diff --git a/README.md b/README.md
index 40b0e80beb9e4d0d036c7e8100b2b658a900bf40..2620f497039b8d9f6b52784af93a6ced6fa1c79f 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,8 @@ emoji: ⚡
 colorFrom: blue
 colorTo: indigo
 sdk: gradio
-sdk_version: 4.19.2
+python_version: 3.8
+sdk_version: 4.16.0
 app_file: app.py
 pinned: false
 ---
diff --git a/app.py b/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..36162dda72c487b2da63ec6e25ceea722398fa0f
--- /dev/null
+++ b/app.py
@@ -0,0 +1,130 @@
+import os
+import sys
+import os.path as osp
+from pathlib import Path
+import cv2
+import gradio as gr
+import torch
+import math
+
+try:
+    import mmpose
+except:
+    os.system('pip install /home/user/app/main/transformer_utils')
+
+os.system('cp -rf /home/user/app/assets/conversions.py /home/user/.pyenv/versions/3.8.18/lib/python3.8/site-packages/torchgeometry/core/conversions.py')
+DEFAULT_MODEL='smpler_x_h32'
+OUT_FOLDER = '/home/user/app/demo_out'
+os.makedirs(OUT_FOLDER, exist_ok=True)
+num_gpus = 1 if torch.cuda.is_available() else -1
+print("!!!", torch.cuda.is_available())      
+print(torch.cuda.device_count())   
+print(torch.version.cuda)  
+index = torch.cuda.current_device()
+print(index)  
+print(torch.cuda.get_device_name(index))
+from main.inference import Inferer
+inferer = Inferer(DEFAULT_MODEL, num_gpus, OUT_FOLDER)
+
+def infer(video_input, in_threshold=0.5, num_people="Single person", render_mesh=False):
+    os.system(f'rm -rf {OUT_FOLDER}/*')
+    multi_person = False if (num_people == "Single person") else True
+    cap = cv2.VideoCapture(video_input)
+    fps = math.ceil(cap.get(5))
+    width = int(cap.get(3))
+    height = int(cap.get(4))
+    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+    video_path = osp.join(OUT_FOLDER, f'out.m4v')
+    final_video_path = osp.join(OUT_FOLDER, f'out.mp4')
+    video_output = cv2.VideoWriter(video_path, fourcc, fps, (width, height))
+    success = 1
+    frame = 0
+    while success:
+        success, original_img = cap.read()
+        if not success:
+            break
+        frame += 1
+        img, mesh_paths, smplx_paths = inferer.infer(original_img, in_threshold, frame, multi_person, not(render_mesh))
+        video_output.write(img)
+    cap.release()
+    video_output.release()
+    cv2.destroyAllWindows()
+    os.system(f'ffmpeg -i {video_path} -c copy {final_video_path}')
+
+    #Compress mesh and smplx files
+    save_path_mesh = os.path.join(OUT_FOLDER, 'mesh')
+    save_mesh_file = os.path.join(OUT_FOLDER, 'mesh.zip')
+    os.makedirs(save_path_mesh, exist_ok= True)
+    save_path_smplx = os.path.join(OUT_FOLDER, 'smplx')
+    save_smplx_file = os.path.join(OUT_FOLDER, 'smplx.zip')
+    os.makedirs(save_path_smplx, exist_ok= True)
+    os.system(f'zip -r {save_mesh_file} {save_path_mesh}')
+    os.system(f'zip -r {save_smplx_file} {save_path_smplx}')
+    return video_path, save_mesh_file, save_smplx_file
+
+TITLE = '''<h1 align="center">SMPLer-X: Scaling Up Expressive Human Pose and Shape Estimation</h1>'''
+VIDEO = '''
+<center><iframe width="960" height="540" 
+src="https://www.youtube.com/embed/DepTqbPpVzY?si=qSeQuX-bgm_rON7E"title="SMPLer-X: Scaling Up Expressive Human Pose and Shape Estimation" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" allowfullscreen>
+</iframe>
+</center><br>'''
+DESCRIPTION = '''
+<b>Official Gradio demo</b> for <a href="https://caizhongang.com/projects/SMPLer-X/"><b>SMPLer-X: Scaling Up Expressive Human Pose and Shape Estimation</b></a>.<br>
+<p>
+Note: You can drop a video at the panel (or select one of the examples) 
+    then you will get the 3D reconstructions of the detected human. ).
+</p>
+'''
+
+with gr.Blocks(title="SMPLer-X", css=".gradio-container") as demo:
+
+    gr.Markdown(TITLE)
+    gr.HTML(VIDEO)
+    gr.Markdown(DESCRIPTION)
+
+    with gr.Row():
+        with gr.Column():
+            video_input = gr.Video(label="Input video", elem_classes="video")
+            threshold = gr.Slider(0, 1.0, value=0.5, label='BBox detection threshold')
+        with gr.Column(scale=2):
+            num_people = gr.Radio(
+                choices=["Single person", "Multiple people"],
+                value="Single person",
+                label="Number of people",
+                info="Choose how many people are there in the video. Choose 'single person' for faster inference.",
+                interactive=True,
+                scale=1,)
+            gr.HTML("""<br/>""")
+            mesh_as_vertices = gr.Checkbox(
+                label="Render mesh", 
+                info="By default, the reconstructions of human bodies are expressed as vertices for faster inference. Check this option if you want to render the human body with mesh.",
+                interactive=True,
+                scale=1,)
+
+            send_button = gr.Button("Infer")
+    gr.HTML("""<br/>""")
+
+    with gr.Row():
+        with gr.Column():
+            video_output = gr.Video(elem_classes="video")
+        with gr.Column():
+            meshes_output = gr.File(label="3D meshes")
+            smplx_output = gr.File(label= "SMPL-X models")
+    # example_images = gr.Examples([])
+    send_button.click(fn=infer, inputs=[video_input, threshold, num_people, mesh_as_vertices], outputs=[video_output, meshes_output, smplx_output])
+    # with gr.Row():
+    example_videos = gr.Examples([
+        ['/home/user/app/assets/01.mp4'], 
+        ['/home/user/app/assets/02.mp4'], 
+        ['/home/user/app/assets/03.mp4'],
+        ['/home/user/app/assets/04.mp4'],
+        ['/home/user/app/assets/05.mp4'], 
+        ['/home/user/app/assets/06.mp4'], 
+        ['/home/user/app/assets/07.mp4'],
+        ['/home/user/app/assets/08.mp4'],
+        ['/home/user/app/assets/09.mp4'],
+        ], 
+        inputs=[video_input, 0.5])
+
+#demo.queue()
+demo.launch(debug=True)
diff --git a/assets/conversions.py b/assets/conversions.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e96d56dd58c419037d7565129b66a1f64cfd568
--- /dev/null
+++ b/assets/conversions.py
@@ -0,0 +1,523 @@
+import torch
+import torch.nn as nn
+
+import torchgeometry as tgm
+
+__all__ = [
+    # functional api
+    "pi",
+    "rad2deg",
+    "deg2rad",
+    "convert_points_from_homogeneous",
+    "convert_points_to_homogeneous",
+    "angle_axis_to_rotation_matrix",
+    "rotation_matrix_to_angle_axis",
+    "rotation_matrix_to_quaternion",
+    "quaternion_to_angle_axis",
+    "angle_axis_to_quaternion",
+    "rtvec_to_pose",
+    # layer api
+    "RadToDeg",
+    "DegToRad",
+    "ConvertPointsFromHomogeneous",
+    "ConvertPointsToHomogeneous",
+]
+
+
+"""Constant with number pi
+"""
+pi = torch.Tensor([3.14159265358979323846])
+
+
+def rad2deg(tensor):
+    r"""Function that converts angles from radians to degrees.
+
+    See :class:`~torchgeometry.RadToDeg` for details.
+
+    Args:
+        tensor (Tensor): Tensor of arbitrary shape.
+
+    Returns:
+        Tensor: Tensor with same shape as input.
+
+    Example:
+        >>> input = tgm.pi * torch.rand(1, 3, 3)
+        >>> output = tgm.rad2deg(input)
+    """
+    if not torch.is_tensor(tensor):
+        raise TypeError("Input type is not a torch.Tensor. Got {}"
+                        .format(type(tensor)))
+
+    return 180. * tensor / pi.to(tensor.device).type(tensor.dtype)
+
+
+def deg2rad(tensor):
+    r"""Function that converts angles from degrees to radians.
+
+    See :class:`~torchgeometry.DegToRad` for details.
+
+    Args:
+        tensor (Tensor): Tensor of arbitrary shape.
+
+    Returns:
+        Tensor: Tensor with same shape as input.
+
+    Examples::
+
+        >>> input = 360. * torch.rand(1, 3, 3)
+        >>> output = tgm.deg2rad(input)
+    """
+    if not torch.is_tensor(tensor):
+        raise TypeError("Input type is not a torch.Tensor. Got {}"
+                        .format(type(tensor)))
+
+    return tensor * pi.to(tensor.device).type(tensor.dtype) / 180.
+
+
+def convert_points_from_homogeneous(points):
+    r"""Function that converts points from homogeneous to Euclidean space.
+
+    See :class:`~torchgeometry.ConvertPointsFromHomogeneous` for details.
+
+    Examples::
+
+        >>> input = torch.rand(2, 4, 3)  # BxNx3
+        >>> output = tgm.convert_points_from_homogeneous(input)  # BxNx2
+    """
+    if not torch.is_tensor(points):
+        raise TypeError("Input type is not a torch.Tensor. Got {}".format(
+            type(points)))
+    if len(points.shape) < 2:
+        raise ValueError("Input must be at least a 2D tensor. Got {}".format(
+            points.shape))
+
+    return points[..., :-1] / points[..., -1:]
+
+
+def convert_points_to_homogeneous(points):
+    r"""Function that converts points from Euclidean to homogeneous space.
+
+    See :class:`~torchgeometry.ConvertPointsToHomogeneous` for details.
+
+    Examples::
+
+        >>> input = torch.rand(2, 4, 3)  # BxNx3
+        >>> output = tgm.convert_points_to_homogeneous(input)  # BxNx4
+    """
+    if not torch.is_tensor(points):
+        raise TypeError("Input type is not a torch.Tensor. Got {}".format(
+            type(points)))
+    if len(points.shape) < 2:
+        raise ValueError("Input must be at least a 2D tensor. Got {}".format(
+            points.shape))
+
+    return nn.functional.pad(points, (0, 1), "constant", 1.0)
+
+
+def angle_axis_to_rotation_matrix(angle_axis):
+    """Convert 3d vector of axis-angle rotation to 4x4 rotation matrix
+
+    Args:
+        angle_axis (Tensor): tensor of 3d vector of axis-angle rotations.
+
+    Returns:
+        Tensor: tensor of 4x4 rotation matrices.
+
+    Shape:
+        - Input: :math:`(N, 3)`
+        - Output: :math:`(N, 4, 4)`
+
+    Example:
+        >>> input = torch.rand(1, 3)  # Nx3
+        >>> output = tgm.angle_axis_to_rotation_matrix(input)  # Nx4x4
+    """
+    def _compute_rotation_matrix(angle_axis, theta2, eps=1e-6):
+        # We want to be careful to only evaluate the square root if the
+        # norm of the angle_axis vector is greater than zero. Otherwise
+        # we get a division by zero.
+        k_one = 1.0
+        theta = torch.sqrt(theta2)
+        wxyz = angle_axis / (theta + eps)
+        wx, wy, wz = torch.chunk(wxyz, 3, dim=1)
+        cos_theta = torch.cos(theta)
+        sin_theta = torch.sin(theta)
+
+        r00 = cos_theta + wx * wx * (k_one - cos_theta)
+        r10 = wz * sin_theta + wx * wy * (k_one - cos_theta)
+        r20 = -wy * sin_theta + wx * wz * (k_one - cos_theta)
+        r01 = wx * wy * (k_one - cos_theta) - wz * sin_theta
+        r11 = cos_theta + wy * wy * (k_one - cos_theta)
+        r21 = wx * sin_theta + wy * wz * (k_one - cos_theta)
+        r02 = wy * sin_theta + wx * wz * (k_one - cos_theta)
+        r12 = -wx * sin_theta + wy * wz * (k_one - cos_theta)
+        r22 = cos_theta + wz * wz * (k_one - cos_theta)
+        rotation_matrix = torch.cat(
+            [r00, r01, r02, r10, r11, r12, r20, r21, r22], dim=1)
+        return rotation_matrix.view(-1, 3, 3)
+
+    def _compute_rotation_matrix_taylor(angle_axis):
+        rx, ry, rz = torch.chunk(angle_axis, 3, dim=1)
+        k_one = torch.ones_like(rx)
+        rotation_matrix = torch.cat(
+            [k_one, -rz, ry, rz, k_one, -rx, -ry, rx, k_one], dim=1)
+        return rotation_matrix.view(-1, 3, 3)
+
+    # stolen from ceres/rotation.h
+
+    _angle_axis = torch.unsqueeze(angle_axis, dim=1)
+    theta2 = torch.matmul(_angle_axis, _angle_axis.transpose(1, 2))
+    theta2 = torch.squeeze(theta2, dim=1)
+
+    # compute rotation matrices
+    rotation_matrix_normal = _compute_rotation_matrix(angle_axis, theta2)
+    rotation_matrix_taylor = _compute_rotation_matrix_taylor(angle_axis)
+
+    # create mask to handle both cases
+    eps = 1e-6
+    mask = (theta2 > eps).view(-1, 1, 1).to(theta2.device)
+    mask_pos = (mask).type_as(theta2)
+    mask_neg = (mask == False).type_as(theta2)  # noqa
+
+    # create output pose matrix
+    batch_size = angle_axis.shape[0]
+    rotation_matrix = torch.eye(4).to(angle_axis.device).type_as(angle_axis)
+    rotation_matrix = rotation_matrix.view(1, 4, 4).repeat(batch_size, 1, 1)
+    # fill output matrix with masked values
+    rotation_matrix[..., :3, :3] = \
+        mask_pos * rotation_matrix_normal + mask_neg * rotation_matrix_taylor
+    return rotation_matrix  # Nx4x4
+
+
+def rtvec_to_pose(rtvec):
+    """
+    Convert axis-angle rotation and translation vector to 4x4 pose matrix
+
+    Args:
+        rtvec (Tensor): Rodrigues vector transformations
+
+    Returns:
+        Tensor: transformation matrices
+
+    Shape:
+        - Input: :math:`(N, 6)`
+        - Output: :math:`(N, 4, 4)`
+
+    Example:
+        >>> input = torch.rand(3, 6)  # Nx6
+        >>> output = tgm.rtvec_to_pose(input)  # Nx4x4
+    """
+    assert rtvec.shape[-1] == 6, 'rtvec=[rx, ry, rz, tx, ty, tz]'
+    pose = angle_axis_to_rotation_matrix(rtvec[..., :3])
+    pose[..., :3, 3] = rtvec[..., 3:]
+    return pose
+
+
+def rotation_matrix_to_angle_axis(rotation_matrix):
+    """Convert 3x4 rotation matrix to Rodrigues vector
+
+    Args:
+        rotation_matrix (Tensor): rotation matrix.
+
+    Returns:
+        Tensor: Rodrigues vector transformation.
+
+    Shape:
+        - Input: :math:`(N, 3, 4)`
+        - Output: :math:`(N, 3)`
+
+    Example:
+        >>> input = torch.rand(2, 3, 4)  # Nx4x4
+        >>> output = tgm.rotation_matrix_to_angle_axis(input)  # Nx3
+    """
+    # todo add check that matrix is a valid rotation matrix
+    quaternion = rotation_matrix_to_quaternion(rotation_matrix)
+    return quaternion_to_angle_axis(quaternion)
+
+
+def rotation_matrix_to_quaternion(rotation_matrix, eps=1e-6):
+    """Convert 3x4 rotation matrix to 4d quaternion vector
+
+    This algorithm is based on algorithm described in
+    https://github.com/KieranWynn/pyquaternion/blob/master/pyquaternion/quaternion.py#L201
+
+    Args:
+        rotation_matrix (Tensor): the rotation matrix to convert.
+
+    Return:
+        Tensor: the rotation in quaternion
+
+    Shape:
+        - Input: :math:`(N, 3, 4)`
+        - Output: :math:`(N, 4)`
+
+    Example:
+        >>> input = torch.rand(4, 3, 4)  # Nx3x4
+        >>> output = tgm.rotation_matrix_to_quaternion(input)  # Nx4
+    """
+    if not torch.is_tensor(rotation_matrix):
+        raise TypeError("Input type is not a torch.Tensor. Got {}".format(
+            type(rotation_matrix)))
+
+    if len(rotation_matrix.shape) > 3:
+        raise ValueError(
+            "Input size must be a three dimensional tensor. Got {}".format(
+                rotation_matrix.shape))
+    if not rotation_matrix.shape[-2:] == (3, 4):
+        raise ValueError(
+            "Input size must be a N x 3 x 4  tensor. Got {}".format(
+                rotation_matrix.shape))
+
+    rmat_t = torch.transpose(rotation_matrix, 1, 2)
+
+    mask_d2 = rmat_t[:, 2, 2] < eps
+
+    mask_d0_d1 = rmat_t[:, 0, 0] > rmat_t[:, 1, 1]
+    mask_d0_nd1 = rmat_t[:, 0, 0] < -rmat_t[:, 1, 1]
+
+    t0 = 1 + rmat_t[:, 0, 0] - rmat_t[:, 1, 1] - rmat_t[:, 2, 2]
+    q0 = torch.stack([rmat_t[:, 1, 2] - rmat_t[:, 2, 1],
+                      t0, rmat_t[:, 0, 1] + rmat_t[:, 1, 0],
+                      rmat_t[:, 2, 0] + rmat_t[:, 0, 2]], -1)
+    t0_rep = t0.repeat(4, 1).t()
+
+    t1 = 1 - rmat_t[:, 0, 0] + rmat_t[:, 1, 1] - rmat_t[:, 2, 2]
+    q1 = torch.stack([rmat_t[:, 2, 0] - rmat_t[:, 0, 2],
+                      rmat_t[:, 0, 1] + rmat_t[:, 1, 0],
+                      t1, rmat_t[:, 1, 2] + rmat_t[:, 2, 1]], -1)
+    t1_rep = t1.repeat(4, 1).t()
+
+    t2 = 1 - rmat_t[:, 0, 0] - rmat_t[:, 1, 1] + rmat_t[:, 2, 2]
+    q2 = torch.stack([rmat_t[:, 0, 1] - rmat_t[:, 1, 0],
+                      rmat_t[:, 2, 0] + rmat_t[:, 0, 2],
+                      rmat_t[:, 1, 2] + rmat_t[:, 2, 1], t2], -1)
+    t2_rep = t2.repeat(4, 1).t()
+
+    t3 = 1 + rmat_t[:, 0, 0] + rmat_t[:, 1, 1] + rmat_t[:, 2, 2]
+    q3 = torch.stack([t3, rmat_t[:, 1, 2] - rmat_t[:, 2, 1],
+                      rmat_t[:, 2, 0] - rmat_t[:, 0, 2],
+                      rmat_t[:, 0, 1] - rmat_t[:, 1, 0]], -1)
+    t3_rep = t3.repeat(4, 1).t()
+
+    mask_c0 = mask_d2 * mask_d0_d1
+    mask_c1 = mask_d2 * ~(mask_d0_d1)
+    mask_c2 = ~(mask_d2) * mask_d0_nd1
+    mask_c3 = ~(mask_d2) * ~(mask_d0_nd1)
+    mask_c0 = mask_c0.view(-1, 1).type_as(q0)
+    mask_c1 = mask_c1.view(-1, 1).type_as(q1)
+    mask_c2 = mask_c2.view(-1, 1).type_as(q2)
+    mask_c3 = mask_c3.view(-1, 1).type_as(q3)
+
+    q = q0 * mask_c0 + q1 * mask_c1 + q2 * mask_c2 + q3 * mask_c3
+    q /= torch.sqrt(t0_rep * mask_c0 + t1_rep * mask_c1 +  # noqa
+                    t2_rep * mask_c2 + t3_rep * mask_c3)  # noqa
+    q *= 0.5
+    return q
+
+
+def quaternion_to_angle_axis(quaternion: torch.Tensor) -> torch.Tensor:
+    """Convert quaternion vector to angle axis of rotation.
+
+    Adapted from ceres C++ library: ceres-solver/include/ceres/rotation.h
+
+    Args:
+        quaternion (torch.Tensor): tensor with quaternions.
+
+    Return:
+        torch.Tensor: tensor with angle axis of rotation.
+
+    Shape:
+        - Input: :math:`(*, 4)` where `*` means, any number of dimensions
+        - Output: :math:`(*, 3)`
+
+    Example:
+        >>> quaternion = torch.rand(2, 4)  # Nx4
+        >>> angle_axis = tgm.quaternion_to_angle_axis(quaternion)  # Nx3
+    """
+    if not torch.is_tensor(quaternion):
+        raise TypeError("Input type is not a torch.Tensor. Got {}".format(
+            type(quaternion)))
+
+    if not quaternion.shape[-1] == 4:
+        raise ValueError("Input must be a tensor of shape Nx4 or 4. Got {}"
+                         .format(quaternion.shape))
+    # unpack input and compute conversion
+    q1: torch.Tensor = quaternion[..., 1]
+    q2: torch.Tensor = quaternion[..., 2]
+    q3: torch.Tensor = quaternion[..., 3]
+    sin_squared_theta: torch.Tensor = q1 * q1 + q2 * q2 + q3 * q3
+
+    sin_theta: torch.Tensor = torch.sqrt(sin_squared_theta)
+    cos_theta: torch.Tensor = quaternion[..., 0]
+    two_theta: torch.Tensor = 2.0 * torch.where(
+        cos_theta < 0.0,
+        torch.atan2(-sin_theta, -cos_theta),
+        torch.atan2(sin_theta, cos_theta))
+
+    k_pos: torch.Tensor = two_theta / sin_theta
+    k_neg: torch.Tensor = 2.0 * torch.ones_like(sin_theta)
+    k: torch.Tensor = torch.where(sin_squared_theta > 0.0, k_pos, k_neg)
+
+    angle_axis: torch.Tensor = torch.zeros_like(quaternion)[..., :3]
+    angle_axis[..., 0] += q1 * k
+    angle_axis[..., 1] += q2 * k
+    angle_axis[..., 2] += q3 * k
+    return angle_axis
+
+# based on:
+# https://github.com/facebookresearch/QuaterNet/blob/master/common/quaternion.py#L138
+
+
+def angle_axis_to_quaternion(angle_axis: torch.Tensor) -> torch.Tensor:
+    """Convert an angle axis to a quaternion.
+
+    Adapted from ceres C++ library: ceres-solver/include/ceres/rotation.h
+
+    Args:
+        angle_axis (torch.Tensor): tensor with angle axis.
+
+    Return:
+        torch.Tensor: tensor with quaternion.
+
+    Shape:
+        - Input: :math:`(*, 3)` where `*` means, any number of dimensions
+        - Output: :math:`(*, 4)`
+
+    Example:
+        >>> angle_axis = torch.rand(2, 4)  # Nx4
+        >>> quaternion = tgm.angle_axis_to_quaternion(angle_axis)  # Nx3
+    """
+    if not torch.is_tensor(angle_axis):
+        raise TypeError("Input type is not a torch.Tensor. Got {}".format(
+            type(angle_axis)))
+
+    if not angle_axis.shape[-1] == 3:
+        raise ValueError("Input must be a tensor of shape Nx3 or 3. Got {}"
+                         .format(angle_axis.shape))
+    # unpack input and compute conversion
+    a0: torch.Tensor = angle_axis[..., 0:1]
+    a1: torch.Tensor = angle_axis[..., 1:2]
+    a2: torch.Tensor = angle_axis[..., 2:3]
+    theta_squared: torch.Tensor = a0 * a0 + a1 * a1 + a2 * a2
+
+    theta: torch.Tensor = torch.sqrt(theta_squared)
+    half_theta: torch.Tensor = theta * 0.5
+
+    mask: torch.Tensor = theta_squared > 0.0
+    ones: torch.Tensor = torch.ones_like(half_theta)
+
+    k_neg: torch.Tensor = 0.5 * ones
+    k_pos: torch.Tensor = torch.sin(half_theta) / theta
+    k: torch.Tensor = torch.where(mask, k_pos, k_neg)
+    w: torch.Tensor = torch.where(mask, torch.cos(half_theta), ones)
+
+    quaternion: torch.Tensor = torch.zeros_like(angle_axis)
+    quaternion[..., 0:1] += a0 * k
+    quaternion[..., 1:2] += a1 * k
+    quaternion[..., 2:3] += a2 * k
+    return torch.cat([w, quaternion], dim=-1)
+
+# TODO: add below funtionalities
+#  - pose_to_rtvec
+
+
+# layer api
+
+
+class RadToDeg(nn.Module):
+    r"""Creates an object that converts angles from radians to degrees.
+
+    Args:
+        tensor (Tensor): Tensor of arbitrary shape.
+
+    Returns:
+        Tensor: Tensor with same shape as input.
+
+    Examples::
+
+        >>> input = tgm.pi * torch.rand(1, 3, 3)
+        >>> output = tgm.RadToDeg()(input)
+    """
+
+    def __init__(self):
+        super(RadToDeg, self).__init__()
+
+    def forward(self, input):
+        return rad2deg(input)
+
+
+class DegToRad(nn.Module):
+    r"""Function that converts angles from degrees to radians.
+
+    Args:
+        tensor (Tensor): Tensor of arbitrary shape.
+
+    Returns:
+        Tensor: Tensor with same shape as input.
+
+    Examples::
+
+        >>> input = 360. * torch.rand(1, 3, 3)
+        >>> output = tgm.DegToRad()(input)
+    """
+
+    def __init__(self):
+        super(DegToRad, self).__init__()
+
+    def forward(self, input):
+        return deg2rad(input)
+
+
+class ConvertPointsFromHomogeneous(nn.Module):
+    r"""Creates a transformation that converts points from homogeneous to
+    Euclidean space.
+
+    Args:
+        points (Tensor): tensor of N-dimensional points.
+
+    Returns:
+        Tensor: tensor of N-1-dimensional points.
+
+    Shape:
+        - Input: :math:`(B, D, N)` or :math:`(D, N)`
+        - Output: :math:`(B, D, N + 1)` or :math:`(D, N + 1)`
+
+    Examples::
+
+        >>> input = torch.rand(2, 4, 3)  # BxNx3
+        >>> transform = tgm.ConvertPointsFromHomogeneous()
+        >>> output = transform(input)  # BxNx2
+    """
+
+    def __init__(self):
+        super(ConvertPointsFromHomogeneous, self).__init__()
+
+    def forward(self, input):
+        return convert_points_from_homogeneous(input)
+
+
+class ConvertPointsToHomogeneous(nn.Module):
+    r"""Creates a transformation to convert points from Euclidean to
+    homogeneous space.
+
+    Args:
+        points (Tensor): tensor of N-dimensional points.
+
+    Returns:
+        Tensor: tensor of N+1-dimensional points.
+
+    Shape:
+        - Input: :math:`(B, D, N)` or :math:`(D, N)`
+        - Output: :math:`(B, D, N + 1)` or :math:`(D, N + 1)`
+
+    Examples::
+
+        >>> input = torch.rand(2, 4, 3)  # BxNx3
+        >>> transform = tgm.ConvertPointsToHomogeneous()
+        >>> output = transform(input)  # BxNx4
+    """
+
+    def __init__(self):
+        super(ConvertPointsToHomogeneous, self).__init__()
+
+    def forward(self, input):
+        return convert_points_to_homogeneous(input)
diff --git a/common/base.py b/common/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..21fe9d3f36f39167879aa84c248c32ec80b12927
--- /dev/null
+++ b/common/base.py
@@ -0,0 +1,86 @@
+import os.path as osp
+import math
+import abc
+from torch.utils.data import DataLoader
+import torch.optim
+import torchvision.transforms as transforms
+from timer import Timer
+from logger import colorlogger
+from torch.nn.parallel.data_parallel import DataParallel
+from config import cfg
+from SMPLer_X import get_model
+
+# ddp
+import torch.distributed as dist
+from torch.utils.data import DistributedSampler
+import torch.utils.data.distributed
+from utils.distribute_utils import (
+    get_rank, is_main_process, time_synchronized, get_group_idx, get_process_groups
+)
+from mmcv.runner import get_dist_info
+
+class Base(object):
+    __metaclass__ = abc.ABCMeta
+
+    def __init__(self, log_name='logs.txt'):
+        self.cur_epoch = 0
+
+        # timer
+        self.tot_timer = Timer()
+        self.gpu_timer = Timer()
+        self.read_timer = Timer()
+
+        # logger
+        self.logger = colorlogger(cfg.log_dir, log_name=log_name)
+
+    @abc.abstractmethod
+    def _make_batch_generator(self):
+        return
+
+    @abc.abstractmethod
+    def _make_model(self):
+        return
+
+class Demoer(Base):
+    def __init__(self, test_epoch=None):
+        if test_epoch is not None:
+            self.test_epoch = int(test_epoch)
+        super(Demoer, self).__init__(log_name='test_logs.txt')
+
+    def _make_batch_generator(self, demo_scene):
+        # data load and construct batch generator
+        self.logger.info("Creating dataset...")
+        from data.UBody.UBody import UBody
+        testset_loader = UBody(transforms.ToTensor(), "demo", demo_scene) # eval(demoset)(transforms.ToTensor(), "demo")
+        batch_generator = DataLoader(dataset=testset_loader, batch_size=cfg.num_gpus * cfg.test_batch_size,
+                                     shuffle=False, num_workers=cfg.num_thread, pin_memory=True)
+
+        self.testset = testset_loader
+        self.batch_generator = batch_generator
+
+    def _make_model(self):
+        self.logger.info('Load checkpoint from {}'.format(cfg.pretrained_model_path))
+
+        # prepare network
+        self.logger.info("Creating graph...")
+        model = get_model('test')
+        model = DataParallel(model).to(cfg.device)
+        ckpt = torch.load(cfg.pretrained_model_path, map_location=cfg.device)
+
+        from collections import OrderedDict
+        new_state_dict = OrderedDict()
+        for k, v in ckpt['network'].items():
+            if 'module' not in k:
+                k = 'module.' + k
+            k = k.replace('module.backbone', 'module.encoder').replace('body_rotation_net', 'body_regressor').replace(
+                'hand_rotation_net', 'hand_regressor')
+            new_state_dict[k] = v
+        model.load_state_dict(new_state_dict, strict=False)
+        model.eval()
+
+        self.model = model
+
+    def _evaluate(self, outs, cur_sample_idx):
+        eval_result = self.testset.evaluate(outs, cur_sample_idx)
+        return eval_result
+
diff --git a/common/logger.py b/common/logger.py
new file mode 100644
index 0000000000000000000000000000000000000000..a117118df6e29aea3922dccef7901d136f5a52b0
--- /dev/null
+++ b/common/logger.py
@@ -0,0 +1,50 @@
+import logging
+import os
+
+OK = '\033[92m'
+WARNING = '\033[93m'
+FAIL = '\033[91m'
+END = '\033[0m'
+
+PINK = '\033[95m'
+BLUE = '\033[94m'
+GREEN = OK
+RED = FAIL
+WHITE = END
+YELLOW = WARNING
+
+class colorlogger():
+    def __init__(self, log_dir, log_name='train_logs.txt'):
+        # set log
+        self._logger = logging.getLogger(log_name)
+        self._logger.setLevel(logging.INFO)
+        log_file = os.path.join(log_dir, log_name)
+        if not os.path.exists(log_dir):
+            os.makedirs(log_dir)
+        file_log = logging.FileHandler(log_file, mode='a')
+        file_log.setLevel(logging.INFO)
+        console_log = logging.StreamHandler()
+        console_log.setLevel(logging.INFO)
+        formatter = logging.Formatter(
+            "{}%(asctime)s{} %(message)s".format(GREEN, END),
+            "%m-%d %H:%M:%S")
+        file_log.setFormatter(formatter)
+        console_log.setFormatter(formatter)
+        self._logger.addHandler(file_log)
+        self._logger.addHandler(console_log)
+
+    def debug(self, msg):
+        self._logger.debug(str(msg))
+
+    def info(self, msg):
+        self._logger.info(str(msg))
+
+    def warning(self, msg):
+        self._logger.warning(WARNING + 'WRN: ' + str(msg) + END)
+
+    def critical(self, msg):
+        self._logger.critical(RED + 'CRI: ' + str(msg) + END)
+
+    def error(self, msg):
+        self._logger.error(RED + 'ERR: ' + str(msg) + END)
+
diff --git a/common/nets/layer.py b/common/nets/layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..69d61bb01d8b5782b61b7b36b4152a2053bfbb80
--- /dev/null
+++ b/common/nets/layer.py
@@ -0,0 +1,53 @@
+import torch.nn as nn
+
+def make_linear_layers(feat_dims, relu_final=True, use_bn=False):
+    layers = []
+    for i in range(len(feat_dims)-1):
+        layers.append(nn.Linear(feat_dims[i], feat_dims[i+1]))
+
+        # Do not use ReLU for final estimation
+        if i < len(feat_dims)-2 or (i == len(feat_dims)-2 and relu_final):
+            if use_bn:
+                layers.append(nn.BatchNorm1d(feat_dims[i+1]))
+            layers.append(nn.ReLU(inplace=True))
+
+    return nn.Sequential(*layers)
+
+def make_conv_layers(feat_dims, kernel=3, stride=1, padding=1, bnrelu_final=True):
+    layers = []
+    for i in range(len(feat_dims)-1):
+        layers.append(
+            nn.Conv2d(
+                in_channels=feat_dims[i],
+                out_channels=feat_dims[i+1],
+                kernel_size=kernel,
+                stride=stride,
+                padding=padding
+                ))
+        # Do not use BN and ReLU for final estimation
+        if i < len(feat_dims)-2 or (i == len(feat_dims)-2 and bnrelu_final):
+            layers.append(nn.BatchNorm2d(feat_dims[i+1]))
+            layers.append(nn.ReLU(inplace=True))
+
+    return nn.Sequential(*layers)
+
+def make_deconv_layers(feat_dims, bnrelu_final=True):
+    layers = []
+    for i in range(len(feat_dims)-1):
+        layers.append(
+            nn.ConvTranspose2d(
+                in_channels=feat_dims[i],
+                out_channels=feat_dims[i+1],
+                kernel_size=4,
+                stride=2,
+                padding=1,
+                output_padding=0,
+                bias=False))
+
+        # Do not use BN and ReLU for final estimation
+        if i < len(feat_dims)-2 or (i == len(feat_dims)-2 and bnrelu_final):
+            layers.append(nn.BatchNorm2d(feat_dims[i+1]))
+            layers.append(nn.ReLU(inplace=True))
+
+    return nn.Sequential(*layers)
+
diff --git a/common/nets/loss.py b/common/nets/loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..7fd3298ef71128d4bfe58099c8ee7f80aa6215a9
--- /dev/null
+++ b/common/nets/loss.py
@@ -0,0 +1,30 @@
+import torch
+import torch.nn as nn
+
+class CoordLoss(nn.Module):
+    def __init__(self):
+        super(CoordLoss, self).__init__()
+
+    def forward(self, coord_out, coord_gt, valid, is_3D=None):
+        loss = torch.abs(coord_out - coord_gt) * valid
+        if is_3D is not None:
+            loss_z = loss[:,:,2:] * is_3D[:,None,None].float()
+            loss = torch.cat((loss[:,:,:2], loss_z),2)
+        return loss
+
+class ParamLoss(nn.Module):
+    def __init__(self):
+        super(ParamLoss, self).__init__()
+
+    def forward(self, param_out, param_gt, valid):
+        loss = torch.abs(param_out - param_gt) * valid
+        return loss
+
+class CELoss(nn.Module):
+    def __init__(self):
+        super(CELoss, self).__init__()
+        self.ce_loss = nn.CrossEntropyLoss(reduction='none')
+
+    def forward(self, out, gt_index):
+        loss = self.ce_loss(out, gt_index)
+        return loss
diff --git a/common/nets/smpler_x.py b/common/nets/smpler_x.py
new file mode 100644
index 0000000000000000000000000000000000000000..60610da8528f266bf6c99a50ac11b4549ee77958
--- /dev/null
+++ b/common/nets/smpler_x.py
@@ -0,0 +1,172 @@
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from nets.layer import make_conv_layers, make_linear_layers, make_deconv_layers
+from utils.transforms import sample_joint_features, soft_argmax_2d, soft_argmax_3d
+from utils.human_models import smpl_x
+from config import cfg
+from mmcv.ops.roi_align import roi_align
+
+class PositionNet(nn.Module):
+    def __init__(self, part, feat_dim=768):
+        super(PositionNet, self).__init__()
+        if part == 'body':
+            self.joint_num = len(smpl_x.pos_joint_part['body'])
+            self.hm_shape = cfg.output_hm_shape
+        elif part == 'hand':
+            self.joint_num = len(smpl_x.pos_joint_part['rhand'])
+            self.hm_shape = cfg.output_hand_hm_shape
+        self.conv = make_conv_layers([feat_dim, self.joint_num * self.hm_shape[0]], kernel=1, stride=1, padding=0, bnrelu_final=False)
+
+    def forward(self, img_feat):
+        joint_hm = self.conv(img_feat).view(-1, self.joint_num, self.hm_shape[0], self.hm_shape[1], self.hm_shape[2])
+        joint_coord = soft_argmax_3d(joint_hm)
+        joint_hm = F.softmax(joint_hm.view(-1, self.joint_num, self.hm_shape[0] * self.hm_shape[1] * self.hm_shape[2]), 2)
+        joint_hm = joint_hm.view(-1, self.joint_num, self.hm_shape[0], self.hm_shape[1], self.hm_shape[2])
+        return joint_hm, joint_coord
+
+class HandRotationNet(nn.Module):
+    def __init__(self, part, feat_dim = 768):
+        super(HandRotationNet, self).__init__()
+        self.part = part
+        self.joint_num = len(smpl_x.pos_joint_part['rhand'])
+        self.hand_conv = make_conv_layers([feat_dim, 512], kernel=1, stride=1, padding=0)
+        self.hand_pose_out = make_linear_layers([self.joint_num * 515, len(smpl_x.orig_joint_part['rhand']) * 6], relu_final=False)
+        self.feat_dim = feat_dim
+
+    def forward(self, img_feat, joint_coord_img):
+        batch_size = img_feat.shape[0]
+        img_feat = self.hand_conv(img_feat)
+        img_feat_joints = sample_joint_features(img_feat, joint_coord_img[:, :, :2])
+        feat = torch.cat((img_feat_joints, joint_coord_img), 2)  # batch_size, joint_num, 512+3
+        hand_pose = self.hand_pose_out(feat.view(batch_size, -1))
+        return hand_pose
+
+class BodyRotationNet(nn.Module):
+    def __init__(self, feat_dim = 768):
+        super(BodyRotationNet, self).__init__()
+        self.joint_num = len(smpl_x.pos_joint_part['body'])
+        self.body_conv = make_linear_layers([feat_dim, 512], relu_final=False)
+        self.root_pose_out = make_linear_layers([self.joint_num * (512+3), 6], relu_final=False)
+        self.body_pose_out = make_linear_layers(
+            [self.joint_num * (512+3), (len(smpl_x.orig_joint_part['body']) - 1) * 6], relu_final=False)  # without root
+        self.shape_out = make_linear_layers([feat_dim, smpl_x.shape_param_dim], relu_final=False)
+        self.cam_out = make_linear_layers([feat_dim, 3], relu_final=False)
+        self.feat_dim = feat_dim
+
+    def forward(self, body_pose_token, shape_token, cam_token, body_joint_img):
+        batch_size = body_pose_token.shape[0]
+
+        # shape parameter
+        shape_param = self.shape_out(shape_token)
+
+        # camera parameter
+        cam_param = self.cam_out(cam_token)
+
+        # body pose parameter
+        body_pose_token = self.body_conv(body_pose_token)
+        body_pose_token = torch.cat((body_pose_token, body_joint_img), 2)
+        root_pose = self.root_pose_out(body_pose_token.view(batch_size, -1))
+        body_pose = self.body_pose_out(body_pose_token.view(batch_size, -1))
+
+        return root_pose, body_pose, shape_param, cam_param
+
+class FaceRegressor(nn.Module):
+    def __init__(self, feat_dim=768):
+        super(FaceRegressor, self).__init__()
+        self.expr_out = make_linear_layers([feat_dim, smpl_x.expr_code_dim], relu_final=False)
+        self.jaw_pose_out = make_linear_layers([feat_dim, 6], relu_final=False)
+
+    def forward(self, expr_token, jaw_pose_token):
+        expr_param = self.expr_out(expr_token)  # expression parameter
+        jaw_pose = self.jaw_pose_out(jaw_pose_token)  # jaw pose parameter
+        return expr_param, jaw_pose
+
+class BoxNet(nn.Module):
+    def __init__(self, feat_dim=768):
+        super(BoxNet, self).__init__()
+        self.joint_num = len(smpl_x.pos_joint_part['body'])
+        self.deconv = make_deconv_layers([feat_dim + self.joint_num * cfg.output_hm_shape[0], 256, 256, 256])
+        self.bbox_center = make_conv_layers([256, 3], kernel=1, stride=1, padding=0, bnrelu_final=False)
+        self.lhand_size = make_linear_layers([256, 256, 2], relu_final=False)
+        self.rhand_size = make_linear_layers([256, 256, 2], relu_final=False)
+        self.face_size = make_linear_layers([256, 256, 2], relu_final=False)
+
+    def forward(self, img_feat, joint_hm):
+        joint_hm = joint_hm.view(joint_hm.shape[0], joint_hm.shape[1] * cfg.output_hm_shape[0], cfg.output_hm_shape[1], cfg.output_hm_shape[2])
+        img_feat = torch.cat((img_feat, joint_hm), 1)
+        img_feat = self.deconv(img_feat)
+
+        # bbox center
+        bbox_center_hm = self.bbox_center(img_feat)
+        bbox_center = soft_argmax_2d(bbox_center_hm)
+        lhand_center, rhand_center, face_center = bbox_center[:, 0, :], bbox_center[:, 1, :], bbox_center[:, 2, :]
+
+        # bbox size
+        lhand_feat = sample_joint_features(img_feat, lhand_center[:, None, :].detach())[:, 0, :]
+        lhand_size = self.lhand_size(lhand_feat)
+        rhand_feat = sample_joint_features(img_feat, rhand_center[:, None, :].detach())[:, 0, :]
+        rhand_size = self.rhand_size(rhand_feat)
+        face_feat = sample_joint_features(img_feat, face_center[:, None, :].detach())[:, 0, :]
+        face_size = self.face_size(face_feat)
+
+        lhand_center = lhand_center / 8
+        rhand_center = rhand_center / 8
+        face_center = face_center / 8
+        return lhand_center, lhand_size, rhand_center, rhand_size, face_center, face_size
+
+class BoxSizeNet(nn.Module):
+    def __init__(self):
+        super(BoxSizeNet, self).__init__()
+        self.lhand_size = make_linear_layers([256, 256, 2], relu_final=False)
+        self.rhand_size = make_linear_layers([256, 256, 2], relu_final=False)
+        self.face_size = make_linear_layers([256, 256, 2], relu_final=False)
+
+    def forward(self, box_fea):
+        # box_fea: [bs, 3, C]
+        lhand_size = self.lhand_size(box_fea[:, 0])
+        rhand_size = self.rhand_size(box_fea[:, 1])
+        face_size = self.face_size(box_fea[:, 2])
+        return lhand_size, rhand_size, face_size
+
+class HandRoI(nn.Module):
+    def __init__(self, feat_dim=768, upscale=4):
+        super(HandRoI, self).__init__()
+        self.upscale = upscale
+        if upscale==1:
+            self.deconv = make_conv_layers([feat_dim, feat_dim], kernel=1, stride=1, padding=0, bnrelu_final=False)
+            self.conv = make_conv_layers([feat_dim, feat_dim], kernel=1, stride=1, padding=0, bnrelu_final=False)
+        elif upscale==2:
+            self.deconv = make_deconv_layers([feat_dim, feat_dim//2])
+            self.conv = make_conv_layers([feat_dim//2, feat_dim], kernel=1, stride=1, padding=0, bnrelu_final=False)
+        elif upscale==4:
+            self.deconv = make_deconv_layers([feat_dim, feat_dim//2, feat_dim//4])
+            self.conv = make_conv_layers([feat_dim//4, feat_dim], kernel=1, stride=1, padding=0, bnrelu_final=False)
+        elif upscale==8:
+            self.deconv = make_deconv_layers([feat_dim, feat_dim//2, feat_dim//4, feat_dim//8])
+            self.conv = make_conv_layers([feat_dim//8, feat_dim], kernel=1, stride=1, padding=0, bnrelu_final=False)
+
+    def forward(self, img_feat, lhand_bbox, rhand_bbox):
+        lhand_bbox = torch.cat((torch.arange(lhand_bbox.shape[0]).float().to(cfg.device)[:, None], lhand_bbox),
+                               1)  # batch_idx, xmin, ymin, xmax, ymax
+        rhand_bbox = torch.cat((torch.arange(rhand_bbox.shape[0]).float().to(cfg.device)[:, None], rhand_bbox),
+                               1)  # batch_idx, xmin, ymin, xmax, ymax
+        img_feat = self.deconv(img_feat)
+        lhand_bbox_roi = lhand_bbox.clone()
+        lhand_bbox_roi[:, 1] = lhand_bbox_roi[:, 1] / cfg.input_body_shape[1] * cfg.output_hm_shape[2] * self.upscale
+        lhand_bbox_roi[:, 2] = lhand_bbox_roi[:, 2] / cfg.input_body_shape[0] * cfg.output_hm_shape[1] * self.upscale
+        lhand_bbox_roi[:, 3] = lhand_bbox_roi[:, 3] / cfg.input_body_shape[1] * cfg.output_hm_shape[2] * self.upscale
+        lhand_bbox_roi[:, 4] = lhand_bbox_roi[:, 4] / cfg.input_body_shape[0] * cfg.output_hm_shape[1] * self.upscale
+        assert (cfg.output_hm_shape[1]*self.upscale, cfg.output_hm_shape[2]*self.upscale) == (img_feat.shape[2], img_feat.shape[3])
+        lhand_img_feat = roi_align(img_feat, lhand_bbox_roi, (cfg.output_hand_hm_shape[1], cfg.output_hand_hm_shape[2]), 1.0, 0, 'avg', False)
+        lhand_img_feat = torch.flip(lhand_img_feat, [3])  # flip to the right hand
+
+        rhand_bbox_roi = rhand_bbox.clone()
+        rhand_bbox_roi[:, 1] = rhand_bbox_roi[:, 1] / cfg.input_body_shape[1] * cfg.output_hm_shape[2] * self.upscale
+        rhand_bbox_roi[:, 2] = rhand_bbox_roi[:, 2] / cfg.input_body_shape[0] * cfg.output_hm_shape[1] * self.upscale
+        rhand_bbox_roi[:, 3] = rhand_bbox_roi[:, 3] / cfg.input_body_shape[1] * cfg.output_hm_shape[2] * self.upscale
+        rhand_bbox_roi[:, 4] = rhand_bbox_roi[:, 4] / cfg.input_body_shape[0] * cfg.output_hm_shape[1] * self.upscale
+        rhand_img_feat = roi_align(img_feat, rhand_bbox_roi, (cfg.output_hand_hm_shape[1], cfg.output_hand_hm_shape[2]), 1.0, 0, 'avg', False)
+        hand_img_feat = torch.cat((lhand_img_feat, rhand_img_feat))  # [bs, c, cfg.output_hand_hm_shape[2]*scale, cfg.output_hand_hm_shape[1]*scale]
+        hand_img_feat = self.conv(hand_img_feat)
+        return hand_img_feat
\ No newline at end of file
diff --git a/common/timer.py b/common/timer.py
new file mode 100644
index 0000000000000000000000000000000000000000..7152ae943c94c66d879a6048671c4427100dad42
--- /dev/null
+++ b/common/timer.py
@@ -0,0 +1,38 @@
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+
+import time
+
+class Timer(object):
+    """A simple timer."""
+    def __init__(self):
+        self.total_time = 0.
+        self.calls = 0
+        self.start_time = 0.
+        self.diff = 0.
+        self.average_time = 0.
+        self.warm_up = 0
+
+    def tic(self):
+        # using time.time instead of time.clock because time time.clock
+        # does not normalize for multithreading
+        self.start_time = time.time()
+
+    def toc(self, average=True):
+        self.diff = time.time() - self.start_time
+        if self.warm_up < 10:
+            self.warm_up += 1
+            return self.diff
+        else:
+            self.total_time += self.diff
+            self.calls += 1
+            self.average_time = self.total_time / self.calls
+
+        if average:
+            return self.average_time
+        else:
+            return self.diff
diff --git a/common/utils/__init__.py b/common/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/common/utils/dir.py b/common/utils/dir.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae29db5b6b5f317ca67c12c897f6daed6a9d749a
--- /dev/null
+++ b/common/utils/dir.py
@@ -0,0 +1,10 @@
+import os
+import sys
+
+def make_folder(folder_name):
+    os.makedirs(folder_name, exist_ok=True)
+
+def add_pypath(path):
+    if path not in sys.path:
+        sys.path.insert(0, path)
+
diff --git a/common/utils/distribute_utils.py b/common/utils/distribute_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b1c71cd3863fe1f99370d7d072d6389f663959e
--- /dev/null
+++ b/common/utils/distribute_utils.py
@@ -0,0 +1,217 @@
+import mmcv
+import os
+import os.path as osp
+import pickle
+import shutil
+import tempfile
+import time
+import torch
+import torch.distributed as dist
+from mmcv.runner import get_dist_info
+import random
+import numpy as np
+import subprocess
+
+def set_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    # torch.set_deterministic(True)
+
+
+def time_synchronized():
+    torch.cuda.synchronize() if torch.cuda.is_available() else None
+    return time.time()
+
+
+def setup_for_distributed(is_master):
+    """This function disables printing when not in master process."""
+    import builtins as __builtin__
+    builtin_print = __builtin__.print
+
+    def print(*args, **kwargs):
+        force = kwargs.pop('force', False)
+        if is_master or force:
+            builtin_print(*args, **kwargs)
+
+    __builtin__.print = print
+
+
+def init_distributed_mode(port = None, master_port=29500):
+    """Initialize slurm distributed training environment.
+
+    If argument ``port`` is not specified, then the master port will be system
+    environment variable ``MASTER_PORT``. If ``MASTER_PORT`` is not in system
+    environment variable, then a default port ``29500`` will be used.
+
+    Args:
+        backend (str): Backend of torch.distributed.
+        port (int, optional): Master port. Defaults to None.
+    """
+    dist_backend = 'nccl'
+    proc_id = int(os.environ['SLURM_PROCID'])
+    ntasks = int(os.environ['SLURM_NTASKS'])
+    node_list = os.environ['SLURM_NODELIST']
+    num_gpus = torch.cuda.device_count()
+    torch.cuda.set_device(proc_id % num_gpus)
+    addr = subprocess.getoutput(
+        f'scontrol show hostname {node_list} | head -n1')
+    # specify master port
+    if port is not None:
+        os.environ['MASTER_PORT'] = str(port)
+    elif 'MASTER_PORT' in os.environ:
+        pass  # use MASTER_PORT in the environment variable
+    else:
+        # 29500 is torch.distributed default port
+        os.environ['MASTER_PORT'] = str(master_port)
+    # use MASTER_ADDR in the environment variable if it already exists
+    if 'MASTER_ADDR' not in os.environ:
+        os.environ['MASTER_ADDR'] = addr
+    os.environ['WORLD_SIZE'] = str(ntasks)
+    os.environ['LOCAL_RANK'] = str(proc_id % num_gpus)
+    os.environ['RANK'] = str(proc_id)
+    dist.init_process_group(backend=dist_backend)
+
+    distributed = True
+    gpu_idx = proc_id % num_gpus
+
+    return distributed, gpu_idx
+
+
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+
+
+def get_world_size():
+    if not is_dist_avail_and_initialized():
+        return 1
+    return dist.get_world_size()
+
+
+def get_rank():
+    if not is_dist_avail_and_initialized():
+        return 0
+    return dist.get_rank()
+
+def get_process_groups():
+    world_size = int(os.environ['WORLD_SIZE'])
+    ranks = list(range(world_size))
+    num_gpus = torch.cuda.device_count()
+    num_nodes = world_size // num_gpus
+    if world_size % num_gpus != 0:
+        raise NotImplementedError('Not implemented for node not fully used.')
+
+    groups = []
+    for node_idx in range(num_nodes):
+        groups.append(ranks[node_idx*num_gpus : (node_idx+1)*num_gpus])
+    process_groups = [torch.distributed.new_group(group) for group in groups]
+
+    return process_groups
+
+def get_group_idx():
+    num_gpus = torch.cuda.device_count()
+    proc_id = get_rank()
+    group_idx = proc_id // num_gpus
+
+    return group_idx
+
+
+def is_main_process():
+    return get_rank() == 0
+
+def cleanup():
+    dist.destroy_process_group()
+
+
+def collect_results(result_part, size, tmpdir=None):
+    rank, world_size = get_dist_info()
+    # create a tmp dir if it is not specified
+    if tmpdir is None:
+        MAX_LEN = 512
+        # 32 is whitespace
+        dir_tensor = torch.full((MAX_LEN, ),
+                                32,
+                                dtype=torch.uint8,
+                                device='cuda')
+        if rank == 0:
+            tmpdir = tempfile.mkdtemp()
+            tmpdir = torch.tensor(
+                bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda')
+            dir_tensor[:len(tmpdir)] = tmpdir
+        dist.broadcast(dir_tensor, 0)
+        tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip()
+    else:
+        mmcv.mkdir_or_exist(tmpdir)
+    # dump the part result to the dir
+    mmcv.dump(result_part, osp.join(tmpdir, f'part_{rank}.pkl'))
+    dist.barrier()
+    # collect all parts
+    if rank != 0:
+        return None
+    else:
+        # load results of all parts from tmp dir
+        part_list = []
+        for i in range(world_size):
+            part_file = osp.join(tmpdir, f'part_{i}.pkl')
+            part_list.append(mmcv.load(part_file))
+        # sort the results
+        ordered_results = []
+        for res in zip(*part_list):
+            ordered_results.extend(list(res))
+        # the dataloader may pad some samples
+        ordered_results = ordered_results[:size]
+        # remove tmp dir
+        shutil.rmtree(tmpdir)
+        return ordered_results
+
+
+def all_gather(data):
+    """
+    Run all_gather on arbitrary picklable data (not necessarily tensors)
+    Args:
+        data:
+            Any picklable object
+    Returns:
+        data_list(list):
+            List of data gathered from each rank
+    """
+    world_size = get_world_size()
+    if world_size == 1:
+        return [data]
+
+    # serialized to a Tensor
+    buffer = pickle.dumps(data)
+    storage = torch.ByteStorage.from_buffer(buffer)
+    tensor = torch.ByteTensor(storage).to('cuda')
+
+    # obtain Tensor size of each rank
+    local_size = torch.tensor([tensor.numel()], device='cuda')
+    size_list = [torch.tensor([0], device='cuda') for _ in range(world_size)]
+    dist.all_gather(size_list, local_size)
+    size_list = [int(size.item()) for size in size_list]
+    max_size = max(size_list)
+
+    # receiving Tensor from all ranks
+    # we pad the tensor because torch all_gather does not support
+    # gathering tensors of different shapes
+    tensor_list = []
+    for _ in size_list:
+        tensor_list.append(
+            torch.empty((max_size, ), dtype=torch.uint8, device='cuda'))
+    if local_size != max_size:
+        padding = torch.empty(
+            size=(max_size - local_size, ), dtype=torch.uint8, device='cuda')
+        tensor = torch.cat((tensor, padding), dim=0)
+    dist.all_gather(tensor_list, tensor)
+
+    data_list = []
+    for size, tensor in zip(size_list, tensor_list):
+        buffer = tensor.cpu().numpy().tobytes()[:size]
+        data_list.append(pickle.loads(buffer))
+
+    return data_list
diff --git a/common/utils/human_model_files/smpl/SMPL_FEMALE.pkl b/common/utils/human_model_files/smpl/SMPL_FEMALE.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..c7aee61286b91ee83f1dc8846e7bab306882f30f
--- /dev/null
+++ b/common/utils/human_model_files/smpl/SMPL_FEMALE.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6d4a1791b6b94880397e1a3a4539b703a228d2150c57de7b288389a8115f4ef0
+size 247530000
diff --git a/common/utils/human_model_files/smpl/SMPL_MALE.pkl b/common/utils/human_model_files/smpl/SMPL_MALE.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..247d55241f21c4190521321279b1dc6f94be02a3
--- /dev/null
+++ b/common/utils/human_model_files/smpl/SMPL_MALE.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ed4d55bb3041fefc6f73b70694d6c8edc1020c0d07340be5cc651cae2c6a6ae3
+size 247101031
diff --git a/common/utils/human_model_files/smpl/SMPL_NEUTRAL.pkl b/common/utils/human_model_files/smpl/SMPL_NEUTRAL.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..65ae47d34e5b26720c9ccdd2614044832f0e30f2
--- /dev/null
+++ b/common/utils/human_model_files/smpl/SMPL_NEUTRAL.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4924f235e63f7c5d5b690acedf736419c2edb846a2d69fc0956169615fa75688
+size 247186228
diff --git a/common/utils/human_model_files/smpl/smpl_uv.npz b/common/utils/human_model_files/smpl/smpl_uv.npz
new file mode 100644
index 0000000000000000000000000000000000000000..808dd7dc08e09a82564fde4add97bd1d24f6447c
--- /dev/null
+++ b/common/utils/human_model_files/smpl/smpl_uv.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eb2a1aaf8be2091ebc4344daefae0622cc09252b33d4f6c36ea2c6541a01d469
+size 1524004
diff --git a/common/utils/human_model_files/smplx/MANO_SMPLX_vertex_ids.pkl b/common/utils/human_model_files/smplx/MANO_SMPLX_vertex_ids.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..dabec1377a0da4c511a519a00f51f1a3a23f33af
--- /dev/null
+++ b/common/utils/human_model_files/smplx/MANO_SMPLX_vertex_ids.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e5abe70b6574de25470475091e8008314a5b90127eb48c3e63bfa0adf8c04dcf
+size 13535
diff --git a/common/utils/human_model_files/smplx/SMPL-X__FLAME_vertex_ids.npy b/common/utils/human_model_files/smplx/SMPL-X__FLAME_vertex_ids.npy
new file mode 100644
index 0000000000000000000000000000000000000000..c940d3aa6cb4cbbcc348fd518b15d8777dc350fd
--- /dev/null
+++ b/common/utils/human_model_files/smplx/SMPL-X__FLAME_vertex_ids.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7e70cdc3659aae699b9732e8dd4af49106310c69b90dc83d9f73e96dbf871e49
+size 40312
diff --git a/common/utils/human_model_files/smplx/SMPLX_FEMALE.npz b/common/utils/human_model_files/smplx/SMPLX_FEMALE.npz
new file mode 100755
index 0000000000000000000000000000000000000000..da0a200cd85eb10f73aa36d44f1d9c509a82dfcc
--- /dev/null
+++ b/common/utils/human_model_files/smplx/SMPLX_FEMALE.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b2a3686c9d6d218ff6822fba411c607a3c8125a70af340f384ce68bebecabe0e
+size 108794146
diff --git a/common/utils/human_model_files/smplx/SMPLX_MALE.npz b/common/utils/human_model_files/smplx/SMPLX_MALE.npz
new file mode 100755
index 0000000000000000000000000000000000000000..41fdef3ff2784eb06bb479ebf5fb6887aafbc183
--- /dev/null
+++ b/common/utils/human_model_files/smplx/SMPLX_MALE.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ab318e3f37d2bfaae26abf4e6fab445c2a610e1d63714794d60379cc263bc2a5
+size 108753445
diff --git a/common/utils/human_model_files/smplx/SMPLX_NEUTRAL.npz b/common/utils/human_model_files/smplx/SMPLX_NEUTRAL.npz
new file mode 100755
index 0000000000000000000000000000000000000000..6f42b326bd60123bd813c0fa2df7f4660862a920
--- /dev/null
+++ b/common/utils/human_model_files/smplx/SMPLX_NEUTRAL.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:376021446ddc86e99acacd795182bbef903e61d33b76b9d8b359c2b0865bd992
+size 108752058
diff --git a/common/utils/human_model_files/smplx/SMPLX_NEUTRAL.pkl b/common/utils/human_model_files/smplx/SMPLX_NEUTRAL.pkl
new file mode 100755
index 0000000000000000000000000000000000000000..c2ef9ea8a36f2bf51256325bc6d24c181975483c
--- /dev/null
+++ b/common/utils/human_model_files/smplx/SMPLX_NEUTRAL.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:381c808965deb4f5e845f8c3eddb0cd69930cc72e5774ce4f34c4ce3cf058361
+size 544173380
diff --git a/common/utils/human_model_files/smplx/SMPLX_to_J14.pkl b/common/utils/human_model_files/smplx/SMPLX_to_J14.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..db8aa5c74b860a2b9555383d5ca2a09523851fe4
--- /dev/null
+++ b/common/utils/human_model_files/smplx/SMPLX_to_J14.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5df844ddea85b0a400a2e8dbe63d09d19f2b1b7ec0e0e952daeae08f83d82d61
+size 4692193
diff --git a/common/utils/human_models.py b/common/utils/human_models.py
new file mode 100644
index 0000000000000000000000000000000000000000..ccc2bc818e471967b6321c96def08c5f5502a985
--- /dev/null
+++ b/common/utils/human_models.py
@@ -0,0 +1,176 @@
+import numpy as np
+import torch
+import os.path as osp
+from config import cfg
+from utils.smplx import smplx
+import pickle
+
+class SMPLX(object):
+    def __init__(self):
+        self.layer_arg = {'create_global_orient': False, 'create_body_pose': False, 'create_left_hand_pose': False, 'create_right_hand_pose': False, 'create_jaw_pose': False, 'create_leye_pose': False, 'create_reye_pose': False, 'create_betas': False, 'create_expression': False, 'create_transl': False}
+        self.layer = {'neutral': smplx.create(cfg.human_model_path, 'smplx', gender='NEUTRAL', use_pca=False, use_face_contour=True, **self.layer_arg),
+                        'male': smplx.create(cfg.human_model_path, 'smplx', gender='MALE', use_pca=False, use_face_contour=True, **self.layer_arg),
+                        'female': smplx.create(cfg.human_model_path, 'smplx', gender='FEMALE', use_pca=False, use_face_contour=True, **self.layer_arg)
+                        }
+        self.vertex_num = 10475
+        self.face = self.layer['neutral'].faces
+        self.shape_param_dim = 10
+        self.expr_code_dim = 10
+        with open(osp.join(cfg.human_model_path, 'smplx', 'SMPLX_to_J14.pkl'), 'rb') as f:
+            self.j14_regressor = pickle.load(f, encoding='latin1')
+        with open(osp.join(cfg.human_model_path, 'smplx', 'MANO_SMPLX_vertex_ids.pkl'), 'rb') as f:
+            self.hand_vertex_idx = pickle.load(f, encoding='latin1')
+        self.face_vertex_idx = np.load(osp.join(cfg.human_model_path, 'smplx', 'SMPL-X__FLAME_vertex_ids.npy'))
+        self.J_regressor = self.layer['neutral'].J_regressor.numpy()
+        self.J_regressor_idx = {'pelvis': 0, 'lwrist': 20, 'rwrist': 21, 'neck': 12}
+        self.orig_hand_regressor = self.make_hand_regressor()
+        #self.orig_hand_regressor = {'left': self.layer.J_regressor.numpy()[[20,37,38,39,25,26,27,28,29,30,34,35,36,31,32,33],:], 'right': self.layer.J_regressor.numpy()[[21,52,53,54,40,41,42,43,44,45,49,50,51,46,47,48],:]}
+
+        # original SMPLX joint set
+        self.orig_joint_num = 53 # 22 (body joints) + 30 (hand joints) + 1 (face jaw joint)
+        self.orig_joints_name = \
+        ('Pelvis', 'L_Hip', 'R_Hip', 'Spine_1', 'L_Knee', 'R_Knee', 'Spine_2', 'L_Ankle', 'R_Ankle', 'Spine_3', 'L_Foot', 'R_Foot', 'Neck', 'L_Collar', 'R_Collar', 'Head', 'L_Shoulder', 'R_Shoulder', 'L_Elbow', 'R_Elbow', 'L_Wrist', 'R_Wrist', # body joints
+        'L_Index_1', 'L_Index_2', 'L_Index_3', 'L_Middle_1', 'L_Middle_2', 'L_Middle_3', 'L_Pinky_1', 'L_Pinky_2', 'L_Pinky_3', 'L_Ring_1', 'L_Ring_2', 'L_Ring_3', 'L_Thumb_1', 'L_Thumb_2', 'L_Thumb_3', # left hand joints
+        'R_Index_1', 'R_Index_2', 'R_Index_3', 'R_Middle_1', 'R_Middle_2', 'R_Middle_3', 'R_Pinky_1', 'R_Pinky_2', 'R_Pinky_3', 'R_Ring_1', 'R_Ring_2', 'R_Ring_3', 'R_Thumb_1', 'R_Thumb_2', 'R_Thumb_3', # right hand joints
+        'Jaw' # face jaw joint
+        )
+        self.orig_flip_pairs = \
+        ( (1,2), (4,5), (7,8), (10,11), (13,14), (16,17), (18,19), (20,21), # body joints
+        (22,37), (23,38), (24,39), (25,40), (26,41), (27,42), (28,43), (29,44), (30,45), (31,46), (32,47), (33,48), (34,49), (35,50), (36,51) # hand joints
+        )
+        self.orig_root_joint_idx = self.orig_joints_name.index('Pelvis')
+        self.orig_joint_part = \
+        {'body': range(self.orig_joints_name.index('Pelvis'), self.orig_joints_name.index('R_Wrist')+1),
+        'lhand': range(self.orig_joints_name.index('L_Index_1'), self.orig_joints_name.index('L_Thumb_3')+1),
+        'rhand': range(self.orig_joints_name.index('R_Index_1'), self.orig_joints_name.index('R_Thumb_3')+1),
+        'face': range(self.orig_joints_name.index('Jaw'), self.orig_joints_name.index('Jaw')+1)}
+
+        # changed SMPLX joint set for the supervision
+        self.joint_num = 137 # 25 (body joints) + 40 (hand joints) + 72 (face keypoints)
+        self.joints_name = \
+        ('Pelvis', 'L_Hip', 'R_Hip', 'L_Knee', 'R_Knee', 'L_Ankle', 'R_Ankle', 'Neck', 'L_Shoulder', 'R_Shoulder', 'L_Elbow', 'R_Elbow', 'L_Wrist', 'R_Wrist', 'L_Big_toe', 'L_Small_toe', 'L_Heel', 'R_Big_toe', 'R_Small_toe', 'R_Heel', 'L_Ear', 'R_Ear', 'L_Eye', 'R_Eye', 'Nose',# body joints
+         'L_Thumb_1', 'L_Thumb_2', 'L_Thumb_3', 'L_Thumb_4', 'L_Index_1', 'L_Index_2', 'L_Index_3', 'L_Index_4', 'L_Middle_1', 'L_Middle_2', 'L_Middle_3', 'L_Middle_4', 'L_Ring_1', 'L_Ring_2', 'L_Ring_3', 'L_Ring_4', 'L_Pinky_1', 'L_Pinky_2', 'L_Pinky_3', 'L_Pinky_4', # left hand joints
+         'R_Thumb_1', 'R_Thumb_2', 'R_Thumb_3', 'R_Thumb_4', 'R_Index_1', 'R_Index_2', 'R_Index_3', 'R_Index_4', 'R_Middle_1', 'R_Middle_2', 'R_Middle_3', 'R_Middle_4', 'R_Ring_1', 'R_Ring_2', 'R_Ring_3', 'R_Ring_4', 'R_Pinky_1', 'R_Pinky_2', 'R_Pinky_3', 'R_Pinky_4', # right hand joints
+         *['Face_' + str(i) for i in range(1,73)] # face keypoints (too many keypoints... omit real names. have same name of keypoints defined in FLAME class)
+         )
+        self.root_joint_idx = self.joints_name.index('Pelvis')
+        self.lwrist_idx = self.joints_name.index('L_Wrist')
+        self.rwrist_idx = self.joints_name.index('R_Wrist')
+        self.neck_idx = self.joints_name.index('Neck')
+        self.flip_pairs = \
+        ( (1,2), (3,4), (5,6), (8,9), (10,11), (12,13), (14,17), (15,18), (16,19), (20,21), (22,23), # body joints
+        (25,45), (26,46), (27,47), (28,48), (29,49), (30,50), (31,51), (32,52), (33,53), (34,54), (35,55), (36,56), (37,57), (38,58), (39,59), (40,60), (41,61), (42,62), (43,63), (44,64), # hand joints
+        (67,68), # face eyeballs
+        (69,78), (70,77), (71,76), (72,75), (73,74), # face eyebrow
+        (83,87), (84,86), # face below nose
+        (88,97), (89,96), (90,95), (91,94), (92,99), (93,98), # face eyes
+        (100,106), (101,105), (102,104), (107,111), (108,110), # face mouth
+        (112,116), (113,115), (117,119), # face lip
+        (120,136), (121,135), (122,134), (123,133), (124,132), (125,131), (126,130), (127,129) # face contours
+        )
+        self.joint_idx = \
+        (0,1,2,4,5,7,8,12,16,17,18,19,20,21,60,61,62,63,64,65,59,58,57,56,55, # body joints
+        37,38,39,66,25,26,27,67,28,29,30,68,34,35,36,69,31,32,33,70, # left hand joints
+        52,53,54,71,40,41,42,72,43,44,45,73,49,50,51,74,46,47,48,75, # right hand joints
+        22,15, # jaw, head
+        57,56, # eyeballs
+        76,77,78,79,80,81,82,83,84,85, # eyebrow
+        86,87,88,89, # nose
+        90,91,92,93,94, # below nose
+        95,96,97,98,99,100,101,102,103,104,105,106, # eyes
+        107, # right mouth
+        108,109,110,111,112, # upper mouth
+        113, # left mouth
+        114,115,116,117,118, # lower mouth
+        119, # right lip
+        120,121,122, # upper lip
+        123, # left lip
+        124,125,126, # lower lip
+        127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143 # face contour
+        )
+        self.joint_part = \
+        {'body': range(self.joints_name.index('Pelvis'), self.joints_name.index('Nose')+1),
+        'lhand': range(self.joints_name.index('L_Thumb_1'), self.joints_name.index('L_Pinky_4')+1),
+        'rhand': range(self.joints_name.index('R_Thumb_1'), self.joints_name.index('R_Pinky_4')+1),
+        'hand': range(self.joints_name.index('L_Thumb_1'), self.joints_name.index('R_Pinky_4')+1),
+        'face': range(self.joints_name.index('Face_1'), self.joints_name.index('Face_72')+1)}
+        
+        # changed SMPLX joint set for PositionNet prediction
+        self.pos_joint_num = 65 # 25 (body joints) + 40 (hand joints)
+        self.pos_joints_name = \
+        ('Pelvis', 'L_Hip', 'R_Hip', 'L_Knee', 'R_Knee', 'L_Ankle', 'R_Ankle', 'Neck', 'L_Shoulder', 'R_Shoulder', 'L_Elbow', 'R_Elbow', 'L_Wrist', 'R_Wrist', 'L_Big_toe', 'L_Small_toe', 'L_Heel', 'R_Big_toe', 'R_Small_toe', 'R_Heel', 'L_Ear', 'R_Ear', 'L_Eye', 'R_Eye', 'Nose', # body joints
+         'L_Thumb_1', 'L_Thumb_2', 'L_Thumb_3', 'L_Thumb_4', 'L_Index_1', 'L_Index_2', 'L_Index_3', 'L_Index_4', 'L_Middle_1', 'L_Middle_2', 'L_Middle_3', 'L_Middle_4', 'L_Ring_1', 'L_Ring_2', 'L_Ring_3', 'L_Ring_4', 'L_Pinky_1', 'L_Pinky_2', 'L_Pinky_3', 'L_Pinky_4', # left hand joints
+         'R_Thumb_1', 'R_Thumb_2', 'R_Thumb_3', 'R_Thumb_4', 'R_Index_1', 'R_Index_2', 'R_Index_3', 'R_Index_4', 'R_Middle_1', 'R_Middle_2', 'R_Middle_3', 'R_Middle_4', 'R_Ring_1', 'R_Ring_2', 'R_Ring_3', 'R_Ring_4', 'R_Pinky_1', 'R_Pinky_2', 'R_Pinky_3', 'R_Pinky_4', # right hand joints
+         )
+        self.pos_joint_part = \
+        {'body': range(self.pos_joints_name.index('Pelvis'), self.pos_joints_name.index('Nose')+1),
+        'lhand': range(self.pos_joints_name.index('L_Thumb_1'), self.pos_joints_name.index('L_Pinky_4')+1),
+        'rhand': range(self.pos_joints_name.index('R_Thumb_1'), self.pos_joints_name.index('R_Pinky_4')+1),
+        'hand': range(self.pos_joints_name.index('L_Thumb_1'), self.pos_joints_name.index('R_Pinky_4')+1)}
+        self.pos_joint_part['L_MCP'] = [self.pos_joints_name.index('L_Index_1') - len(self.pos_joint_part['body']),
+                                        self.pos_joints_name.index('L_Middle_1') - len(self.pos_joint_part['body']),
+                                        self.pos_joints_name.index('L_Ring_1') - len(self.pos_joint_part['body']),
+                                        self.pos_joints_name.index('L_Pinky_1') - len(self.pos_joint_part['body'])]
+        self.pos_joint_part['R_MCP'] = [self.pos_joints_name.index('R_Index_1') - len(self.pos_joint_part['body']) - len(self.pos_joint_part['lhand']),
+                                        self.pos_joints_name.index('R_Middle_1') - len(self.pos_joint_part['body']) - len(self.pos_joint_part['lhand']),
+                                        self.pos_joints_name.index('R_Ring_1') - len(self.pos_joint_part['body']) - len(self.pos_joint_part['lhand']),
+                                        self.pos_joints_name.index('R_Pinky_1') - len(self.pos_joint_part['body']) - len(self.pos_joint_part['lhand'])]
+    
+    def make_hand_regressor(self):
+        regressor = self.layer['neutral'].J_regressor.numpy()
+        lhand_regressor = np.concatenate((regressor[[20,37,38,39],:],
+                                            np.eye(self.vertex_num)[5361,None],
+                                                regressor[[25,26,27],:],
+                                                np.eye(self.vertex_num)[4933,None],
+                                                regressor[[28,29,30],:],
+                                                np.eye(self.vertex_num)[5058,None],
+                                                regressor[[34,35,36],:],
+                                                np.eye(self.vertex_num)[5169,None],
+                                                regressor[[31,32,33],:],
+                                                np.eye(self.vertex_num)[5286,None]))
+        rhand_regressor = np.concatenate((regressor[[21,52,53,54],:],
+                                            np.eye(self.vertex_num)[8079,None],
+                                                regressor[[40,41,42],:],
+                                                np.eye(self.vertex_num)[7669,None],
+                                                regressor[[43,44,45],:],
+                                                np.eye(self.vertex_num)[7794,None],
+                                                regressor[[49,50,51],:],
+                                                np.eye(self.vertex_num)[7905,None],
+                                                regressor[[46,47,48],:],
+                                                np.eye(self.vertex_num)[8022,None]))
+        hand_regressor = {'left': lhand_regressor, 'right': rhand_regressor}
+        return hand_regressor
+
+        
+    def reduce_joint_set(self, joint):
+        new_joint = []
+        for name in self.pos_joints_name:
+            idx = self.joints_name.index(name)
+            new_joint.append(joint[:,idx,:])
+        new_joint = torch.stack(new_joint,1)
+        return new_joint
+
+class SMPL(object):
+    def __init__(self):
+        self.layer_arg = {'create_body_pose': False, 'create_betas': False, 'create_global_orient': False, 'create_transl': False}
+        self.layer = {'neutral': smplx.create(cfg.human_model_path, 'smpl', gender='NEUTRAL', **self.layer_arg), 'male': smplx.create(cfg.human_model_path, 'smpl', gender='MALE', **self.layer_arg), 'female': smplx.create(cfg.human_model_path, 'smpl', gender='FEMALE', **self.layer_arg)}
+        self.vertex_num = 6890
+        self.face = self.layer['neutral'].faces
+        self.shape_param_dim = 10
+        self.vposer_code_dim = 32
+
+        # original SMPL joint set
+        self.orig_joint_num = 24
+        self.orig_joints_name = ('Pelvis', 'L_Hip', 'R_Hip', 'Spine_1', 'L_Knee', 'R_Knee', 'Spine_2', 'L_Ankle', 'R_Ankle', 'Spine_3', 'L_Foot', 'R_Foot', 'Neck', 'L_Collar', 'R_Collar', 'Head', 'L_Shoulder', 'R_Shoulder', 'L_Elbow', 'R_Elbow', 'L_Wrist', 'R_Wrist', 'L_Hand', 'R_Hand')
+        self.orig_flip_pairs = ( (1,2), (4,5), (7,8), (10,11), (13,14), (16,17), (18,19), (20,21), (22,23) )
+        self.orig_root_joint_idx = self.orig_joints_name.index('Pelvis')
+        self.orig_joint_regressor = self.layer['neutral'].J_regressor.numpy().astype(np.float32)
+        
+        self.joint_num = self.orig_joint_num
+        self.joints_name = self.orig_joints_name
+        self.flip_pairs = self.orig_flip_pairs
+        self.root_joint_idx = self.orig_root_joint_idx
+        self.joint_regressor = self.orig_joint_regressor
+
+smpl_x = SMPLX()
+smpl = SMPL()
diff --git a/common/utils/inference_utils.py b/common/utils/inference_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ccc515a3dcc24662005a4e6be0948024e98c995
--- /dev/null
+++ b/common/utils/inference_utils.py
@@ -0,0 +1,153 @@
+from typing import Literal, Union
+
+def process_mmdet_results(mmdet_results: list,
+                          cat_id: int = 0,
+                          multi_person: bool = True) -> list:
+    """Process mmdet results, sort bboxes by area in descending order.
+
+    Args:
+        mmdet_results (list):
+            Result of mmdet.apis.inference_detector
+            when the input is a batch.
+            Shape of the nested lists is
+            (n_frame, n_category, n_human, 5).
+        cat_id (int, optional):
+            Category ID. This function will only select
+            the selected category, and drop the others.
+            Defaults to 0, ID of human category.
+        multi_person (bool, optional):
+            Whether to allow multi-person detection, which is
+            slower than single-person. If false, the function
+            only assure that the first person of each frame
+            has the biggest bbox.
+            Defaults to True.
+
+    Returns:
+        list:
+            A list of detected bounding boxes.
+            Shape of the nested lists is
+            (n_frame, n_human, 5)
+            and each bbox is (x, y, x, y, score).
+    """
+    ret_list = []
+    only_max_arg = not multi_person
+    # for _, frame_results in enumerate(mmdet_results):
+    cat_bboxes = mmdet_results[cat_id]
+    # import pdb; pdb.set_trace()
+    sorted_bbox = qsort_bbox_list(cat_bboxes, only_max_arg)
+
+    if only_max_arg:
+        ret_list.append(sorted_bbox[0:1])
+    else:
+        ret_list.append(sorted_bbox)
+    return ret_list
+
+
+def qsort_bbox_list(bbox_list: list,
+                    only_max: bool = False,
+                    bbox_convention: Literal['xyxy', 'xywh'] = 'xyxy'):
+    """Sort a list of bboxes, by their area in pixel(W*H).
+
+    Args:
+        input_list (list):
+            A list of bboxes. Each item is a list of (x1, y1, x2, y2)
+        only_max (bool, optional):
+            If True, only assure the max element at first place,
+            others may not be well sorted.
+            If False, return a well sorted descending list.
+            Defaults to False.
+        bbox_convention (str, optional):
+            Bbox type, xyxy or xywh. Defaults to 'xyxy'.
+
+    Returns:
+        list:
+            A sorted(maybe not so well) descending list.
+    """
+    # import pdb; pdb.set_trace()
+    if len(bbox_list) <= 1:
+        return bbox_list
+    else:
+        bigger_list = []
+        less_list = []
+        anchor_index = int(len(bbox_list) / 2)
+        anchor_bbox = bbox_list[anchor_index]
+        anchor_area = get_area_of_bbox(anchor_bbox, bbox_convention)
+        for i in range(len(bbox_list)):
+            if i == anchor_index:
+                continue
+            tmp_bbox = bbox_list[i]
+            tmp_area = get_area_of_bbox(tmp_bbox, bbox_convention)
+            if tmp_area >= anchor_area:
+                bigger_list.append(tmp_bbox)
+            else:
+                less_list.append(tmp_bbox)
+        if only_max:
+            return qsort_bbox_list(bigger_list) + \
+                [anchor_bbox, ] + less_list
+        else:
+            return qsort_bbox_list(bigger_list) + \
+                [anchor_bbox, ] + qsort_bbox_list(less_list)
+
+def get_area_of_bbox(
+        bbox: Union[list, tuple],
+        bbox_convention: Literal['xyxy', 'xywh'] = 'xyxy') -> float:
+    """Get the area of a bbox_xyxy.
+
+    Args:
+        (Union[list, tuple]):
+            A list of [x1, y1, x2, y2].
+        bbox_convention (str, optional):
+            Bbox type, xyxy or xywh. Defaults to 'xyxy'.
+
+    Returns:
+        float:
+            Area of the bbox(|y2-y1|*|x2-x1|).
+    """
+    # import pdb;pdb.set_trace()
+    if bbox_convention == 'xyxy':
+        return abs(bbox[2] - bbox[0]) * abs(bbox[3] - bbox[1])
+    elif bbox_convention == 'xywh':
+        return abs(bbox[2] * bbox[3])
+    else:
+        raise TypeError(f'Wrong bbox convention: {bbox_convention}')
+
+def calculate_iou(bbox1, bbox2):
+    # Calculate the Intersection over Union (IoU) between two bounding boxes
+    x1 = max(bbox1[0], bbox2[0])
+    y1 = max(bbox1[1], bbox2[1])
+    x2 = min(bbox1[2], bbox2[2])
+    y2 = min(bbox1[3], bbox2[3])
+    
+    intersection_area = max(0, x2 - x1 + 1) * max(0, y2 - y1 + 1)
+    
+    bbox1_area = (bbox1[2] - bbox1[0] + 1) * (bbox1[3] - bbox1[1] + 1)
+    bbox2_area = (bbox2[2] - bbox2[0] + 1) * (bbox2[3] - bbox2[1] + 1)
+    
+    union_area = bbox1_area + bbox2_area - intersection_area
+    
+    iou = intersection_area / union_area
+    return iou
+
+
+def non_max_suppression(bboxes, iou_threshold):
+    # Sort the bounding boxes by their confidence scores (e.g., the probability of containing an object)
+    bboxes = sorted(bboxes, key=lambda x: x[4], reverse=True)
+    
+    # Initialize a list to store the selected bounding boxes
+    selected_bboxes = []
+    
+    # Perform non-maximum suppression
+    while len(bboxes) > 0:
+        current_bbox = bboxes[0]
+        selected_bboxes.append(current_bbox)
+        bboxes = bboxes[1:]
+        
+        remaining_bboxes = []
+        for bbox in bboxes:
+            iou = calculate_iou(current_bbox, bbox)
+            if iou < iou_threshold:
+                remaining_bboxes.append(bbox)
+                
+        bboxes = remaining_bboxes
+        
+    return selected_bboxes
\ No newline at end of file
diff --git a/common/utils/preprocessing.py b/common/utils/preprocessing.py
new file mode 100644
index 0000000000000000000000000000000000000000..81d53b0d67fcc59d8c1684519eb8b29cf2bd59e1
--- /dev/null
+++ b/common/utils/preprocessing.py
@@ -0,0 +1,541 @@
+import numpy as np
+import cv2
+import random
+from config import cfg
+import math
+from utils.human_models import smpl_x, smpl
+from utils.transforms import cam2pixel, transform_joint_to_other_db
+from plyfile import PlyData, PlyElement
+import torch
+
+
+def load_img(path, order='RGB'):
+    img = cv2.imread(path, cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION)
+    if not isinstance(img, np.ndarray):
+        raise IOError("Fail to read %s" % path)
+
+    if order == 'RGB':
+        img = img[:, :, ::-1].copy()
+
+    img = img.astype(np.float32)
+    return img
+
+
+def get_bbox(joint_img, joint_valid, extend_ratio=1.2):
+    x_img, y_img = joint_img[:, 0], joint_img[:, 1]
+    x_img = x_img[joint_valid == 1];
+    y_img = y_img[joint_valid == 1];
+    xmin = min(x_img);
+    ymin = min(y_img);
+    xmax = max(x_img);
+    ymax = max(y_img);
+
+    x_center = (xmin + xmax) / 2.;
+    width = xmax - xmin;
+    xmin = x_center - 0.5 * width * extend_ratio
+    xmax = x_center + 0.5 * width * extend_ratio
+
+    y_center = (ymin + ymax) / 2.;
+    height = ymax - ymin;
+    ymin = y_center - 0.5 * height * extend_ratio
+    ymax = y_center + 0.5 * height * extend_ratio
+
+    bbox = np.array([xmin, ymin, xmax - xmin, ymax - ymin]).astype(np.float32)
+    return bbox
+
+
+def sanitize_bbox(bbox, img_width, img_height):
+    x, y, w, h = bbox
+    x1 = np.max((0, x))
+    y1 = np.max((0, y))
+    x2 = np.min((img_width - 1, x1 + np.max((0, w - 1))))
+    y2 = np.min((img_height - 1, y1 + np.max((0, h - 1))))
+    if w * h > 0 and x2 > x1 and y2 > y1:
+        bbox = np.array([x1, y1, x2 - x1, y2 - y1])
+    else:
+        bbox = None
+
+    return bbox
+
+
+def process_bbox(bbox, img_width, img_height, ratio=1.25):
+    bbox = sanitize_bbox(bbox, img_width, img_height)
+    if bbox is None:
+        return bbox
+
+    # aspect ratio preserving bbox
+    w = bbox[2]
+    h = bbox[3]
+    c_x = bbox[0] + w / 2.
+    c_y = bbox[1] + h / 2.
+    aspect_ratio = cfg.input_img_shape[1] / cfg.input_img_shape[0]
+    if w > aspect_ratio * h:
+        h = w / aspect_ratio
+    elif w < aspect_ratio * h:
+        w = h * aspect_ratio
+    bbox[2] = w * ratio
+    bbox[3] = h * ratio
+    bbox[0] = c_x - bbox[2] / 2.
+    bbox[1] = c_y - bbox[3] / 2.
+
+    bbox = bbox.astype(np.float32)
+    return bbox
+
+
+def get_aug_config():
+    scale_factor = 0.25
+    rot_factor = 30
+    color_factor = 0.2
+
+    scale = np.clip(np.random.randn(), -1.0, 1.0) * scale_factor + 1.0
+    rot = np.clip(np.random.randn(), -2.0,
+                  2.0) * rot_factor if random.random() <= 0.6 else 0
+    c_up = 1.0 + color_factor
+    c_low = 1.0 - color_factor
+    color_scale = np.array([random.uniform(c_low, c_up), random.uniform(c_low, c_up), random.uniform(c_low, c_up)])
+    do_flip = random.random() <= 0.5
+
+    return scale, rot, color_scale, do_flip
+
+
+def augmentation(img, bbox, data_split):
+    if getattr(cfg, 'no_aug', False):
+        scale, rot, color_scale, do_flip = 1.0, 0.0, np.array([1, 1, 1]), False
+    elif data_split == 'train':
+        scale, rot, color_scale, do_flip = get_aug_config()
+    else:
+        scale, rot, color_scale, do_flip = 1.0, 0.0, np.array([1, 1, 1]), False
+
+    img, trans, inv_trans = generate_patch_image(img, bbox, scale, rot, do_flip, cfg.input_img_shape)
+    img = np.clip(img * color_scale[None, None, :], 0, 255)
+    return img, trans, inv_trans, rot, do_flip
+
+
+def generate_patch_image(cvimg, bbox, scale, rot, do_flip, out_shape):
+    img = cvimg.copy()
+    img_height, img_width, img_channels = img.shape
+
+    bb_c_x = float(bbox[0] + 0.5 * bbox[2])
+    bb_c_y = float(bbox[1] + 0.5 * bbox[3])
+    bb_width = float(bbox[2])
+    bb_height = float(bbox[3])
+
+    if do_flip:
+        img = img[:, ::-1, :]
+        bb_c_x = img_width - bb_c_x - 1
+
+    trans = gen_trans_from_patch_cv(bb_c_x, bb_c_y, bb_width, bb_height, out_shape[1], out_shape[0], scale, rot)
+    img_patch = cv2.warpAffine(img, trans, (int(out_shape[1]), int(out_shape[0])), flags=cv2.INTER_LINEAR)
+    img_patch = img_patch.astype(np.float32)
+    inv_trans = gen_trans_from_patch_cv(bb_c_x, bb_c_y, bb_width, bb_height, out_shape[1], out_shape[0], scale, rot,
+                                        inv=True)
+
+    return img_patch, trans, inv_trans
+
+
+def rotate_2d(pt_2d, rot_rad):
+    x = pt_2d[0]
+    y = pt_2d[1]
+    sn, cs = np.sin(rot_rad), np.cos(rot_rad)
+    xx = x * cs - y * sn
+    yy = x * sn + y * cs
+    return np.array([xx, yy], dtype=np.float32)
+
+
+def gen_trans_from_patch_cv(c_x, c_y, src_width, src_height, dst_width, dst_height, scale, rot, inv=False):
+    # augment size with scale
+    src_w = src_width * scale
+    src_h = src_height * scale
+    src_center = np.array([c_x, c_y], dtype=np.float32)
+
+    # augment rotation
+    rot_rad = np.pi * rot / 180
+    src_downdir = rotate_2d(np.array([0, src_h * 0.5], dtype=np.float32), rot_rad)
+    src_rightdir = rotate_2d(np.array([src_w * 0.5, 0], dtype=np.float32), rot_rad)
+
+    dst_w = dst_width
+    dst_h = dst_height
+    dst_center = np.array([dst_w * 0.5, dst_h * 0.5], dtype=np.float32)
+    dst_downdir = np.array([0, dst_h * 0.5], dtype=np.float32)
+    dst_rightdir = np.array([dst_w * 0.5, 0], dtype=np.float32)
+
+    src = np.zeros((3, 2), dtype=np.float32)
+    src[0, :] = src_center
+    src[1, :] = src_center + src_downdir
+    src[2, :] = src_center + src_rightdir
+
+    dst = np.zeros((3, 2), dtype=np.float32)
+    dst[0, :] = dst_center
+    dst[1, :] = dst_center + dst_downdir
+    dst[2, :] = dst_center + dst_rightdir
+
+    if inv:
+        trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
+    else:
+        trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))
+
+    trans = trans.astype(np.float32)
+    return trans
+
+
+def process_db_coord(joint_img, joint_cam, joint_valid, do_flip, img_shape, flip_pairs, img2bb_trans, rot,
+                     src_joints_name, target_joints_name):
+    joint_img_original = joint_img.copy()
+    joint_img, joint_cam, joint_valid = joint_img.copy(), joint_cam.copy(), joint_valid.copy()
+
+    # flip augmentation
+    if do_flip:
+        joint_cam[:, 0] = -joint_cam[:, 0]
+        joint_img[:, 0] = img_shape[1] - 1 - joint_img[:, 0]
+        for pair in flip_pairs:
+            joint_img[pair[0], :], joint_img[pair[1], :] = joint_img[pair[1], :].copy(), joint_img[pair[0], :].copy()
+            joint_cam[pair[0], :], joint_cam[pair[1], :] = joint_cam[pair[1], :].copy(), joint_cam[pair[0], :].copy()
+            joint_valid[pair[0], :], joint_valid[pair[1], :] = joint_valid[pair[1], :].copy(), joint_valid[pair[0],
+                                                                                               :].copy()
+
+    # 3D data rotation augmentation
+    rot_aug_mat = np.array([[np.cos(np.deg2rad(-rot)), -np.sin(np.deg2rad(-rot)), 0],
+                            [np.sin(np.deg2rad(-rot)), np.cos(np.deg2rad(-rot)), 0],
+                            [0, 0, 1]], dtype=np.float32)
+    joint_cam = np.dot(rot_aug_mat, joint_cam.transpose(1, 0)).transpose(1, 0)
+
+    # affine transformation
+    joint_img_xy1 = np.concatenate((joint_img[:, :2], np.ones_like(joint_img[:, :1])), 1)
+    joint_img[:, :2] = np.dot(img2bb_trans, joint_img_xy1.transpose(1, 0)).transpose(1, 0)
+    joint_img[:, 0] = joint_img[:, 0] / cfg.input_img_shape[1] * cfg.output_hm_shape[2]
+    joint_img[:, 1] = joint_img[:, 1] / cfg.input_img_shape[0] * cfg.output_hm_shape[1]
+
+    # check truncation
+    joint_trunc = joint_valid * ((joint_img_original[:, 0] > 0) * (joint_img[:, 0] >= 0) * (joint_img[:, 0] < cfg.output_hm_shape[2]) * \
+                                 (joint_img_original[:, 1] > 0) *(joint_img[:, 1] >= 0) * (joint_img[:, 1] < cfg.output_hm_shape[1]) * \
+                                 (joint_img_original[:, 2] > 0) *(joint_img[:, 2] >= 0) * (joint_img[:, 2] < cfg.output_hm_shape[0])).reshape(-1,
+                                                                                                              1).astype(
+        np.float32)
+
+    # transform joints to target db joints
+    joint_img = transform_joint_to_other_db(joint_img, src_joints_name, target_joints_name)
+    joint_cam_wo_ra = transform_joint_to_other_db(joint_cam, src_joints_name, target_joints_name)
+    joint_valid = transform_joint_to_other_db(joint_valid, src_joints_name, target_joints_name)
+    joint_trunc = transform_joint_to_other_db(joint_trunc, src_joints_name, target_joints_name)
+
+    # root-alignment, for joint_cam input wo ra
+    joint_cam_ra = joint_cam_wo_ra.copy()
+    joint_cam_ra = joint_cam_ra - joint_cam_ra[smpl_x.root_joint_idx, None, :]  # root-relative
+    joint_cam_ra[smpl_x.joint_part['lhand'], :] = joint_cam_ra[smpl_x.joint_part['lhand'], :] - joint_cam_ra[
+                                                                                            smpl_x.lwrist_idx, None,
+                                                                                            :]  # left hand root-relative
+    joint_cam_ra[smpl_x.joint_part['rhand'], :] = joint_cam_ra[smpl_x.joint_part['rhand'], :] - joint_cam_ra[
+                                                                                            smpl_x.rwrist_idx, None,
+                                                                                            :]  # right hand root-relative
+    joint_cam_ra[smpl_x.joint_part['face'], :] = joint_cam_ra[smpl_x.joint_part['face'], :] - joint_cam_ra[smpl_x.neck_idx,
+                                                                                        None,
+                                                                                        :]  # face root-relative
+
+    return joint_img, joint_cam_wo_ra, joint_cam_ra, joint_valid, joint_trunc
+
+
+def process_human_model_output(human_model_param, cam_param, do_flip, img_shape, img2bb_trans, rot, human_model_type, joint_img=None):
+    if human_model_type == 'smplx':
+        human_model = smpl_x
+        rotation_valid = np.ones((smpl_x.orig_joint_num), dtype=np.float32)
+        coord_valid = np.ones((smpl_x.joint_num), dtype=np.float32)
+
+        root_pose, body_pose, shape, trans = human_model_param['root_pose'], human_model_param['body_pose'], \
+                                             human_model_param['shape'], human_model_param['trans']
+        if 'lhand_pose' in human_model_param and human_model_param['lhand_valid']:
+            lhand_pose = human_model_param['lhand_pose']
+        else:
+            lhand_pose = np.zeros((3 * len(smpl_x.orig_joint_part['lhand'])), dtype=np.float32)
+            rotation_valid[smpl_x.orig_joint_part['lhand']] = 0
+            coord_valid[smpl_x.joint_part['lhand']] = 0
+        if 'rhand_pose' in human_model_param and human_model_param['rhand_valid']:
+            rhand_pose = human_model_param['rhand_pose']
+        else:
+            rhand_pose = np.zeros((3 * len(smpl_x.orig_joint_part['rhand'])), dtype=np.float32)
+            rotation_valid[smpl_x.orig_joint_part['rhand']] = 0
+            coord_valid[smpl_x.joint_part['rhand']] = 0
+        if 'jaw_pose' in human_model_param and 'expr' in human_model_param and human_model_param['face_valid']:
+            jaw_pose = human_model_param['jaw_pose']
+            expr = human_model_param['expr']
+            expr_valid = True
+        else:
+            jaw_pose = np.zeros((3), dtype=np.float32)
+            expr = np.zeros((smpl_x.expr_code_dim), dtype=np.float32)
+            rotation_valid[smpl_x.orig_joint_part['face']] = 0
+            coord_valid[smpl_x.joint_part['face']] = 0
+            expr_valid = False
+        if 'gender' in human_model_param:
+            gender = human_model_param['gender']
+        else:
+            gender = 'neutral'
+        root_pose = torch.FloatTensor(root_pose).view(1, 3)  # (1,3)
+        body_pose = torch.FloatTensor(body_pose).view(-1, 3)  # (21,3)
+        lhand_pose = torch.FloatTensor(lhand_pose).view(-1, 3)  # (15,3)
+        rhand_pose = torch.FloatTensor(rhand_pose).view(-1, 3)  # (15,3)
+        jaw_pose = torch.FloatTensor(jaw_pose).view(-1, 3)  # (1,3)
+        shape = torch.FloatTensor(shape).view(1, -1)  # SMPLX shape parameter
+        expr = torch.FloatTensor(expr).view(1, -1)  # SMPLX expression parameter
+        trans = torch.FloatTensor(trans).view(1, -1)  # translation vector
+
+        # apply camera extrinsic (rotation)
+        # merge root pose and camera rotation
+        if 'R' in cam_param:
+            R = np.array(cam_param['R'], dtype=np.float32).reshape(3, 3)
+            root_pose = root_pose.numpy()
+            root_pose, _ = cv2.Rodrigues(root_pose)
+            root_pose, _ = cv2.Rodrigues(np.dot(R, root_pose))
+            root_pose = torch.from_numpy(root_pose).view(1, 3)
+
+        # get mesh and joint coordinates
+        zero_pose = torch.zeros((1, 3)).float()  # eye poses
+        with torch.no_grad():
+            output = smpl_x.layer[gender](betas=shape, body_pose=body_pose.view(1, -1), global_orient=root_pose,
+                                          transl=trans, left_hand_pose=lhand_pose.view(1, -1),
+                                          right_hand_pose=rhand_pose.view(1, -1), jaw_pose=jaw_pose.view(1, -1),
+                                          leye_pose=zero_pose, reye_pose=zero_pose, expression=expr)
+        mesh_cam = output.vertices[0].numpy()
+        joint_cam = output.joints[0].numpy()[smpl_x.joint_idx, :]
+
+        # apply camera exrinsic (translation)
+        # compenstate rotation (translation from origin to root joint was not cancled)
+        if 'R' in cam_param and 't' in cam_param:
+            R, t = np.array(cam_param['R'], dtype=np.float32).reshape(3, 3), np.array(cam_param['t'],
+                                                                                      dtype=np.float32).reshape(1, 3)
+            root_cam = joint_cam[smpl_x.root_joint_idx, None, :]
+            joint_cam = joint_cam - root_cam + np.dot(R, root_cam.transpose(1, 0)).transpose(1, 0) + t
+            mesh_cam = mesh_cam - root_cam + np.dot(R, root_cam.transpose(1, 0)).transpose(1, 0) + t
+
+        # concat root, body, two hands, and jaw pose
+        pose = torch.cat((root_pose, body_pose, lhand_pose, rhand_pose, jaw_pose))
+
+        # joint coordinates
+        if 'focal' not in cam_param or 'princpt' not in cam_param:
+            assert joint_img is not None 
+        else:   
+            joint_img = cam2pixel(joint_cam, cam_param['focal'], cam_param['princpt'])
+
+        joint_img_original = joint_img.copy()
+
+        joint_cam = joint_cam - joint_cam[smpl_x.root_joint_idx, None, :]  # root-relative
+        joint_cam[smpl_x.joint_part['lhand'], :] = joint_cam[smpl_x.joint_part['lhand'], :] - joint_cam[
+                                                                                              smpl_x.lwrist_idx, None,
+                                                                                              :]  # left hand root-relative
+        joint_cam[smpl_x.joint_part['rhand'], :] = joint_cam[smpl_x.joint_part['rhand'], :] - joint_cam[
+                                                                                              smpl_x.rwrist_idx, None,
+                                                                                              :]  # right hand root-relative
+        joint_cam[smpl_x.joint_part['face'], :] = joint_cam[smpl_x.joint_part['face'], :] - joint_cam[smpl_x.neck_idx,
+                                                                                            None,
+                                                                                            :]  # face root-relative
+        joint_img[smpl_x.joint_part['body'], 2] = (joint_cam[smpl_x.joint_part['body'], 2].copy() / (
+                    cfg.body_3d_size / 2) + 1) / 2. * cfg.output_hm_shape[0]  # body depth discretize
+        joint_img[smpl_x.joint_part['lhand'], 2] = (joint_cam[smpl_x.joint_part['lhand'], 2].copy() / (
+                    cfg.hand_3d_size / 2) + 1) / 2. * cfg.output_hm_shape[0]  # left hand depth discretize
+        joint_img[smpl_x.joint_part['rhand'], 2] = (joint_cam[smpl_x.joint_part['rhand'], 2].copy() / (
+                    cfg.hand_3d_size / 2) + 1) / 2. * cfg.output_hm_shape[0]  # right hand depth discretize
+        joint_img[smpl_x.joint_part['face'], 2] = (joint_cam[smpl_x.joint_part['face'], 2].copy() / (
+                    cfg.face_3d_size / 2) + 1) / 2. * cfg.output_hm_shape[0]  # face depth discretize
+
+    elif human_model_type == 'smpl':
+        human_model = smpl
+        pose, shape, trans = human_model_param['pose'], human_model_param['shape'], human_model_param['trans']
+        if 'gender' in human_model_param:
+            gender = human_model_param['gender']
+        else:
+            gender = 'neutral'
+        pose = torch.FloatTensor(pose).view(-1, 3)
+        shape = torch.FloatTensor(shape).view(1, -1);
+        trans = torch.FloatTensor(trans).view(1, -1)  # translation vector
+
+        # apply camera extrinsic (rotation)
+        # merge root pose and camera rotation
+        if 'R' in cam_param:
+            R = np.array(cam_param['R'], dtype=np.float32).reshape(3, 3)
+            root_pose = pose[smpl.orig_root_joint_idx, :].numpy()
+            root_pose, _ = cv2.Rodrigues(root_pose)
+            root_pose, _ = cv2.Rodrigues(np.dot(R, root_pose))
+            pose[smpl.orig_root_joint_idx] = torch.from_numpy(root_pose).view(3)
+
+        # get mesh and joint coordinates
+        root_pose = pose[smpl.orig_root_joint_idx].view(1, 3)
+        body_pose = torch.cat((pose[:smpl.orig_root_joint_idx, :], pose[smpl.orig_root_joint_idx + 1:, :])).view(1, -1)
+        with torch.no_grad():
+            output = smpl.layer[gender](betas=shape, body_pose=body_pose, global_orient=root_pose, transl=trans)
+        mesh_cam = output.vertices[0].numpy()
+        joint_cam = np.dot(smpl.joint_regressor, mesh_cam)
+
+        # apply camera exrinsic (translation)
+        # compenstate rotation (translation from origin to root joint was not cancled)
+        if 'R' in cam_param and 't' in cam_param:
+            R, t = np.array(cam_param['R'], dtype=np.float32).reshape(3, 3), np.array(cam_param['t'],
+                                                                                      dtype=np.float32).reshape(1, 3)
+            root_cam = joint_cam[smpl.root_joint_idx, None, :]
+            joint_cam = joint_cam - root_cam + np.dot(R, root_cam.transpose(1, 0)).transpose(1, 0) + t
+            mesh_cam = mesh_cam - root_cam + np.dot(R, root_cam.transpose(1, 0)).transpose(1, 0) + t
+
+        # joint coordinates
+        if 'focal' not in cam_param or 'princpt' not in cam_param:
+            assert joint_img is not None 
+        else:   
+            joint_img = cam2pixel(joint_cam, cam_param['focal'], cam_param['princpt'])
+        
+        joint_img_original = joint_img.copy()
+        joint_cam = joint_cam - joint_cam[smpl.root_joint_idx, None, :]  # body root-relative
+        joint_img[:, 2] = (joint_cam[:, 2].copy() / (cfg.body_3d_size / 2) + 1) / 2. * cfg.output_hm_shape[
+            0]  # body depth discretize
+
+    elif human_model_type == 'mano':
+        human_model = mano
+        pose, shape, trans = human_model_param['pose'], human_model_param['shape'], human_model_param['trans']
+        hand_type = human_model_param['hand_type']
+        pose = torch.FloatTensor(pose).view(-1, 3)
+        shape = torch.FloatTensor(shape).view(1, -1);
+        trans = torch.FloatTensor(trans).view(1, -1)  # translation vector
+
+        # apply camera extrinsic (rotation)
+        # merge root pose and camera rotation
+        if 'R' in cam_param:
+            R = np.array(cam_param['R'], dtype=np.float32).reshape(3, 3)
+            root_pose = pose[mano.orig_root_joint_idx, :].numpy()
+            root_pose, _ = cv2.Rodrigues(root_pose)
+            root_pose, _ = cv2.Rodrigues(np.dot(R, root_pose))
+            pose[mano.orig_root_joint_idx] = torch.from_numpy(root_pose).view(3)
+
+        # get mesh and joint coordinates
+        root_pose = pose[mano.orig_root_joint_idx].view(1, 3)
+        hand_pose = torch.cat((pose[:mano.orig_root_joint_idx, :], pose[mano.orig_root_joint_idx + 1:, :])).view(1, -1)
+        with torch.no_grad():
+            output = mano.layer[hand_type](betas=shape, hand_pose=hand_pose, global_orient=root_pose, transl=trans)
+        mesh_cam = output.vertices[0].numpy()
+        joint_cam = np.dot(mano.joint_regressor, mesh_cam)
+
+        # apply camera exrinsic (translation)
+        # compenstate rotation (translation from origin to root joint was not cancled)
+        if 'R' in cam_param and 't' in cam_param:
+            R, t = np.array(cam_param['R'], dtype=np.float32).reshape(3, 3), np.array(cam_param['t'],
+                                                                                      dtype=np.float32).reshape(1, 3)
+            root_cam = joint_cam[mano.root_joint_idx, None, :]
+            joint_cam = joint_cam - root_cam + np.dot(R, root_cam.transpose(1, 0)).transpose(1, 0) + t
+            mesh_cam = mesh_cam - root_cam + np.dot(R, root_cam.transpose(1, 0)).transpose(1, 0) + t
+
+        # joint coordinates
+        if 'focal' not in cam_param or 'princpt' not in cam_param:
+            assert joint_img is not None 
+        else:   
+            joint_img = cam2pixel(joint_cam, cam_param['focal'], cam_param['princpt'])
+        joint_cam = joint_cam - joint_cam[mano.root_joint_idx, None, :]  # hand root-relative
+        joint_img[:, 2] = (joint_cam[:, 2].copy() / (cfg.hand_3d_size / 2) + 1) / 2. * cfg.output_hm_shape[
+            0]  # hand depth discretize
+
+    mesh_cam_orig = mesh_cam.copy()  # back-up the original one
+
+    ## so far, data augmentations are not applied yet
+    ## now, apply data augmentations
+
+    # image projection
+    if do_flip:
+        joint_cam[:, 0] = -joint_cam[:, 0]
+        joint_img[:, 0] = img_shape[1] - 1 - joint_img[:, 0]
+        for pair in human_model.flip_pairs:
+            joint_cam[pair[0], :], joint_cam[pair[1], :] = joint_cam[pair[1], :].copy(), joint_cam[pair[0], :].copy()
+            joint_img[pair[0], :], joint_img[pair[1], :] = joint_img[pair[1], :].copy(), joint_img[pair[0], :].copy()
+            if human_model_type == 'smplx':
+                coord_valid[pair[0]], coord_valid[pair[1]] = coord_valid[pair[1]].copy(), coord_valid[pair[0]].copy()
+
+    # x,y affine transform, root-relative depth
+    joint_img_xy1 = np.concatenate((joint_img[:, :2], np.ones_like(joint_img[:, 0:1])), 1)
+    joint_img[:, :2] = np.dot(img2bb_trans, joint_img_xy1.transpose(1, 0)).transpose(1, 0)[:, :2]
+    joint_img[:, 0] = joint_img[:, 0] / cfg.input_img_shape[1] * cfg.output_hm_shape[2]
+    joint_img[:, 1] = joint_img[:, 1] / cfg.input_img_shape[0] * cfg.output_hm_shape[1]
+
+    # check truncation
+    # TODO
+    joint_trunc = ((joint_img_original[:, 0] > 0) * (joint_img[:, 0] >= 0) * (joint_img[:, 0] < cfg.output_hm_shape[2]) * \
+                   (joint_img_original[:, 1] > 0) * (joint_img[:, 1] >= 0) * (joint_img[:, 1] < cfg.output_hm_shape[1]) * \
+                   (joint_img_original[:, 2] > 0) * (joint_img[:, 2] >= 0) * (joint_img[:, 2] < cfg.output_hm_shape[0])).reshape(-1, 1).astype(
+        np.float32)
+
+    # 3D data rotation augmentation
+    rot_aug_mat = np.array([[np.cos(np.deg2rad(-rot)), -np.sin(np.deg2rad(-rot)), 0],
+                            [np.sin(np.deg2rad(-rot)), np.cos(np.deg2rad(-rot)), 0],
+                            [0, 0, 1]], dtype=np.float32)
+    # coordinate
+    joint_cam = np.dot(rot_aug_mat, joint_cam.transpose(1, 0)).transpose(1, 0)
+    # parameters
+    # flip pose parameter (axis-angle)
+    if do_flip:
+        for pair in human_model.orig_flip_pairs:
+            pose[pair[0], :], pose[pair[1], :] = pose[pair[1], :].clone(), pose[pair[0], :].clone()
+            if human_model_type == 'smplx':
+                rotation_valid[pair[0]], rotation_valid[pair[1]] = rotation_valid[pair[1]].copy(), rotation_valid[
+                    pair[0]].copy()
+        pose[:, 1:3] *= -1  # multiply -1 to y and z axis of axis-angle
+
+    # rotate root pose
+    pose = pose.numpy()
+    root_pose = pose[human_model.orig_root_joint_idx, :]
+    root_pose, _ = cv2.Rodrigues(root_pose)
+    root_pose, _ = cv2.Rodrigues(np.dot(rot_aug_mat, root_pose))
+    pose[human_model.orig_root_joint_idx] = root_pose.reshape(3)
+
+    # change to mean shape if beta is too far from it
+    shape[(shape.abs() > 3).any(dim=1)] = 0.
+    shape = shape.numpy().reshape(-1)
+
+    # return results
+    if human_model_type == 'smplx':
+        pose = pose.reshape(-1)
+        expr = expr.numpy().reshape(-1)
+
+        return joint_img, joint_cam, joint_trunc, pose, shape, expr, rotation_valid, coord_valid, expr_valid, mesh_cam_orig
+    elif human_model_type == 'smpl':
+        pose = pose.reshape(-1)
+        return joint_img, joint_cam, joint_trunc, pose, shape, mesh_cam_orig
+    elif human_model_type == 'mano':
+        pose = pose.reshape(-1)
+        return joint_img, joint_cam, joint_trunc, pose, shape, mesh_cam_orig
+
+
+def get_fitting_error_3D(db_joint, db_joint_from_fit, joint_valid):
+    # mask coordinate
+    db_joint = db_joint[np.tile(joint_valid, (1, 3)) == 1].reshape(-1, 3)
+    db_joint_from_fit = db_joint_from_fit[np.tile(joint_valid, (1, 3)) == 1].reshape(-1, 3)
+
+    db_joint_from_fit = db_joint_from_fit - np.mean(db_joint_from_fit, 0)[None, :] + np.mean(db_joint, 0)[None,
+                                                                                     :]  # translation alignment
+    error = np.sqrt(np.sum((db_joint - db_joint_from_fit) ** 2, 1)).mean()
+    return error
+
+
+def load_obj(file_name):
+    v = []
+    obj_file = open(file_name)
+    for line in obj_file:
+        words = line.split(' ')
+        if words[0] == 'v':
+            x, y, z = float(words[1]), float(words[2]), float(words[3])
+            v.append(np.array([x, y, z]))
+    return np.stack(v)
+
+
+def load_ply(file_name):
+    plydata = PlyData.read(file_name)
+    x = plydata['vertex']['x']
+    y = plydata['vertex']['y']
+    z = plydata['vertex']['z']
+    v = np.stack((x, y, z), 1)
+    return v
+
+def resize_bbox(bbox, scale=1.2):
+    if isinstance(bbox, list):
+        x1, y1, x2, y2 = bbox[0], bbox[1], bbox[2], bbox[3]
+    else:
+        x1, y1, x2, y2 = bbox
+    x_center = (x1+x2)/2.0
+    y_center = (y1+y2)/2.0
+    x_size, y_size = x2-x1, y2-y1
+    x1_resize = x_center-x_size/2.0*scale
+    x2_resize = x_center+x_size/2.0*scale
+    y1_resize = y_center - y_size / 2.0 * scale
+    y2_resize = y_center + y_size / 2.0 * scale
+    bbox[0], bbox[1], bbox[2], bbox[3] = x1_resize, y1_resize, x2_resize, y2_resize
+    return bbox
\ No newline at end of file
diff --git a/common/utils/smplx/LICENSE b/common/utils/smplx/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..3034a97b164d6e006655493e950314ec58e200cd
--- /dev/null
+++ b/common/utils/smplx/LICENSE
@@ -0,0 +1,58 @@
+License
+
+Software Copyright License for non-commercial scientific research purposes
+Please read carefully the following terms and conditions and any accompanying documentation before you download and/or use the SMPL-X/SMPLify-X model, data and software, (the "Model & Software"), including 3D meshes, blend weights, blend shapes, textures, software, scripts, and animations. By downloading and/or using the Model & Software (including downloading, cloning, installing, and any other use of this github repository), you acknowledge that you have read these terms and conditions, understand them, and agree to be bound by them. If you do not agree with these terms and conditions, you must not download and/or use the Model & Software. Any infringement of the terms of this agreement will automatically terminate your rights under this License
+
+Ownership / Licensees
+The Software and the associated materials has been developed at the
+
+Max Planck Institute for Intelligent Systems (hereinafter "MPI").
+
+Any copyright or patent right is owned by and proprietary material of the
+
+Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (hereinafter “MPG”; MPI and MPG hereinafter collectively “Max-Planck”)
+
+hereinafter the “Licensor”.
+
+License Grant
+Licensor grants you (Licensee) personally a single-user, non-exclusive, non-transferable, free of charge right:
+
+To install the Model & Software on computers owned, leased or otherwise controlled by you and/or your organization;
+To use the Model & Software for the sole purpose of performing non-commercial scientific research, non-commercial education, or non-commercial artistic projects;
+Any other use, in particular any use for commercial purposes, is prohibited. This includes, without limitation, incorporation in a commercial product, use in a commercial service, or production of other artifacts for commercial purposes. The Model & Software may not be reproduced, modified and/or made available in any form to any third party without Max-Planck’s prior written permission.
+
+The Model & Software may not be used for pornographic purposes or to generate pornographic material whether commercial or not. This license also prohibits the use of the Model & Software to train methods/algorithms/neural networks/etc. for commercial use of any kind. By downloading the Model & Software, you agree not to reverse engineer it.
+
+No Distribution
+The Model & Software and the license herein granted shall not be copied, shared, distributed, re-sold, offered for re-sale, transferred or sub-licensed in whole or in part except that you may make one copy for archive purposes only.
+
+Disclaimer of Representations and Warranties
+You expressly acknowledge and agree that the Model & Software results from basic research, is provided “AS IS”, may contain errors, and that any use of the Model & Software is at your sole risk. LICENSOR MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE MODEL & SOFTWARE, NEITHER EXPRESS NOR IMPLIED, AND THE ABSENCE OF ANY LEGAL OR ACTUAL DEFECTS, WHETHER DISCOVERABLE OR NOT. Specifically, and not to limit the foregoing, licensor makes no representations or warranties (i) regarding the merchantability or fitness for a particular purpose of the Model & Software, (ii) that the use of the Model & Software will not infringe any patents, copyrights or other intellectual property rights of a third party, and (iii) that the use of the Model & Software will not cause any damage of any kind to you or a third party.
+
+Limitation of Liability
+Because this Model & Software License Agreement qualifies as a donation, according to Section 521 of the German Civil Code (Bürgerliches Gesetzbuch – BGB) Licensor as a donor is liable for intent and gross negligence only. If the Licensor fraudulently conceals a legal or material defect, they are obliged to compensate the Licensee for the resulting damage.
+Licensor shall be liable for loss of data only up to the amount of typical recovery costs which would have arisen had proper and regular data backup measures been taken. For the avoidance of doubt Licensor shall be liable in accordance with the German Product Liability Act in the event of product liability. The foregoing applies also to Licensor’s legal representatives or assistants in performance. Any further liability shall be excluded.
+Patent claims generated through the usage of the Model & Software cannot be directed towards the copyright holders.
+The Model & Software is provided in the state of development the licensor defines. If modified or extended by Licensee, the Licensor makes no claims about the fitness of the Model & Software and is not responsible for any problems such modifications cause.
+
+No Maintenance Services
+You understand and agree that Licensor is under no obligation to provide either maintenance services, update services, notices of latent defects, or corrections of defects with regard to the Model & Software. Licensor nevertheless reserves the right to update, modify, or discontinue the Model & Software at any time.
+
+Defects of the Model & Software must be notified in writing to the Licensor with a comprehensible description of the error symptoms. The notification of the defect should enable the reproduction of the error. The Licensee is encouraged to communicate any use, results, modification or publication.
+
+Publications using the Model & Software
+You acknowledge that the Model & Software is a valuable scientific resource and agree to appropriately reference the following paper in any publication making use of the Model & Software.
+
+Citation:
+
+
+@inproceedings{SMPL-X:2019,
+  title = {Expressive Body Capture: 3D Hands, Face, and Body from a Single Image},
+  author = {Pavlakos, Georgios and Choutas, Vasileios and Ghorbani, Nima and Bolkart, Timo and Osman, Ahmed A. A. and Tzionas, Dimitrios and Black, Michael J.},
+  booktitle = {Proceedings IEEE Conf. on Computer Vision and Pattern Recognition (CVPR)},
+  year = {2019}
+}
+Commercial licensing opportunities
+For commercial uses of the Software, please send email to ps-license@tue.mpg.de
+
+This Agreement shall be governed by the laws of the Federal Republic of Germany except for the UN Sales Convention.
diff --git a/common/utils/smplx/README.md b/common/utils/smplx/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..fb2df07aae5c116d23056b53160505316d72ea5f
--- /dev/null
+++ b/common/utils/smplx/README.md
@@ -0,0 +1,186 @@
+## SMPL-X:  A new joint 3D model of the human body, face and hands together
+
+[[Paper Page](https://smpl-x.is.tue.mpg.de)] [[Paper](https://ps.is.tuebingen.mpg.de/uploads_file/attachment/attachment/497/SMPL-X.pdf)]
+[[Supp. Mat.](https://ps.is.tuebingen.mpg.de/uploads_file/attachment/attachment/498/SMPL-X-supp.pdf)]
+
+![SMPL-X Examples](./images/teaser_fig.png)
+
+## Table of Contents
+  * [License](#license)
+  * [Description](#description)
+  * [Installation](#installation)
+  * [Downloading the model](#downloading-the-model)
+  * [Loading SMPL-X, SMPL+H and SMPL](#loading-smpl-x-smplh-and-smpl) 
+    * [SMPL and SMPL+H setup](#smpl-and-smplh-setup)
+    * [Model loading](https://github.com/vchoutas/smplx#model-loading)
+  * [MANO and FLAME correspondences](#mano-and-flame-correspondences) 
+  * [Example](#example)
+  * [Citation](#citation)
+  * [Acknowledgments](#acknowledgments)
+  * [Contact](#contact)
+
+## License
+
+Software Copyright License for **non-commercial scientific research purposes**.
+Please read carefully the [terms and conditions](https://github.com/vchoutas/smplx/blob/master/LICENSE) and any accompanying documentation before you download and/or use the SMPL-X/SMPLify-X model, data and software, (the "Model & Software"), including 3D meshes, blend weights, blend shapes, textures, software, scripts, and animations. By downloading and/or using the Model & Software (including downloading, cloning, installing, and any other use of this github repository), you acknowledge that you have read these terms and conditions, understand them, and agree to be bound by them. If you do not agree with these terms and conditions, you must not download and/or use the Model & Software. Any infringement of the terms of this agreement will automatically terminate your rights under this [License](./LICENSE).
+
+## Disclaimer
+
+The original images used for the figures 1 and 2 of the paper can be found in this link. 
+The images in the paper are used under license from gettyimages.com.
+We have acquired the right to use them in the publication, but redistribution is not allowed.
+Please follow the instructions on the given link to acquire right of usage.
+Our results are obtained on the 483 × 724 pixels resolution of the original images.
+
+## Description
+
+*SMPL-X* (SMPL eXpressive) is a unified body model with shape parameters trained jointly for the
+face, hands and body. *SMPL-X* uses standard vertex based linear blend skinning with learned corrective blend
+shapes, has N = 10, 475 vertices and K = 54 joints,
+which include joints for the neck, jaw, eyeballs and fingers. 
+SMPL-X is defined by a function M(θ, β, ψ), where θ is the pose parameters, β the shape parameters and
+ψ the facial expression parameters.
+
+
+## Installation
+
+To install the model please follow the next steps in the specified order:
+1. To install from PyPi simply run: 
+  ```Shell
+  pip install smplx[all]
+  ```
+2. Clone this repository and install it using the *setup.py* script: 
+```Shell
+git clone https://github.com/vchoutas/smplx
+python setup.py install
+```
+
+## Downloading the model
+
+To download the *SMPL-X* model go to [this project website](https://smpl-x.is.tue.mpg.de) and register to get access to the downloads section. 
+
+To download the *SMPL+H* model go to [this project website](http://mano.is.tue.mpg.de) and register to get access to the downloads section. 
+
+To download the *SMPL* model go to [this](http://smpl.is.tue.mpg.de) (male and female models) and [this](http://smplify.is.tue.mpg.de) (gender neutral model) project website and register to get access to the downloads section. 
+
+## Loading SMPL-X, SMPL+H and SMPL
+
+### SMPL and SMPL+H setup
+
+The loader gives the option to use any of the SMPL-X, SMPL+H, SMPL, and MANO models. Depending on the model you want to use, please follow the respective download instructions. To switch between MANO, SMPL, SMPL+H and SMPL-X just change the *model_path* or *model_type* parameters. For more details please check the docs of the model classes.
+Before using SMPL and SMPL+H you should follow the instructions in [tools/README.md](./tools/README.md) to remove the
+Chumpy objects from both model pkls, as well as merge the MANO parameters with SMPL+H.
+
+### Model loading 
+
+You can either use the [create](https://github.com/vchoutas/smplx/blob/c63c02b478c5c6f696491ed9167e3af6b08d89b1/smplx/body_models.py#L54)
+function from [body_models](./smplx/body_models.py) or directly call the constructor for the 
+[SMPL](https://github.com/vchoutas/smplx/blob/c63c02b478c5c6f696491ed9167e3af6b08d89b1/smplx/body_models.py#L106), 
+[SMPL+H](https://github.com/vchoutas/smplx/blob/c63c02b478c5c6f696491ed9167e3af6b08d89b1/smplx/body_models.py#L395) and 
+[SMPL-X](https://github.com/vchoutas/smplx/blob/c63c02b478c5c6f696491ed9167e3af6b08d89b1/smplx/body_models.py#L628) model. The path to the model can either be the path to the file with the parameters or a directory with the following structure:
+```bash
+models
+├── smpl
+│   ├── SMPL_FEMALE.pkl
+│   └── SMPL_MALE.pkl
+│   └── SMPL_NEUTRAL.pkl
+├── smplh
+│   ├── SMPLH_FEMALE.pkl
+│   └── SMPLH_MALE.pkl
+├── mano
+|   ├── MANO_RIGHT.pkl
+|   └── MANO_LEFT.pkl
+└── smplx
+    ├── SMPLX_FEMALE.npz
+    ├── SMPLX_FEMALE.pkl
+    ├── SMPLX_MALE.npz
+    ├── SMPLX_MALE.pkl
+    ├── SMPLX_NEUTRAL.npz
+    └── SMPLX_NEUTRAL.pkl
+```
+
+
+## MANO and FLAME correspondences
+
+The vertex correspondences between SMPL-X and MANO, FLAME can be downloaded
+from [the project website](https://smpl-x.is.tue.mpg.de). If you have extracted
+the correspondence data in the folder *correspondences*, then use the following
+scripts to visualize them:
+
+1. To view MANO correspondences run the following command:
+
+```
+python examples/vis_mano_vertices.py --model-folder $SMPLX_FOLDER --corr-fname correspondences/MANO_SMPLX_vertex_ids.pkl
+```
+
+2. To view FLAME correspondences run the following command:
+
+```
+python examples/vis_flame_vertices.py --model-folder $SMPLX_FOLDER --corr-fname correspondences/SMPL-X__FLAME_vertex_ids.npy
+```
+
+## Example
+
+After installing the *smplx* package and downloading the model parameters you should be able to run the *demo.py*
+script to visualize the results. For this step you have to install the [pyrender](https://pyrender.readthedocs.io/en/latest/index.html) and [trimesh](https://trimsh.org/) packages.
+
+`python examples/demo.py --model-folder $SMPLX_FOLDER --plot-joints=True --gender="neutral"`
+
+![SMPL-X Examples](./images/example.png)
+
+## Citation
+
+Depending on which model is loaded for your project, i.e. SMPL-X or SMPL+H or SMPL, please cite the most relevant work below, listed in the same order:
+
+```
+@inproceedings{SMPL-X:2019,
+    title = {Expressive Body Capture: 3D Hands, Face, and Body from a Single Image},
+    author = {Pavlakos, Georgios and Choutas, Vasileios and Ghorbani, Nima and Bolkart, Timo and Osman, Ahmed A. A. and Tzionas, Dimitrios and Black, Michael J.},
+    booktitle = {Proceedings IEEE Conf. on Computer Vision and Pattern Recognition (CVPR)},
+    year = {2019}
+}
+```
+
+```
+@article{MANO:SIGGRAPHASIA:2017,
+    title = {Embodied Hands: Modeling and Capturing Hands and Bodies Together},
+    author = {Romero, Javier and Tzionas, Dimitrios and Black, Michael J.},
+    journal = {ACM Transactions on Graphics, (Proc. SIGGRAPH Asia)},
+    volume = {36},
+    number = {6},
+    series = {245:1--245:17},
+    month = nov,
+    year = {2017},
+    month_numeric = {11}
+  }
+```
+
+```
+@article{SMPL:2015,
+    author = {Loper, Matthew and Mahmood, Naureen and Romero, Javier and Pons-Moll, Gerard and Black, Michael J.},
+    title = {{SMPL}: A Skinned Multi-Person Linear Model},
+    journal = {ACM Transactions on Graphics, (Proc. SIGGRAPH Asia)},
+    month = oct,
+    number = {6},
+    pages = {248:1--248:16},
+    publisher = {ACM},
+    volume = {34},
+    year = {2015}
+}
+```
+
+This repository was originally developed for SMPL-X / SMPLify-X (CVPR 2019), you might be interested in having a look: [https://smpl-x.is.tue.mpg.de](https://smpl-x.is.tue.mpg.de).
+
+## Acknowledgments
+
+### Facial Contour
+
+Special thanks to [Soubhik Sanyal](https://github.com/soubhiksanyal) for sharing the Tensorflow code used for the facial
+landmarks.
+
+## Contact
+The code of this repository was implemented by [Vassilis Choutas](vassilis.choutas@tuebingen.mpg.de).
+
+For questions, please contact [smplx@tue.mpg.de](smplx@tue.mpg.de).
+
+For commercial licensing (and all related questions for business applications), please contact [ps-licensing@tue.mpg.de](ps-licensing@tue.mpg.de).
diff --git a/common/utils/smplx/examples/demo.py b/common/utils/smplx/examples/demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a6fd5024f4ac05d9f5db336b769d84836b51c18
--- /dev/null
+++ b/common/utils/smplx/examples/demo.py
@@ -0,0 +1,180 @@
+# -*- coding: utf-8 -*-
+
+# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
+# holder of all proprietary rights on this computer program.
+# You can only use this computer program if you have closed
+# a license agreement with MPG or you get the right to use the computer
+# program from someone who is authorized to grant you that right.
+# Any use of the computer program without a valid license is prohibited and
+# liable to prosecution.
+#
+# Copyright©2019 Max-Planck-Gesellschaft zur Förderung
+# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
+# for Intelligent Systems. All rights reserved.
+#
+# Contact: ps-license@tuebingen.mpg.de
+
+import os.path as osp
+import argparse
+
+import numpy as np
+import torch
+
+import smplx
+
+
+def main(model_folder,
+         model_type='smplx',
+         ext='npz',
+         gender='neutral',
+         plot_joints=False,
+         num_betas=10,
+         sample_shape=True,
+         sample_expression=True,
+         num_expression_coeffs=10,
+         plotting_module='pyrender',
+         use_face_contour=False):
+
+    model = smplx.create(model_folder, model_type=model_type,
+                         gender=gender, use_face_contour=use_face_contour,
+                         num_betas=num_betas,
+                         num_expression_coeffs=num_expression_coeffs,
+                         ext=ext)
+    print(model)
+
+    betas, expression = None, None
+    if sample_shape:
+        betas = torch.randn([1, model.num_betas], dtype=torch.float32)
+    if sample_expression:
+        expression = torch.randn(
+            [1, model.num_expression_coeffs], dtype=torch.float32)
+
+    output = model(betas=betas, expression=expression,
+                   return_verts=True)
+    vertices = output.vertices.detach().cpu().numpy().squeeze()
+    joints = output.joints.detach().cpu().numpy().squeeze()
+
+    print('Vertices shape =', vertices.shape)
+    print('Joints shape =', joints.shape)
+
+    if plotting_module == 'pyrender':
+        import pyrender
+        import trimesh
+        vertex_colors = np.ones([vertices.shape[0], 4]) * [0.3, 0.3, 0.3, 0.8]
+        tri_mesh = trimesh.Trimesh(vertices, model.faces,
+                                   vertex_colors=vertex_colors)
+
+        mesh = pyrender.Mesh.from_trimesh(tri_mesh)
+
+        scene = pyrender.Scene()
+        scene.add(mesh)
+
+        if plot_joints:
+            sm = trimesh.creation.uv_sphere(radius=0.005)
+            sm.visual.vertex_colors = [0.9, 0.1, 0.1, 1.0]
+            tfs = np.tile(np.eye(4), (len(joints), 1, 1))
+            tfs[:, :3, 3] = joints
+            joints_pcl = pyrender.Mesh.from_trimesh(sm, poses=tfs)
+            scene.add(joints_pcl)
+
+        pyrender.Viewer(scene, use_raymond_lighting=True)
+    elif plotting_module == 'matplotlib':
+        from matplotlib import pyplot as plt
+        from mpl_toolkits.mplot3d import Axes3D
+        from mpl_toolkits.mplot3d.art3d import Poly3DCollection
+
+        fig = plt.figure()
+        ax = fig.add_subplot(111, projection='3d')
+
+        mesh = Poly3DCollection(vertices[model.faces], alpha=0.1)
+        face_color = (1.0, 1.0, 0.9)
+        edge_color = (0, 0, 0)
+        mesh.set_edgecolor(edge_color)
+        mesh.set_facecolor(face_color)
+        ax.add_collection3d(mesh)
+        ax.scatter(joints[:, 0], joints[:, 1], joints[:, 2], color='r')
+
+        if plot_joints:
+            ax.scatter(joints[:, 0], joints[:, 1], joints[:, 2], alpha=0.1)
+        plt.show()
+    elif plotting_module == 'open3d':
+        import open3d as o3d
+
+        mesh = o3d.geometry.TriangleMesh()
+        mesh.vertices = o3d.utility.Vector3dVector(
+            vertices)
+        mesh.triangles = o3d.utility.Vector3iVector(model.faces)
+        mesh.compute_vertex_normals()
+        mesh.paint_uniform_color([0.3, 0.3, 0.3])
+
+        geometry = [mesh]
+        if plot_joints:
+            joints_pcl = o3d.geometry.PointCloud()
+            joints_pcl.points = o3d.utility.Vector3dVector(joints)
+            joints_pcl.paint_uniform_color([0.7, 0.3, 0.3])
+            geometry.append(joints_pcl)
+
+        o3d.visualization.draw_geometries(geometry)
+    else:
+        raise ValueError('Unknown plotting_module: {}'.format(plotting_module))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='SMPL-X Demo')
+
+    parser.add_argument('--model-folder', required=True, type=str,
+                        help='The path to the model folder')
+    parser.add_argument('--model-type', default='smplx', type=str,
+                        choices=['smpl', 'smplh', 'smplx', 'mano', 'flame'],
+                        help='The type of model to load')
+    parser.add_argument('--gender', type=str, default='neutral',
+                        help='The gender of the model')
+    parser.add_argument('--num-betas', default=10, type=int,
+                        dest='num_betas',
+                        help='Number of shape coefficients.')
+    parser.add_argument('--num-expression-coeffs', default=10, type=int,
+                        dest='num_expression_coeffs',
+                        help='Number of expression coefficients.')
+    parser.add_argument('--plotting-module', type=str, default='pyrender',
+                        dest='plotting_module',
+                        choices=['pyrender', 'matplotlib', 'open3d'],
+                        help='The module to use for plotting the result')
+    parser.add_argument('--ext', type=str, default='npz',
+                        help='Which extension to use for loading')
+    parser.add_argument('--plot-joints', default=False,
+                        type=lambda arg: arg.lower() in ['true', '1'],
+                        help='The path to the model folder')
+    parser.add_argument('--sample-shape', default=True,
+                        dest='sample_shape',
+                        type=lambda arg: arg.lower() in ['true', '1'],
+                        help='Sample a random shape')
+    parser.add_argument('--sample-expression', default=True,
+                        dest='sample_expression',
+                        type=lambda arg: arg.lower() in ['true', '1'],
+                        help='Sample a random expression')
+    parser.add_argument('--use-face-contour', default=False,
+                        type=lambda arg: arg.lower() in ['true', '1'],
+                        help='Compute the contour of the face')
+
+    args = parser.parse_args()
+
+    model_folder = osp.expanduser(osp.expandvars(args.model_folder))
+    model_type = args.model_type
+    plot_joints = args.plot_joints
+    use_face_contour = args.use_face_contour
+    gender = args.gender
+    ext = args.ext
+    plotting_module = args.plotting_module
+    num_betas = args.num_betas
+    num_expression_coeffs = args.num_expression_coeffs
+    sample_shape = args.sample_shape
+    sample_expression = args.sample_expression
+
+    main(model_folder, model_type, ext=ext,
+         gender=gender, plot_joints=plot_joints,
+         num_betas=num_betas,
+         num_expression_coeffs=num_expression_coeffs,
+         sample_shape=sample_shape,
+         sample_expression=sample_expression,
+         plotting_module=plotting_module,
+         use_face_contour=use_face_contour)
diff --git a/common/utils/smplx/examples/demo_layers.py b/common/utils/smplx/examples/demo_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d4e18226c02a6c06c5158dc66276598ba96163a
--- /dev/null
+++ b/common/utils/smplx/examples/demo_layers.py
@@ -0,0 +1,181 @@
+# -*- coding: utf-8 -*-
+
+# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
+# holder of all proprietary rights on this computer program.
+# You can only use this computer program if you have closed
+# a license agreement with MPG or you get the right to use the computer
+# program from someone who is authorized to grant you that right.
+# Any use of the computer program without a valid license is prohibited and
+# liable to prosecution.
+#
+# Copyright©2019 Max-Planck-Gesellschaft zur Förderung
+# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
+# for Intelligent Systems. All rights reserved.
+#
+# Contact: ps-license@tuebingen.mpg.de
+
+import os.path as osp
+import argparse
+
+import numpy as np
+import torch
+
+import smplx
+
+
+def main(model_folder,
+         model_type='smplx',
+         ext='npz',
+         gender='neutral',
+         plot_joints=False,
+         num_betas=10,
+         sample_shape=True,
+         sample_expression=True,
+         num_expression_coeffs=10,
+         plotting_module='pyrender',
+         use_face_contour=False):
+
+    model = smplx.build_layer(
+        model_folder, model_type=model_type,
+        gender=gender, use_face_contour=use_face_contour,
+        num_betas=num_betas,
+        num_expression_coeffs=num_expression_coeffs,
+        ext=ext)
+    print(model)
+
+    betas, expression = None, None
+    if sample_shape:
+        betas = torch.randn([1, model.num_betas], dtype=torch.float32)
+    if sample_expression:
+        expression = torch.randn(
+            [1, model.num_expression_coeffs], dtype=torch.float32)
+
+    output = model(betas=betas, expression=expression,
+                   return_verts=True)
+    vertices = output.vertices.detach().cpu().numpy().squeeze()
+    joints = output.joints.detach().cpu().numpy().squeeze()
+
+    print('Vertices shape =', vertices.shape)
+    print('Joints shape =', joints.shape)
+
+    if plotting_module == 'pyrender':
+        import pyrender
+        import trimesh
+        vertex_colors = np.ones([vertices.shape[0], 4]) * [0.3, 0.3, 0.3, 0.8]
+        tri_mesh = trimesh.Trimesh(vertices, model.faces,
+                                   vertex_colors=vertex_colors)
+
+        mesh = pyrender.Mesh.from_trimesh(tri_mesh)
+
+        scene = pyrender.Scene()
+        scene.add(mesh)
+
+        if plot_joints:
+            sm = trimesh.creation.uv_sphere(radius=0.005)
+            sm.visual.vertex_colors = [0.9, 0.1, 0.1, 1.0]
+            tfs = np.tile(np.eye(4), (len(joints), 1, 1))
+            tfs[:, :3, 3] = joints
+            joints_pcl = pyrender.Mesh.from_trimesh(sm, poses=tfs)
+            scene.add(joints_pcl)
+
+        pyrender.Viewer(scene, use_raymond_lighting=True)
+    elif plotting_module == 'matplotlib':
+        from matplotlib import pyplot as plt
+        from mpl_toolkits.mplot3d import Axes3D
+        from mpl_toolkits.mplot3d.art3d import Poly3DCollection
+
+        fig = plt.figure()
+        ax = fig.add_subplot(111, projection='3d')
+
+        mesh = Poly3DCollection(vertices[model.faces], alpha=0.1)
+        face_color = (1.0, 1.0, 0.9)
+        edge_color = (0, 0, 0)
+        mesh.set_edgecolor(edge_color)
+        mesh.set_facecolor(face_color)
+        ax.add_collection3d(mesh)
+        ax.scatter(joints[:, 0], joints[:, 1], joints[:, 2], color='r')
+
+        if plot_joints:
+            ax.scatter(joints[:, 0], joints[:, 1], joints[:, 2], alpha=0.1)
+        plt.show()
+    elif plotting_module == 'open3d':
+        import open3d as o3d
+
+        mesh = o3d.geometry.TriangleMesh()
+        mesh.vertices = o3d.utility.Vector3dVector(
+            vertices)
+        mesh.triangles = o3d.utility.Vector3iVector(model.faces)
+        mesh.compute_vertex_normals()
+        mesh.paint_uniform_color([0.3, 0.3, 0.3])
+
+        geometry = [mesh]
+        if plot_joints:
+            joints_pcl = o3d.geometry.PointCloud()
+            joints_pcl.points = o3d.utility.Vector3dVector(joints)
+            joints_pcl.paint_uniform_color([0.7, 0.3, 0.3])
+            geometry.append(joints_pcl)
+
+        o3d.visualization.draw_geometries(geometry)
+    else:
+        raise ValueError('Unknown plotting_module: {}'.format(plotting_module))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='SMPL-X Demo')
+
+    parser.add_argument('--model-folder', required=True, type=str,
+                        help='The path to the model folder')
+    parser.add_argument('--model-type', default='smplx', type=str,
+                        choices=['smpl', 'smplh', 'smplx', 'mano', 'flame'],
+                        help='The type of model to load')
+    parser.add_argument('--gender', type=str, default='neutral',
+                        help='The gender of the model')
+    parser.add_argument('--num-betas', default=10, type=int,
+                        dest='num_betas',
+                        help='Number of shape coefficients.')
+    parser.add_argument('--num-expression-coeffs', default=10, type=int,
+                        dest='num_expression_coeffs',
+                        help='Number of expression coefficients.')
+    parser.add_argument('--plotting-module', type=str, default='pyrender',
+                        dest='plotting_module',
+                        choices=['pyrender', 'matplotlib', 'open3d'],
+                        help='The module to use for plotting the result')
+    parser.add_argument('--ext', type=str, default='npz',
+                        help='Which extension to use for loading')
+    parser.add_argument('--plot-joints', default=False,
+                        type=lambda arg: arg.lower() in ['true', '1'],
+                        help='The path to the model folder')
+    parser.add_argument('--sample-shape', default=True,
+                        dest='sample_shape',
+                        type=lambda arg: arg.lower() in ['true', '1'],
+                        help='Sample a random shape')
+    parser.add_argument('--sample-expression', default=True,
+                        dest='sample_expression',
+                        type=lambda arg: arg.lower() in ['true', '1'],
+                        help='Sample a random expression')
+    parser.add_argument('--use-face-contour', default=False,
+                        type=lambda arg: arg.lower() in ['true', '1'],
+                        help='Compute the contour of the face')
+
+    args = parser.parse_args()
+
+    model_folder = osp.expanduser(osp.expandvars(args.model_folder))
+    model_type = args.model_type
+    plot_joints = args.plot_joints
+    use_face_contour = args.use_face_contour
+    gender = args.gender
+    ext = args.ext
+    plotting_module = args.plotting_module
+    num_betas = args.num_betas
+    num_expression_coeffs = args.num_expression_coeffs
+    sample_shape = args.sample_shape
+    sample_expression = args.sample_expression
+
+    main(model_folder, model_type, ext=ext,
+         gender=gender, plot_joints=plot_joints,
+         num_betas=num_betas,
+         num_expression_coeffs=num_expression_coeffs,
+         sample_shape=sample_shape,
+         sample_expression=sample_expression,
+         plotting_module=plotting_module,
+         use_face_contour=use_face_contour)
diff --git a/common/utils/smplx/examples/vis_flame_vertices.py b/common/utils/smplx/examples/vis_flame_vertices.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8d6b9b33610876a9d555f87492b326b172692a7
--- /dev/null
+++ b/common/utils/smplx/examples/vis_flame_vertices.py
@@ -0,0 +1,92 @@
+# -*- coding: utf-8 -*-
+
+# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
+# holder of all proprietary rights on this computer program.
+# You can only use this computer program if you have closed
+# a license agreement with MPG or you get the right to use the computer
+# program from someone who is authorized to grant you that right.
+# Any use of the computer program without a valid license is prohibited and
+# liable to prosecution.
+#
+# Copyright©2019 Max-Planck-Gesellschaft zur Förderung
+# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
+# for Intelligent Systems. All rights reserved.
+#
+# Contact: ps-license@tuebingen.mpg.de
+
+import os.path as osp
+import argparse
+import pickle
+
+import numpy as np
+import torch
+import open3d as o3d
+
+import smplx
+
+
+def main(model_folder, corr_fname, ext='npz',
+         head_color=(0.3, 0.3, 0.6),
+         gender='neutral'):
+
+    head_idxs = np.load(corr_fname)
+
+    model = smplx.create(model_folder, model_type='smplx',
+                         gender=gender,
+                         ext=ext)
+    betas = torch.zeros([1, 10], dtype=torch.float32)
+    expression = torch.zeros([1, 10], dtype=torch.float32)
+
+    output = model(betas=betas, expression=expression,
+                   return_verts=True)
+    vertices = output.vertices.detach().cpu().numpy().squeeze()
+    joints = output.joints.detach().cpu().numpy().squeeze()
+
+    print('Vertices shape =', vertices.shape)
+    print('Joints shape =', joints.shape)
+
+    mesh = o3d.geometry.TriangleMesh()
+    mesh.vertices = o3d.utility.Vector3dVector(vertices)
+    mesh.triangles = o3d.utility.Vector3iVector(model.faces)
+    mesh.compute_vertex_normals()
+
+    colors = np.ones_like(vertices) * [0.3, 0.3, 0.3]
+    colors[head_idxs] = head_color
+
+    mesh.vertex_colors = o3d.utility.Vector3dVector(colors)
+
+    o3d.visualization.draw_geometries([mesh])
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='SMPL-X Demo')
+
+    parser.add_argument('--model-folder', required=True, type=str,
+                        help='The path to the model folder')
+    parser.add_argument('--corr-fname', required=True, type=str,
+                        dest='corr_fname',
+                        help='Filename with the head correspondences')
+    parser.add_argument('--gender', type=str, default='neutral',
+                        help='The gender of the model')
+    parser.add_argument('--ext', type=str, default='npz',
+                        help='Which extension to use for loading')
+    parser.add_argument('--head', default='right',
+                        choices=['right', 'left'],
+                        type=str, help='Which head to plot')
+    parser.add_argument('--head-color', type=float, nargs=3, dest='head_color',
+                        default=(0.3, 0.3, 0.6),
+                        help='Color for the head vertices')
+
+    args = parser.parse_args()
+
+    model_folder = osp.expanduser(osp.expandvars(args.model_folder))
+    corr_fname = args.corr_fname
+    gender = args.gender
+    ext = args.ext
+    head = args.head
+    head_color = args.head_color
+
+    main(model_folder, corr_fname, ext=ext,
+         head_color=head_color,
+         gender=gender
+         )
diff --git a/common/utils/smplx/examples/vis_mano_vertices.py b/common/utils/smplx/examples/vis_mano_vertices.py
new file mode 100644
index 0000000000000000000000000000000000000000..1741542a1808071cc35fa1fcdef01a869885ec7e
--- /dev/null
+++ b/common/utils/smplx/examples/vis_mano_vertices.py
@@ -0,0 +1,99 @@
+# -*- coding: utf-8 -*-
+
+# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
+# holder of all proprietary rights on this computer program.
+# You can only use this computer program if you have closed
+# a license agreement with MPG or you get the right to use the computer
+# program from someone who is authorized to grant you that right.
+# Any use of the computer program without a valid license is prohibited and
+# liable to prosecution.
+#
+# Copyright©2019 Max-Planck-Gesellschaft zur Förderung
+# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
+# for Intelligent Systems. All rights reserved.
+#
+# Contact: ps-license@tuebingen.mpg.de
+
+import os.path as osp
+import argparse
+import pickle
+
+import numpy as np
+import torch
+import open3d as o3d
+
+import smplx
+
+
+def main(model_folder, corr_fname, ext='npz',
+         hand_color=(0.3, 0.3, 0.6),
+         gender='neutral', hand='right'):
+
+    with open(corr_fname, 'rb') as f:
+        idxs_data = pickle.load(f)
+        if hand == 'both':
+            hand_idxs = np.concatenate(
+                [idxs_data['left_hand'], idxs_data['right_hand']]
+            )
+        else:
+            hand_idxs = idxs_data[f'{hand}_hand']
+
+    model = smplx.create(model_folder, model_type='smplx',
+                         gender=gender,
+                         ext=ext)
+    betas = torch.zeros([1, 10], dtype=torch.float32)
+    expression = torch.zeros([1, 10], dtype=torch.float32)
+
+    output = model(betas=betas, expression=expression,
+                   return_verts=True)
+    vertices = output.vertices.detach().cpu().numpy().squeeze()
+    joints = output.joints.detach().cpu().numpy().squeeze()
+
+    print('Vertices shape =', vertices.shape)
+    print('Joints shape =', joints.shape)
+
+    mesh = o3d.geometry.TriangleMesh()
+    mesh.vertices = o3d.utility.Vector3dVector(vertices)
+    mesh.triangles = o3d.utility.Vector3iVector(model.faces)
+    mesh.compute_vertex_normals()
+
+    colors = np.ones_like(vertices) * [0.3, 0.3, 0.3]
+    colors[hand_idxs] = hand_color
+
+    mesh.vertex_colors = o3d.utility.Vector3dVector(colors)
+
+    o3d.visualization.draw_geometries([mesh])
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='SMPL-X Demo')
+
+    parser.add_argument('--model-folder', required=True, type=str,
+                        help='The path to the model folder')
+    parser.add_argument('--corr-fname', required=True, type=str,
+                        dest='corr_fname',
+                        help='Filename with the hand correspondences')
+    parser.add_argument('--gender', type=str, default='neutral',
+                        help='The gender of the model')
+    parser.add_argument('--ext', type=str, default='npz',
+                        help='Which extension to use for loading')
+    parser.add_argument('--hand', default='right',
+                        choices=['right', 'left', 'both'],
+                        type=str, help='Which hand to plot')
+    parser.add_argument('--hand-color', type=float, nargs=3, dest='hand_color',
+                        default=(0.3, 0.3, 0.6),
+                        help='Color for the hand vertices')
+
+    args = parser.parse_args()
+
+    model_folder = osp.expanduser(osp.expandvars(args.model_folder))
+    corr_fname = args.corr_fname
+    gender = args.gender
+    ext = args.ext
+    hand = args.hand
+    hand_color = args.hand_color
+
+    main(model_folder, corr_fname, ext=ext,
+         hand_color=hand_color,
+         gender=gender, hand=hand
+         )
diff --git a/common/utils/smplx/setup.py b/common/utils/smplx/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..0496b2a2ae47157e60c6f1a1b6766404df9c7e16
--- /dev/null
+++ b/common/utils/smplx/setup.py
@@ -0,0 +1,79 @@
+# -*- coding: utf-8 -*-
+
+# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
+# holder of all proprietary rights on this computer program.
+# You can only use this computer program if you have closed
+# a license agreement with MPG or you get the right to use the computer
+# program from someone who is authorized to grant you that right.
+# Any use of the computer program without a valid license is prohibited and
+# liable to prosecution.
+#
+# Copyright©2019 Max-Planck-Gesellschaft zur Förderung
+# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
+# for Intelligent Systems and the Max Planck Institute for Biological
+# Cybernetics. All rights reserved.
+#
+# Contact: ps-license@tuebingen.mpg.de
+
+import io
+import os
+
+from setuptools import setup
+
+# Package meta-data.
+NAME = 'smplx'
+DESCRIPTION = 'PyTorch module for loading the SMPLX body model'
+URL = 'http://smpl-x.is.tuebingen.mpg.de'
+EMAIL = 'vassilis.choutas@tuebingen.mpg.de'
+AUTHOR = 'Vassilis Choutas'
+REQUIRES_PYTHON = '>=3.6.0'
+VERSION = '0.1.21'
+
+here = os.path.abspath(os.path.dirname(__file__))
+
+try:
+    FileNotFoundError
+except NameError:
+    FileNotFoundError = IOError
+
+# Import the README and use it as the long-description.
+# Note: this will only work if 'README.md' is present in your MANIFEST.in file!
+try:
+    with io.open(os.path.join(here, 'README.md'), encoding='utf-8') as f:
+        long_description = '\n' + f.read()
+except FileNotFoundError:
+    long_description = DESCRIPTION
+
+# Load the package's __version__.py module as a dictionary.
+about = {}
+if not VERSION:
+    with open(os.path.join(here, NAME, '__version__.py')) as f:
+        exec(f.read(), about)
+else:
+    about['__version__'] = VERSION
+
+pyrender_reqs = ['pyrender>=0.1.23', 'trimesh>=2.37.6', 'shapely']
+matplotlib_reqs = ['matplotlib']
+open3d_reqs = ['open3d-python']
+
+setup(name=NAME,
+      version=about['__version__'],
+      description=DESCRIPTION,
+      long_description=long_description,
+      long_description_content_type='text/markdown',
+      author=AUTHOR,
+      author_email=EMAIL,
+      python_requires=REQUIRES_PYTHON,
+      url=URL,
+      install_requires=[
+          'numpy>=1.16.2',
+          'torch>=1.0.1.post2',
+          'torchgeometry>=0.1.2'
+      ],
+      extras_require={
+          'pyrender': pyrender_reqs,
+          'open3d': open3d_reqs,
+          'matplotlib': matplotlib_reqs,
+          'all': pyrender_reqs + matplotlib_reqs + open3d_reqs
+      },
+      packages=['smplx', 'tools'])
diff --git a/common/utils/smplx/smplx/__init__.py b/common/utils/smplx/smplx/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..886949df670691d1ef5995737cafa285224826c4
--- /dev/null
+++ b/common/utils/smplx/smplx/__init__.py
@@ -0,0 +1,30 @@
+# -*- coding: utf-8 -*-
+
+# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
+# holder of all proprietary rights on this computer program.
+# You can only use this computer program if you have closed
+# a license agreement with MPG or you get the right to use the computer
+# program from someone who is authorized to grant you that right.
+# Any use of the computer program without a valid license is prohibited and
+# liable to prosecution.
+#
+# Copyright©2019 Max-Planck-Gesellschaft zur Förderung
+# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
+# for Intelligent Systems. All rights reserved.
+#
+# Contact: ps-license@tuebingen.mpg.de
+
+from .body_models import (
+    create,
+    SMPL,
+    SMPLH,
+    SMPLX,
+    MANO,
+    FLAME,
+    build_layer,
+    SMPLLayer,
+    SMPLHLayer,
+    SMPLXLayer,
+    MANOLayer,
+    FLAMELayer,
+)
diff --git a/common/utils/smplx/smplx/body_models.py b/common/utils/smplx/smplx/body_models.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b46353a5dc96cdaa53a8a25dafd0d15a25aaef2
--- /dev/null
+++ b/common/utils/smplx/smplx/body_models.py
@@ -0,0 +1,2331 @@
+#  -*- coding: utf-8 -*-
+
+# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
+# holder of all proprietary rights on this computer program.
+# You can only use this computer program if you have closed
+# a license agreement with MPG or you get the right to use the computer
+# program from someone who is authorized to grant you that right.
+# Any use of the computer program without a valid license is prohibited and
+# liable to prosecution.
+#
+# Copyright©2019 Max-Planck-Gesellschaft zur Förderung
+# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
+# for Intelligent Systems. All rights reserved.
+#
+# Contact: ps-license@tuebingen.mpg.de
+
+from typing import Optional, Dict, Union
+import os
+import os.path as osp
+
+import pickle
+
+import numpy as np
+
+import torch
+import torch.nn as nn
+
+from .lbs import (
+    lbs, vertices2landmarks, find_dynamic_lmk_idx_and_bcoords)
+
+from .vertex_ids import vertex_ids as VERTEX_IDS
+from .utils import (
+    Struct, to_np, to_tensor, Tensor, Array,
+    SMPLOutput,
+    SMPLHOutput,
+    SMPLXOutput,
+    MANOOutput,
+    FLAMEOutput,
+    find_joint_kin_chain)
+from .vertex_joint_selector import VertexJointSelector
+from config import cfg
+
+class SMPL(nn.Module):
+
+    NUM_JOINTS = 23
+    NUM_BODY_JOINTS = 23
+    SHAPE_SPACE_DIM = 300
+
+    def __init__(
+        self, model_path: str,
+        data_struct: Optional[Struct] = None,
+        create_betas: bool = True,
+        betas: Optional[Tensor] = None,
+        num_betas: int = 10,
+        create_global_orient: bool = True,
+        global_orient: Optional[Tensor] = None,
+        create_body_pose: bool = True,
+        body_pose: Optional[Tensor] = None,
+        create_transl: bool = True,
+        transl: Optional[Tensor] = None,
+        dtype=torch.float32,
+        batch_size: int = 1,
+        joint_mapper=None,
+        gender: str = 'neutral',
+        vertex_ids: Dict[str, int] = None,
+        v_template: Optional[Union[Tensor, Array]] = None,
+        **kwargs
+    ) -> None:
+        ''' SMPL model constructor
+
+            Parameters
+            ----------
+            model_path: str
+                The path to the folder or to the file where the model
+                parameters are stored
+            data_struct: Strct
+                A struct object. If given, then the parameters of the model are
+                read from the object. Otherwise, the model tries to read the
+                parameters from the given `model_path`. (default = None)
+            create_global_orient: bool, optional
+                Flag for creating a member variable for the global orientation
+                of the body. (default = True)
+            global_orient: torch.tensor, optional, Bx3
+                The default value for the global orientation variable.
+                (default = None)
+            create_body_pose: bool, optional
+                Flag for creating a member variable for the pose of the body.
+                (default = True)
+            body_pose: torch.tensor, optional, Bx(Body Joints * 3)
+                The default value for the body pose variable.
+                (default = None)
+            num_betas: int, optional
+                Number of shape components to use
+                (default = 10).
+            create_betas: bool, optional
+                Flag for creating a member variable for the shape space
+                (default = True).
+            betas: torch.tensor, optional, Bx10
+                The default value for the shape member variable.
+                (default = None)
+            create_transl: bool, optional
+                Flag for creating a member variable for the translation
+                of the body. (default = True)
+            transl: torch.tensor, optional, Bx3
+                The default value for the transl variable.
+                (default = None)
+            dtype: torch.dtype, optional
+                The data type for the created variables
+            batch_size: int, optional
+                The batch size used for creating the member variables
+            joint_mapper: object, optional
+                An object that re-maps the joints. Useful if one wants to
+                re-order the SMPL joints to some other convention (e.g. MSCOCO)
+                (default = None)
+            gender: str, optional
+                Which gender to load
+            vertex_ids: dict, optional
+                A dictionary containing the indices of the extra vertices that
+                will be selected
+        '''
+
+        self.gender = gender
+
+        if data_struct is None:
+            if osp.isdir(model_path):
+                model_fn = 'SMPL_{}.{ext}'.format(gender.upper(), ext='pkl')
+                smpl_path = os.path.join(model_path, model_fn)
+            else:
+                smpl_path = model_path
+            assert osp.exists(smpl_path), 'Path {} does not exist!'.format(
+                smpl_path)
+
+            with open(smpl_path, 'rb') as smpl_file:
+                data_struct = Struct(**pickle.load(smpl_file,
+                                                   encoding='latin1'))
+
+        super(SMPL, self).__init__()
+        self.batch_size = batch_size
+        shapedirs = data_struct.shapedirs
+        if (shapedirs.shape[-1] < self.SHAPE_SPACE_DIM):
+            print(f'WARNING: You are using a {self.name()} model, with only'
+                  ' 10 shape coefficients.')
+            num_betas = min(num_betas, 10)
+        else:
+            num_betas = min(num_betas, self.SHAPE_SPACE_DIM)
+
+        self._num_betas = num_betas
+        shapedirs = shapedirs[:, :, :num_betas]
+        # The shape components
+        self.register_buffer(
+            'shapedirs',
+            to_tensor(to_np(shapedirs), dtype=dtype))
+
+        if vertex_ids is None:
+            # SMPL and SMPL-H share the same topology, so any extra joints can
+            # be drawn from the same place
+            vertex_ids = VERTEX_IDS['smplh']
+
+        self.dtype = dtype
+
+        self.joint_mapper = joint_mapper
+
+        self.vertex_joint_selector = VertexJointSelector(
+            vertex_ids=vertex_ids, **kwargs)
+
+        self.faces = data_struct.f
+        self.register_buffer('faces_tensor',
+                             to_tensor(to_np(self.faces, dtype=np.int64),
+                                       dtype=torch.long))
+
+        if create_betas:
+            if betas is None:
+                default_betas = torch.zeros(
+                    [batch_size, self.num_betas], dtype=dtype)
+            else:
+                if torch.is_tensor(betas):
+                    default_betas = betas.clone().detach()
+                else:
+                    default_betas = torch.tensor(betas, dtype=dtype)
+
+            self.register_parameter(
+                'betas', nn.Parameter(default_betas, requires_grad=True))
+
+        # The tensor that contains the global rotation of the model
+        # It is separated from the pose of the joints in case we wish to
+        # optimize only over one of them
+        if create_global_orient:
+            if global_orient is None:
+                default_global_orient = torch.zeros(
+                    [batch_size, 3], dtype=dtype)
+            else:
+                if torch.is_tensor(global_orient):
+                    default_global_orient = global_orient.clone().detach()
+                else:
+                    default_global_orient = torch.tensor(
+                        global_orient, dtype=dtype)
+
+            global_orient = nn.Parameter(default_global_orient,
+                                         requires_grad=True)
+            self.register_parameter('global_orient', global_orient)
+
+        if create_body_pose:
+            if body_pose is None:
+                default_body_pose = torch.zeros(
+                    [batch_size, self.NUM_BODY_JOINTS * 3], dtype=dtype)
+            else:
+                if torch.is_tensor(body_pose):
+                    default_body_pose = body_pose.clone().detach()
+                else:
+                    default_body_pose = torch.tensor(body_pose,
+                                                     dtype=dtype)
+            self.register_parameter(
+                'body_pose',
+                nn.Parameter(default_body_pose, requires_grad=True))
+
+        if create_transl:
+            if transl is None:
+                default_transl = torch.zeros([batch_size, 3],
+                                             dtype=dtype,
+                                             requires_grad=True)
+            else:
+                default_transl = torch.tensor(transl, dtype=dtype)
+            self.register_parameter(
+                'transl', nn.Parameter(default_transl, requires_grad=True))
+
+        if v_template is None:
+            v_template = data_struct.v_template
+        if not torch.is_tensor(v_template):
+            v_template = to_tensor(to_np(v_template), dtype=dtype)
+        # The vertices of the template model
+        self.register_buffer('v_template', v_template)
+
+        j_regressor = to_tensor(to_np(
+            data_struct.J_regressor), dtype=dtype)
+        self.register_buffer('J_regressor', j_regressor)
+
+        # Pose blend shape basis: 6890 x 3 x 207, reshaped to 6890*3 x 207
+        num_pose_basis = data_struct.posedirs.shape[-1]
+        # 207 x 20670
+        posedirs = np.reshape(data_struct.posedirs, [-1, num_pose_basis]).T
+        self.register_buffer('posedirs',
+                             to_tensor(to_np(posedirs), dtype=dtype))
+
+        # indices of parents for each joints
+        parents = to_tensor(to_np(data_struct.kintree_table[0])).long()
+        parents[0] = -1
+        self.register_buffer('parents', parents)
+
+        self.register_buffer(
+            'lbs_weights', to_tensor(to_np(data_struct.weights), dtype=dtype))
+
+    @property
+    def num_betas(self):
+        return self._num_betas
+
+    @property
+    def num_expression_coeffs(self):
+        return 0
+
+    def create_mean_pose(self, data_struct) -> Tensor:
+        pass
+
+    def name(self) -> str:
+        return 'SMPL'
+
+    @torch.no_grad()
+    def reset_params(self, **params_dict) -> None:
+        for param_name, param in self.named_parameters():
+            if param_name in params_dict:
+                param[:] = torch.tensor(params_dict[param_name])
+            else:
+                param.fill_(0)
+
+    def get_num_verts(self) -> int:
+        return self.v_template.shape[0]
+
+    def get_num_faces(self) -> int:
+        return self.faces.shape[0]
+
+    def extra_repr(self) -> str:
+        msg = [
+            f'Gender: {self.gender.upper()}',
+            f'Number of joints: {self.J_regressor.shape[0]}',
+            f'Betas: {self.num_betas}',
+        ]
+        return '\n'.join(msg)
+
+    def forward(
+        self,
+        betas: Optional[Tensor] = None,
+        body_pose: Optional[Tensor] = None,
+        global_orient: Optional[Tensor] = None,
+        transl: Optional[Tensor] = None,
+        return_verts=True,
+        return_full_pose: bool = False,
+        pose2rot: bool = True,
+        **kwargs
+    ) -> SMPLOutput:
+        ''' Forward pass for the SMPL model
+
+            Parameters
+            ----------
+            global_orient: torch.tensor, optional, shape Bx3
+                If given, ignore the member variable and use it as the global
+                rotation of the body. Useful if someone wishes to predicts this
+                with an external model. (default=None)
+            betas: torch.tensor, optional, shape Bx10
+                If given, ignore the member variable `betas` and use it
+                instead. For example, it can used if shape parameters
+                `betas` are predicted from some external model.
+                (default=None)
+            body_pose: torch.tensor, optional, shape Bx(J*3)
+                If given, ignore the member variable `body_pose` and use it
+                instead. For example, it can used if someone predicts the
+                pose of the body joints are predicted from some external model.
+                It should be a tensor that contains joint rotations in
+                axis-angle format. (default=None)
+            transl: torch.tensor, optional, shape Bx3
+                If given, ignore the member variable `transl` and use it
+                instead. For example, it can used if the translation
+                `transl` is predicted from some external model.
+                (default=None)
+            return_verts: bool, optional
+                Return the vertices. (default=True)
+            return_full_pose: bool, optional
+                Returns the full axis-angle pose vector (default=False)
+
+            Returns
+            -------
+        '''
+        # If no shape and pose parameters are passed along, then use the
+        # ones from the module
+        global_orient = (global_orient if global_orient is not None else
+                         self.global_orient)
+        body_pose = body_pose if body_pose is not None else self.body_pose
+        betas = betas if betas is not None else self.betas
+
+        apply_trans = transl is not None or hasattr(self, 'transl')
+        if transl is None and hasattr(self, 'transl'):
+            transl = self.transl
+
+        full_pose = torch.cat([global_orient, body_pose], dim=1)
+
+        batch_size = max(betas.shape[0], global_orient.shape[0],
+                         body_pose.shape[0])
+
+        if betas.shape[0] != batch_size:
+            num_repeats = int(batch_size / betas.shape[0])
+            betas = betas.expand(num_repeats, -1)
+
+        vertices, joints = lbs(betas, full_pose, self.v_template,
+                               self.shapedirs, self.posedirs,
+                               self.J_regressor, self.parents,
+                               self.lbs_weights, pose2rot=pose2rot)
+
+        joints = self.vertex_joint_selector(vertices, joints)
+        # Map the joints to the current dataset
+        if self.joint_mapper is not None:
+            joints = self.joint_mapper(joints)
+
+        if apply_trans:
+            joints += transl.unsqueeze(dim=1)
+            vertices += transl.unsqueeze(dim=1)
+
+        output = SMPLOutput(vertices=vertices if return_verts else None,
+                            global_orient=global_orient,
+                            body_pose=body_pose,
+                            joints=joints,
+                            betas=betas,
+                            full_pose=full_pose if return_full_pose else None)
+
+        return output
+
+
+class SMPLLayer(SMPL):
+    def __init__(
+        self,
+        *args,
+        **kwargs
+    ) -> None:
+        # Just create a SMPL module without any member variables
+        super(SMPLLayer, self).__init__(
+            create_body_pose=False,
+            create_betas=False,
+            create_global_orient=False,
+            create_transl=False,
+            *args,
+            **kwargs,
+        )
+
+    def forward(
+        self,
+        betas: Optional[Tensor] = None,
+        body_pose: Optional[Tensor] = None,
+        global_orient: Optional[Tensor] = None,
+        transl: Optional[Tensor] = None,
+        return_verts=True,
+        return_full_pose: bool = False,
+        pose2rot: bool = True,
+        **kwargs
+    ) -> SMPLOutput:
+        ''' Forward pass for the SMPL model
+
+            Parameters
+            ----------
+            global_orient: torch.tensor, optional, shape Bx3
+                If given, ignore the member variable and use it as the global
+                rotation of the body. Useful if someone wishes to predicts this
+                with an external model. (default=None)
+            betas: torch.tensor, optional, shape Bx10
+                If given, ignore the member variable `betas` and use it
+                instead. For example, it can used if shape parameters
+                `betas` are predicted from some external model.
+                (default=None)
+            body_pose: torch.tensor, optional, shape Bx(J*3)
+                If given, ignore the member variable `body_pose` and use it
+                instead. For example, it can used if someone predicts the
+                pose of the body joints are predicted from some external model.
+                It should be a tensor that contains joint rotations in
+                axis-angle format. (default=None)
+            transl: torch.tensor, optional, shape Bx3
+                If given, ignore the member variable `transl` and use it
+                instead. For example, it can used if the translation
+                `transl` is predicted from some external model.
+                (default=None)
+            return_verts: bool, optional
+                Return the vertices. (default=True)
+            return_full_pose: bool, optional
+                Returns the full axis-angle pose vector (default=False)
+
+            Returns
+            -------
+        '''
+        device, dtype = self.shapedirs.device, self.shapedirs.dtype
+        if global_orient is None:
+            batch_size = 1
+            global_orient = torch.zeros(3, device=device, dtype=dtype).view(
+                1, 1, 3).expand(batch_size, 1, 1).contiguous()
+        else:
+            batch_size = global_orient.shape[0]
+        if body_pose is None:
+            body_pose = torch.zeros(3, device=device, dtype=dtype).view(
+                1, 1, 3).expand(
+                    batch_size, self.NUM_BODY_JOINTS, 1).contiguous()
+        if betas is None:
+            betas = torch.zeros([batch_size, self.num_betas],
+                                dtype=dtype, device=device)
+        if transl is None:
+            transl = torch.zeros([batch_size, 3], dtype=dtype, device=device)
+        full_pose = torch.cat(
+            [global_orient.reshape(-1, 1, 3),
+             body_pose.reshape(-1, self.NUM_BODY_JOINTS, 3)],
+            dim=1)
+
+        vertices, joints = lbs(betas, full_pose, self.v_template,
+                               self.shapedirs, self.posedirs,
+                               self.J_regressor, self.parents,
+                               self.lbs_weights,
+                               pose2rot=True)
+
+        joints = self.vertex_joint_selector(vertices, joints)
+        # Map the joints to the current dataset
+        if self.joint_mapper is not None:
+            joints = self.joint_mapper(joints)
+
+        if transl is not None:
+            joints += transl.unsqueeze(dim=1)
+            vertices += transl.unsqueeze(dim=1)
+
+        output = SMPLOutput(vertices=vertices if return_verts else None,
+                            global_orient=global_orient,
+                            body_pose=body_pose,
+                            joints=joints,
+                            betas=betas,
+                            full_pose=full_pose if return_full_pose else None)
+
+        return output
+
+
+class SMPLH(SMPL):
+
+    # The hand joints are replaced by MANO
+    NUM_BODY_JOINTS = SMPL.NUM_JOINTS - 2
+    NUM_HAND_JOINTS = 15
+    NUM_JOINTS = NUM_BODY_JOINTS + 2 * NUM_HAND_JOINTS
+
+    def __init__(
+        self, model_path,
+        data_struct: Optional[Struct] = None,
+        create_left_hand_pose: bool = True,
+        left_hand_pose: Optional[Tensor] = None,
+        create_right_hand_pose: bool = True,
+        right_hand_pose: Optional[Tensor] = None,
+        use_pca: bool = True,
+        num_pca_comps: int = 6,
+        flat_hand_mean: bool = False,
+        batch_size: int = 1,
+        gender: str = 'neutral',
+        dtype=torch.float32,
+        vertex_ids=None,
+        use_compressed: bool = True,
+        ext: str = 'pkl',
+        **kwargs
+    ) -> None:
+        ''' SMPLH model constructor
+
+            Parameters
+            ----------
+            model_path: str
+                The path to the folder or to the file where the model
+                parameters are stored
+            data_struct: Strct
+                A struct object. If given, then the parameters of the model are
+                read from the object. Otherwise, the model tries to read the
+                parameters from the given `model_path`. (default = None)
+            create_left_hand_pose: bool, optional
+                Flag for creating a member variable for the pose of the left
+                hand. (default = True)
+            left_hand_pose: torch.tensor, optional, BxP
+                The default value for the left hand pose member variable.
+                (default = None)
+            create_right_hand_pose: bool, optional
+                Flag for creating a member variable for the pose of the right
+                hand. (default = True)
+            right_hand_pose: torch.tensor, optional, BxP
+                The default value for the right hand pose member variable.
+                (default = None)
+            num_pca_comps: int, optional
+                The number of PCA components to use for each hand.
+                (default = 6)
+            flat_hand_mean: bool, optional
+                If False, then the pose of the hand is initialized to False.
+            batch_size: int, optional
+                The batch size used for creating the member variables
+            gender: str, optional
+                Which gender to load
+            dtype: torch.dtype, optional
+                The data type for the created variables
+            vertex_ids: dict, optional
+                A dictionary containing the indices of the extra vertices that
+                will be selected
+        '''
+
+        self.num_pca_comps = num_pca_comps
+        # If no data structure is passed, then load the data from the given
+        # model folder
+        if data_struct is None:
+            # Load the model
+            if osp.isdir(model_path):
+                model_fn = 'SMPLH_{}.{ext}'.format(gender.upper(), ext=ext)
+                smplh_path = os.path.join(model_path, model_fn)
+            else:
+                smplh_path = model_path
+            assert osp.exists(smplh_path), 'Path {} does not exist!'.format(
+                smplh_path)
+
+            if ext == 'pkl':
+                with open(smplh_path, 'rb') as smplh_file:
+                    model_data = pickle.load(smplh_file, encoding='latin1')
+            elif ext == 'npz':
+                model_data = np.load(smplh_path, allow_pickle=True)
+            else:
+                raise ValueError('Unknown extension: {}'.format(ext))
+            data_struct = Struct(**model_data)
+
+        if vertex_ids is None:
+            vertex_ids = VERTEX_IDS['smplh']
+
+        super(SMPLH, self).__init__(
+            model_path=model_path,
+            data_struct=data_struct,
+            batch_size=batch_size, vertex_ids=vertex_ids, gender=gender,
+            use_compressed=use_compressed, dtype=dtype, ext=ext, **kwargs)
+
+        self.use_pca = use_pca
+        self.num_pca_comps = num_pca_comps
+        self.flat_hand_mean = flat_hand_mean
+
+        left_hand_components = data_struct.hands_componentsl[:num_pca_comps]
+        right_hand_components = data_struct.hands_componentsr[:num_pca_comps]
+
+        self.np_left_hand_components = left_hand_components
+        self.np_right_hand_components = right_hand_components
+        if self.use_pca:
+            self.register_buffer(
+                'left_hand_components',
+                torch.tensor(left_hand_components, dtype=dtype))
+            self.register_buffer(
+                'right_hand_components',
+                torch.tensor(right_hand_components, dtype=dtype))
+
+        if self.flat_hand_mean:
+            left_hand_mean = np.zeros_like(data_struct.hands_meanl)
+        else:
+            left_hand_mean = data_struct.hands_meanl
+
+        if self.flat_hand_mean:
+            right_hand_mean = np.zeros_like(data_struct.hands_meanr)
+        else:
+            right_hand_mean = data_struct.hands_meanr
+
+        self.register_buffer('left_hand_mean',
+                             to_tensor(left_hand_mean, dtype=self.dtype))
+        self.register_buffer('right_hand_mean',
+                             to_tensor(right_hand_mean, dtype=self.dtype))
+
+        # Create the buffers for the pose of the left hand
+        hand_pose_dim = num_pca_comps if use_pca else 3 * self.NUM_HAND_JOINTS
+        if create_left_hand_pose:
+            if left_hand_pose is None:
+                default_lhand_pose = torch.zeros([batch_size, hand_pose_dim],
+                                                 dtype=dtype)
+            else:
+                default_lhand_pose = torch.tensor(left_hand_pose, dtype=dtype)
+
+            left_hand_pose_param = nn.Parameter(default_lhand_pose,
+                                                requires_grad=True)
+            self.register_parameter('left_hand_pose',
+                                    left_hand_pose_param)
+
+        if create_right_hand_pose:
+            if right_hand_pose is None:
+                default_rhand_pose = torch.zeros([batch_size, hand_pose_dim],
+                                                 dtype=dtype)
+            else:
+                default_rhand_pose = torch.tensor(right_hand_pose, dtype=dtype)
+
+            right_hand_pose_param = nn.Parameter(default_rhand_pose,
+                                                 requires_grad=True)
+            self.register_parameter('right_hand_pose',
+                                    right_hand_pose_param)
+
+        # Create the buffer for the mean pose.
+        pose_mean_tensor = self.create_mean_pose(
+            data_struct, flat_hand_mean=flat_hand_mean)
+        if not torch.is_tensor(pose_mean_tensor):
+            pose_mean_tensor = torch.tensor(pose_mean_tensor, dtype=dtype)
+        self.register_buffer('pose_mean', pose_mean_tensor)
+
+    def create_mean_pose(self, data_struct, flat_hand_mean=False):
+        # Create the array for the mean pose. If flat_hand is false, then use
+        # the mean that is given by the data, rather than the flat open hand
+        global_orient_mean = torch.zeros([3], dtype=self.dtype)
+        body_pose_mean = torch.zeros([self.NUM_BODY_JOINTS * 3],
+                                     dtype=self.dtype)
+
+        pose_mean = torch.cat([global_orient_mean, body_pose_mean,
+                               self.left_hand_mean,
+                               self.right_hand_mean], dim=0)
+        return pose_mean
+
+    def name(self) -> str:
+        return 'SMPL+H'
+
+    def extra_repr(self):
+        msg = super(SMPLH, self).extra_repr()
+        msg = [msg]
+        if self.use_pca:
+            msg.append(f'Number of PCA components: {self.num_pca_comps}')
+        msg.append(f'Flat hand mean: {self.flat_hand_mean}')
+        return '\n'.join(msg)
+
+    def forward(
+        self,
+        betas: Optional[Tensor] = None,
+        global_orient: Optional[Tensor] = None,
+        body_pose: Optional[Tensor] = None,
+        left_hand_pose: Optional[Tensor] = None,
+        right_hand_pose: Optional[Tensor] = None,
+        transl: Optional[Tensor] = None,
+        return_verts: bool = True,
+        return_full_pose: bool = False,
+        pose2rot: bool = True,
+        **kwargs
+    ) -> SMPLHOutput:
+        '''
+        '''
+        # If no shape and pose parameters are passed along, then use the
+        # ones from the module
+        global_orient = (global_orient if global_orient is not None else
+                         self.global_orient)
+        body_pose = body_pose if body_pose is not None else self.body_pose
+        betas = betas if betas is not None else self.betas
+        left_hand_pose = (left_hand_pose if left_hand_pose is not None else
+                          self.left_hand_pose)
+        right_hand_pose = (right_hand_pose if right_hand_pose is not None else
+                           self.right_hand_pose)
+
+        apply_trans = transl is not None or hasattr(self, 'transl')
+        if transl is None:
+            if hasattr(self, 'transl'):
+                transl = self.transl
+
+        if self.use_pca:
+            left_hand_pose = torch.einsum(
+                'bi,ij->bj', [left_hand_pose, self.left_hand_components])
+            right_hand_pose = torch.einsum(
+                'bi,ij->bj', [right_hand_pose, self.right_hand_components])
+
+        full_pose = torch.cat([global_orient, body_pose,
+                               left_hand_pose,
+                               right_hand_pose], dim=1)
+        full_pose += self.pose_mean
+
+        vertices, joints = lbs(self.betas, full_pose, self.v_template,
+                               self.shapedirs, self.posedirs,
+                               self.J_regressor, self.parents,
+                               self.lbs_weights, pose2rot=pose2rot)
+
+        # Add any extra joints that might be needed
+        joints = self.vertex_joint_selector(vertices, joints)
+        if self.joint_mapper is not None:
+            joints = self.joint_mapper(joints)
+
+        if apply_trans:
+            joints += transl.unsqueeze(dim=1)
+            vertices += transl.unsqueeze(dim=1)
+
+        output = SMPLHOutput(vertices=vertices if return_verts else None,
+                             joints=joints,
+                             betas=betas,
+                             global_orient=global_orient,
+                             body_pose=body_pose,
+                             left_hand_pose=left_hand_pose,
+                             right_hand_pose=right_hand_pose,
+                             full_pose=full_pose if return_full_pose else None)
+
+        return output
+
+
+class SMPLHLayer(SMPLH):
+
+    def __init__(
+        self, *args, **kwargs
+    ) -> None:
+        ''' SMPL+H as a layer model constructor
+        '''
+        super(SMPLHLayer, self).__init__(
+            create_global_orient=False,
+            create_body_pose=False,
+            create_left_hand_pose=False,
+            create_right_hand_pose=False,
+            create_betas=False,
+            create_transl=False,
+            *args,
+            **kwargs)
+
+    def forward(
+        self,
+        betas: Optional[Tensor] = None,
+        global_orient: Optional[Tensor] = None,
+        body_pose: Optional[Tensor] = None,
+        left_hand_pose: Optional[Tensor] = None,
+        right_hand_pose: Optional[Tensor] = None,
+        transl: Optional[Tensor] = None,
+        return_verts: bool = True,
+        return_full_pose: bool = False,
+        pose2rot: bool = True,
+        **kwargs
+    ) -> SMPLHOutput:
+        '''
+        '''
+        device, dtype = self.shapedirs.device, self.shapedirs.dtype
+        if global_orient is None:
+            batch_size = 1
+            global_orient = torch.zeros(3, device=device, dtype=dtype).view(
+                1, 1, 3).expand(batch_size, -1, -1).contiguous()
+        else:
+            batch_size = global_orient.shape[0]
+        if body_pose is None:
+            body_pose = torch.zeros(3, device=device, dtype=dtype).view(
+                1, 1, 3).expand(batch_size, 21, -1).contiguous()
+        if left_hand_pose is None:
+            left_hand_pose = torch.zeros(3, device=device, dtype=dtype).view(
+                1, 1, 3).expand(batch_size, 15, -1).contiguous()
+        if right_hand_pose is None:
+            right_hand_pose = torch.zeros(3, device=device, dtype=dtype).view(
+                1, 1, 3).expand(batch_size, 15, -1).contiguous()
+        if betas is None:
+            betas = torch.zeros([batch_size, self.num_betas],
+                                dtype=dtype, device=device)
+        if transl is None:
+            transl = torch.zeros([batch_size, 3], dtype=dtype, device=device)
+
+        # Concatenate all pose vectors
+        full_pose = torch.cat(
+            [global_orient.reshape(-1, 1, 3),
+             body_pose.reshape(-1, self.NUM_BODY_JOINTS, 3),
+             left_hand_pose.reshape(-1, self.NUM_HAND_JOINTS, 3),
+             right_hand_pose.reshape(-1, self.NUM_HAND_JOINTS, 3)],
+            dim=1)
+
+        vertices, joints = lbs(betas, full_pose, self.v_template,
+                               self.shapedirs, self.posedirs,
+                               self.J_regressor, self.parents,
+                               self.lbs_weights, pose2rot=True)
+
+        # Add any extra joints that might be needed
+        joints = self.vertex_joint_selector(vertices, joints)
+        if self.joint_mapper is not None:
+            joints = self.joint_mapper(joints)
+
+        if transl is not None:
+            joints += transl.unsqueeze(dim=1)
+            vertices += transl.unsqueeze(dim=1)
+
+        output = SMPLHOutput(vertices=vertices if return_verts else None,
+                             joints=joints,
+                             betas=betas,
+                             global_orient=global_orient,
+                             body_pose=body_pose,
+                             left_hand_pose=left_hand_pose,
+                             right_hand_pose=right_hand_pose,
+                             full_pose=full_pose if return_full_pose else None)
+
+        return output
+
+
+class SMPLX(SMPLH):
+    '''
+    SMPL-X (SMPL eXpressive) is a unified body model, with shape parameters
+    trained jointly for the face, hands and body.
+    SMPL-X uses standard vertex based linear blend skinning with learned
+    corrective blend shapes, has N=10475 vertices and K=54 joints,
+    which includes joints for the neck, jaw, eyeballs and fingers.
+    '''
+
+    NUM_BODY_JOINTS = SMPLH.NUM_BODY_JOINTS
+    NUM_HAND_JOINTS = 15
+    NUM_FACE_JOINTS = 3
+    NUM_JOINTS = NUM_BODY_JOINTS + 2 * NUM_HAND_JOINTS + NUM_FACE_JOINTS
+    EXPRESSION_SPACE_DIM = 100
+    NECK_IDX = 12
+
+    def __init__(
+        self, model_path: str,
+        num_expression_coeffs: int = 10,
+        create_expression: bool = True,
+        expression: Optional[Tensor] = None,
+        create_jaw_pose: bool = True,
+        jaw_pose: Optional[Tensor] = None,
+        create_leye_pose: bool = True,
+        leye_pose: Optional[Tensor] = None,
+        create_reye_pose=True,
+        reye_pose: Optional[Tensor] = None,
+        use_face_contour: bool = False,
+        batch_size: int = 1,
+        gender: str = 'neutral',
+        dtype=torch.float32,
+        ext: str = 'npz',
+        **kwargs
+    ) -> None:
+        ''' SMPLX model constructor
+
+            Parameters
+            ----------
+            model_path: str
+                The path to the folder or to the file where the model
+                parameters are stored
+            num_expression_coeffs: int, optional
+                Number of expression components to use
+                (default = 10).
+            create_expression: bool, optional
+                Flag for creating a member variable for the expression space
+                (default = True).
+            expression: torch.tensor, optional, Bx10
+                The default value for the expression member variable.
+                (default = None)
+            create_jaw_pose: bool, optional
+                Flag for creating a member variable for the jaw pose.
+                (default = False)
+            jaw_pose: torch.tensor, optional, Bx3
+                The default value for the jaw pose variable.
+                (default = None)
+            create_leye_pose: bool, optional
+                Flag for creating a member variable for the left eye pose.
+                (default = False)
+            leye_pose: torch.tensor, optional, Bx10
+                The default value for the left eye pose variable.
+                (default = None)
+            create_reye_pose: bool, optional
+                Flag for creating a member variable for the right eye pose.
+                (default = False)
+            reye_pose: torch.tensor, optional, Bx10
+                The default value for the right eye pose variable.
+                (default = None)
+            use_face_contour: bool, optional
+                Whether to compute the keypoints that form the facial contour
+            batch_size: int, optional
+                The batch size used for creating the member variables
+            gender: str, optional
+                Which gender to load
+            dtype: torch.dtype
+                The data type for the created variables
+        '''
+
+        # Load the model
+        if osp.isdir(model_path):
+            model_fn = 'SMPLX_{}.{ext}'.format(gender.upper(), ext=ext)
+            smplx_path = os.path.join(model_path, model_fn)
+        else:
+            smplx_path = model_path
+        assert osp.exists(smplx_path), 'Path {} does not exist!'.format(smplx_path)
+        if ext == 'pkl':
+            with open(smplx_path, 'rb') as smplx_file:
+                model_data = pickle.load(smplx_file, encoding='latin1')
+        elif ext == 'npz':
+            model_data = np.load(smplx_path, allow_pickle=True)
+        else:
+            raise ValueError('Unknown extension: {}'.format(ext))
+
+        data_struct = Struct(**model_data)
+
+        super(SMPLX, self).__init__(
+            model_path=model_path,
+            data_struct=data_struct,
+            dtype=dtype,
+            batch_size=batch_size,
+            vertex_ids=VERTEX_IDS['smplx'],
+            gender=gender, ext=ext,
+            **kwargs)
+
+        lmk_faces_idx = data_struct.lmk_faces_idx
+        self.register_buffer('lmk_faces_idx',
+                             torch.tensor(lmk_faces_idx, dtype=torch.long))
+        lmk_bary_coords = data_struct.lmk_bary_coords
+        self.register_buffer('lmk_bary_coords',
+                             torch.tensor(lmk_bary_coords, dtype=dtype))
+
+        self.use_face_contour = use_face_contour
+        if self.use_face_contour:
+            dynamic_lmk_faces_idx = data_struct.dynamic_lmk_faces_idx
+            dynamic_lmk_faces_idx = torch.tensor(
+                dynamic_lmk_faces_idx,
+                dtype=torch.long)
+            self.register_buffer('dynamic_lmk_faces_idx',
+                                 dynamic_lmk_faces_idx)
+
+            dynamic_lmk_bary_coords = data_struct.dynamic_lmk_bary_coords
+            dynamic_lmk_bary_coords = torch.tensor(
+                dynamic_lmk_bary_coords, dtype=dtype)
+            self.register_buffer('dynamic_lmk_bary_coords',
+                                 dynamic_lmk_bary_coords)
+
+            neck_kin_chain = find_joint_kin_chain(self.NECK_IDX, self.parents)
+            self.register_buffer(
+                'neck_kin_chain',
+                torch.tensor(neck_kin_chain, dtype=torch.long))
+
+        if create_jaw_pose:
+            if jaw_pose is None:
+                default_jaw_pose = torch.zeros([batch_size, 3], dtype=dtype)
+            else:
+                default_jaw_pose = torch.tensor(jaw_pose, dtype=dtype)
+            jaw_pose_param = nn.Parameter(default_jaw_pose,
+                                          requires_grad=True)
+            self.register_parameter('jaw_pose', jaw_pose_param)
+
+        if create_leye_pose:
+            if leye_pose is None:
+                default_leye_pose = torch.zeros([batch_size, 3], dtype=dtype)
+            else:
+                default_leye_pose = torch.tensor(leye_pose, dtype=dtype)
+            leye_pose_param = nn.Parameter(default_leye_pose,
+                                           requires_grad=True)
+            self.register_parameter('leye_pose', leye_pose_param)
+
+        if create_reye_pose:
+            if reye_pose is None:
+                default_reye_pose = torch.zeros([batch_size, 3], dtype=dtype)
+            else:
+                default_reye_pose = torch.tensor(reye_pose, dtype=dtype)
+            reye_pose_param = nn.Parameter(default_reye_pose,
+                                           requires_grad=True)
+            self.register_parameter('reye_pose', reye_pose_param)
+
+        shapedirs = data_struct.shapedirs
+        if len(shapedirs.shape) < 3:
+            shapedirs = shapedirs[:, :, None]
+        if (shapedirs.shape[-1] < self.SHAPE_SPACE_DIM +
+                self.EXPRESSION_SPACE_DIM):
+            print(f'WARNING: You are using a {self.name()} model, with only'
+                  ' 10 shape and 10 expression coefficients.')
+            expr_start_idx = 10
+            expr_end_idx = 20
+            num_expression_coeffs = min(num_expression_coeffs, 10)
+        else:
+            expr_start_idx = self.SHAPE_SPACE_DIM
+            expr_end_idx = self.SHAPE_SPACE_DIM + num_expression_coeffs
+            num_expression_coeffs = min(
+                num_expression_coeffs, self.EXPRESSION_SPACE_DIM)
+
+        self._num_expression_coeffs = num_expression_coeffs
+
+        expr_dirs = shapedirs[:, :, expr_start_idx:expr_end_idx]
+        self.register_buffer(
+            'expr_dirs', to_tensor(to_np(expr_dirs), dtype=dtype))
+
+        if create_expression:
+            if expression is None:
+                default_expression = torch.zeros(
+                    [batch_size, self.num_expression_coeffs], dtype=dtype)
+            else:
+                default_expression = torch.tensor(expression, dtype=dtype)
+            expression_param = nn.Parameter(default_expression,
+                                            requires_grad=True)
+            self.register_parameter('expression', expression_param)
+
+    def name(self) -> str:
+        return 'SMPL-X'
+
+    @property
+    def num_expression_coeffs(self):
+        return self._num_expression_coeffs
+
+    def create_mean_pose(self, data_struct, flat_hand_mean=False):
+        # Create the array for the mean pose. If flat_hand is false, then use
+        # the mean that is given by the data, rather than the flat open hand
+        global_orient_mean = torch.zeros([3], dtype=self.dtype)
+        body_pose_mean = torch.zeros([self.NUM_BODY_JOINTS * 3],
+                                     dtype=self.dtype)
+        jaw_pose_mean = torch.zeros([3], dtype=self.dtype)
+        leye_pose_mean = torch.zeros([3], dtype=self.dtype)
+        reye_pose_mean = torch.zeros([3], dtype=self.dtype)
+
+        pose_mean = np.concatenate([global_orient_mean, body_pose_mean,
+                                    jaw_pose_mean,
+                                    leye_pose_mean, reye_pose_mean,
+                                    self.left_hand_mean, self.right_hand_mean],
+                                   axis=0)
+
+        return pose_mean
+
+    def extra_repr(self):
+        msg = super(SMPLX, self).extra_repr()
+        msg = [
+            msg,
+            f'Number of Expression Coefficients: {self.num_expression_coeffs}'
+        ]
+        return '\n'.join(msg)
+
+    def forward(
+        self,
+        betas: Optional[Tensor] = None,
+        global_orient: Optional[Tensor] = None,
+        body_pose: Optional[Tensor] = None,
+        left_hand_pose: Optional[Tensor] = None,
+        right_hand_pose: Optional[Tensor] = None,
+        transl: Optional[Tensor] = None,
+        expression: Optional[Tensor] = None,
+        jaw_pose: Optional[Tensor] = None,
+        leye_pose: Optional[Tensor] = None,
+        reye_pose: Optional[Tensor] = None,
+        return_verts: bool = True,
+        return_full_pose: bool = False,
+        pose2rot: bool = True,
+        **kwargs
+    ) -> SMPLXOutput:
+        '''
+        Forward pass for the SMPLX model
+
+            Parameters
+            ----------
+            global_orient: torch.tensor, optional, shape Bx3
+                If given, ignore the member variable and use it as the global
+                rotation of the body. Useful if someone wishes to predicts this
+                with an external model. (default=None)
+            betas: torch.tensor, optional, shape Bx10
+                If given, ignore the member variable `betas` and use it
+                instead. For example, it can used if shape parameters
+                `betas` are predicted from some external model.
+                (default=None)
+            expression: torch.tensor, optional, shape Bx10
+                If given, ignore the member variable `expression` and use it
+                instead. For example, it can used if expression parameters
+                `expression` are predicted from some external model.
+            body_pose: torch.tensor, optional, shape Bx(J*3)
+                If given, ignore the member variable `body_pose` and use it
+                instead. For example, it can used if someone predicts the
+                pose of the body joints are predicted from some external model.
+                It should be a tensor that contains joint rotations in
+                axis-angle format. (default=None)
+            left_hand_pose: torch.tensor, optional, shape BxP
+                If given, ignore the member variable `left_hand_pose` and
+                use this instead. It should either contain PCA coefficients or
+                joint rotations in axis-angle format.
+            right_hand_pose: torch.tensor, optional, shape BxP
+                If given, ignore the member variable `right_hand_pose` and
+                use this instead. It should either contain PCA coefficients or
+                joint rotations in axis-angle format.
+            jaw_pose: torch.tensor, optional, shape Bx3
+                If given, ignore the member variable `jaw_pose` and
+                use this instead. It should either joint rotations in
+                axis-angle format.
+            transl: torch.tensor, optional, shape Bx3
+                If given, ignore the member variable `transl` and use it
+                instead. For example, it can used if the translation
+                `transl` is predicted from some external model.
+                (default=None)
+            return_verts: bool, optional
+                Return the vertices. (default=True)
+            return_full_pose: bool, optional
+                Returns the full axis-angle pose vector (default=False)
+
+            Returns
+            -------
+                output: ModelOutput
+                A named tuple of type `ModelOutput`
+        '''
+
+        # If no shape and pose parameters are passed along, then use the
+        # ones from the module
+        global_orient = (global_orient if global_orient is not None else
+                         self.global_orient)
+        body_pose = body_pose if body_pose is not None else self.body_pose
+        betas = betas if betas is not None else self.betas
+
+        left_hand_pose = (left_hand_pose if left_hand_pose is not None else
+                          self.left_hand_pose)
+        right_hand_pose = (right_hand_pose if right_hand_pose is not None else
+                           self.right_hand_pose)
+        jaw_pose = jaw_pose if jaw_pose is not None else self.jaw_pose
+        leye_pose = leye_pose if leye_pose is not None else self.leye_pose
+        reye_pose = reye_pose if reye_pose is not None else self.reye_pose
+        expression = expression if expression is not None else self.expression
+
+        apply_trans = transl is not None or hasattr(self, 'transl')
+        if transl is None:
+            if hasattr(self, 'transl'):
+                transl = self.transl
+
+        if self.use_pca:
+            left_hand_pose = torch.einsum(
+                'bi,ij->bj', [left_hand_pose, self.left_hand_components])
+            right_hand_pose = torch.einsum(
+                'bi,ij->bj', [right_hand_pose, self.right_hand_components])
+
+        full_pose = torch.cat([global_orient, body_pose,
+                               jaw_pose, leye_pose, reye_pose,
+                               left_hand_pose,
+                               right_hand_pose], dim=1)
+
+        # Add the mean pose of the model. Does not affect the body, only the
+        # hands when flat_hand_mean == False
+        full_pose += self.pose_mean
+
+        batch_size = max(betas.shape[0], global_orient.shape[0],
+                         body_pose.shape[0])
+        # Concatenate the shape and expression coefficients
+        scale = int(batch_size / betas.shape[0])
+        if scale > 1:
+            betas = betas.expand(scale, -1)
+        shape_components = torch.cat([betas, expression], dim=-1)
+
+        shapedirs = torch.cat([self.shapedirs, self.expr_dirs], dim=-1)
+
+        vertices, joints = lbs(shape_components, full_pose, self.v_template,
+                               shapedirs, self.posedirs,
+                               self.J_regressor, self.parents,
+                               self.lbs_weights, pose2rot=pose2rot,
+                               )
+
+        lmk_faces_idx = self.lmk_faces_idx.unsqueeze(
+            dim=0).expand(batch_size, -1).contiguous()
+        lmk_bary_coords = self.lmk_bary_coords.unsqueeze(dim=0).repeat(
+            self.batch_size, 1, 1)
+        if self.use_face_contour:
+            lmk_idx_and_bcoords = find_dynamic_lmk_idx_and_bcoords(
+                vertices, full_pose, self.dynamic_lmk_faces_idx,
+                self.dynamic_lmk_bary_coords,
+                self.neck_kin_chain,
+                pose2rot=True,
+            )
+            dyn_lmk_faces_idx, dyn_lmk_bary_coords = lmk_idx_and_bcoords
+
+            lmk_faces_idx = torch.cat([lmk_faces_idx,
+                                       dyn_lmk_faces_idx], 1)
+            lmk_bary_coords = torch.cat(
+                [lmk_bary_coords.expand(batch_size, -1, -1),
+                 dyn_lmk_bary_coords], 1)
+
+        landmarks = vertices2landmarks(vertices, self.faces_tensor,
+                                       lmk_faces_idx,
+                                       lmk_bary_coords)
+
+        # Add any extra joints that might be needed
+        joints = self.vertex_joint_selector(vertices, joints)
+        # Add the landmarks to the joints
+        joints = torch.cat([joints, landmarks], dim=1)
+        # Map the joints to the current dataset
+
+        if self.joint_mapper is not None:
+            joints = self.joint_mapper(joints=joints, vertices=vertices)
+
+        if apply_trans:
+            joints += transl.unsqueeze(dim=1)
+            vertices += transl.unsqueeze(dim=1)
+
+        output = SMPLXOutput(vertices=vertices if return_verts else None,
+                             joints=joints,
+                             betas=betas,
+                             expression=expression,
+                             global_orient=global_orient,
+                             body_pose=body_pose,
+                             left_hand_pose=left_hand_pose,
+                             right_hand_pose=right_hand_pose,
+                             jaw_pose=jaw_pose,
+                             full_pose=full_pose if return_full_pose else None)
+        return output
+
+
+class SMPLXLayer(SMPLX):
+    def __init__(
+        self,
+        *args,
+        **kwargs
+    ) -> None:
+        # Just create a SMPLX module without any member variables
+        super(SMPLXLayer, self).__init__(
+            create_global_orient=False,
+            create_body_pose=False,
+            create_left_hand_pose=False,
+            create_right_hand_pose=False,
+            create_jaw_pose=False,
+            create_leye_pose=False,
+            create_reye_pose=False,
+            create_betas=False,
+            create_expression=False,
+            create_transl=False,
+            *args, **kwargs,
+        )
+
+    def forward(
+        self,
+        betas: Optional[Tensor] = None,
+        global_orient: Optional[Tensor] = None,
+        body_pose: Optional[Tensor] = None,
+        left_hand_pose: Optional[Tensor] = None,
+        right_hand_pose: Optional[Tensor] = None,
+        transl: Optional[Tensor] = None,
+        expression: Optional[Tensor] = None,
+        jaw_pose: Optional[Tensor] = None,
+        leye_pose: Optional[Tensor] = None,
+        reye_pose: Optional[Tensor] = None,
+        return_verts: bool = True,
+        return_full_pose: bool = False,
+        **kwargs
+    ) -> SMPLXOutput:
+        '''
+        Forward pass for the SMPLX model
+
+            Parameters
+            ----------
+            global_orient: torch.tensor, optional, shape Bx3
+                If given, ignore the member variable and use it as the global
+                rotation of the body. Useful if someone wishes to predicts this
+                with an external model. (default=None)
+            betas: torch.tensor, optional, shape Bx10
+                If given, ignore the member variable `betas` and use it
+                instead. For example, it can used if shape parameters
+                `betas` are predicted from some external model.
+                (default=None)
+            expression: torch.tensor, optional, shape Bx10
+                If given, ignore the member variable `expression` and use it
+                instead. For example, it can used if expression parameters
+                `expression` are predicted from some external model.
+            body_pose: torch.tensor, optional, shape Bx(J*3)
+                If given, ignore the member variable `body_pose` and use it
+                instead. For example, it can used if someone predicts the
+                pose of the body joints are predicted from some external model.
+                It should be a tensor that contains joint rotations in
+                axis-angle format. (default=None)
+            left_hand_pose: torch.tensor, optional, shape BxP
+                If given, ignore the member variable `left_hand_pose` and
+                use this instead. It should either contain PCA coefficients or
+                joint rotations in axis-angle format.
+            right_hand_pose: torch.tensor, optional, shape BxP
+                If given, ignore the member variable `right_hand_pose` and
+                use this instead. It should either contain PCA coefficients or
+                joint rotations in axis-angle format.
+            jaw_pose: torch.tensor, optional, shape Bx3x3
+                If given, ignore the member variable `jaw_pose` and
+                use this instead. It should either joint rotations in
+                axis-angle format.
+            transl: torch.tensor, optional, shape Bx3
+                If given, ignore the member variable `transl` and use it
+                instead. For example, it can used if the translation
+                `transl` is predicted from some external model.
+                (default=None)
+            return_verts: bool, optional
+                Return the vertices. (default=True)
+            return_full_pose: bool, optional
+                Returns the full pose vector (default=False)
+            Returns
+            -------
+                output: ModelOutput
+                A data class that contains the posed vertices and joints
+        '''
+        device, dtype = self.shapedirs.device, self.shapedirs.dtype
+
+        if global_orient is None:
+            batch_size = 1
+            global_orient = torch.zeros(3, device=device, dtype=dtype).view(
+                1, 1, 3).expand(batch_size, -1, -1).contiguous()
+        else:
+            batch_size = global_orient.shape[0]
+        if body_pose is None:
+            body_pose = torch.zeros(3, device=device, dtype=dtype).view(
+                1, 1, 3).expand(
+                    batch_size, self.NUM_BODY_JOINTS, -1).contiguous()
+        if left_hand_pose is None:
+            left_hand_pose = torch.zeros(3, device=device, dtype=dtype).view(
+                1, 1, 3).expand(batch_size, 15, -1).contiguous()
+        if right_hand_pose is None:
+            right_hand_pose = torch.zeros(3, device=device, dtype=dtype).view(
+                1, 1, 3).expand(batch_size, 15, -1).contiguous()
+        if jaw_pose is None:
+            jaw_pose = torch.zeros(3, device=device, dtype=dtype).view(
+                1, 1, 3).expand(batch_size, -1, -1).contiguous()
+        if leye_pose is None:
+            leye_pose = torch.zeros(3, device=device, dtype=dtype).view(
+                1, 1, 3).expand(batch_size, -1, -1).contiguous()
+        if reye_pose is None:
+            reye_pose = torch.zeros(3, device=device, dtype=dtype).view(
+                1, 1, 3).expand(batch_size, -1, -1).contiguous()
+        if expression is None:
+            expression = torch.zeros([batch_size, self.num_expression_coeffs],
+                                     dtype=dtype, device=device)
+        if betas is None:
+            betas = torch.zeros([batch_size, self.num_betas],
+                                dtype=dtype, device=device)
+        if transl is None:
+            transl = torch.zeros([batch_size, 3], dtype=dtype, device=device)
+
+        # Concatenate all pose vectors
+        full_pose = torch.cat(
+            [global_orient.reshape(-1, 1, 3),
+             body_pose.reshape(-1, self.NUM_BODY_JOINTS, 3),
+             jaw_pose.reshape(-1, 1, 3),
+             leye_pose.reshape(-1, 1, 3),
+             reye_pose.reshape(-1, 1, 3),
+             left_hand_pose.reshape(-1, self.NUM_HAND_JOINTS, 3),
+             right_hand_pose.reshape(-1, self.NUM_HAND_JOINTS, 3)],
+            dim=1)
+        shape_components = torch.cat([betas, expression], dim=-1)
+
+        shapedirs = torch.cat([self.shapedirs, self.expr_dirs], dim=-1)
+
+        vertices, joints = lbs(shape_components, full_pose, self.v_template,
+                               shapedirs, self.posedirs,
+                               self.J_regressor, self.parents,
+                               self.lbs_weights, pose2rot=True)
+
+        lmk_faces_idx = self.lmk_faces_idx.unsqueeze(
+            dim=0).expand(batch_size, -1).contiguous()
+        lmk_bary_coords = self.lmk_bary_coords.unsqueeze(dim=0).repeat(
+            self.batch_size, 1, 1)
+        if self.use_face_contour:
+            lmk_idx_and_bcoords = find_dynamic_lmk_idx_and_bcoords(
+                vertices, full_pose,
+                self.dynamic_lmk_faces_idx,
+                self.dynamic_lmk_bary_coords,
+                self.neck_kin_chain,
+                pose2rot=False,
+            )
+            dyn_lmk_faces_idx, dyn_lmk_bary_coords = lmk_idx_and_bcoords
+
+            lmk_faces_idx = torch.cat([lmk_faces_idx, dyn_lmk_faces_idx], 1)
+            lmk_bary_coords = torch.cat(
+                [lmk_bary_coords.expand(batch_size, -1, -1),
+                 dyn_lmk_bary_coords], 1)
+
+        landmarks = vertices2landmarks(vertices, self.faces_tensor,
+                                       lmk_faces_idx,
+                                       lmk_bary_coords)
+
+        # Add any extra joints that might be needed
+        joints = self.vertex_joint_selector(vertices, joints)
+        # Add the landmarks to the joints
+        joints = torch.cat([joints, landmarks], dim=1)
+        # Map the joints to the current dataset
+
+        if self.joint_mapper is not None:
+            joints = self.joint_mapper(joints=joints, vertices=vertices)
+
+        if transl is not None:
+            joints += transl.unsqueeze(dim=1)
+            vertices += transl.unsqueeze(dim=1)
+
+        output = SMPLXOutput(vertices=vertices if return_verts else None,
+                             joints=joints,
+                             betas=betas,
+                             expression=expression,
+                             global_orient=global_orient,
+                             body_pose=body_pose,
+                             left_hand_pose=left_hand_pose,
+                             right_hand_pose=right_hand_pose,
+                             jaw_pose=jaw_pose,
+                             transl=transl,
+                             full_pose=full_pose if return_full_pose else None)
+        return output
+
+
+class MANO(SMPL):
+    # The hand joints are replaced by MANO
+    NUM_BODY_JOINTS = 1
+    NUM_HAND_JOINTS = 15
+    NUM_JOINTS = NUM_BODY_JOINTS + NUM_HAND_JOINTS
+
+    def __init__(
+        self,
+        model_path: str,
+        is_rhand: bool = True,
+        data_struct: Optional[Struct] = None,
+        create_hand_pose: bool = True,
+        hand_pose: Optional[Tensor] = None,
+        use_pca: bool = True,
+        num_pca_comps: int = 6,
+        flat_hand_mean: bool = False,
+        batch_size: int = 1,
+        dtype=torch.float32,
+        vertex_ids=None,
+        use_compressed: bool = True,
+        ext: str = 'pkl',
+        **kwargs
+    ) -> None:
+        ''' MANO model constructor
+
+            Parameters
+            ----------
+            model_path: str
+                The path to the folder or to the file where the model
+                parameters are stored
+            data_struct: Strct
+                A struct object. If given, then the parameters of the model are
+                read from the object. Otherwise, the model tries to read the
+                parameters from the given `model_path`. (default = None)
+            create_hand_pose: bool, optional
+                Flag for creating a member variable for the pose of the right
+                hand. (default = True)
+            hand_pose: torch.tensor, optional, BxP
+                The default value for the right hand pose member variable.
+                (default = None)
+            num_pca_comps: int, optional
+                The number of PCA components to use for each hand.
+                (default = 6)
+            flat_hand_mean: bool, optional
+                If False, then the pose of the hand is initialized to False.
+            batch_size: int, optional
+                The batch size used for creating the member variables
+            dtype: torch.dtype, optional
+                The data type for the created variables
+            vertex_ids: dict, optional
+                A dictionary containing the indices of the extra vertices that
+                will be selected
+        '''
+
+        self.num_pca_comps = num_pca_comps
+        self.is_rhand = is_rhand
+        # If no data structure is passed, then load the data from the given
+        # model folder
+        if data_struct is None:
+            # Load the model
+            if osp.isdir(model_path):
+                model_fn = 'MANO_{}.{ext}'.format(
+                    'RIGHT' if is_rhand else 'LEFT', ext=ext)
+                mano_path = os.path.join(model_path, model_fn)
+            else:
+                mano_path = model_path
+                self.is_rhand = True if 'RIGHT' in os.path.basename(
+                    model_path) else False
+            assert osp.exists(mano_path), 'Path {} does not exist!'.format(
+                mano_path)
+
+            if ext == 'pkl':
+                with open(mano_path, 'rb') as mano_file:
+                    model_data = pickle.load(mano_file, encoding='latin1')
+            elif ext == 'npz':
+                model_data = np.load(mano_path, allow_pickle=True)
+            else:
+                raise ValueError('Unknown extension: {}'.format(ext))
+            data_struct = Struct(**model_data)
+
+        if vertex_ids is None:
+            vertex_ids = VERTEX_IDS['smplh']
+
+        super(MANO, self).__init__(
+            model_path=model_path, data_struct=data_struct,
+            batch_size=batch_size, vertex_ids=vertex_ids,
+            use_compressed=use_compressed, dtype=dtype, ext=ext, **kwargs)
+
+        # add only MANO tips to the extra joints
+        self.vertex_joint_selector.extra_joints_idxs = to_tensor(
+            list(VERTEX_IDS['mano'].values()), dtype=torch.long)
+
+        self.use_pca = use_pca
+        self.num_pca_comps = num_pca_comps
+        if self.num_pca_comps == 45:
+            self.use_pca = False
+        self.flat_hand_mean = flat_hand_mean
+
+        hand_components = data_struct.hands_components[:num_pca_comps]
+
+        self.np_hand_components = hand_components
+
+        if self.use_pca:
+            self.register_buffer(
+                'hand_components',
+                torch.tensor(hand_components, dtype=dtype))
+
+        if self.flat_hand_mean:
+            hand_mean = np.zeros_like(data_struct.hands_mean)
+        else:
+            hand_mean = data_struct.hands_mean
+
+        self.register_buffer('hand_mean',
+                             to_tensor(hand_mean, dtype=self.dtype))
+
+        # Create the buffers for the pose of the left hand
+        hand_pose_dim = num_pca_comps if use_pca else 3 * self.NUM_HAND_JOINTS
+        if create_hand_pose:
+            if hand_pose is None:
+                default_hand_pose = torch.zeros([batch_size, hand_pose_dim],
+                                                dtype=dtype)
+            else:
+                default_hand_pose = torch.tensor(hand_pose, dtype=dtype)
+
+            hand_pose_param = nn.Parameter(default_hand_pose,
+                                           requires_grad=True)
+            self.register_parameter('hand_pose',
+                                    hand_pose_param)
+
+        # Create the buffer for the mean pose.
+        pose_mean = self.create_mean_pose(
+            data_struct, flat_hand_mean=flat_hand_mean)
+        pose_mean_tensor = pose_mean.clone().to(dtype)
+        # pose_mean_tensor = torch.tensor(pose_mean, dtype=dtype)
+        self.register_buffer('pose_mean', pose_mean_tensor)
+
+    def name(self) -> str:
+        return 'MANO'
+
+    def create_mean_pose(self, data_struct, flat_hand_mean=False):
+        # Create the array for the mean pose. If flat_hand is false, then use
+        # the mean that is given by the data, rather than the flat open hand
+        global_orient_mean = torch.zeros([3], dtype=self.dtype)
+        pose_mean = torch.cat([global_orient_mean, self.hand_mean], dim=0)
+        return pose_mean
+
+    def extra_repr(self):
+        msg = [super(MANO, self).extra_repr()]
+        if self.use_pca:
+            msg.append(f'Number of PCA components: {self.num_pca_comps}')
+        msg.append(f'Flat hand mean: {self.flat_hand_mean}')
+        return '\n'.join(msg)
+
+    def forward(
+        self,
+        betas: Optional[Tensor] = None,
+        global_orient: Optional[Tensor] = None,
+        hand_pose: Optional[Tensor] = None,
+        transl: Optional[Tensor] = None,
+        return_verts: bool = True,
+        return_full_pose: bool = False,
+        **kwargs
+    ) -> MANOOutput:
+        ''' Forward pass for the MANO model
+        '''
+        # If no shape and pose parameters are passed along, then use the
+        # ones from the module
+        global_orient = (global_orient if global_orient is not None else
+                         self.global_orient)
+        betas = betas if betas is not None else self.betas
+        hand_pose = (hand_pose if hand_pose is not None else
+                     self.hand_pose)
+
+        apply_trans = transl is not None or hasattr(self, 'transl')
+        if transl is None:
+            if hasattr(self, 'transl'):
+                transl = self.transl
+
+        if self.use_pca:
+            hand_pose = torch.einsum(
+                'bi,ij->bj', [hand_pose, self.hand_components])
+
+        full_pose = torch.cat([global_orient, hand_pose], dim=1)
+        full_pose += self.pose_mean
+
+        vertices, joints = lbs(betas, full_pose, self.v_template,
+                               self.shapedirs, self.posedirs,
+                               self.J_regressor, self.parents,
+                               self.lbs_weights, pose2rot=True,
+                               )
+
+        # # Add pre-selected extra joints that might be needed
+        # joints = self.vertex_joint_selector(vertices, joints)
+
+        if self.joint_mapper is not None:
+            joints = self.joint_mapper(joints)
+
+        if apply_trans:
+            joints = joints + transl.unsqueeze(dim=1)
+            vertices = vertices + transl.unsqueeze(dim=1)
+
+        output = MANOOutput(vertices=vertices if return_verts else None,
+                            joints=joints if return_verts else None,
+                            betas=betas,
+                            global_orient=global_orient,
+                            hand_pose=hand_pose,
+                            full_pose=full_pose if return_full_pose else None)
+
+        return output
+
+
+class MANOLayer(MANO):
+    def __init__(self, *args, **kwargs) -> None:
+        ''' MANO as a layer model constructor
+        '''
+        super(MANOLayer, self).__init__(
+            create_global_orient=False,
+            create_hand_pose=False,
+            create_betas=False,
+            create_transl=False,
+            *args, **kwargs)
+
+    def name(self) -> str:
+        return 'MANO'
+
+    def forward(
+        self,
+        betas: Optional[Tensor] = None,
+        global_orient: Optional[Tensor] = None,
+        hand_pose: Optional[Tensor] = None,
+        transl: Optional[Tensor] = None,
+        return_verts: bool = True,
+        return_full_pose: bool = False,
+        **kwargs
+    ) -> MANOOutput:
+        ''' Forward pass for the MANO model
+        '''
+        device, dtype = self.shapedirs.device, self.shapedirs.dtype
+        if global_orient is None:
+            batch_size = 1
+            global_orient = torch.zeros(3, device=device, dtype=dtype).view(
+                1, 1, 3).expand(batch_size, -1, -1).contiguous()
+        else:
+            batch_size = global_orient.shape[0]
+        if hand_pose is None:
+            hand_pose = torch.zeros(3, device=device, dtype=dtype).view(
+                1, 1, 3).expand(batch_size, 15, -1).contiguous()
+        if betas is None:
+            betas = torch.zeros(
+                [batch_size, self.num_betas], dtype=dtype, device=device)
+        if transl is None:
+            transl = torch.zeros([batch_size, 3], dtype=dtype, device=device)
+
+        full_pose = torch.cat([global_orient, hand_pose], dim=1)
+        vertices, joints = lbs(betas, full_pose, self.v_template,
+                               self.shapedirs, self.posedirs,
+                               self.J_regressor, self.parents,
+                               self.lbs_weights, pose2rot=True)
+
+        if self.joint_mapper is not None:
+            joints = self.joint_mapper(joints)
+
+        if transl is not None:
+            joints = joints + transl.unsqueeze(dim=1)
+            vertices = vertices + transl.unsqueeze(dim=1)
+
+        output = MANOOutput(
+            vertices=vertices if return_verts else None,
+            joints=joints if return_verts else None,
+            betas=betas,
+            global_orient=global_orient,
+            hand_pose=hand_pose,
+            full_pose=full_pose if return_full_pose else None)
+
+        return output
+
+
+class FLAME(SMPL):
+    NUM_JOINTS = 5
+    SHAPE_SPACE_DIM = 300
+    EXPRESSION_SPACE_DIM = 100
+    NECK_IDX = 0
+
+    def __init__(
+        self,
+        model_path: str,
+        data_struct=None,
+        num_expression_coeffs=10,
+        create_expression: bool = True,
+        expression: Optional[Tensor] = None,
+        create_neck_pose: bool = True,
+        neck_pose: Optional[Tensor] = None,
+        create_jaw_pose: bool = True,
+        jaw_pose: Optional[Tensor] = None,
+        create_leye_pose: bool = True,
+        leye_pose: Optional[Tensor] = None,
+        create_reye_pose=True,
+        reye_pose: Optional[Tensor] = None,
+        use_face_contour=False,
+        batch_size: int = 1,
+        gender: str = 'neutral',
+        dtype: torch.dtype = torch.float32,
+        ext='pkl',
+        **kwargs
+    ) -> None:
+        ''' FLAME model constructor
+
+            Parameters
+            ----------
+            model_path: str
+                The path to the folder or to the file where the model
+                parameters are stored
+            num_expression_coeffs: int, optional
+                Number of expression components to use
+                (default = 10).
+            create_expression: bool, optional
+                Flag for creating a member variable for the expression space
+                (default = True).
+            expression: torch.tensor, optional, Bx10
+                The default value for the expression member variable.
+                (default = None)
+            create_neck_pose: bool, optional
+                Flag for creating a member variable for the neck pose.
+                (default = False)
+            neck_pose: torch.tensor, optional, Bx3
+                The default value for the neck pose variable.
+                (default = None)
+            create_jaw_pose: bool, optional
+                Flag for creating a member variable for the jaw pose.
+                (default = False)
+            jaw_pose: torch.tensor, optional, Bx3
+                The default value for the jaw pose variable.
+                (default = None)
+            create_leye_pose: bool, optional
+                Flag for creating a member variable for the left eye pose.
+                (default = False)
+            leye_pose: torch.tensor, optional, Bx10
+                The default value for the left eye pose variable.
+                (default = None)
+            create_reye_pose: bool, optional
+                Flag for creating a member variable for the right eye pose.
+                (default = False)
+            reye_pose: torch.tensor, optional, Bx10
+                The default value for the right eye pose variable.
+                (default = None)
+            use_face_contour: bool, optional
+                Whether to compute the keypoints that form the facial contour
+            batch_size: int, optional
+                The batch size used for creating the member variables
+            gender: str, optional
+                Which gender to load
+            dtype: torch.dtype
+                The data type for the created variables
+        '''
+        model_fn = f'FLAME_{gender.upper()}.{ext}'
+        flame_path = os.path.join(model_path, model_fn)
+        assert osp.exists(flame_path), 'Path {} does not exist!'.format(
+            flame_path)
+        if ext == 'npz':
+            file_data = np.load(flame_path, allow_pickle=True)
+        elif ext == 'pkl':
+            with open(flame_path, 'rb') as smpl_file:
+                file_data = pickle.load(smpl_file, encoding='latin1')
+        else:
+            raise ValueError('Unknown extension: {}'.format(ext))
+        data_struct = Struct(**file_data)
+
+        super(FLAME, self).__init__(
+            model_path=model_path,
+            data_struct=data_struct,
+            dtype=dtype,
+            batch_size=batch_size,
+            gender=gender,
+            ext=ext,
+            **kwargs)
+
+        self.use_face_contour = use_face_contour
+
+        self.vertex_joint_selector.extra_joints_idxs = to_tensor(
+            [], dtype=torch.long)
+
+        if create_neck_pose:
+            if neck_pose is None:
+                default_neck_pose = torch.zeros([batch_size, 3], dtype=dtype)
+            else:
+                default_neck_pose = torch.tensor(neck_pose, dtype=dtype)
+            neck_pose_param = nn.Parameter(
+                default_neck_pose, requires_grad=True)
+            self.register_parameter('neck_pose', neck_pose_param)
+
+        if create_jaw_pose:
+            if jaw_pose is None:
+                default_jaw_pose = torch.zeros([batch_size, 3], dtype=dtype)
+            else:
+                default_jaw_pose = torch.tensor(jaw_pose, dtype=dtype)
+            jaw_pose_param = nn.Parameter(default_jaw_pose,
+                                          requires_grad=True)
+            self.register_parameter('jaw_pose', jaw_pose_param)
+
+        if create_leye_pose:
+            if leye_pose is None:
+                default_leye_pose = torch.zeros([batch_size, 3], dtype=dtype)
+            else:
+                default_leye_pose = torch.tensor(leye_pose, dtype=dtype)
+            leye_pose_param = nn.Parameter(default_leye_pose,
+                                           requires_grad=True)
+            self.register_parameter('leye_pose', leye_pose_param)
+
+        if create_reye_pose:
+            if reye_pose is None:
+                default_reye_pose = torch.zeros([batch_size, 3], dtype=dtype)
+            else:
+                default_reye_pose = torch.tensor(reye_pose, dtype=dtype)
+            reye_pose_param = nn.Parameter(default_reye_pose,
+                                           requires_grad=True)
+            self.register_parameter('reye_pose', reye_pose_param)
+
+        shapedirs = data_struct.shapedirs
+        if len(shapedirs.shape) < 3:
+            shapedirs = shapedirs[:, :, None]
+        if (shapedirs.shape[-1] < self.SHAPE_SPACE_DIM +
+                self.EXPRESSION_SPACE_DIM):
+            print(f'WARNING: You are using a {self.name()} model, with only'
+                  ' 10 shape and 10 expression coefficients.')
+            expr_start_idx = 10
+            expr_end_idx = 20
+            num_expression_coeffs = min(num_expression_coeffs, 10)
+        else:
+            expr_start_idx = self.SHAPE_SPACE_DIM
+            expr_end_idx = self.SHAPE_SPACE_DIM + num_expression_coeffs
+            num_expression_coeffs = min(
+                num_expression_coeffs, self.EXPRESSION_SPACE_DIM)
+
+        self._num_expression_coeffs = num_expression_coeffs
+
+        expr_dirs = shapedirs[:, :, expr_start_idx:expr_end_idx]
+        self.register_buffer(
+            'expr_dirs', to_tensor(to_np(expr_dirs), dtype=dtype))
+
+        if create_expression:
+            if expression is None:
+                default_expression = torch.zeros(
+                    [batch_size, self.num_expression_coeffs], dtype=dtype)
+            else:
+                default_expression = torch.tensor(expression, dtype=dtype)
+            expression_param = nn.Parameter(default_expression,
+                                            requires_grad=True)
+            self.register_parameter('expression', expression_param)
+
+        # The pickle file that contains the barycentric coordinates for
+        # regressing the landmarks
+        landmark_bcoord_filename = osp.join(
+            model_path, 'flame_static_embedding.pkl')
+
+        with open(landmark_bcoord_filename, 'rb') as fp:
+            landmarks_data = pickle.load(fp, encoding='latin1')
+
+        lmk_faces_idx = landmarks_data['lmk_face_idx'].astype(np.int64)
+        self.register_buffer('lmk_faces_idx',
+                             torch.tensor(lmk_faces_idx, dtype=torch.long))
+        lmk_bary_coords = landmarks_data['lmk_b_coords']
+        self.register_buffer('lmk_bary_coords',
+                             torch.tensor(lmk_bary_coords, dtype=dtype))
+        if self.use_face_contour:
+            face_contour_path = os.path.join(
+                model_path, 'flame_dynamic_embedding.npy')
+            contour_embeddings = np.load(face_contour_path,
+                                         allow_pickle=True,
+                                         encoding='latin1')[()]
+
+            dynamic_lmk_faces_idx = np.array(
+                contour_embeddings['lmk_face_idx'], dtype=np.int64)
+            dynamic_lmk_faces_idx = torch.tensor(
+                dynamic_lmk_faces_idx,
+                dtype=torch.long)
+            self.register_buffer('dynamic_lmk_faces_idx',
+                                 dynamic_lmk_faces_idx)
+
+            dynamic_lmk_b_coords = torch.tensor(
+                contour_embeddings['lmk_b_coords'], dtype=dtype)
+            self.register_buffer(
+                'dynamic_lmk_bary_coords', dynamic_lmk_b_coords)
+
+            neck_kin_chain = find_joint_kin_chain(self.NECK_IDX, self.parents)
+            self.register_buffer(
+                'neck_kin_chain',
+                torch.tensor(neck_kin_chain, dtype=torch.long))
+
+    @property
+    def num_expression_coeffs(self):
+        return self._num_expression_coeffs
+
+    def name(self) -> str:
+        return 'FLAME'
+
+    def extra_repr(self):
+        msg = [
+            super(FLAME, self).extra_repr(),
+            f'Number of Expression Coefficients: {self.num_expression_coeffs}',
+            f'Use face contour: {self.use_face_contour}',
+        ]
+        return '\n'.join(msg)
+
+    def forward(
+        self,
+        betas: Optional[Tensor] = None,
+        global_orient: Optional[Tensor] = None,
+        neck_pose: Optional[Tensor] = None,
+        transl: Optional[Tensor] = None,
+        expression: Optional[Tensor] = None,
+        jaw_pose: Optional[Tensor] = None,
+        leye_pose: Optional[Tensor] = None,
+        reye_pose: Optional[Tensor] = None,
+        return_verts: bool = True,
+        return_full_pose: bool = False,
+        pose2rot: bool = True,
+        **kwargs
+    ) -> FLAMEOutput:
+        '''
+        Forward pass for the SMPLX model
+
+            Parameters
+            ----------
+            global_orient: torch.tensor, optional, shape Bx3
+                If given, ignore the member variable and use it as the global
+                rotation of the body. Useful if someone wishes to predicts this
+                with an external model. (default=None)
+            betas: torch.tensor, optional, shape Bx10
+                If given, ignore the member variable `betas` and use it
+                instead. For example, it can used if shape parameters
+                `betas` are predicted from some external model.
+                (default=None)
+            expression: torch.tensor, optional, shape Bx10
+                If given, ignore the member variable `expression` and use it
+                instead. For example, it can used if expression parameters
+                `expression` are predicted from some external model.
+            jaw_pose: torch.tensor, optional, shape Bx3
+                If given, ignore the member variable `jaw_pose` and
+                use this instead. It should either joint rotations in
+                axis-angle format.
+            jaw_pose: torch.tensor, optional, shape Bx3
+                If given, ignore the member variable `jaw_pose` and
+                use this instead. It should either joint rotations in
+                axis-angle format.
+            transl: torch.tensor, optional, shape Bx3
+                If given, ignore the member variable `transl` and use it
+                instead. For example, it can used if the translation
+                `transl` is predicted from some external model.
+                (default=None)
+            return_verts: bool, optional
+                Return the vertices. (default=True)
+            return_full_pose: bool, optional
+                Returns the full axis-angle pose vector (default=False)
+
+            Returns
+            -------
+                output: ModelOutput
+                A named tuple of type `ModelOutput`
+        '''
+
+        # If no shape and pose parameters are passed along, then use the
+        # ones from the module
+        global_orient = (global_orient if global_orient is not None else
+                         self.global_orient)
+        jaw_pose = jaw_pose if jaw_pose is not None else self.jaw_pose
+        neck_pose = neck_pose if neck_pose is not None else self.neck_pose
+
+        leye_pose = leye_pose if leye_pose is not None else self.leye_pose
+        reye_pose = reye_pose if reye_pose is not None else self.reye_pose
+
+        betas = betas if betas is not None else self.betas
+        expression = expression if expression is not None else self.expression
+
+        apply_trans = transl is not None or hasattr(self, 'transl')
+        if transl is None:
+            if hasattr(self, 'transl'):
+                transl = self.transl
+
+        full_pose = torch.cat(
+            [global_orient, neck_pose, jaw_pose, leye_pose, reye_pose], dim=1)
+
+        batch_size = max(betas.shape[0], global_orient.shape[0],
+                         jaw_pose.shape[0])
+        # Concatenate the shape and expression coefficients
+        scale = int(batch_size / betas.shape[0])
+        if scale > 1:
+            betas = betas.expand(scale, -1)
+        shape_components = torch.cat([betas, expression], dim=-1)
+        shapedirs = torch.cat([self.shapedirs, self.expr_dirs], dim=-1)
+
+        vertices, joints = lbs(shape_components, full_pose, self.v_template,
+                               shapedirs, self.posedirs,
+                               self.J_regressor, self.parents,
+                               self.lbs_weights, pose2rot=pose2rot,
+                               )
+
+        lmk_faces_idx = self.lmk_faces_idx.unsqueeze(
+            dim=0).expand(batch_size, -1).contiguous()
+        lmk_bary_coords = self.lmk_bary_coords.unsqueeze(dim=0).repeat(
+            self.batch_size, 1, 1)
+        if self.use_face_contour:
+            lmk_idx_and_bcoords = find_dynamic_lmk_idx_and_bcoords(
+                vertices, full_pose, self.dynamic_lmk_faces_idx,
+                self.dynamic_lmk_bary_coords,
+                self.neck_kin_chain,
+                pose2rot=True,
+            )
+            dyn_lmk_faces_idx, dyn_lmk_bary_coords = lmk_idx_and_bcoords
+            lmk_faces_idx = torch.cat([lmk_faces_idx,
+                                       dyn_lmk_faces_idx], 1)
+            lmk_bary_coords = torch.cat(
+                [lmk_bary_coords.expand(batch_size, -1, -1),
+                 dyn_lmk_bary_coords], 1)
+
+        landmarks = vertices2landmarks(vertices, self.faces_tensor,
+                                       lmk_faces_idx,
+                                       lmk_bary_coords)
+
+        # Add any extra joints that might be needed
+        joints = self.vertex_joint_selector(vertices, joints)
+        # Add the landmarks to the joints
+        joints = torch.cat([joints, landmarks], dim=1)
+
+        # Map the joints to the current dataset
+        if self.joint_mapper is not None:
+            joints = self.joint_mapper(joints=joints, vertices=vertices)
+
+        if apply_trans:
+            joints += transl.unsqueeze(dim=1)
+            vertices += transl.unsqueeze(dim=1)
+
+        output = FLAMEOutput(vertices=vertices if return_verts else None,
+                             joints=joints,
+                             betas=betas,
+                             expression=expression,
+                             global_orient=global_orient,
+                             neck_pose=neck_pose,
+                             jaw_pose=jaw_pose,
+                             full_pose=full_pose if return_full_pose else None)
+        return output
+
+
+class FLAMELayer(FLAME):
+    def __init__(self, *args, **kwargs) -> None:
+        ''' FLAME as a layer model constructor '''
+        super(FLAMELayer, self).__init__(
+            create_betas=False,
+            create_expression=False,
+            create_global_orient=False,
+            create_neck_pose=False,
+            create_jaw_pose=False,
+            create_leye_pose=False,
+            create_reye_pose=False,
+            *args,
+            **kwargs)
+
+    def forward(
+        self,
+        betas: Optional[Tensor] = None,
+        global_orient: Optional[Tensor] = None,
+        neck_pose: Optional[Tensor] = None,
+        transl: Optional[Tensor] = None,
+        expression: Optional[Tensor] = None,
+        jaw_pose: Optional[Tensor] = None,
+        leye_pose: Optional[Tensor] = None,
+        reye_pose: Optional[Tensor] = None,
+        return_verts: bool = True,
+        return_full_pose: bool = False,
+        pose2rot: bool = True,
+        **kwargs
+    ) -> FLAMEOutput:
+        '''
+        Forward pass for the SMPLX model
+
+            Parameters
+            ----------
+            global_orient: torch.tensor, optional, shape Bx3
+                If given, ignore the member variable and use it as the global
+                rotation of the body. Useful if someone wishes to predicts this
+                with an external model. (default=None)
+            betas: torch.tensor, optional, shape Bx10
+                If given, ignore the member variable `betas` and use it
+                instead. For example, it can used if shape parameters
+                `betas` are predicted from some external model.
+                (default=None)
+            expression: torch.tensor, optional, shape Bx10
+                If given, ignore the member variable `expression` and use it
+                instead. For example, it can used if expression parameters
+                `expression` are predicted from some external model.
+            jaw_pose: torch.tensor, optional, shape Bx3
+                If given, ignore the member variable `jaw_pose` and
+                use this instead. It should either joint rotations in
+                axis-angle format.
+            jaw_pose: torch.tensor, optional, shape Bx3
+                If given, ignore the member variable `jaw_pose` and
+                use this instead. It should either joint rotations in
+                axis-angle format.
+            transl: torch.tensor, optional, shape Bx3
+                If given, ignore the member variable `transl` and use it
+                instead. For example, it can used if the translation
+                `transl` is predicted from some external model.
+                (default=None)
+            return_verts: bool, optional
+                Return the vertices. (default=True)
+            return_full_pose: bool, optional
+                Returns the full axis-angle pose vector (default=False)
+
+            Returns
+            -------
+                output: ModelOutput
+                A named tuple of type `ModelOutput`
+        '''
+        device, dtype = self.shapedirs.device, self.shapedirs.dtype
+        if global_orient is None:
+            batch_size = 1
+            global_orient = torch.zeros(3, device=device, dtype=dtype).view(
+                1, 1, 3).expand(batch_size, -1, -1).contiguous()
+        else:
+            batch_size = global_orient.shape[0]
+        if neck_pose is None:
+            neck_pose = torch.zeros(3, device=device, dtype=dtype).view(
+                1, 1, 3).expand(batch_size, 1, -1).contiguous()
+        if jaw_pose is None:
+            jaw_pose = torch.zeros(3, device=device, dtype=dtype).view(
+                1, 1, 3).expand(batch_size, -1, -1).contiguous()
+        if leye_pose is None:
+            leye_pose = torch.zeros(3, device=device, dtype=dtype).view(
+                1, 1, 3).expand(batch_size, -1, -1).contiguous()
+        if reye_pose is None:
+            reye_pose = torch.zeros(3, device=device, dtype=dtype).view(
+                1, 1, 3).expand(batch_size, -1, -1).contiguous()
+        if betas is None:
+            betas = torch.zeros([batch_size, self.num_betas],
+                                dtype=dtype, device=device)
+        if expression is None:
+            expression = torch.zeros([batch_size, self.num_expression_coeffs],
+                                     dtype=dtype, device=device)
+        if transl is None:
+            transl = torch.zeros([batch_size, 3], dtype=dtype, device=device)
+
+        full_pose = torch.cat(
+            [global_orient, neck_pose, jaw_pose, leye_pose, reye_pose], dim=1)
+
+        shape_components = torch.cat([betas, expression], dim=-1)
+        shapedirs = torch.cat([self.shapedirs, self.expr_dirs], dim=-1)
+
+        vertices, joints = lbs(shape_components, full_pose, self.v_template,
+                               shapedirs, self.posedirs,
+                               self.J_regressor, self.parents,
+                               self.lbs_weights, pose2rot=True,
+                               )
+
+        lmk_faces_idx = self.lmk_faces_idx.unsqueeze(
+            dim=0).expand(batch_size, -1).contiguous()
+        lmk_bary_coords = self.lmk_bary_coords.unsqueeze(dim=0).repeat(
+            self.batch_size, 1, 1)
+        if self.use_face_contour:
+            lmk_idx_and_bcoords = find_dynamic_lmk_idx_and_bcoords(
+                vertices, full_pose, self.dynamic_lmk_faces_idx,
+                self.dynamic_lmk_bary_coords,
+                self.neck_kin_chain,
+                pose2rot=False,
+            )
+            dyn_lmk_faces_idx, dyn_lmk_bary_coords = lmk_idx_and_bcoords
+            lmk_faces_idx = torch.cat([lmk_faces_idx,
+                                       dyn_lmk_faces_idx], 1)
+            lmk_bary_coords = torch.cat(
+                [lmk_bary_coords.expand(batch_size, -1, -1),
+                 dyn_lmk_bary_coords], 1)
+
+        landmarks = vertices2landmarks(vertices, self.faces_tensor,
+                                       lmk_faces_idx,
+                                       lmk_bary_coords)
+
+        # Add any extra joints that might be needed
+        joints = self.vertex_joint_selector(vertices, joints)
+        # Add the landmarks to the joints
+        joints = torch.cat([joints, landmarks], dim=1)
+
+        # Map the joints to the current dataset
+        if self.joint_mapper is not None:
+            joints = self.joint_mapper(joints=joints, vertices=vertices)
+
+        joints += transl.unsqueeze(dim=1)
+        vertices += transl.unsqueeze(dim=1)
+
+        output = FLAMEOutput(vertices=vertices if return_verts else None,
+                             joints=joints,
+                             betas=betas,
+                             expression=expression,
+                             global_orient=global_orient,
+                             neck_pose=neck_pose,
+                             jaw_pose=jaw_pose,
+                             full_pose=full_pose if return_full_pose else None)
+        return output
+
+
+def build_layer(
+    model_path: str,
+    model_type: str = 'smpl',
+    **kwargs
+) -> Union[SMPLLayer, SMPLHLayer, SMPLXLayer, MANOLayer, FLAMELayer]:
+    ''' Method for creating a model from a path and a model type
+
+        Parameters
+        ----------
+        model_path: str
+            Either the path to the model you wish to load or a folder,
+            where each subfolder contains the differents types, i.e.:
+            model_path:
+            |
+            |-- smpl
+                |-- SMPL_FEMALE
+                |-- SMPL_NEUTRAL
+                |-- SMPL_MALE
+            |-- smplh
+                |-- SMPLH_FEMALE
+                |-- SMPLH_MALE
+            |-- smplx
+                |-- SMPLX_FEMALE
+                |-- SMPLX_NEUTRAL
+                |-- SMPLX_MALE
+            |-- mano
+                |-- MANO RIGHT
+                |-- MANO LEFT
+            |-- flame
+                |-- FLAME_FEMALE
+                |-- FLAME_MALE
+                |-- FLAME_NEUTRAL
+
+        model_type: str, optional
+            When model_path is a folder, then this parameter specifies  the
+            type of model to be loaded
+        **kwargs: dict
+            Keyword arguments
+
+        Returns
+        -------
+            body_model: nn.Module
+                The PyTorch module that implements the corresponding body model
+        Raises
+        ------
+            ValueError: In case the model type is not one of SMPL, SMPLH,
+            SMPLX, MANO or FLAME
+    '''
+
+    if osp.isdir(model_path):
+        model_path = os.path.join(model_path, model_type)
+    else:
+        model_type = osp.basename(model_path).split('_')[0].lower()
+
+    if model_type.lower() == 'smpl':
+        return SMPLLayer(model_path, **kwargs)
+    elif model_type.lower() == 'smplh':
+        return SMPLHLayer(model_path, **kwargs)
+    elif model_type.lower() == 'smplx':
+        return SMPLXLayer(model_path, **kwargs)
+    elif 'mano' in model_type.lower():
+        return MANOLayer(model_path, **kwargs)
+    elif 'flame' in model_type.lower():
+        return FLAMELayer(model_path, **kwargs)
+    else:
+        raise ValueError(f'Unknown model type {model_type}, exiting!')
+
+
+def create(
+    model_path: str,
+    model_type: str = 'smpl',
+    **kwargs
+) -> Union[SMPL, SMPLH, SMPLX, MANO, FLAME]:
+    ''' Method for creating a model from a path and a model type
+
+        Parameters
+        ----------
+        model_path: str
+            Either the path to the model you wish to load or a folder,
+            where each subfolder contains the differents types, i.e.:
+            model_path:
+            |
+            |-- smpl
+                |-- SMPL_FEMALE
+                |-- SMPL_NEUTRAL
+                |-- SMPL_MALE
+            |-- smplh
+                |-- SMPLH_FEMALE
+                |-- SMPLH_MALE
+            |-- smplx
+                |-- SMPLX_FEMALE
+                |-- SMPLX_NEUTRAL
+                |-- SMPLX_MALE
+            |-- mano
+                |-- MANO RIGHT
+                |-- MANO LEFT
+
+        model_type: str, optional
+            When model_path is a folder, then this parameter specifies  the
+            type of model to be loaded
+        **kwargs: dict
+            Keyword arguments
+
+        Returns
+        -------
+            body_model: nn.Module
+                The PyTorch module that implements the corresponding body model
+        Raises
+        ------
+            ValueError: In case the model type is not one of SMPL, SMPLH,
+            SMPLX, MANO or FLAME
+    '''
+
+    # If it's a folder, assume
+    if osp.isdir(model_path):
+        model_path = os.path.join(model_path, model_type)
+    else:
+        model_type = osp.basename(model_path).split('_')[0].lower()
+
+    if model_type.lower() == 'smpl':
+        return SMPL(model_path, **kwargs)
+    elif model_type.lower() == 'smplh':
+        return SMPLH(model_path, **kwargs)
+    elif model_type.lower() == 'smplx':
+        return SMPLX(model_path, **kwargs)
+    elif 'mano' in model_type.lower():
+        return MANO(model_path, **kwargs)
+    elif 'flame' in model_type.lower():
+        return FLAME(model_path, **kwargs)
+    else:
+        raise ValueError(f'Unknown model type {model_type}, exiting!')
diff --git a/common/utils/smplx/smplx/joint_names.py b/common/utils/smplx/smplx/joint_names.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a3a10f8cef8b50075dc9f680459fc5d596a0013
--- /dev/null
+++ b/common/utils/smplx/smplx/joint_names.py
@@ -0,0 +1,163 @@
+# -*- coding: utf-8 -*-
+
+# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
+# holder of all proprietary rights on this computer program.
+# You can only use this computer program if you have closed
+# a license agreement with MPG or you get the right to use the computer
+# program from someone who is authorized to grant you that right.
+# Any use of the computer program without a valid license is prohibited and
+# liable to prosecution.
+#
+# Copyright©2019 Max-Planck-Gesellschaft zur Förderung
+# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
+# for Intelligent Systems. All rights reserved.
+#
+# Contact: ps-license@tuebingen.mpg.de
+
+JOINT_NAMES = [
+    'pelvis',
+    'left_hip',
+    'right_hip',
+    'spine1',
+    'left_knee',
+    'right_knee',
+    'spine2',
+    'left_ankle',
+    'right_ankle',
+    'spine3',
+    'left_foot',
+    'right_foot',
+    'neck',
+    'left_collar',
+    'right_collar',
+    'head',
+    'left_shoulder',
+    'right_shoulder',
+    'left_elbow',
+    'right_elbow',
+    'left_wrist',
+    'right_wrist',
+    'jaw',
+    'left_eye_smplhf',
+    'right_eye_smplhf',
+    'left_index1',
+    'left_index2',
+    'left_index3',
+    'left_middle1',
+    'left_middle2',
+    'left_middle3',
+    'left_pinky1',
+    'left_pinky2',
+    'left_pinky3',
+    'left_ring1',
+    'left_ring2',
+    'left_ring3',
+    'left_thumb1',
+    'left_thumb2',
+    'left_thumb3',
+    'right_index1',
+    'right_index2',
+    'right_index3',
+    'right_middle1',
+    'right_middle2',
+    'right_middle3',
+    'right_pinky1',
+    'right_pinky2',
+    'right_pinky3',
+    'right_ring1',
+    'right_ring2',
+    'right_ring3',
+    'right_thumb1',
+    'right_thumb2',
+    'right_thumb3',
+    'nose',
+    'right_eye',
+    'left_eye',
+    'right_ear',
+    'left_ear',
+    'left_big_toe',
+    'left_small_toe',
+    'left_heel',
+    'right_big_toe',
+    'right_small_toe',
+    'right_heel',
+    'left_thumb',
+    'left_index',
+    'left_middle',
+    'left_ring',
+    'left_pinky',
+    'right_thumb',
+    'right_index',
+    'right_middle',
+    'right_ring',
+    'right_pinky',
+    'right_eye_brow1',
+    'right_eye_brow2',
+    'right_eye_brow3',
+    'right_eye_brow4',
+    'right_eye_brow5',
+    'left_eye_brow5',
+    'left_eye_brow4',
+    'left_eye_brow3',
+    'left_eye_brow2',
+    'left_eye_brow1',
+    'nose1',
+    'nose2',
+    'nose3',
+    'nose4',
+    'right_nose_2',
+    'right_nose_1',
+    'nose_middle',
+    'left_nose_1',
+    'left_nose_2',
+    'right_eye1',
+    'right_eye2',
+    'right_eye3',
+    'right_eye4',
+    'right_eye5',
+    'right_eye6',
+    'left_eye4',
+    'left_eye3',
+    'left_eye2',
+    'left_eye1',
+    'left_eye6',
+    'left_eye5',
+    'right_mouth_1',
+    'right_mouth_2',
+    'right_mouth_3',
+    'mouth_top',
+    'left_mouth_3',
+    'left_mouth_2',
+    'left_mouth_1',
+    'left_mouth_5',  # 59 in OpenPose output
+    'left_mouth_4',  # 58 in OpenPose output
+    'mouth_bottom',
+    'right_mouth_4',
+    'right_mouth_5',
+    'right_lip_1',
+    'right_lip_2',
+    'lip_top',
+    'left_lip_2',
+    'left_lip_1',
+    'left_lip_3',
+    'lip_bottom',
+    'right_lip_3',
+    # Face contour
+    'right_contour_1',
+    'right_contour_2',
+    'right_contour_3',
+    'right_contour_4',
+    'right_contour_5',
+    'right_contour_6',
+    'right_contour_7',
+    'right_contour_8',
+    'contour_middle',
+    'left_contour_8',
+    'left_contour_7',
+    'left_contour_6',
+    'left_contour_5',
+    'left_contour_4',
+    'left_contour_3',
+    'left_contour_2',
+    'left_contour_1',
+]
diff --git a/common/utils/smplx/smplx/lbs.py b/common/utils/smplx/smplx/lbs.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4d8bb266dac88dda10a2eeb536076533604ff52
--- /dev/null
+++ b/common/utils/smplx/smplx/lbs.py
@@ -0,0 +1,404 @@
+# -*- coding: utf-8 -*-
+
+# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
+# holder of all proprietary rights on this computer program.
+# You can only use this computer program if you have closed
+# a license agreement with MPG or you get the right to use the computer
+# program from someone who is authorized to grant you that right.
+# Any use of the computer program without a valid license is prohibited and
+# liable to prosecution.
+#
+# Copyright©2019 Max-Planck-Gesellschaft zur Förderung
+# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
+# for Intelligent Systems. All rights reserved.
+#
+# Contact: ps-license@tuebingen.mpg.de
+
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+
+from typing import Tuple, List
+import numpy as np
+
+import torch
+import torch.nn.functional as F
+
+from .utils import rot_mat_to_euler, Tensor
+
+
+def find_dynamic_lmk_idx_and_bcoords(
+    vertices: Tensor,
+    pose: Tensor,
+    dynamic_lmk_faces_idx: Tensor,
+    dynamic_lmk_b_coords: Tensor,
+    neck_kin_chain: List[int],
+    pose2rot: bool = True,
+) -> Tuple[Tensor, Tensor]:
+    ''' Compute the faces, barycentric coordinates for the dynamic landmarks
+
+
+        To do so, we first compute the rotation of the neck around the y-axis
+        and then use a pre-computed look-up table to find the faces and the
+        barycentric coordinates that will be used.
+
+        Special thanks to Soubhik Sanyal (soubhik.sanyal@tuebingen.mpg.de)
+        for providing the original TensorFlow implementation and for the LUT.
+
+        Parameters
+        ----------
+        vertices: torch.tensor BxVx3, dtype = torch.float32
+            The tensor of input vertices
+        pose: torch.tensor Bx(Jx3), dtype = torch.float32
+            The current pose of the body model
+        dynamic_lmk_faces_idx: torch.tensor L, dtype = torch.long
+            The look-up table from neck rotation to faces
+        dynamic_lmk_b_coords: torch.tensor Lx3, dtype = torch.float32
+            The look-up table from neck rotation to barycentric coordinates
+        neck_kin_chain: list
+            A python list that contains the indices of the joints that form the
+            kinematic chain of the neck.
+        dtype: torch.dtype, optional
+
+        Returns
+        -------
+        dyn_lmk_faces_idx: torch.tensor, dtype = torch.long
+            A tensor of size BxL that contains the indices of the faces that
+            will be used to compute the current dynamic landmarks.
+        dyn_lmk_b_coords: torch.tensor, dtype = torch.float32
+            A tensor of size BxL that contains the indices of the faces that
+            will be used to compute the current dynamic landmarks.
+    '''
+
+    dtype = vertices.dtype
+    batch_size = vertices.shape[0]
+
+    if pose2rot:
+        aa_pose = torch.index_select(pose.view(batch_size, -1, 3), 1,
+                                     neck_kin_chain)
+        rot_mats = batch_rodrigues(
+            aa_pose.view(-1, 3)).view(batch_size, -1, 3, 3)
+    else:
+        rot_mats = torch.index_select(
+            pose.view(batch_size, -1, 3, 3), 1, neck_kin_chain)
+
+    rel_rot_mat = torch.eye(
+        3, device=vertices.device, dtype=dtype).unsqueeze_(dim=0).repeat(
+            batch_size, 1, 1)
+    for idx in range(len(neck_kin_chain)):
+        rel_rot_mat = torch.bmm(rot_mats[:, idx], rel_rot_mat)
+
+    y_rot_angle = torch.round(
+        torch.clamp(-rot_mat_to_euler(rel_rot_mat) * 180.0 / np.pi,
+                    max=39)).to(dtype=torch.long)
+    neg_mask = y_rot_angle.lt(0).to(dtype=torch.long)
+    mask = y_rot_angle.lt(-39).to(dtype=torch.long)
+    neg_vals = mask * 78 + (1 - mask) * (39 - y_rot_angle)
+    y_rot_angle = (neg_mask * neg_vals +
+                   (1 - neg_mask) * y_rot_angle)
+
+    dyn_lmk_faces_idx = torch.index_select(dynamic_lmk_faces_idx,
+                                           0, y_rot_angle)
+    dyn_lmk_b_coords = torch.index_select(dynamic_lmk_b_coords,
+                                          0, y_rot_angle)
+
+    return dyn_lmk_faces_idx, dyn_lmk_b_coords
+
+
+def vertices2landmarks(
+    vertices: Tensor,
+    faces: Tensor,
+    lmk_faces_idx: Tensor,
+    lmk_bary_coords: Tensor
+) -> Tensor:
+    ''' Calculates landmarks by barycentric interpolation
+
+        Parameters
+        ----------
+        vertices: torch.tensor BxVx3, dtype = torch.float32
+            The tensor of input vertices
+        faces: torch.tensor Fx3, dtype = torch.long
+            The faces of the mesh
+        lmk_faces_idx: torch.tensor L, dtype = torch.long
+            The tensor with the indices of the faces used to calculate the
+            landmarks.
+        lmk_bary_coords: torch.tensor Lx3, dtype = torch.float32
+            The tensor of barycentric coordinates that are used to interpolate
+            the landmarks
+
+        Returns
+        -------
+        landmarks: torch.tensor BxLx3, dtype = torch.float32
+            The coordinates of the landmarks for each mesh in the batch
+    '''
+    # Extract the indices of the vertices for each face
+    # BxLx3
+    batch_size, num_verts = vertices.shape[:2]
+    device = vertices.device
+
+    lmk_faces = torch.index_select(faces, 0, lmk_faces_idx.view(-1)).view(
+        batch_size, -1, 3)
+
+    lmk_faces += torch.arange(
+        batch_size, dtype=torch.long, device=device).view(-1, 1, 1) * num_verts
+
+    lmk_vertices = vertices.view(-1, 3)[lmk_faces].view(
+        batch_size, -1, 3, 3)
+
+    landmarks = torch.einsum('blfi,blf->bli', [lmk_vertices, lmk_bary_coords])
+    return landmarks
+
+
+def lbs(
+    betas: Tensor,
+    pose: Tensor,
+    v_template: Tensor,
+    shapedirs: Tensor,
+    posedirs: Tensor,
+    J_regressor: Tensor,
+    parents: Tensor,
+    lbs_weights: Tensor,
+    pose2rot: bool = True,
+) -> Tuple[Tensor, Tensor]:
+    ''' Performs Linear Blend Skinning with the given shape and pose parameters
+
+        Parameters
+        ----------
+        betas : torch.tensor BxNB
+            The tensor of shape parameters
+        pose : torch.tensor Bx(J + 1) * 3
+            The pose parameters in axis-angle format
+        v_template torch.tensor BxVx3
+            The template mesh that will be deformed
+        shapedirs : torch.tensor 1xNB
+            The tensor of PCA shape displacements
+        posedirs : torch.tensor Px(V * 3)
+            The pose PCA coefficients
+        J_regressor : torch.tensor JxV
+            The regressor array that is used to calculate the joints from
+            the position of the vertices
+        parents: torch.tensor J
+            The array that describes the kinematic tree for the model
+        lbs_weights: torch.tensor N x V x (J + 1)
+            The linear blend skinning weights that represent how much the
+            rotation matrix of each part affects each vertex
+        pose2rot: bool, optional
+            Flag on whether to convert the input pose tensor to rotation
+            matrices. The default value is True. If False, then the pose tensor
+            should already contain rotation matrices and have a size of
+            Bx(J + 1)x9
+        dtype: torch.dtype, optional
+
+        Returns
+        -------
+        verts: torch.tensor BxVx3
+            The vertices of the mesh after applying the shape and pose
+            displacements.
+        joints: torch.tensor BxJx3
+            The joints of the model
+    '''
+
+    batch_size = max(betas.shape[0], pose.shape[0])
+    device, dtype = betas.device, betas.dtype
+
+    # Add shape contribution
+    v_shaped = v_template + blend_shapes(betas, shapedirs)
+
+    # Get the joints
+    # NxJx3 array
+    J = vertices2joints(J_regressor, v_shaped)
+
+    # 3. Add pose blend shapes
+    # N x J x 3 x 3
+    ident = torch.eye(3, dtype=dtype, device=device)
+    if pose2rot:
+        rot_mats = batch_rodrigues(pose.view(-1, 3)).view(
+            [batch_size, -1, 3, 3])
+
+        pose_feature = (rot_mats[:, 1:, :, :] - ident).view([batch_size, -1])
+        # (N x P) x (P, V * 3) -> N x V x 3
+        pose_offsets = torch.matmul(
+            pose_feature, posedirs).view(batch_size, -1, 3)
+    else:
+        pose_feature = pose[:, 1:].view(batch_size, -1, 3, 3) - ident
+        rot_mats = pose.view(batch_size, -1, 3, 3)
+
+        pose_offsets = torch.matmul(pose_feature.view(batch_size, -1),
+                                    posedirs).view(batch_size, -1, 3)
+
+    v_posed = pose_offsets + v_shaped
+    # 4. Get the global joint location
+    J_transformed, A = batch_rigid_transform(rot_mats, J, parents, dtype=dtype)
+
+    # 5. Do skinning:
+    # W is N x V x (J + 1)
+    W = lbs_weights.unsqueeze(dim=0).expand([batch_size, -1, -1])
+    # (N x V x (J + 1)) x (N x (J + 1) x 16)
+    num_joints = J_regressor.shape[0]
+    T = torch.matmul(W, A.view(batch_size, num_joints, 16)) \
+        .view(batch_size, -1, 4, 4)
+
+    homogen_coord = torch.ones([batch_size, v_posed.shape[1], 1],
+                               dtype=dtype, device=device)
+    v_posed_homo = torch.cat([v_posed, homogen_coord], dim=2)
+    v_homo = torch.matmul(T, torch.unsqueeze(v_posed_homo, dim=-1))
+
+    verts = v_homo[:, :, :3, 0]
+
+    return verts, J_transformed
+
+
+def vertices2joints(J_regressor: Tensor, vertices: Tensor) -> Tensor:
+    ''' Calculates the 3D joint locations from the vertices
+
+    Parameters
+    ----------
+    J_regressor : torch.tensor JxV
+        The regressor array that is used to calculate the joints from the
+        position of the vertices
+    vertices : torch.tensor BxVx3
+        The tensor of mesh vertices
+
+    Returns
+    -------
+    torch.tensor BxJx3
+        The location of the joints
+    '''
+
+    return torch.einsum('bik,ji->bjk', [vertices, J_regressor])
+
+
+def blend_shapes(betas: Tensor, shape_disps: Tensor) -> Tensor:
+    ''' Calculates the per vertex displacement due to the blend shapes
+
+
+    Parameters
+    ----------
+    betas : torch.tensor Bx(num_betas)
+        Blend shape coefficients
+    shape_disps: torch.tensor Vx3x(num_betas)
+        Blend shapes
+
+    Returns
+    -------
+    torch.tensor BxVx3
+        The per-vertex displacement due to shape deformation
+    '''
+
+    # Displacement[b, m, k] = sum_{l} betas[b, l] * shape_disps[m, k, l]
+    # i.e. Multiply each shape displacement by its corresponding beta and
+    # then sum them.
+    blend_shape = torch.einsum('bl,mkl->bmk', [betas, shape_disps])
+    return blend_shape
+
+
+def batch_rodrigues(
+    rot_vecs: Tensor,
+    epsilon: float = 1e-8,
+) -> Tensor:
+    ''' Calculates the rotation matrices for a batch of rotation vectors
+        Parameters
+        ----------
+        rot_vecs: torch.tensor Nx3
+            array of N axis-angle vectors
+        Returns
+        -------
+        R: torch.tensor Nx3x3
+            The rotation matrices for the given axis-angle parameters
+    '''
+
+    batch_size = rot_vecs.shape[0]
+    device, dtype = rot_vecs.device, rot_vecs.dtype
+
+    angle = torch.norm(rot_vecs + 1e-8, dim=1, keepdim=True)
+    rot_dir = rot_vecs / angle
+
+    cos = torch.unsqueeze(torch.cos(angle), dim=1)
+    sin = torch.unsqueeze(torch.sin(angle), dim=1)
+
+    # Bx1 arrays
+    rx, ry, rz = torch.split(rot_dir, 1, dim=1)
+    K = torch.zeros((batch_size, 3, 3), dtype=dtype, device=device)
+
+    zeros = torch.zeros((batch_size, 1), dtype=dtype, device=device)
+    K = torch.cat([zeros, -rz, ry, rz, zeros, -rx, -ry, rx, zeros], dim=1) \
+        .view((batch_size, 3, 3))
+
+    ident = torch.eye(3, dtype=dtype, device=device).unsqueeze(dim=0)
+    rot_mat = ident + sin * K + (1 - cos) * torch.bmm(K, K)
+    return rot_mat
+
+
+def transform_mat(R: Tensor, t: Tensor) -> Tensor:
+    ''' Creates a batch of transformation matrices
+        Args:
+            - R: Bx3x3 array of a batch of rotation matrices
+            - t: Bx3x1 array of a batch of translation vectors
+        Returns:
+            - T: Bx4x4 Transformation matrix
+    '''
+    # No padding left or right, only add an extra row
+    return torch.cat([F.pad(R, [0, 0, 0, 1]),
+                      F.pad(t, [0, 0, 0, 1], value=1)], dim=2)
+
+
+def batch_rigid_transform(
+    rot_mats: Tensor,
+    joints: Tensor,
+    parents: Tensor,
+    dtype=torch.float32
+) -> Tensor:
+    """
+    Applies a batch of rigid transformations to the joints
+
+    Parameters
+    ----------
+    rot_mats : torch.tensor BxNx3x3
+        Tensor of rotation matrices
+    joints : torch.tensor BxNx3
+        Locations of joints
+    parents : torch.tensor BxN
+        The kinematic tree of each object
+    dtype : torch.dtype, optional:
+        The data type of the created tensors, the default is torch.float32
+
+    Returns
+    -------
+    posed_joints : torch.tensor BxNx3
+        The locations of the joints after applying the pose rotations
+    rel_transforms : torch.tensor BxNx4x4
+        The relative (with respect to the root joint) rigid transformations
+        for all the joints
+    """
+
+    joints = torch.unsqueeze(joints, dim=-1)
+
+    rel_joints = joints.clone()
+    rel_joints[:, 1:] -= joints[:, parents[1:]]
+
+    transforms_mat = transform_mat(
+        rot_mats.reshape(-1, 3, 3),
+        rel_joints.reshape(-1, 3, 1)).reshape(-1, joints.shape[1], 4, 4)
+
+    transform_chain = [transforms_mat[:, 0]]
+    for i in range(1, parents.shape[0]):
+        # Subtract the joint location at the rest pose
+        # No need for rotation, since it's identity when at rest
+        curr_res = torch.matmul(transform_chain[parents[i]],
+                                transforms_mat[:, i])
+        transform_chain.append(curr_res)
+
+    transforms = torch.stack(transform_chain, dim=1)
+
+    # The last column of the transformations contains the posed joints
+    posed_joints = transforms[:, :, :3, 3]
+
+    # The last column of the transformations contains the posed joints
+    posed_joints = transforms[:, :, :3, 3]
+
+    joints_homogen = F.pad(joints, [0, 0, 0, 1])
+
+    rel_transforms = transforms - F.pad(
+        torch.matmul(transforms, joints_homogen), [3, 0, 0, 0, 0, 0, 0, 0])
+
+    return posed_joints, rel_transforms
diff --git a/common/utils/smplx/smplx/utils.py b/common/utils/smplx/smplx/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..55dd1ed6d94582c744d0b99bc4150b1040cf58a0
--- /dev/null
+++ b/common/utils/smplx/smplx/utils.py
@@ -0,0 +1,125 @@
+# -*- coding: utf-8 -*-
+
+# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
+# holder of all proprietary rights on this computer program.
+# You can only use this computer program if you have closed
+# a license agreement with MPG or you get the right to use the computer
+# program from someone who is authorized to grant you that right.
+# Any use of the computer program without a valid license is prohibited and
+# liable to prosecution.
+#
+# Copyright©2019 Max-Planck-Gesellschaft zur Förderung
+# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
+# for Intelligent Systems. All rights reserved.
+#
+# Contact: ps-license@tuebingen.mpg.de
+
+from typing import NewType, Union, Optional
+from dataclasses import dataclass, asdict, fields
+import numpy as np
+import torch
+
+Tensor = NewType('Tensor', torch.Tensor)
+Array = NewType('Array', np.ndarray)
+
+
+@dataclass
+class ModelOutput:
+    vertices: Optional[Tensor] = None
+    joints: Optional[Tensor] = None
+    full_pose: Optional[Tensor] = None
+    global_orient: Optional[Tensor] = None
+    transl: Optional[Tensor] = None
+
+    def __getitem__(self, key):
+        return getattr(self, key)
+
+    def get(self, key, default=None):
+        return getattr(self, key, default)
+
+    def __iter__(self):
+        return self.keys()
+
+    def keys(self):
+        keys = [t.name for t in fields(self)]
+        return iter(keys)
+
+    def values(self):
+        values = [getattr(self, t.name) for t in fields(self)]
+        return iter(values)
+
+    def items(self):
+        data = [(t.name, getattr(self, t.name)) for t in fields(self)]
+        return iter(data)
+
+
+@dataclass
+class SMPLOutput(ModelOutput):
+    betas: Optional[Tensor] = None
+    body_pose: Optional[Tensor] = None
+
+
+@dataclass
+class SMPLHOutput(SMPLOutput):
+    left_hand_pose: Optional[Tensor] = None
+    right_hand_pose: Optional[Tensor] = None
+    transl: Optional[Tensor] = None
+
+
+@dataclass
+class SMPLXOutput(SMPLHOutput):
+    expression: Optional[Tensor] = None
+    jaw_pose: Optional[Tensor] = None
+
+
+@dataclass
+class MANOOutput(ModelOutput):
+    betas: Optional[Tensor] = None
+    hand_pose: Optional[Tensor] = None
+
+
+@dataclass
+class FLAMEOutput(ModelOutput):
+    betas: Optional[Tensor] = None
+    expression: Optional[Tensor] = None
+    jaw_pose: Optional[Tensor] = None
+    neck_pose: Optional[Tensor] = None
+
+
+def find_joint_kin_chain(joint_id, kinematic_tree):
+    kin_chain = []
+    curr_idx = joint_id
+    while curr_idx != -1:
+        kin_chain.append(curr_idx)
+        curr_idx = kinematic_tree[curr_idx]
+    return kin_chain
+
+
+def to_tensor(
+        array: Union[Array, Tensor], dtype=torch.float32
+) -> Tensor:
+    if torch.is_tensor(array):
+        return array
+    else:
+        return torch.tensor(array, dtype=dtype)
+
+
+class Struct(object):
+    def __init__(self, **kwargs):
+        for key, val in kwargs.items():
+            setattr(self, key, val)
+
+
+def to_np(array, dtype=np.float32):
+    if 'scipy.sparse' in str(type(array)):
+        array = array.todense()
+    return np.array(array, dtype=dtype)
+
+
+def rot_mat_to_euler(rot_mats):
+    # Calculates rotation matrix to euler angles
+    # Careful for extreme cases of eular angles like [0.0, pi, 0.0]
+
+    sy = torch.sqrt(rot_mats[:, 0, 0] * rot_mats[:, 0, 0] +
+                    rot_mats[:, 1, 0] * rot_mats[:, 1, 0])
+    return torch.atan2(-rot_mats[:, 2, 0], sy)
diff --git a/common/utils/smplx/smplx/vertex_ids.py b/common/utils/smplx/smplx/vertex_ids.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e7a4c36700f002da54a9e181eabbd47af2a95bc
--- /dev/null
+++ b/common/utils/smplx/smplx/vertex_ids.py
@@ -0,0 +1,77 @@
+# -*- coding: utf-8 -*-
+
+# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
+# holder of all proprietary rights on this computer program.
+# You can only use this computer program if you have closed
+# a license agreement with MPG or you get the right to use the computer
+# program from someone who is authorized to grant you that right.
+# Any use of the computer program without a valid license is prohibited and
+# liable to prosecution.
+#
+# Copyright©2019 Max-Planck-Gesellschaft zur Förderung
+# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
+# for Intelligent Systems. All rights reserved.
+#
+# Contact: ps-license@tuebingen.mpg.de
+
+from __future__ import print_function
+from __future__ import absolute_import
+from __future__ import division
+
+# Joint name to vertex mapping. SMPL/SMPL-H/SMPL-X vertices that correspond to
+# MSCOCO and OpenPose joints
+vertex_ids = {
+    'smplh': {
+        'nose':		    332,
+        'reye':		    6260,
+        'leye':		    2800,
+        'rear':		    4071,
+        'lear':		    583,
+        'rthumb':		6191,
+        'rindex':		5782,
+        'rmiddle':		5905,
+        'rring':		6016,
+        'rpinky':		6133,
+        'lthumb':		2746,
+        'lindex':		2319,
+        'lmiddle':		2445,
+        'lring':		2556,
+        'lpinky':		2673,
+        'LBigToe':		3216,
+        'LSmallToe':	3226,
+        'LHeel':		3387,
+        'RBigToe':		6617,
+        'RSmallToe':    6624,
+        'RHeel':		6787
+    },
+    'smplx': {
+        'nose':		    9120,
+        'reye':		    9929,
+        'leye':		    9448,
+        'rear':		    616,
+        'lear':		    6,
+        'rthumb':		8079,
+        'rindex':		7669,
+        'rmiddle':		7794,
+        'rring':		7905,
+        'rpinky':		8022,
+        'lthumb':		5361,
+        'lindex':		4933,
+        'lmiddle':		5058,
+        'lring':		5169,
+        'lpinky':		5286,
+        'LBigToe':		5770,
+        'LSmallToe':    5780,
+        'LHeel':		8846,
+        'RBigToe':		8463,
+        'RSmallToe': 	8474,
+        'RHeel':  		8635
+    },
+    'mano': {
+            'thumb':		744,
+            'index':		320,
+            'middle':		443,
+            'ring':		    554,
+            'pinky':		671,
+        }
+}
diff --git a/common/utils/smplx/smplx/vertex_joint_selector.py b/common/utils/smplx/smplx/vertex_joint_selector.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b8298bd5e087731f86c1c699703b5219e046c5c
--- /dev/null
+++ b/common/utils/smplx/smplx/vertex_joint_selector.py
@@ -0,0 +1,77 @@
+# -*- coding: utf-8 -*-
+
+# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
+# holder of all proprietary rights on this computer program.
+# You can only use this computer program if you have closed
+# a license agreement with MPG or you get the right to use the computer
+# program from someone who is authorized to grant you that right.
+# Any use of the computer program without a valid license is prohibited and
+# liable to prosecution.
+#
+# Copyright©2019 Max-Planck-Gesellschaft zur Förderung
+# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
+# for Intelligent Systems. All rights reserved.
+#
+# Contact: ps-license@tuebingen.mpg.de
+
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+
+import numpy as np
+
+import torch
+import torch.nn as nn
+
+from .utils import to_tensor
+
+
+class VertexJointSelector(nn.Module):
+
+    def __init__(self, vertex_ids=None,
+                 use_hands=True,
+                 use_feet_keypoints=True, **kwargs):
+        super(VertexJointSelector, self).__init__()
+
+        extra_joints_idxs = []
+
+        face_keyp_idxs = np.array([
+            vertex_ids['nose'],
+            vertex_ids['reye'],
+            vertex_ids['leye'],
+            vertex_ids['rear'],
+            vertex_ids['lear']], dtype=np.int64)
+
+        extra_joints_idxs = np.concatenate([extra_joints_idxs,
+                                            face_keyp_idxs])
+
+        if use_feet_keypoints:
+            feet_keyp_idxs = np.array([vertex_ids['LBigToe'],
+                                       vertex_ids['LSmallToe'],
+                                       vertex_ids['LHeel'],
+                                       vertex_ids['RBigToe'],
+                                       vertex_ids['RSmallToe'],
+                                       vertex_ids['RHeel']], dtype=np.int32)
+
+            extra_joints_idxs = np.concatenate(
+                [extra_joints_idxs, feet_keyp_idxs])
+
+        if use_hands:
+            self.tip_names = ['thumb', 'index', 'middle', 'ring', 'pinky']
+
+            tips_idxs = []
+            for hand_id in ['l', 'r']:
+                for tip_name in self.tip_names:
+                    tips_idxs.append(vertex_ids[hand_id + tip_name])
+
+            extra_joints_idxs = np.concatenate(
+                [extra_joints_idxs, tips_idxs])
+
+        self.register_buffer('extra_joints_idxs',
+                             to_tensor(extra_joints_idxs, dtype=torch.long))
+
+    def forward(self, vertices, joints):
+        extra_joints = torch.index_select(vertices, 1, self.extra_joints_idxs)
+        joints = torch.cat([joints, extra_joints], dim=1)
+
+        return joints
diff --git a/common/utils/smplx/tools/README.md b/common/utils/smplx/tools/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1e69d971bc22f8d65f2e751c24c6125d363a3e76
--- /dev/null
+++ b/common/utils/smplx/tools/README.md
@@ -0,0 +1,20 @@
+## Removing Chumpy objects
+
+In a Python 2 virtual environment with [Chumpy](https://github.com/mattloper/chumpy) installed run the following to remove any Chumpy objects from the model data:
+
+```bash
+python tools/clean_ch.py --input-models path-to-models/*.pkl --output-folder output-folder
+```
+
+## Merging SMPL-H and MANO parameters
+
+In order to use the given PyTorch SMPL-H module we first need to merge the SMPL-H and MANO parameters in a single file. After agreeing to the license and downloading the models, run the following command:
+
+```bash
+python tools/merge_smplh_mano.py --smplh-fn SMPLH_FOLDER/SMPLH_GENDER.pkl \
+ --mano-left-fn MANO_FOLDER/MANO_LEFT.pkl \
+ --mano-right-fn MANO_FOLDER/MANO_RIGHT.pkl \
+ --output-folder OUTPUT_FOLDER
+```
+
+where SMPLH_FOLDER is the folder with the SMPL-H files and MANO_FOLDER the one for the MANO files.
diff --git a/common/utils/smplx/tools/__init__.py b/common/utils/smplx/tools/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..098b529b7f169758710ab788be94fe5d83e51256
--- /dev/null
+++ b/common/utils/smplx/tools/__init__.py
@@ -0,0 +1,19 @@
+# -*- coding: utf-8 -*-
+
+# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
+# holder of all proprietary rights on this computer program.
+# You can only use this computer program if you have closed
+# a license agreement with MPG or you get the right to use the computer
+# program from someone who is authorized to grant you that right.
+# Any use of the computer program without a valid license is prohibited and
+# liable to prosecution.
+#
+# Copyright©2019 Max-Planck-Gesellschaft zur Förderung
+# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
+# for Intelligent Systems and the Max Planck Institute for Biological
+# Cybernetics. All rights reserved.
+#
+# Contact: ps-license@tuebingen.mpg.de
+
+import clean_ch
+import merge_smplh_mano
diff --git a/common/utils/smplx/tools/clean_ch.py b/common/utils/smplx/tools/clean_ch.py
new file mode 100644
index 0000000000000000000000000000000000000000..56874b374c5d25aeb4ace0aefb3570bd7b891c22
--- /dev/null
+++ b/common/utils/smplx/tools/clean_ch.py
@@ -0,0 +1,68 @@
+# -*- coding: utf-8 -*-
+
+# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
+# holder of all proprietary rights on this computer program.
+# You can only use this computer program if you have closed
+# a license agreement with MPG or you get the right to use the computer
+# program from someone who is authorized to grant you that right.
+# Any use of the computer program without a valid license is prohibited and
+# liable to prosecution.
+#
+# Copyright©2019 Max-Planck-Gesellschaft zur Förderung
+# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
+# for Intelligent Systems and the Max Planck Institute for Biological
+# Cybernetics. All rights reserved.
+#
+# Contact: ps-license@tuebingen.mpg.de
+
+from __future__ import print_function
+from __future__ import absolute_import
+from __future__ import division
+
+import argparse
+import os
+import os.path as osp
+
+import pickle
+
+from tqdm import tqdm
+import numpy as np
+
+
+def clean_fn(fn, output_folder='output'):
+    with open(fn, 'rb') as body_file:
+        body_data = pickle.load(body_file)
+
+    output_dict = {}
+    for key, data in body_data.iteritems():
+        if 'chumpy' in str(type(data)):
+            output_dict[key] = np.array(data)
+        else:
+            output_dict[key] = data
+
+    out_fn = osp.split(fn)[1]
+
+    out_path = osp.join(output_folder, out_fn)
+    with open(out_path, 'wb') as out_file:
+        pickle.dump(output_dict, out_file)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input-models', dest='input_models', nargs='+',
+                        required=True, type=str,
+                        help='The path to the model that will be processed')
+    parser.add_argument('--output-folder', dest='output_folder',
+                        required=True, type=str,
+                        help='The path to the output folder')
+
+    args = parser.parse_args()
+
+    input_models = args.input_models
+    output_folder = args.output_folder
+    if not osp.exists(output_folder):
+        print('Creating directory: {}'.format(output_folder))
+        os.makedirs(output_folder)
+
+    for input_model in input_models:
+        clean_fn(input_model, output_folder=output_folder)
diff --git a/common/utils/smplx/tools/merge_smplh_mano.py b/common/utils/smplx/tools/merge_smplh_mano.py
new file mode 100644
index 0000000000000000000000000000000000000000..eab9d1ea60c224cf3785bd90dc542569ad81cd78
--- /dev/null
+++ b/common/utils/smplx/tools/merge_smplh_mano.py
@@ -0,0 +1,89 @@
+# -*- coding: utf-8 -*-
+
+# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
+# holder of all proprietary rights on this computer program.
+# You can only use this computer program if you have closed
+# a license agreement with MPG or you get the right to use the computer
+# program from someone who is authorized to grant you that right.
+# Any use of the computer program without a valid license is prohibited and
+# liable to prosecution.
+#
+# Copyright©2019 Max-Planck-Gesellschaft zur Förderung
+# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
+# for Intelligent Systems and the Max Planck Institute for Biological
+# Cybernetics. All rights reserved.
+#
+# Contact: ps-license@tuebingen.mpg.de
+
+from __future__ import print_function
+
+import os
+import os.path as osp
+import pickle
+
+import argparse
+
+import numpy as np
+
+
+def merge_models(smplh_fn, mano_left_fn, mano_right_fn,
+                 output_folder='output'):
+
+    with open(smplh_fn, 'rb') as body_file:
+        body_data = pickle.load(body_file)
+
+    with open(mano_left_fn, 'rb') as lhand_file:
+        lhand_data = pickle.load(lhand_file)
+
+    with open(mano_right_fn, 'rb') as rhand_file:
+        rhand_data = pickle.load(rhand_file)
+
+    out_fn = osp.split(smplh_fn)[1]
+
+    output_data = body_data.copy()
+    output_data['hands_componentsl'] = lhand_data['hands_components']
+    output_data['hands_componentsr'] = rhand_data['hands_components']
+
+    output_data['hands_coeffsl'] = lhand_data['hands_coeffs']
+    output_data['hands_coeffsr'] = rhand_data['hands_coeffs']
+
+    output_data['hands_meanl'] = lhand_data['hands_mean']
+    output_data['hands_meanr'] = rhand_data['hands_mean']
+
+    for key, data in output_data.iteritems():
+        if 'chumpy' in str(type(data)):
+            output_data[key] = np.array(data)
+        else:
+            output_data[key] = data
+
+    out_path = osp.join(output_folder, out_fn)
+    print(out_path)
+    print('Saving to {}'.format(out_path))
+    with open(out_path, 'wb') as output_file:
+        pickle.dump(output_data, output_file)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--smplh-fn', dest='smplh_fn', required=True,
+                        type=str, help='The path to the SMPLH model')
+    parser.add_argument('--mano-left-fn', dest='mano_left_fn', required=True,
+                        type=str, help='The path to the left hand MANO model')
+    parser.add_argument('--mano-right-fn', dest='mano_right_fn', required=True,
+                        type=str, help='The path to the right hand MANO model')
+    parser.add_argument('--output-folder', dest='output_folder',
+                        required=True, type=str,
+                        help='The path to the output folder')
+
+    args = parser.parse_args()
+
+    smplh_fn = args.smplh_fn
+    mano_left_fn = args.mano_left_fn
+    mano_right_fn = args.mano_right_fn
+    output_folder = args.output_folder
+
+    if not osp.exists(output_folder):
+        print('Creating directory: {}'.format(output_folder))
+        os.makedirs(output_folder)
+
+    merge_models(smplh_fn, mano_left_fn, mano_right_fn, output_folder)
diff --git a/common/utils/transforms.py b/common/utils/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b5cfcd0d57ad829c4ee4a6044744c3992663600
--- /dev/null
+++ b/common/utils/transforms.py
@@ -0,0 +1,172 @@
+import torch
+import numpy as np
+import scipy
+from config import cfg
+from torch.nn import functional as F
+import torchgeometry as tgm
+
+
+def cam2pixel(cam_coord, f, c):
+    x = cam_coord[:, 0] / cam_coord[:, 2] * f[0] + c[0]
+    y = cam_coord[:, 1] / cam_coord[:, 2] * f[1] + c[1]
+    z = cam_coord[:, 2]
+    return np.stack((x, y, z), 1)
+
+
+def pixel2cam(pixel_coord, f, c):
+    x = (pixel_coord[:, 0] - c[0]) / f[0] * pixel_coord[:, 2]
+    y = (pixel_coord[:, 1] - c[1]) / f[1] * pixel_coord[:, 2]
+    z = pixel_coord[:, 2]
+    return np.stack((x, y, z), 1)
+
+
+def world2cam(world_coord, R, t):
+    cam_coord = np.dot(R, world_coord.transpose(1, 0)).transpose(1, 0) + t.reshape(1, 3)
+    return cam_coord
+
+
+def cam2world(cam_coord, R, t):
+    world_coord = np.dot(np.linalg.inv(R), (cam_coord - t.reshape(1, 3)).transpose(1, 0)).transpose(1, 0)
+    return world_coord
+
+
+def rigid_transform_3D(A, B):
+    n, dim = A.shape
+    centroid_A = np.mean(A, axis=0)
+    centroid_B = np.mean(B, axis=0)
+    H = np.dot(np.transpose(A - centroid_A), B - centroid_B) / n
+    U, s, V = np.linalg.svd(H)
+    R = np.dot(np.transpose(V), np.transpose(U))
+    if np.linalg.det(R) < 0:
+        s[-1] = -s[-1]
+        V[2] = -V[2]
+        R = np.dot(np.transpose(V), np.transpose(U))
+
+    varP = np.var(A, axis=0).sum()
+    c = 1 / varP * np.sum(s)
+
+    t = -np.dot(c * R, np.transpose(centroid_A)) + np.transpose(centroid_B)
+    return c, R, t
+
+
+def rigid_align(A, B):
+    c, R, t = rigid_transform_3D(A, B)
+    A2 = np.transpose(np.dot(c * R, np.transpose(A))) + t
+    return A2
+
+
+def transform_joint_to_other_db(src_joint, src_name, dst_name):
+    src_joint_num = len(src_name)
+    dst_joint_num = len(dst_name)
+
+    new_joint = np.zeros(((dst_joint_num,) + src_joint.shape[1:]), dtype=np.float32)
+    for src_idx in range(len(src_name)):
+        name = src_name[src_idx]
+        if name in dst_name:
+            dst_idx = dst_name.index(name)
+            new_joint[dst_idx] = src_joint[src_idx]
+
+    return new_joint
+
+
+def rot6d_to_axis_angle(x):
+    batch_size = x.shape[0]
+
+    x = x.view(-1, 3, 2)
+    a1 = x[:, :, 0]
+    a2 = x[:, :, 1]
+    b1 = F.normalize(a1)
+    b2 = F.normalize(a2 - torch.einsum('bi,bi->b', b1, a2).unsqueeze(-1) * b1)
+    b3 = torch.cross(b1, b2)
+    rot_mat = torch.stack((b1, b2, b3), dim=-1)  # 3x3 rotation matrix
+
+    rot_mat = torch.cat([rot_mat, torch.zeros((batch_size, 3, 1)).to(cfg.device).float()], 2)  # 3x4 rotation matrix
+    axis_angle = tgm.rotation_matrix_to_angle_axis(rot_mat).reshape(-1, 3)  # axis-angle
+    axis_angle[torch.isnan(axis_angle)] = 0.0
+    return axis_angle
+
+
+def sample_joint_features(img_feat, joint_xy):
+    height, width = img_feat.shape[2:]
+    x = joint_xy[:, :, 0] / (width - 1) * 2 - 1
+    y = joint_xy[:, :, 1] / (height - 1) * 2 - 1
+    grid = torch.stack((x, y), 2)[:, :, None, :]
+    img_feat = F.grid_sample(img_feat, grid, align_corners=True)[:, :, :, 0]  # batch_size, channel_dim, joint_num
+    img_feat = img_feat.permute(0, 2, 1).contiguous()  # batch_size, joint_num, channel_dim
+    return img_feat
+
+
+def soft_argmax_2d(heatmap2d):
+    batch_size = heatmap2d.shape[0]
+    height, width = heatmap2d.shape[2:]
+    heatmap2d = heatmap2d.reshape((batch_size, -1, height * width))
+    heatmap2d = F.softmax(heatmap2d, 2)
+    heatmap2d = heatmap2d.reshape((batch_size, -1, height, width))
+
+    accu_x = heatmap2d.sum(dim=(2))
+    accu_y = heatmap2d.sum(dim=(3))
+
+    accu_x = accu_x * torch.arange(width).float().to(cfg.device)[None, None, :]
+    accu_y = accu_y * torch.arange(height).float().to(cfg.device)[None, None, :]
+
+    accu_x = accu_x.sum(dim=2, keepdim=True)
+    accu_y = accu_y.sum(dim=2, keepdim=True)
+
+    coord_out = torch.cat((accu_x, accu_y), dim=2)
+    return coord_out
+
+
+def soft_argmax_3d(heatmap3d):
+    batch_size = heatmap3d.shape[0]
+    depth, height, width = heatmap3d.shape[2:]
+    heatmap3d = heatmap3d.reshape((batch_size, -1, depth * height * width))
+    heatmap3d = F.softmax(heatmap3d, 2)
+    heatmap3d = heatmap3d.reshape((batch_size, -1, depth, height, width))
+
+    accu_x = heatmap3d.sum(dim=(2, 3))
+    accu_y = heatmap3d.sum(dim=(2, 4))
+    accu_z = heatmap3d.sum(dim=(3, 4))
+
+    accu_x = accu_x * torch.arange(width).float().to(cfg.device)[None, None, :]
+    accu_y = accu_y * torch.arange(height).float().to(cfg.device)[None, None, :]
+    accu_z = accu_z * torch.arange(depth).float().to(cfg.device)[None, None, :]
+
+    accu_x = accu_x.sum(dim=2, keepdim=True)
+    accu_y = accu_y.sum(dim=2, keepdim=True)
+    accu_z = accu_z.sum(dim=2, keepdim=True)
+
+    coord_out = torch.cat((accu_x, accu_y, accu_z), dim=2)
+    return coord_out
+
+
+def restore_bbox(bbox_center, bbox_size, aspect_ratio, extension_ratio):
+    bbox = bbox_center.view(-1, 1, 2) + torch.cat((-bbox_size.view(-1, 1, 2) / 2., bbox_size.view(-1, 1, 2) / 2.),
+                                                  1)  # xyxy in (cfg.output_hm_shape[2], cfg.output_hm_shape[1]) space
+    bbox[:, :, 0] = bbox[:, :, 0] / cfg.output_hm_shape[2] * cfg.input_body_shape[1]
+    bbox[:, :, 1] = bbox[:, :, 1] / cfg.output_hm_shape[1] * cfg.input_body_shape[0]
+    bbox = bbox.view(-1, 4)
+
+    # xyxy -> xywh
+    bbox[:, 2] = bbox[:, 2] - bbox[:, 0]
+    bbox[:, 3] = bbox[:, 3] - bbox[:, 1]
+
+    # aspect ratio preserving bbox
+    w = bbox[:, 2]
+    h = bbox[:, 3]
+    c_x = bbox[:, 0] + w / 2.
+    c_y = bbox[:, 1] + h / 2.
+
+    mask1 = w > (aspect_ratio * h)
+    mask2 = w < (aspect_ratio * h)
+    h[mask1] = w[mask1] / aspect_ratio
+    w[mask2] = h[mask2] * aspect_ratio
+
+    bbox[:, 2] = w * extension_ratio
+    bbox[:, 3] = h * extension_ratio
+    bbox[:, 0] = c_x - bbox[:, 2] / 2.
+    bbox[:, 1] = c_y - bbox[:, 3] / 2.
+
+    # xywh -> xyxy
+    bbox[:, 2] = bbox[:, 2] + bbox[:, 0]
+    bbox[:, 3] = bbox[:, 3] + bbox[:, 1]
+    return bbox
diff --git a/common/utils/vis.py b/common/utils/vis.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5b7dd3b6775e16bff638c8383ed04ab916978c1
--- /dev/null
+++ b/common/utils/vis.py
@@ -0,0 +1,183 @@
+import os
+import cv2
+import numpy as np
+from mpl_toolkits.mplot3d import Axes3D
+import matplotlib.pyplot as plt
+import matplotlib as mpl
+import os
+os.environ["PYOPENGL_PLATFORM"] = "egl"
+import pyrender
+import trimesh
+from config import cfg
+
+def vis_keypoints_with_skeleton(img, kps, kps_lines, kp_thresh=0.4, alpha=1):
+    # Convert from plt 0-1 RGBA colors to 0-255 BGR colors for opencv.
+    cmap = plt.get_cmap('rainbow')
+    colors = [cmap(i) for i in np.linspace(0, 1, len(kps_lines) + 2)]
+    colors = [(c[2] * 255, c[1] * 255, c[0] * 255) for c in colors]
+
+    # Perform the drawing on a copy of the image, to allow for blending.
+    kp_mask = np.copy(img)
+
+    # Draw the keypoints.
+    for l in range(len(kps_lines)):
+        i1 = kps_lines[l][0]
+        i2 = kps_lines[l][1]
+        p1 = kps[0, i1].astype(np.int32), kps[1, i1].astype(np.int32)
+        p2 = kps[0, i2].astype(np.int32), kps[1, i2].astype(np.int32)
+        if kps[2, i1] > kp_thresh and kps[2, i2] > kp_thresh:
+            cv2.line(
+                kp_mask, p1, p2,
+                color=colors[l], thickness=2, lineType=cv2.LINE_AA)
+        if kps[2, i1] > kp_thresh:
+            cv2.circle(
+                kp_mask, p1,
+                radius=3, color=colors[l], thickness=-1, lineType=cv2.LINE_AA)
+        if kps[2, i2] > kp_thresh:
+            cv2.circle(
+                kp_mask, p2,
+                radius=3, color=colors[l], thickness=-1, lineType=cv2.LINE_AA)
+
+    # Blend the keypoints.
+    return cv2.addWeighted(img, 1.0 - alpha, kp_mask, alpha, 0)
+
+def vis_keypoints(img, kps, alpha=1, radius=3, color=None):
+    # Convert from plt 0-1 RGBA colors to 0-255 BGR colors for opencv.
+    cmap = plt.get_cmap('rainbow')
+    if color is None:
+        colors = [cmap(i) for i in np.linspace(0, 1, len(kps) + 2)]
+        colors = [(c[2] * 255, c[1] * 255, c[0] * 255) for c in colors]
+
+    # Perform the drawing on a copy of the image, to allow for blending.
+    kp_mask = np.copy(img)
+
+    # Draw the keypoints.
+    for i in range(len(kps)):
+        p = kps[i][0].astype(np.int32), kps[i][1].astype(np.int32)
+        if color is None:
+            cv2.circle(kp_mask, p, radius=radius, color=colors[i], thickness=-1, lineType=cv2.LINE_AA)
+        else:
+            cv2.circle(kp_mask, p, radius=radius, color=color, thickness=-1, lineType=cv2.LINE_AA)
+
+    # Blend the keypoints.
+    return cv2.addWeighted(img, 1.0 - alpha, kp_mask, alpha, 0)
+
+def vis_mesh(img, mesh_vertex, alpha=0.5):
+    # Convert from plt 0-1 RGBA colors to 0-255 BGR colors for opencv.
+    cmap = plt.get_cmap('rainbow')
+    colors = [cmap(i) for i in np.linspace(0, 1, len(mesh_vertex))]
+    colors = [(c[2] * 255, c[1] * 255, c[0] * 255) for c in colors]
+
+    # Perform the drawing on a copy of the image, to allow for blending.
+    mask = np.copy(img)
+
+    # Draw the mesh
+    for i in range(len(mesh_vertex)):
+        p = mesh_vertex[i][0].astype(np.int32), mesh_vertex[i][1].astype(np.int32)
+        cv2.circle(mask, p, radius=1, color=colors[i], thickness=-1, lineType=cv2.LINE_AA)
+
+    # Blend the keypoints.
+    return cv2.addWeighted(img, 1.0 - alpha, mask, alpha, 0)
+
+def vis_3d_skeleton(kpt_3d, kpt_3d_vis, kps_lines, filename=None):
+
+    fig = plt.figure()
+    ax = fig.add_subplot(111, projection='3d')
+
+    # Convert from plt 0-1 RGBA colors to 0-255 BGR colors for opencv.
+    cmap = plt.get_cmap('rainbow')
+    colors = [cmap(i) for i in np.linspace(0, 1, len(kps_lines) + 2)]
+    colors = [np.array((c[2], c[1], c[0])) for c in colors]
+
+    for l in range(len(kps_lines)):
+        i1 = kps_lines[l][0]
+        i2 = kps_lines[l][1]
+        x = np.array([kpt_3d[i1,0], kpt_3d[i2,0]])
+        y = np.array([kpt_3d[i1,1], kpt_3d[i2,1]])
+        z = np.array([kpt_3d[i1,2], kpt_3d[i2,2]])
+
+        if kpt_3d_vis[i1,0] > 0 and kpt_3d_vis[i2,0] > 0:
+            ax.plot(x, z, -y, c=colors[l], linewidth=2)
+        if kpt_3d_vis[i1,0] > 0:
+            ax.scatter(kpt_3d[i1,0], kpt_3d[i1,2], -kpt_3d[i1,1], c=colors[l], marker='o')
+        if kpt_3d_vis[i2,0] > 0:
+            ax.scatter(kpt_3d[i2,0], kpt_3d[i2,2], -kpt_3d[i2,1], c=colors[l], marker='o')
+
+    x_r = np.array([0, cfg.input_shape[1]], dtype=np.float32)
+    y_r = np.array([0, cfg.input_shape[0]], dtype=np.float32)
+    z_r = np.array([0, 1], dtype=np.float32)
+    
+    if filename is None:
+        ax.set_title('3D vis')
+    else:
+        ax.set_title(filename)
+
+    ax.set_xlabel('X Label')
+    ax.set_ylabel('Z Label')
+    ax.set_zlabel('Y Label')
+    ax.legend()
+
+    plt.show()
+    cv2.waitKey(0)
+
+def save_obj(v, f, file_name='output.obj'):
+    obj_file = open(file_name, 'w')
+    for i in range(len(v)):
+        obj_file.write('v ' + str(v[i][0]) + ' ' + str(v[i][1]) + ' ' + str(v[i][2]) + '\n')
+    for i in range(len(f)):
+        obj_file.write('f ' + str(f[i][0]+1) + '/' + str(f[i][0]+1) + ' ' + str(f[i][1]+1) + '/' + str(f[i][1]+1) + ' ' + str(f[i][2]+1) + '/' + str(f[i][2]+1) + '\n')
+    obj_file.close()
+
+
+def perspective_projection(vertices, cam_param):
+    # vertices: [N, 3]
+    # cam_param: [3]
+    fx, fy= cam_param['focal']
+    cx, cy = cam_param['princpt']
+    vertices[:, 0] = vertices[:, 0] * fx / vertices[:, 2] + cx
+    vertices[:, 1] = vertices[:, 1] * fy / vertices[:, 2] + cy
+    return vertices
+
+
+def render_mesh(img, mesh, face, cam_param, mesh_as_vertices=False):
+    if mesh_as_vertices:
+        # to run on cluster where headless pyrender is not supported for A100/V100
+        vertices_2d = perspective_projection(mesh, cam_param)
+        img = vis_keypoints(img, vertices_2d, alpha=0.8, radius=2, color=(0, 0, 255))
+    else:
+        # mesh
+        mesh = trimesh.Trimesh(mesh, face)
+        rot = trimesh.transformations.rotation_matrix(
+        np.radians(180), [1, 0, 0])
+        mesh.apply_transform(rot)
+        material = pyrender.MetallicRoughnessMaterial(metallicFactor=0.0, alphaMode='OPAQUE', baseColorFactor=(1.0, 1.0, 0.9, 1.0))
+        mesh = pyrender.Mesh.from_trimesh(mesh, material=material, smooth=False)
+        scene = pyrender.Scene(ambient_light=(0.3, 0.3, 0.3))
+        scene.add(mesh, 'mesh')
+
+        focal, princpt = cam_param['focal'], cam_param['princpt']
+        camera = pyrender.IntrinsicsCamera(fx=focal[0], fy=focal[1], cx=princpt[0], cy=princpt[1])
+        scene.add(camera)
+
+        # renderer
+        renderer = pyrender.OffscreenRenderer(viewport_width=img.shape[1], viewport_height=img.shape[0], point_size=1.0)
+
+        # light
+        light = pyrender.DirectionalLight(color=[1.0, 1.0, 1.0], intensity=0.8)
+        light_pose = np.eye(4)
+        light_pose[:3, 3] = np.array([0, -1, 1])
+        scene.add(light, pose=light_pose)
+        light_pose[:3, 3] = np.array([0, 1, 1])
+        scene.add(light, pose=light_pose)
+        light_pose[:3, 3] = np.array([1, 1, 2])
+        scene.add(light, pose=light_pose)
+
+        # render
+        rgb, depth = renderer.render(scene, flags=pyrender.RenderFlags.RGBA)
+        rgb = rgb[:,:,:3].astype(np.float32)
+        valid_mask = (depth > 0)[:,:,None]
+
+        # save to image
+        img = rgb * valid_mask + img * (1-valid_mask)
+
+    return img
diff --git a/main/SMPLer_X.py b/main/SMPLer_X.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ca9477babbb0eee26f296b47359f7b0911d0a31
--- /dev/null
+++ b/main/SMPLer_X.py
@@ -0,0 +1,468 @@
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from nets.smpler_x import PositionNet, HandRotationNet, FaceRegressor, BoxNet, HandRoI, BodyRotationNet
+from nets.loss import CoordLoss, ParamLoss, CELoss
+from utils.human_models import smpl_x
+from utils.transforms import rot6d_to_axis_angle, restore_bbox
+from config import cfg
+import math
+import copy
+from mmpose.models import build_posenet
+from mmcv import Config
+
+class Model(nn.Module):
+    def __init__(self, encoder, body_position_net, body_rotation_net, box_net, hand_position_net, hand_roi_net,
+                 hand_rotation_net, face_regressor):
+        super(Model, self).__init__()
+
+        # body
+        self.encoder = encoder
+        self.body_position_net = body_position_net
+        self.body_regressor = body_rotation_net
+        self.box_net = box_net
+
+        # hand
+        self.hand_roi_net = hand_roi_net
+        self.hand_position_net = hand_position_net
+        self.hand_regressor = hand_rotation_net
+
+        # face
+        self.face_regressor = face_regressor
+
+        self.smplx_layer = copy.deepcopy(smpl_x.layer['neutral']).to(cfg.device)
+        self.coord_loss = CoordLoss()
+        self.param_loss = ParamLoss()
+        self.ce_loss = CELoss()
+
+        self.body_num_joints = len(smpl_x.pos_joint_part['body'])
+        self.hand_joint_num = len(smpl_x.pos_joint_part['rhand'])
+
+        self.neck = [self.box_net, self.hand_roi_net]
+
+        self.head = [self.body_position_net, self.body_regressor,
+                    self.hand_position_net, self.hand_regressor, 
+                    self.face_regressor]
+
+        self.trainable_modules = [self.encoder, self.body_position_net, self.body_regressor,
+                                  self.box_net, self.hand_position_net,
+                                  self.hand_roi_net, self.hand_regressor, self.face_regressor]
+        self.special_trainable_modules = []
+
+        # backbone:
+        param_bb = sum(p.numel() for p in self.encoder.parameters() if p.requires_grad)
+        # neck 
+        param_neck = 0
+        for module in self.neck:
+            param_neck += sum(p.numel() for p in module.parameters() if p.requires_grad)
+        # head
+        param_head = 0
+        for module in self.head:
+            param_head += sum(p.numel() for p in module.parameters() if p.requires_grad)
+
+        param_net = param_bb + param_neck + param_head
+
+        # print('#parameters:')
+        # print(f'{param_bb}, {param_neck}, {param_head}, {param_net}')
+
+    def get_camera_trans(self, cam_param):
+        # camera translation
+        t_xy = cam_param[:, :2]
+        gamma = torch.sigmoid(cam_param[:, 2])  # apply sigmoid to make it positive
+        k_value = torch.FloatTensor([math.sqrt(cfg.focal[0] * cfg.focal[1] * cfg.camera_3d_size * cfg.camera_3d_size / (
+                cfg.input_body_shape[0] * cfg.input_body_shape[1]))]).to(cfg.device).view(-1)
+        t_z = k_value * gamma
+        cam_trans = torch.cat((t_xy, t_z[:, None]), 1)
+        return cam_trans
+
+    def get_coord(self, root_pose, body_pose, lhand_pose, rhand_pose, jaw_pose, shape, expr, cam_trans, mode):
+        batch_size = root_pose.shape[0]
+        zero_pose = torch.zeros((1, 3)).float().to(cfg.device).repeat(batch_size, 1)  # eye poses
+        output = self.smplx_layer(betas=shape, body_pose=body_pose, global_orient=root_pose, right_hand_pose=rhand_pose,
+                                  left_hand_pose=lhand_pose, jaw_pose=jaw_pose, leye_pose=zero_pose,
+                                  reye_pose=zero_pose, expression=expr)
+        # camera-centered 3D coordinate
+        mesh_cam = output.vertices
+        if mode == 'test' and cfg.testset == 'AGORA':  # use 144 joints for AGORA evaluation
+            joint_cam = output.joints
+        else:
+            joint_cam = output.joints[:, smpl_x.joint_idx, :]
+
+        # project 3D coordinates to 2D space
+        if mode == 'train' and len(cfg.trainset_3d) == 1 and cfg.trainset_3d[0] == 'AGORA' and len(
+                cfg.trainset_2d) == 0:  # prevent gradients from backpropagating to SMPLX paraemter regression module
+            x = (joint_cam[:, :, 0].detach() + cam_trans[:, None, 0]) / (
+                    joint_cam[:, :, 2].detach() + cam_trans[:, None, 2] + 1e-4) * cfg.focal[0] + cfg.princpt[0]
+            y = (joint_cam[:, :, 1].detach() + cam_trans[:, None, 1]) / (
+                    joint_cam[:, :, 2].detach() + cam_trans[:, None, 2] + 1e-4) * cfg.focal[1] + cfg.princpt[1]
+        else:
+            x = (joint_cam[:, :, 0] + cam_trans[:, None, 0]) / (joint_cam[:, :, 2] + cam_trans[:, None, 2] + 1e-4) * \
+                cfg.focal[0] + cfg.princpt[0]
+            y = (joint_cam[:, :, 1] + cam_trans[:, None, 1]) / (joint_cam[:, :, 2] + cam_trans[:, None, 2] + 1e-4) * \
+                cfg.focal[1] + cfg.princpt[1]
+        x = x / cfg.input_body_shape[1] * cfg.output_hm_shape[2]
+        y = y / cfg.input_body_shape[0] * cfg.output_hm_shape[1]
+        joint_proj = torch.stack((x, y), 2)
+
+        # root-relative 3D coordinates
+        root_cam = joint_cam[:, smpl_x.root_joint_idx, None, :]
+        joint_cam = joint_cam - root_cam
+        mesh_cam = mesh_cam + cam_trans[:, None, :]  # for rendering
+        joint_cam_wo_ra = joint_cam.clone()
+
+        # left hand root (left wrist)-relative 3D coordinatese
+        lhand_idx = smpl_x.joint_part['lhand']
+        lhand_cam = joint_cam[:, lhand_idx, :]
+        lwrist_cam = joint_cam[:, smpl_x.lwrist_idx, None, :]
+        lhand_cam = lhand_cam - lwrist_cam
+        joint_cam = torch.cat((joint_cam[:, :lhand_idx[0], :], lhand_cam, joint_cam[:, lhand_idx[-1] + 1:, :]), 1)
+
+        # right hand root (right wrist)-relative 3D coordinatese
+        rhand_idx = smpl_x.joint_part['rhand']
+        rhand_cam = joint_cam[:, rhand_idx, :]
+        rwrist_cam = joint_cam[:, smpl_x.rwrist_idx, None, :]
+        rhand_cam = rhand_cam - rwrist_cam
+        joint_cam = torch.cat((joint_cam[:, :rhand_idx[0], :], rhand_cam, joint_cam[:, rhand_idx[-1] + 1:, :]), 1)
+
+        # face root (neck)-relative 3D coordinates
+        face_idx = smpl_x.joint_part['face']
+        face_cam = joint_cam[:, face_idx, :]
+        neck_cam = joint_cam[:, smpl_x.neck_idx, None, :]
+        face_cam = face_cam - neck_cam
+        joint_cam = torch.cat((joint_cam[:, :face_idx[0], :], face_cam, joint_cam[:, face_idx[-1] + 1:, :]), 1)
+
+        return joint_proj, joint_cam, joint_cam_wo_ra, mesh_cam
+
+    def generate_mesh_gt(self, targets, mode):
+        if 'smplx_mesh_cam' in targets:
+            return targets['smplx_mesh_cam']
+        nums = [3, 63, 45, 45, 3]
+        accu = []
+        temp = 0
+        for num in nums:
+            temp += num
+            accu.append(temp)
+        pose = targets['smplx_pose']
+        root_pose, body_pose, lhand_pose, rhand_pose, jaw_pose = \
+            pose[:, :accu[0]], pose[:, accu[0]:accu[1]], pose[:, accu[1]:accu[2]], pose[:, accu[2]:accu[3]], pose[:,
+                                                                                                             accu[3]:
+                                                                                                             accu[4]]
+        # print(lhand_pose)
+        shape = targets['smplx_shape']
+        expr = targets['smplx_expr']
+        cam_trans = targets['smplx_cam_trans']
+
+        # final output
+        joint_proj, joint_cam, joint_cam_wo_ra, mesh_cam = self.get_coord(root_pose, body_pose, lhand_pose, rhand_pose, jaw_pose, shape,
+                                                         expr, cam_trans, mode)
+
+        return mesh_cam
+
+    def bbox_split(self, bbox):
+        # bbox:[bs, 3, 3]
+        lhand_bbox_center, rhand_bbox_center, face_bbox_center = \
+            bbox[:, 0, :2], bbox[:, 1, :2], bbox[:, 2, :2]
+        return lhand_bbox_center, rhand_bbox_center, face_bbox_center
+
+    def forward(self, inputs, targets, meta_info, mode):
+
+        body_img = F.interpolate(inputs['img'], cfg.input_body_shape)
+
+        # 1. Encoder
+        img_feat, task_tokens = self.encoder(body_img)  # task_token:[bs, N, c]
+        shape_token, cam_token, expr_token, jaw_pose_token, hand_token, body_pose_token = \
+            task_tokens[:, 0], task_tokens[:, 1], task_tokens[:, 2], task_tokens[:, 3], task_tokens[:, 4:6], task_tokens[:, 6:]
+
+        # 2. Body Regressor
+        body_joint_hm, body_joint_img = self.body_position_net(img_feat)
+        root_pose, body_pose, shape, cam_param, = self.body_regressor(body_pose_token, shape_token, cam_token, body_joint_img.detach())
+        root_pose = rot6d_to_axis_angle(root_pose)
+        body_pose = rot6d_to_axis_angle(body_pose.reshape(-1, 6)).reshape(body_pose.shape[0], -1)  # (N, J_R*3)
+        cam_trans = self.get_camera_trans(cam_param)
+
+        # 3. Hand and Face BBox Estimation
+        lhand_bbox_center, lhand_bbox_size, rhand_bbox_center, rhand_bbox_size, face_bbox_center, face_bbox_size = self.box_net(img_feat, body_joint_hm.detach())
+        lhand_bbox = restore_bbox(lhand_bbox_center, lhand_bbox_size, cfg.input_hand_shape[1] / cfg.input_hand_shape[0], 2.0).detach()  # xyxy in (cfg.input_body_shape[1], cfg.input_body_shape[0]) space
+        rhand_bbox = restore_bbox(rhand_bbox_center, rhand_bbox_size, cfg.input_hand_shape[1] / cfg.input_hand_shape[0], 2.0).detach()  # xyxy in (cfg.input_body_shape[1], cfg.input_body_shape[0]) space
+        face_bbox = restore_bbox(face_bbox_center, face_bbox_size, cfg.input_face_shape[1] / cfg.input_face_shape[0], 1.5).detach()  # xyxy in (cfg.input_body_shape[1], cfg.input_body_shape[0]) space
+
+        # 4. Differentiable Feature-level Hand Crop-Upsample
+        # hand_feat: list, [bsx2, c, cfg.output_hm_shape[1]*scale, cfg.output_hm_shape[2]*scale]
+        hand_feat = self.hand_roi_net(img_feat, lhand_bbox, rhand_bbox)  # hand_feat: flipped left hand + right hand
+
+        # 5. Hand/Face Regressor
+        # hand regressor
+        _, hand_joint_img = self.hand_position_net(hand_feat)  # (2N, J_P, 3)
+        hand_pose = self.hand_regressor(hand_feat, hand_joint_img.detach())
+        hand_pose = rot6d_to_axis_angle(hand_pose.reshape(-1, 6)).reshape(hand_feat.shape[0], -1)  # (2N, J_R*3)
+        # restore flipped left hand joint coordinates
+        batch_size = hand_joint_img.shape[0] // 2
+        lhand_joint_img = hand_joint_img[:batch_size, :, :]
+        lhand_joint_img = torch.cat((cfg.output_hand_hm_shape[2] - 1 - lhand_joint_img[:, :, 0:1], lhand_joint_img[:, :, 1:]), 2)
+        rhand_joint_img = hand_joint_img[batch_size:, :, :]
+        # restore flipped left hand joint rotations
+        batch_size = hand_pose.shape[0] // 2
+        lhand_pose = hand_pose[:batch_size, :].reshape(-1, len(smpl_x.orig_joint_part['lhand']), 3)
+        lhand_pose = torch.cat((lhand_pose[:, :, 0:1], -lhand_pose[:, :, 1:3]), 2).view(batch_size, -1)
+        rhand_pose = hand_pose[batch_size:, :]
+
+        # hand regressor
+        expr, jaw_pose = self.face_regressor(expr_token, jaw_pose_token)
+        jaw_pose = rot6d_to_axis_angle(jaw_pose)
+
+        # final output
+        joint_proj, joint_cam, joint_cam_wo_ra, mesh_cam = self.get_coord(root_pose, body_pose, lhand_pose, rhand_pose, jaw_pose, shape, expr, cam_trans, mode)
+        pose = torch.cat((root_pose, body_pose, lhand_pose, rhand_pose, jaw_pose), 1)
+        joint_img = torch.cat((body_joint_img, lhand_joint_img, rhand_joint_img), 1)
+
+        if mode == 'test' and 'smplx_pose' in targets:
+            mesh_pseudo_gt = self.generate_mesh_gt(targets, mode)
+
+        if mode == 'train':
+            # loss functions
+            loss = {}
+
+            smplx_kps_3d_weight = getattr(cfg, 'smplx_kps_3d_weight', 1.0)
+            smplx_kps_3d_weight = getattr(cfg, 'smplx_kps_weight', smplx_kps_3d_weight) # old config
+
+            smplx_kps_2d_weight = getattr(cfg, 'smplx_kps_2d_weight', 1.0)
+            net_kps_2d_weight = getattr(cfg, 'net_kps_2d_weight', 1.0)
+
+            smplx_pose_weight = getattr(cfg, 'smplx_pose_weight', 1.0)
+            smplx_shape_weight = getattr(cfg, 'smplx_loss_weight', 1.0)
+            # smplx_orient_weight = getattr(cfg, 'smplx_orient_weight', smplx_pose_weight) # if not specified, use the same weight as pose
+    
+
+            # do not supervise root pose if original agora json is used
+            if getattr(cfg, 'agora_fix_global_orient_transl', False):
+                # loss['smplx_pose'] = self.param_loss(pose, targets['smplx_pose'], meta_info['smplx_pose_valid'])[:, 3:] * smplx_pose_weight
+                if hasattr(cfg, 'smplx_orient_weight'):
+                    smplx_orient_weight = getattr(cfg, 'smplx_orient_weight')
+                    loss['smplx_orient'] = self.param_loss(pose, targets['smplx_pose'], meta_info['smplx_pose_valid'])[:, :3] * smplx_orient_weight
+
+                loss['smplx_pose'] = self.param_loss(pose, targets['smplx_pose'], meta_info['smplx_pose_valid']) * smplx_pose_weight
+
+            else:
+                loss['smplx_pose'] = self.param_loss(pose, targets['smplx_pose'], meta_info['smplx_pose_valid'])[:, 3:] * smplx_pose_weight
+
+            loss['smplx_shape'] = self.param_loss(shape, targets['smplx_shape'],
+                                                  meta_info['smplx_shape_valid'][:, None]) * smplx_shape_weight 
+            loss['smplx_expr'] = self.param_loss(expr, targets['smplx_expr'], meta_info['smplx_expr_valid'][:, None])
+
+            # supervision for keypoints3d wo/ ra
+            loss['joint_cam'] = self.coord_loss(joint_cam_wo_ra, targets['joint_cam'], meta_info['joint_valid'] * meta_info['is_3D'][:, None, None]) * smplx_kps_3d_weight
+            # supervision for keypoints3d w/ ra
+            loss['smplx_joint_cam'] = self.coord_loss(joint_cam, targets['smplx_joint_cam'], meta_info['smplx_joint_valid']) * smplx_kps_3d_weight
+
+            if not (meta_info['lhand_bbox_valid'] == 0).all():
+                loss['lhand_bbox'] = (self.coord_loss(lhand_bbox_center, targets['lhand_bbox_center'], meta_info['lhand_bbox_valid'][:, None]) +
+                                    self.coord_loss(lhand_bbox_size, targets['lhand_bbox_size'], meta_info['lhand_bbox_valid'][:, None]))
+            if not (meta_info['rhand_bbox_valid'] == 0).all():
+                loss['rhand_bbox'] = (self.coord_loss(rhand_bbox_center, targets['rhand_bbox_center'], meta_info['rhand_bbox_valid'][:, None]) +
+                                    self.coord_loss(rhand_bbox_size, targets['rhand_bbox_size'], meta_info['rhand_bbox_valid'][:, None]))
+            if not (meta_info['face_bbox_valid'] == 0).all():
+                loss['face_bbox'] = (self.coord_loss(face_bbox_center, targets['face_bbox_center'], meta_info['face_bbox_valid'][:, None]) +
+                                 self.coord_loss(face_bbox_size, targets['face_bbox_size'], meta_info['face_bbox_valid'][:, None]))
+            
+            # if (meta_info['face_bbox_valid'] == 0).all():
+            #     out = {}
+            targets['original_joint_img'] = targets['joint_img'].clone()
+            targets['original_smplx_joint_img'] = targets['smplx_joint_img'].clone()
+            # out['original_joint_proj'] = joint_proj.clone()
+            if not (meta_info['lhand_bbox_valid'] + meta_info['rhand_bbox_valid'] == 0).all():
+
+                # change hand target joint_img and joint_trunc according to hand bbox (cfg.output_hm_shape -> downsampled hand bbox space)
+                for part_name, bbox in (('lhand', lhand_bbox), ('rhand', rhand_bbox)):
+                    for coord_name, trunc_name in (('joint_img', 'joint_trunc'), ('smplx_joint_img', 'smplx_joint_trunc')):
+                        x = targets[coord_name][:, smpl_x.joint_part[part_name], 0]
+                        y = targets[coord_name][:, smpl_x.joint_part[part_name], 1]
+                        z = targets[coord_name][:, smpl_x.joint_part[part_name], 2]
+                        trunc = meta_info[trunc_name][:, smpl_x.joint_part[part_name], 0]
+
+                        x -= (bbox[:, None, 0] / cfg.input_body_shape[1] * cfg.output_hm_shape[2])
+                        x *= (cfg.output_hand_hm_shape[2] / (
+                                (bbox[:, None, 2] - bbox[:, None, 0]) / cfg.input_body_shape[1] * cfg.output_hm_shape[
+                            2]))
+                        y -= (bbox[:, None, 1] / cfg.input_body_shape[0] * cfg.output_hm_shape[1])
+                        y *= (cfg.output_hand_hm_shape[1] / (
+                                (bbox[:, None, 3] - bbox[:, None, 1]) / cfg.input_body_shape[0] * cfg.output_hm_shape[
+                            1]))
+                        z *= cfg.output_hand_hm_shape[0] / cfg.output_hm_shape[0]
+                        trunc *= ((x >= 0) * (x < cfg.output_hand_hm_shape[2]) * (y >= 0) * (
+                                y < cfg.output_hand_hm_shape[1]))
+
+                        coord = torch.stack((x, y, z), 2)
+                        trunc = trunc[:, :, None]
+                        targets[coord_name] = torch.cat((targets[coord_name][:, :smpl_x.joint_part[part_name][0], :], coord,
+                                                        targets[coord_name][:, smpl_x.joint_part[part_name][-1] + 1:, :]),
+                                                        1)
+                        meta_info[trunc_name] = torch.cat((meta_info[trunc_name][:, :smpl_x.joint_part[part_name][0], :],
+                                                        trunc,
+                                                        meta_info[trunc_name][:, smpl_x.joint_part[part_name][-1] + 1:,
+                                                        :]), 1)
+
+                # change hand projected joint coordinates according to hand bbox (cfg.output_hm_shape -> hand bbox space)
+                for part_name, bbox in (('lhand', lhand_bbox), ('rhand', rhand_bbox)):
+                    x = joint_proj[:, smpl_x.joint_part[part_name], 0]
+                    y = joint_proj[:, smpl_x.joint_part[part_name], 1]
+
+                    x -= (bbox[:, None, 0] / cfg.input_body_shape[1] * cfg.output_hm_shape[2])
+                    x *= (cfg.output_hand_hm_shape[2] / (
+                            (bbox[:, None, 2] - bbox[:, None, 0]) / cfg.input_body_shape[1] * cfg.output_hm_shape[2]))
+                    y -= (bbox[:, None, 1] / cfg.input_body_shape[0] * cfg.output_hm_shape[1])
+                    y *= (cfg.output_hand_hm_shape[1] / (
+                            (bbox[:, None, 3] - bbox[:, None, 1]) / cfg.input_body_shape[0] * cfg.output_hm_shape[1]))
+
+                    coord = torch.stack((x, y), 2)
+                    trans = []
+                    for bid in range(coord.shape[0]):
+                        mask = meta_info['joint_trunc'][bid, smpl_x.joint_part[part_name], 0] == 1
+                        if torch.sum(mask) == 0:
+                            trans.append(torch.zeros((2)).float().to(cfg.device))
+                        else:
+                            trans.append((-coord[bid, mask, :2] + targets['joint_img'][:, smpl_x.joint_part[part_name], :][
+                                                                bid, mask, :2]).mean(0))
+                    trans = torch.stack(trans)[:, None, :]
+                    coord = coord + trans  # global translation alignment
+                    joint_proj = torch.cat((joint_proj[:, :smpl_x.joint_part[part_name][0], :], coord,
+                                            joint_proj[:, smpl_x.joint_part[part_name][-1] + 1:, :]), 1)
+
+            if not (meta_info['face_bbox_valid'] == 0).all():
+                # change face projected joint coordinates according to face bbox (cfg.output_hm_shape -> face bbox space)
+                coord = joint_proj[:, smpl_x.joint_part['face'], :]
+                trans = []
+                for bid in range(coord.shape[0]):
+                    mask = meta_info['joint_trunc'][bid, smpl_x.joint_part['face'], 0] == 1
+                    if torch.sum(mask) == 0:
+                        trans.append(torch.zeros((2)).float().to(cfg.device))
+                    else:
+                        trans.append((-coord[bid, mask, :2] + targets['joint_img'][:, smpl_x.joint_part['face'], :][bid,
+                                                            mask, :2]).mean(0))
+                trans = torch.stack(trans)[:, None, :]
+                coord = coord + trans  # global translation alignment
+                joint_proj = torch.cat((joint_proj[:, :smpl_x.joint_part['face'][0], :], coord,
+                                        joint_proj[:, smpl_x.joint_part['face'][-1] + 1:, :]), 1)
+            
+            loss['joint_proj'] = self.coord_loss(joint_proj, targets['joint_img'][:, :, :2], meta_info['joint_trunc']) * smplx_kps_2d_weight
+            loss['joint_img'] = self.coord_loss(joint_img, smpl_x.reduce_joint_set(targets['joint_img']),
+                                                smpl_x.reduce_joint_set(meta_info['joint_trunc']), meta_info['is_3D']) * net_kps_2d_weight
+            
+            loss['smplx_joint_img'] = self.coord_loss(joint_img, smpl_x.reduce_joint_set(targets['smplx_joint_img']),
+                                                      smpl_x.reduce_joint_set(meta_info['smplx_joint_trunc'])) * net_kps_2d_weight
+
+            return loss
+        else:
+            # change hand output joint_img according to hand bbox
+            for part_name, bbox in (('lhand', lhand_bbox), ('rhand', rhand_bbox)):
+                joint_img[:, smpl_x.pos_joint_part[part_name], 0] *= (
+                        ((bbox[:, None, 2] - bbox[:, None, 0]) / cfg.input_body_shape[1] * cfg.output_hm_shape[2]) /
+                        cfg.output_hand_hm_shape[2])
+                joint_img[:, smpl_x.pos_joint_part[part_name], 0] += (
+                        bbox[:, None, 0] / cfg.input_body_shape[1] * cfg.output_hm_shape[2])
+                joint_img[:, smpl_x.pos_joint_part[part_name], 1] *= (
+                        ((bbox[:, None, 3] - bbox[:, None, 1]) / cfg.input_body_shape[0] * cfg.output_hm_shape[1]) /
+                        cfg.output_hand_hm_shape[1])
+                joint_img[:, smpl_x.pos_joint_part[part_name], 1] += (
+                        bbox[:, None, 1] / cfg.input_body_shape[0] * cfg.output_hm_shape[1])
+
+            # change input_body_shape to input_img_shape
+            for bbox in (lhand_bbox, rhand_bbox, face_bbox):
+                bbox[:, 0] *= cfg.input_img_shape[1] / cfg.input_body_shape[1]
+                bbox[:, 1] *= cfg.input_img_shape[0] / cfg.input_body_shape[0]
+                bbox[:, 2] *= cfg.input_img_shape[1] / cfg.input_body_shape[1]
+                bbox[:, 3] *= cfg.input_img_shape[0] / cfg.input_body_shape[0]
+
+            # test output
+            out = {}
+            out['img'] = inputs['img']
+            out['joint_img'] = joint_img
+            out['smplx_joint_proj'] = joint_proj
+            out['smplx_mesh_cam'] = mesh_cam
+            out['smplx_root_pose'] = root_pose
+            out['smplx_body_pose'] = body_pose
+            out['smplx_lhand_pose'] = lhand_pose
+            out['smplx_rhand_pose'] = rhand_pose
+            out['smplx_jaw_pose'] = jaw_pose
+            out['smplx_shape'] = shape
+            out['smplx_expr'] = expr
+            out['cam_trans'] = cam_trans
+            out['lhand_bbox'] = lhand_bbox
+            out['rhand_bbox'] = rhand_bbox
+            out['face_bbox'] = face_bbox
+            if 'smplx_shape' in targets:
+                out['smplx_shape_target'] = targets['smplx_shape']
+            if 'img_path' in meta_info:
+                out['img_path'] = meta_info['img_path']
+            if 'smplx_pose' in targets:
+                out['smplx_mesh_cam_pseudo_gt'] = mesh_pseudo_gt
+            if 'smplx_mesh_cam' in targets:
+                out['smplx_mesh_cam_target'] = targets['smplx_mesh_cam']
+            if 'smpl_mesh_cam' in targets:
+                out['smpl_mesh_cam_target'] = targets['smpl_mesh_cam']
+            if 'bb2img_trans' in meta_info:
+                out['bb2img_trans'] = meta_info['bb2img_trans']
+            if 'gt_smplx_transl' in meta_info:
+                out['gt_smplx_transl'] = meta_info['gt_smplx_transl']
+
+            return out
+
+def init_weights(m):
+    try:
+        if type(m) == nn.ConvTranspose2d:
+            nn.init.normal_(m.weight, std=0.001)
+        elif type(m) == nn.Conv2d:
+            nn.init.normal_(m.weight, std=0.001)
+            nn.init.constant_(m.bias, 0)
+        elif type(m) == nn.BatchNorm2d:
+            nn.init.constant_(m.weight, 1)
+            nn.init.constant_(m.bias, 0)
+        elif type(m) == nn.Linear:
+            nn.init.normal_(m.weight, std=0.01)
+            nn.init.constant_(m.bias, 0)
+    except AttributeError:
+        pass
+
+
+def get_model(mode):
+
+    # body
+    vit_cfg = Config.fromfile(cfg.encoder_config_file)
+    vit = build_posenet(vit_cfg.model)
+    body_position_net = PositionNet('body', feat_dim=cfg.feat_dim)
+    body_rotation_net = BodyRotationNet(feat_dim=cfg.feat_dim)
+    box_net = BoxNet(feat_dim=cfg.feat_dim)
+
+    # hand
+    hand_position_net = PositionNet('hand', feat_dim=cfg.feat_dim)
+    hand_roi_net = HandRoI(feat_dim=cfg.feat_dim, upscale=cfg.upscale)
+    hand_rotation_net = HandRotationNet('hand', feat_dim=cfg.feat_dim)
+
+    # face
+    face_regressor = FaceRegressor(feat_dim=cfg.feat_dim)
+
+    if mode == 'train':
+        # body
+        if not getattr(cfg, 'random_init', False):
+            encoder_pretrained_model = torch.load(cfg.encoder_pretrained_model_path)['state_dict']
+            vit.load_state_dict(encoder_pretrained_model, strict=False)
+            print(f"Initialize encoder from {cfg.encoder_pretrained_model_path}")
+        else:
+            print('Random init!!!!!!!')
+
+        body_position_net.apply(init_weights)
+        body_rotation_net.apply(init_weights)
+        box_net.apply(init_weights)
+
+        # hand
+        hand_position_net.apply(init_weights)
+        hand_roi_net.apply(init_weights)
+        hand_rotation_net.apply(init_weights)
+
+        # face
+        face_regressor.apply(init_weights)
+
+    encoder = vit.backbone
+
+    model = Model(encoder, body_position_net, body_rotation_net, box_net, hand_position_net, hand_roi_net, hand_rotation_net,
+                  face_regressor)
+    return model
\ No newline at end of file
diff --git a/main/_base_/datasets/300w.py b/main/_base_/datasets/300w.py
new file mode 100644
index 0000000000000000000000000000000000000000..10c343a2adf84947159f2651b3e918d1fc32ea90
--- /dev/null
+++ b/main/_base_/datasets/300w.py
@@ -0,0 +1,384 @@
+dataset_info = dict(
+    dataset_name='300w',
+    paper_info=dict(
+        author='Sagonas, Christos and Antonakos, Epameinondas '
+        'and Tzimiropoulos, Georgios and Zafeiriou, Stefanos '
+        'and Pantic, Maja',
+        title='300 faces in-the-wild challenge: '
+        'Database and results',
+        container='Image and vision computing',
+        year='2016',
+        homepage='https://ibug.doc.ic.ac.uk/resources/300-W/',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='kpt-0', id=0, color=[255, 255, 255], type='', swap='kpt-16'),
+        1:
+        dict(
+            name='kpt-1', id=1, color=[255, 255, 255], type='', swap='kpt-15'),
+        2:
+        dict(
+            name='kpt-2', id=2, color=[255, 255, 255], type='', swap='kpt-14'),
+        3:
+        dict(
+            name='kpt-3', id=3, color=[255, 255, 255], type='', swap='kpt-13'),
+        4:
+        dict(
+            name='kpt-4', id=4, color=[255, 255, 255], type='', swap='kpt-12'),
+        5:
+        dict(
+            name='kpt-5', id=5, color=[255, 255, 255], type='', swap='kpt-11'),
+        6:
+        dict(
+            name='kpt-6', id=6, color=[255, 255, 255], type='', swap='kpt-10'),
+        7:
+        dict(name='kpt-7', id=7, color=[255, 255, 255], type='', swap='kpt-9'),
+        8:
+        dict(name='kpt-8', id=8, color=[255, 255, 255], type='', swap=''),
+        9:
+        dict(name='kpt-9', id=9, color=[255, 255, 255], type='', swap='kpt-7'),
+        10:
+        dict(
+            name='kpt-10', id=10, color=[255, 255, 255], type='',
+            swap='kpt-6'),
+        11:
+        dict(
+            name='kpt-11', id=11, color=[255, 255, 255], type='',
+            swap='kpt-5'),
+        12:
+        dict(
+            name='kpt-12', id=12, color=[255, 255, 255], type='',
+            swap='kpt-4'),
+        13:
+        dict(
+            name='kpt-13', id=13, color=[255, 255, 255], type='',
+            swap='kpt-3'),
+        14:
+        dict(
+            name='kpt-14', id=14, color=[255, 255, 255], type='',
+            swap='kpt-2'),
+        15:
+        dict(
+            name='kpt-15', id=15, color=[255, 255, 255], type='',
+            swap='kpt-1'),
+        16:
+        dict(
+            name='kpt-16', id=16, color=[255, 255, 255], type='',
+            swap='kpt-0'),
+        17:
+        dict(
+            name='kpt-17',
+            id=17,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-26'),
+        18:
+        dict(
+            name='kpt-18',
+            id=18,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-25'),
+        19:
+        dict(
+            name='kpt-19',
+            id=19,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-24'),
+        20:
+        dict(
+            name='kpt-20',
+            id=20,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-23'),
+        21:
+        dict(
+            name='kpt-21',
+            id=21,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-22'),
+        22:
+        dict(
+            name='kpt-22',
+            id=22,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-21'),
+        23:
+        dict(
+            name='kpt-23',
+            id=23,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-20'),
+        24:
+        dict(
+            name='kpt-24',
+            id=24,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-19'),
+        25:
+        dict(
+            name='kpt-25',
+            id=25,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-18'),
+        26:
+        dict(
+            name='kpt-26',
+            id=26,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-17'),
+        27:
+        dict(name='kpt-27', id=27, color=[255, 255, 255], type='', swap=''),
+        28:
+        dict(name='kpt-28', id=28, color=[255, 255, 255], type='', swap=''),
+        29:
+        dict(name='kpt-29', id=29, color=[255, 255, 255], type='', swap=''),
+        30:
+        dict(name='kpt-30', id=30, color=[255, 255, 255], type='', swap=''),
+        31:
+        dict(
+            name='kpt-31',
+            id=31,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-35'),
+        32:
+        dict(
+            name='kpt-32',
+            id=32,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-34'),
+        33:
+        dict(name='kpt-33', id=33, color=[255, 255, 255], type='', swap=''),
+        34:
+        dict(
+            name='kpt-34',
+            id=34,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-32'),
+        35:
+        dict(
+            name='kpt-35',
+            id=35,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-31'),
+        36:
+        dict(
+            name='kpt-36',
+            id=36,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-45'),
+        37:
+        dict(
+            name='kpt-37',
+            id=37,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-44'),
+        38:
+        dict(
+            name='kpt-38',
+            id=38,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-43'),
+        39:
+        dict(
+            name='kpt-39',
+            id=39,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-42'),
+        40:
+        dict(
+            name='kpt-40',
+            id=40,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-47'),
+        41:
+        dict(
+            name='kpt-41',
+            id=41,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-46'),
+        42:
+        dict(
+            name='kpt-42',
+            id=42,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-39'),
+        43:
+        dict(
+            name='kpt-43',
+            id=43,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-38'),
+        44:
+        dict(
+            name='kpt-44',
+            id=44,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-37'),
+        45:
+        dict(
+            name='kpt-45',
+            id=45,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-36'),
+        46:
+        dict(
+            name='kpt-46',
+            id=46,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-41'),
+        47:
+        dict(
+            name='kpt-47',
+            id=47,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-40'),
+        48:
+        dict(
+            name='kpt-48',
+            id=48,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-54'),
+        49:
+        dict(
+            name='kpt-49',
+            id=49,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-53'),
+        50:
+        dict(
+            name='kpt-50',
+            id=50,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-52'),
+        51:
+        dict(name='kpt-51', id=51, color=[255, 255, 255], type='', swap=''),
+        52:
+        dict(
+            name='kpt-52',
+            id=52,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-50'),
+        53:
+        dict(
+            name='kpt-53',
+            id=53,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-49'),
+        54:
+        dict(
+            name='kpt-54',
+            id=54,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-48'),
+        55:
+        dict(
+            name='kpt-55',
+            id=55,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-59'),
+        56:
+        dict(
+            name='kpt-56',
+            id=56,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-58'),
+        57:
+        dict(name='kpt-57', id=57, color=[255, 255, 255], type='', swap=''),
+        58:
+        dict(
+            name='kpt-58',
+            id=58,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-56'),
+        59:
+        dict(
+            name='kpt-59',
+            id=59,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-55'),
+        60:
+        dict(
+            name='kpt-60',
+            id=60,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-64'),
+        61:
+        dict(
+            name='kpt-61',
+            id=61,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-63'),
+        62:
+        dict(name='kpt-62', id=62, color=[255, 255, 255], type='', swap=''),
+        63:
+        dict(
+            name='kpt-63',
+            id=63,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-61'),
+        64:
+        dict(
+            name='kpt-64',
+            id=64,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-60'),
+        65:
+        dict(
+            name='kpt-65',
+            id=65,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-67'),
+        66:
+        dict(name='kpt-66', id=66, color=[255, 255, 255], type='', swap=''),
+        67:
+        dict(
+            name='kpt-67',
+            id=67,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-65'),
+    },
+    skeleton_info={},
+    joint_weights=[1.] * 68,
+    sigmas=[])
diff --git a/main/_base_/datasets/aflw.py b/main/_base_/datasets/aflw.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf534cbb756e8c514c2f5e2a7fceedd55afb637e
--- /dev/null
+++ b/main/_base_/datasets/aflw.py
@@ -0,0 +1,83 @@
+dataset_info = dict(
+    dataset_name='aflw',
+    paper_info=dict(
+        author='Koestinger, Martin and Wohlhart, Paul and '
+        'Roth, Peter M and Bischof, Horst',
+        title='Annotated facial landmarks in the wild: '
+        'A large-scale, real-world database for facial '
+        'landmark localization',
+        container='2011 IEEE international conference on computer '
+        'vision workshops (ICCV workshops)',
+        year='2011',
+        homepage='https://www.tugraz.at/institute/icg/research/'
+        'team-bischof/lrs/downloads/aflw/',
+    ),
+    keypoint_info={
+        0:
+        dict(name='kpt-0', id=0, color=[255, 255, 255], type='', swap='kpt-5'),
+        1:
+        dict(name='kpt-1', id=1, color=[255, 255, 255], type='', swap='kpt-4'),
+        2:
+        dict(name='kpt-2', id=2, color=[255, 255, 255], type='', swap='kpt-3'),
+        3:
+        dict(name='kpt-3', id=3, color=[255, 255, 255], type='', swap='kpt-2'),
+        4:
+        dict(name='kpt-4', id=4, color=[255, 255, 255], type='', swap='kpt-1'),
+        5:
+        dict(name='kpt-5', id=5, color=[255, 255, 255], type='', swap='kpt-0'),
+        6:
+        dict(
+            name='kpt-6', id=6, color=[255, 255, 255], type='', swap='kpt-11'),
+        7:
+        dict(
+            name='kpt-7', id=7, color=[255, 255, 255], type='', swap='kpt-10'),
+        8:
+        dict(name='kpt-8', id=8, color=[255, 255, 255], type='', swap='kpt-9'),
+        9:
+        dict(name='kpt-9', id=9, color=[255, 255, 255], type='', swap='kpt-8'),
+        10:
+        dict(
+            name='kpt-10', id=10, color=[255, 255, 255], type='',
+            swap='kpt-7'),
+        11:
+        dict(
+            name='kpt-11', id=11, color=[255, 255, 255], type='',
+            swap='kpt-6'),
+        12:
+        dict(
+            name='kpt-12',
+            id=12,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-14'),
+        13:
+        dict(name='kpt-13', id=13, color=[255, 255, 255], type='', swap=''),
+        14:
+        dict(
+            name='kpt-14',
+            id=14,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-12'),
+        15:
+        dict(
+            name='kpt-15',
+            id=15,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-17'),
+        16:
+        dict(name='kpt-16', id=16, color=[255, 255, 255], type='', swap=''),
+        17:
+        dict(
+            name='kpt-17',
+            id=17,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-15'),
+        18:
+        dict(name='kpt-18', id=18, color=[255, 255, 255], type='', swap='')
+    },
+    skeleton_info={},
+    joint_weights=[1.] * 19,
+    sigmas=[])
diff --git a/main/_base_/datasets/aic.py b/main/_base_/datasets/aic.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ecdbe3f0afeb19dbb7aed42653ce5efd85cfda3
--- /dev/null
+++ b/main/_base_/datasets/aic.py
@@ -0,0 +1,140 @@
+dataset_info = dict(
+    dataset_name='aic',
+    paper_info=dict(
+        author='Wu, Jiahong and Zheng, He and Zhao, Bo and '
+        'Li, Yixin and Yan, Baoming and Liang, Rui and '
+        'Wang, Wenjia and Zhou, Shipei and Lin, Guosen and '
+        'Fu, Yanwei and others',
+        title='Ai challenger: A large-scale dataset for going '
+        'deeper in image understanding',
+        container='arXiv',
+        year='2017',
+        homepage='https://github.com/AIChallenger/AI_Challenger_2017',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='right_shoulder',
+            id=0,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        1:
+        dict(
+            name='right_elbow',
+            id=1,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_elbow'),
+        2:
+        dict(
+            name='right_wrist',
+            id=2,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist'),
+        3:
+        dict(
+            name='left_shoulder',
+            id=3,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        4:
+        dict(
+            name='left_elbow',
+            id=4,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_elbow'),
+        5:
+        dict(
+            name='left_wrist',
+            id=5,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_wrist'),
+        6:
+        dict(
+            name='right_hip',
+            id=6,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        7:
+        dict(
+            name='right_knee',
+            id=7,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        8:
+        dict(
+            name='right_ankle',
+            id=8,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_ankle'),
+        9:
+        dict(
+            name='left_hip',
+            id=9,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        10:
+        dict(
+            name='left_knee',
+            id=10,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        11:
+        dict(
+            name='left_ankle',
+            id=11,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_ankle'),
+        12:
+        dict(
+            name='head_top',
+            id=12,
+            color=[51, 153, 255],
+            type='upper',
+            swap=''),
+        13:
+        dict(name='neck', id=13, color=[51, 153, 255], type='upper', swap='')
+    },
+    skeleton_info={
+        0:
+        dict(link=('right_wrist', 'right_elbow'), id=0, color=[255, 128, 0]),
+        1: dict(
+            link=('right_elbow', 'right_shoulder'), id=1, color=[255, 128, 0]),
+        2: dict(link=('right_shoulder', 'neck'), id=2, color=[51, 153, 255]),
+        3: dict(link=('neck', 'left_shoulder'), id=3, color=[51, 153, 255]),
+        4: dict(link=('left_shoulder', 'left_elbow'), id=4, color=[0, 255, 0]),
+        5: dict(link=('left_elbow', 'left_wrist'), id=5, color=[0, 255, 0]),
+        6: dict(link=('right_ankle', 'right_knee'), id=6, color=[255, 128, 0]),
+        7: dict(link=('right_knee', 'right_hip'), id=7, color=[255, 128, 0]),
+        8: dict(link=('right_hip', 'left_hip'), id=8, color=[51, 153, 255]),
+        9: dict(link=('left_hip', 'left_knee'), id=9, color=[0, 255, 0]),
+        10: dict(link=('left_knee', 'left_ankle'), id=10, color=[0, 255, 0]),
+        11: dict(link=('head_top', 'neck'), id=11, color=[51, 153, 255]),
+        12: dict(
+            link=('right_shoulder', 'right_hip'), id=12, color=[51, 153, 255]),
+        13:
+        dict(link=('left_shoulder', 'left_hip'), id=13, color=[51, 153, 255])
+    },
+    joint_weights=[
+        1., 1.2, 1.5, 1., 1.2, 1.5, 1., 1.2, 1.5, 1., 1.2, 1.5, 1., 1.
+    ],
+
+    # 'https://github.com/AIChallenger/AI_Challenger_2017/blob/master/'
+    # 'Evaluation/keypoint_eval/keypoint_eval.py#L50'
+    # delta = 2 x sigma
+    sigmas=[
+        0.01388152, 0.01515228, 0.01057665, 0.01417709, 0.01497891, 0.01402144,
+        0.03909642, 0.03686941, 0.01981803, 0.03843971, 0.03412318, 0.02415081,
+        0.01291456, 0.01236173
+    ])
diff --git a/main/_base_/datasets/animalpose.py b/main/_base_/datasets/animalpose.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5bb62d951b71da25e679bd755fe566216dc3f6f
--- /dev/null
+++ b/main/_base_/datasets/animalpose.py
@@ -0,0 +1,166 @@
+dataset_info = dict(
+    dataset_name='animalpose',
+    paper_info=dict(
+        author='Cao, Jinkun and Tang, Hongyang and Fang, Hao-Shu and '
+        'Shen, Xiaoyong and Lu, Cewu and Tai, Yu-Wing',
+        title='Cross-Domain Adaptation for Animal Pose Estimation',
+        container='The IEEE International Conference on '
+        'Computer Vision (ICCV)',
+        year='2019',
+        homepage='https://sites.google.com/view/animal-pose/',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='L_Eye', id=0, color=[0, 255, 0], type='upper', swap='R_Eye'),
+        1:
+        dict(
+            name='R_Eye',
+            id=1,
+            color=[255, 128, 0],
+            type='upper',
+            swap='L_Eye'),
+        2:
+        dict(
+            name='L_EarBase',
+            id=2,
+            color=[0, 255, 0],
+            type='upper',
+            swap='R_EarBase'),
+        3:
+        dict(
+            name='R_EarBase',
+            id=3,
+            color=[255, 128, 0],
+            type='upper',
+            swap='L_EarBase'),
+        4:
+        dict(name='Nose', id=4, color=[51, 153, 255], type='upper', swap=''),
+        5:
+        dict(name='Throat', id=5, color=[51, 153, 255], type='upper', swap=''),
+        6:
+        dict(
+            name='TailBase', id=6, color=[51, 153, 255], type='lower',
+            swap=''),
+        7:
+        dict(
+            name='Withers', id=7, color=[51, 153, 255], type='upper', swap=''),
+        8:
+        dict(
+            name='L_F_Elbow',
+            id=8,
+            color=[0, 255, 0],
+            type='upper',
+            swap='R_F_Elbow'),
+        9:
+        dict(
+            name='R_F_Elbow',
+            id=9,
+            color=[255, 128, 0],
+            type='upper',
+            swap='L_F_Elbow'),
+        10:
+        dict(
+            name='L_B_Elbow',
+            id=10,
+            color=[0, 255, 0],
+            type='lower',
+            swap='R_B_Elbow'),
+        11:
+        dict(
+            name='R_B_Elbow',
+            id=11,
+            color=[255, 128, 0],
+            type='lower',
+            swap='L_B_Elbow'),
+        12:
+        dict(
+            name='L_F_Knee',
+            id=12,
+            color=[0, 255, 0],
+            type='upper',
+            swap='R_F_Knee'),
+        13:
+        dict(
+            name='R_F_Knee',
+            id=13,
+            color=[255, 128, 0],
+            type='upper',
+            swap='L_F_Knee'),
+        14:
+        dict(
+            name='L_B_Knee',
+            id=14,
+            color=[0, 255, 0],
+            type='lower',
+            swap='R_B_Knee'),
+        15:
+        dict(
+            name='R_B_Knee',
+            id=15,
+            color=[255, 128, 0],
+            type='lower',
+            swap='L_B_Knee'),
+        16:
+        dict(
+            name='L_F_Paw',
+            id=16,
+            color=[0, 255, 0],
+            type='upper',
+            swap='R_F_Paw'),
+        17:
+        dict(
+            name='R_F_Paw',
+            id=17,
+            color=[255, 128, 0],
+            type='upper',
+            swap='L_F_Paw'),
+        18:
+        dict(
+            name='L_B_Paw',
+            id=18,
+            color=[0, 255, 0],
+            type='lower',
+            swap='R_B_Paw'),
+        19:
+        dict(
+            name='R_B_Paw',
+            id=19,
+            color=[255, 128, 0],
+            type='lower',
+            swap='L_B_Paw')
+    },
+    skeleton_info={
+        0: dict(link=('L_Eye', 'R_Eye'), id=0, color=[51, 153, 255]),
+        1: dict(link=('L_Eye', 'L_EarBase'), id=1, color=[0, 255, 0]),
+        2: dict(link=('R_Eye', 'R_EarBase'), id=2, color=[255, 128, 0]),
+        3: dict(link=('L_Eye', 'Nose'), id=3, color=[0, 255, 0]),
+        4: dict(link=('R_Eye', 'Nose'), id=4, color=[255, 128, 0]),
+        5: dict(link=('Nose', 'Throat'), id=5, color=[51, 153, 255]),
+        6: dict(link=('Throat', 'Withers'), id=6, color=[51, 153, 255]),
+        7: dict(link=('TailBase', 'Withers'), id=7, color=[51, 153, 255]),
+        8: dict(link=('Throat', 'L_F_Elbow'), id=8, color=[0, 255, 0]),
+        9: dict(link=('L_F_Elbow', 'L_F_Knee'), id=9, color=[0, 255, 0]),
+        10: dict(link=('L_F_Knee', 'L_F_Paw'), id=10, color=[0, 255, 0]),
+        11: dict(link=('Throat', 'R_F_Elbow'), id=11, color=[255, 128, 0]),
+        12: dict(link=('R_F_Elbow', 'R_F_Knee'), id=12, color=[255, 128, 0]),
+        13: dict(link=('R_F_Knee', 'R_F_Paw'), id=13, color=[255, 128, 0]),
+        14: dict(link=('TailBase', 'L_B_Elbow'), id=14, color=[0, 255, 0]),
+        15: dict(link=('L_B_Elbow', 'L_B_Knee'), id=15, color=[0, 255, 0]),
+        16: dict(link=('L_B_Knee', 'L_B_Paw'), id=16, color=[0, 255, 0]),
+        17: dict(link=('TailBase', 'R_B_Elbow'), id=17, color=[255, 128, 0]),
+        18: dict(link=('R_B_Elbow', 'R_B_Knee'), id=18, color=[255, 128, 0]),
+        19: dict(link=('R_B_Knee', 'R_B_Paw'), id=19, color=[255, 128, 0])
+    },
+    joint_weights=[
+        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.2, 1.2,
+        1.5, 1.5, 1.5, 1.5
+    ],
+
+    # Note: The original paper did not provide enough information about
+    # the sigmas. We modified from 'https://github.com/cocodataset/'
+    # 'cocoapi/blob/master/PythonAPI/pycocotools/cocoeval.py#L523'
+    sigmas=[
+        0.025, 0.025, 0.026, 0.035, 0.035, 0.10, 0.10, 0.10, 0.107, 0.107,
+        0.107, 0.107, 0.087, 0.087, 0.087, 0.087, 0.089, 0.089, 0.089, 0.089
+    ])
diff --git a/main/_base_/datasets/ap10k.py b/main/_base_/datasets/ap10k.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0df579acbb8cf0de1ef62412ba865ee8710f0aa
--- /dev/null
+++ b/main/_base_/datasets/ap10k.py
@@ -0,0 +1,142 @@
+dataset_info = dict(
+    dataset_name='ap10k',
+    paper_info=dict(
+        author='Yu, Hang and Xu, Yufei and Zhang, Jing and '
+        'Zhao, Wei and Guan, Ziyu and Tao, Dacheng',
+        title='AP-10K: A Benchmark for Animal Pose Estimation in the Wild',
+        container='35th Conference on Neural Information Processing Systems '
+        '(NeurIPS 2021) Track on Datasets and Bench-marks.',
+        year='2021',
+        homepage='https://github.com/AlexTheBad/AP-10K',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='L_Eye', id=0, color=[0, 255, 0], type='upper', swap='R_Eye'),
+        1:
+        dict(
+            name='R_Eye',
+            id=1,
+            color=[255, 128, 0],
+            type='upper',
+            swap='L_Eye'),
+        2:
+        dict(name='Nose', id=2, color=[51, 153, 255], type='upper', swap=''),
+        3:
+        dict(name='Neck', id=3, color=[51, 153, 255], type='upper', swap=''),
+        4:
+        dict(
+            name='Root of tail',
+            id=4,
+            color=[51, 153, 255],
+            type='lower',
+            swap=''),
+        5:
+        dict(
+            name='L_Shoulder',
+            id=5,
+            color=[51, 153, 255],
+            type='upper',
+            swap='R_Shoulder'),
+        6:
+        dict(
+            name='L_Elbow',
+            id=6,
+            color=[51, 153, 255],
+            type='upper',
+            swap='R_Elbow'),
+        7:
+        dict(
+            name='L_F_Paw',
+            id=7,
+            color=[0, 255, 0],
+            type='upper',
+            swap='R_F_Paw'),
+        8:
+        dict(
+            name='R_Shoulder',
+            id=8,
+            color=[0, 255, 0],
+            type='upper',
+            swap='L_Shoulder'),
+        9:
+        dict(
+            name='R_Elbow',
+            id=9,
+            color=[255, 128, 0],
+            type='upper',
+            swap='L_Elbow'),
+        10:
+        dict(
+            name='R_F_Paw',
+            id=10,
+            color=[0, 255, 0],
+            type='lower',
+            swap='L_F_Paw'),
+        11:
+        dict(
+            name='L_Hip',
+            id=11,
+            color=[255, 128, 0],
+            type='lower',
+            swap='R_Hip'),
+        12:
+        dict(
+            name='L_Knee',
+            id=12,
+            color=[255, 128, 0],
+            type='lower',
+            swap='R_Knee'),
+        13:
+        dict(
+            name='L_B_Paw',
+            id=13,
+            color=[0, 255, 0],
+            type='lower',
+            swap='R_B_Paw'),
+        14:
+        dict(
+            name='R_Hip', id=14, color=[0, 255, 0], type='lower',
+            swap='L_Hip'),
+        15:
+        dict(
+            name='R_Knee',
+            id=15,
+            color=[0, 255, 0],
+            type='lower',
+            swap='L_Knee'),
+        16:
+        dict(
+            name='R_B_Paw',
+            id=16,
+            color=[0, 255, 0],
+            type='lower',
+            swap='L_B_Paw'),
+    },
+    skeleton_info={
+        0: dict(link=('L_Eye', 'R_Eye'), id=0, color=[0, 0, 255]),
+        1: dict(link=('L_Eye', 'Nose'), id=1, color=[0, 0, 255]),
+        2: dict(link=('R_Eye', 'Nose'), id=2, color=[0, 0, 255]),
+        3: dict(link=('Nose', 'Neck'), id=3, color=[0, 255, 0]),
+        4: dict(link=('Neck', 'Root of tail'), id=4, color=[0, 255, 0]),
+        5: dict(link=('Neck', 'L_Shoulder'), id=5, color=[0, 255, 255]),
+        6: dict(link=('L_Shoulder', 'L_Elbow'), id=6, color=[0, 255, 255]),
+        7: dict(link=('L_Elbow', 'L_F_Paw'), id=6, color=[0, 255, 255]),
+        8: dict(link=('Neck', 'R_Shoulder'), id=7, color=[6, 156, 250]),
+        9: dict(link=('R_Shoulder', 'R_Elbow'), id=8, color=[6, 156, 250]),
+        10: dict(link=('R_Elbow', 'R_F_Paw'), id=9, color=[6, 156, 250]),
+        11: dict(link=('Root of tail', 'L_Hip'), id=10, color=[0, 255, 255]),
+        12: dict(link=('L_Hip', 'L_Knee'), id=11, color=[0, 255, 255]),
+        13: dict(link=('L_Knee', 'L_B_Paw'), id=12, color=[0, 255, 255]),
+        14: dict(link=('Root of tail', 'R_Hip'), id=13, color=[6, 156, 250]),
+        15: dict(link=('R_Hip', 'R_Knee'), id=14, color=[6, 156, 250]),
+        16: dict(link=('R_Knee', 'R_B_Paw'), id=15, color=[6, 156, 250]),
+    },
+    joint_weights=[
+        1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5,
+        1.5
+    ],
+    sigmas=[
+        0.025, 0.025, 0.026, 0.035, 0.035, 0.079, 0.072, 0.062, 0.079, 0.072,
+        0.062, 0.107, 0.087, 0.089, 0.107, 0.087, 0.089
+    ])
diff --git a/main/_base_/datasets/atrw.py b/main/_base_/datasets/atrw.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ec71c8c508a0340139371a651ca2dd56eeae3cf
--- /dev/null
+++ b/main/_base_/datasets/atrw.py
@@ -0,0 +1,144 @@
+dataset_info = dict(
+    dataset_name='atrw',
+    paper_info=dict(
+        author='Li, Shuyuan and Li, Jianguo and Tang, Hanlin '
+        'and Qian, Rui and Lin, Weiyao',
+        title='ATRW: A Benchmark for Amur Tiger '
+        'Re-identification in the Wild',
+        container='Proceedings of the 28th ACM '
+        'International Conference on Multimedia',
+        year='2020',
+        homepage='https://cvwc2019.github.io/challenge.html',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='left_ear',
+            id=0,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_ear'),
+        1:
+        dict(
+            name='right_ear',
+            id=1,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_ear'),
+        2:
+        dict(name='nose', id=2, color=[51, 153, 255], type='upper', swap=''),
+        3:
+        dict(
+            name='right_shoulder',
+            id=3,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        4:
+        dict(
+            name='right_front_paw',
+            id=4,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_front_paw'),
+        5:
+        dict(
+            name='left_shoulder',
+            id=5,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        6:
+        dict(
+            name='left_front_paw',
+            id=6,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_front_paw'),
+        7:
+        dict(
+            name='right_hip',
+            id=7,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        8:
+        dict(
+            name='right_knee',
+            id=8,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        9:
+        dict(
+            name='right_back_paw',
+            id=9,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_back_paw'),
+        10:
+        dict(
+            name='left_hip',
+            id=10,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        11:
+        dict(
+            name='left_knee',
+            id=11,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        12:
+        dict(
+            name='left_back_paw',
+            id=12,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_back_paw'),
+        13:
+        dict(name='tail', id=13, color=[51, 153, 255], type='lower', swap=''),
+        14:
+        dict(
+            name='center', id=14, color=[51, 153, 255], type='lower', swap=''),
+    },
+    skeleton_info={
+        0:
+        dict(link=('left_ear', 'nose'), id=0, color=[51, 153, 255]),
+        1:
+        dict(link=('right_ear', 'nose'), id=1, color=[51, 153, 255]),
+        2:
+        dict(link=('nose', 'center'), id=2, color=[51, 153, 255]),
+        3:
+        dict(
+            link=('left_shoulder', 'left_front_paw'), id=3, color=[0, 255, 0]),
+        4:
+        dict(link=('left_shoulder', 'center'), id=4, color=[0, 255, 0]),
+        5:
+        dict(
+            link=('right_shoulder', 'right_front_paw'),
+            id=5,
+            color=[255, 128, 0]),
+        6:
+        dict(link=('right_shoulder', 'center'), id=6, color=[255, 128, 0]),
+        7:
+        dict(link=('tail', 'center'), id=7, color=[51, 153, 255]),
+        8:
+        dict(link=('right_back_paw', 'right_knee'), id=8, color=[255, 128, 0]),
+        9:
+        dict(link=('right_knee', 'right_hip'), id=9, color=[255, 128, 0]),
+        10:
+        dict(link=('right_hip', 'tail'), id=10, color=[255, 128, 0]),
+        11:
+        dict(link=('left_back_paw', 'left_knee'), id=11, color=[0, 255, 0]),
+        12:
+        dict(link=('left_knee', 'left_hip'), id=12, color=[0, 255, 0]),
+        13:
+        dict(link=('left_hip', 'tail'), id=13, color=[0, 255, 0]),
+    },
+    joint_weights=[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
+    sigmas=[
+        0.0277, 0.0823, 0.0831, 0.0202, 0.0716, 0.0263, 0.0646, 0.0302, 0.0440,
+        0.0316, 0.0333, 0.0547, 0.0263, 0.0683, 0.0539
+    ])
diff --git a/main/_base_/datasets/campus.py b/main/_base_/datasets/campus.py
new file mode 100644
index 0000000000000000000000000000000000000000..334316e9c25282508767158d3fae30578ab3949d
--- /dev/null
+++ b/main/_base_/datasets/campus.py
@@ -0,0 +1,151 @@
+dataset_info = dict(
+    dataset_name='campus',
+    paper_info=dict(
+        author='Belagiannis, Vasileios and Amin, Sikandar and Andriluka, '
+        'Mykhaylo and Schiele, Bernt and Navab, Nassir and Ilic, Slobodan',
+        title='3D Pictorial Structures for Multiple Human Pose Estimation',
+        container='IEEE Computer Society Conference on Computer Vision and '
+        'Pattern Recognition (CVPR)',
+        year='2014',
+        homepage='http://campar.in.tum.de/Chair/MultiHumanPose',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='right_ankle',
+            id=0,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_ankle'),
+        1:
+        dict(
+            name='right_knee',
+            id=1,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        2:
+        dict(
+            name='right_hip',
+            id=2,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        3:
+        dict(
+            name='left_hip',
+            id=3,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        4:
+        dict(
+            name='left_knee',
+            id=4,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        5:
+        dict(
+            name='left_ankle',
+            id=5,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_ankle'),
+        6:
+        dict(
+            name='right_wrist',
+            id=6,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist'),
+        7:
+        dict(
+            name='right_elbow',
+            id=7,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_elbow'),
+        8:
+        dict(
+            name='right_shoulder',
+            id=8,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        9:
+        dict(
+            name='left_shoulder',
+            id=9,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        10:
+        dict(
+            name='left_elbow',
+            id=10,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_elbow'),
+        11:
+        dict(
+            name='left_wrist',
+            id=11,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_wrist'),
+        12:
+        dict(
+            name='bottom_head',
+            id=12,
+            color=[51, 153, 255],
+            type='upper',
+            swap=''),
+        13:
+        dict(
+            name='top_head',
+            id=13,
+            color=[51, 153, 255],
+            type='upper',
+            swap=''),
+    },
+    skeleton_info={
+        0:
+        dict(link=('right_ankle', 'right_knee'), id=0, color=[255, 128, 0]),
+        1:
+        dict(link=('right_knee', 'right_hip'), id=1, color=[255, 128, 0]),
+        2:
+        dict(link=('left_hip', 'left_knee'), id=2, color=[0, 255, 0]),
+        3:
+        dict(link=('left_knee', 'left_ankle'), id=3, color=[0, 255, 0]),
+        4:
+        dict(link=('right_hip', 'left_hip'), id=4, color=[51, 153, 255]),
+        5:
+        dict(link=('right_wrist', 'right_elbow'), id=5, color=[255, 128, 0]),
+        6:
+        dict(
+            link=('right_elbow', 'right_shoulder'), id=6, color=[255, 128, 0]),
+        7:
+        dict(link=('left_shoulder', 'left_elbow'), id=7, color=[0, 255, 0]),
+        8:
+        dict(link=('left_elbow', 'left_wrist'), id=8, color=[0, 255, 0]),
+        9:
+        dict(link=('right_hip', 'right_shoulder'), id=9, color=[255, 128, 0]),
+        10:
+        dict(link=('left_hip', 'left_shoulder'), id=10, color=[0, 255, 0]),
+        11:
+        dict(
+            link=('right_shoulder', 'bottom_head'), id=11, color=[255, 128,
+                                                                  0]),
+        12:
+        dict(link=('left_shoulder', 'bottom_head'), id=12, color=[0, 255, 0]),
+        13:
+        dict(link=('bottom_head', 'top_head'), id=13, color=[51, 153, 255]),
+    },
+    joint_weights=[
+        1.5, 1.2, 1.0, 1.0, 1.2, 1.5, 1.5, 1.2, 1.0, 1.0, 1.2, 1.5, 1.0, 1.0
+    ],
+    sigmas=[
+        0.089, 0.087, 0.107, 0.107, 0.087, 0.089, 0.062, 0.072, 0.079, 0.079,
+        0.072, 0.062, 0.026, 0.026
+    ])
diff --git a/main/_base_/datasets/coco.py b/main/_base_/datasets/coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..865a95bc02fedd318f32d2e7aa8397147d78fdb5
--- /dev/null
+++ b/main/_base_/datasets/coco.py
@@ -0,0 +1,181 @@
+dataset_info = dict(
+    dataset_name='coco',
+    paper_info=dict(
+        author='Lin, Tsung-Yi and Maire, Michael and '
+        'Belongie, Serge and Hays, James and '
+        'Perona, Pietro and Ramanan, Deva and '
+        r'Doll{\'a}r, Piotr and Zitnick, C Lawrence',
+        title='Microsoft coco: Common objects in context',
+        container='European conference on computer vision',
+        year='2014',
+        homepage='http://cocodataset.org/',
+    ),
+    keypoint_info={
+        0:
+        dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''),
+        1:
+        dict(
+            name='left_eye',
+            id=1,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_eye'),
+        2:
+        dict(
+            name='right_eye',
+            id=2,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_eye'),
+        3:
+        dict(
+            name='left_ear',
+            id=3,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_ear'),
+        4:
+        dict(
+            name='right_ear',
+            id=4,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_ear'),
+        5:
+        dict(
+            name='left_shoulder',
+            id=5,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        6:
+        dict(
+            name='right_shoulder',
+            id=6,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        7:
+        dict(
+            name='left_elbow',
+            id=7,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_elbow'),
+        8:
+        dict(
+            name='right_elbow',
+            id=8,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_elbow'),
+        9:
+        dict(
+            name='left_wrist',
+            id=9,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_wrist'),
+        10:
+        dict(
+            name='right_wrist',
+            id=10,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist'),
+        11:
+        dict(
+            name='left_hip',
+            id=11,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        12:
+        dict(
+            name='right_hip',
+            id=12,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        13:
+        dict(
+            name='left_knee',
+            id=13,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        14:
+        dict(
+            name='right_knee',
+            id=14,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        15:
+        dict(
+            name='left_ankle',
+            id=15,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_ankle'),
+        16:
+        dict(
+            name='right_ankle',
+            id=16,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_ankle')
+    },
+    skeleton_info={
+        0:
+        dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]),
+        1:
+        dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]),
+        2:
+        dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]),
+        4:
+        dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]),
+        5:
+        dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]),
+        6:
+        dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]),
+        7:
+        dict(
+            link=('left_shoulder', 'right_shoulder'),
+            id=7,
+            color=[51, 153, 255]),
+        8:
+        dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]),
+        9:
+        dict(
+            link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]),
+        10:
+        dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]),
+        11:
+        dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]),
+        12:
+        dict(link=('left_eye', 'right_eye'), id=12, color=[51, 153, 255]),
+        13:
+        dict(link=('nose', 'left_eye'), id=13, color=[51, 153, 255]),
+        14:
+        dict(link=('nose', 'right_eye'), id=14, color=[51, 153, 255]),
+        15:
+        dict(link=('left_eye', 'left_ear'), id=15, color=[51, 153, 255]),
+        16:
+        dict(link=('right_eye', 'right_ear'), id=16, color=[51, 153, 255]),
+        17:
+        dict(link=('left_ear', 'left_shoulder'), id=17, color=[51, 153, 255]),
+        18:
+        dict(
+            link=('right_ear', 'right_shoulder'), id=18, color=[51, 153, 255])
+    },
+    joint_weights=[
+        1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5,
+        1.5
+    ],
+    sigmas=[
+        0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062,
+        0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089
+    ])
diff --git a/main/_base_/datasets/coco_wholebody.py b/main/_base_/datasets/coco_wholebody.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef9b707017a24a1a133bb28566d212c618fee694
--- /dev/null
+++ b/main/_base_/datasets/coco_wholebody.py
@@ -0,0 +1,1154 @@
+dataset_info = dict(
+    dataset_name='coco_wholebody',
+    paper_info=dict(
+        author='Jin, Sheng and Xu, Lumin and Xu, Jin and '
+        'Wang, Can and Liu, Wentao and '
+        'Qian, Chen and Ouyang, Wanli and Luo, Ping',
+        title='Whole-Body Human Pose Estimation in the Wild',
+        container='Proceedings of the European '
+        'Conference on Computer Vision (ECCV)',
+        year='2020',
+        homepage='https://github.com/jin-s13/COCO-WholeBody/',
+    ),
+    keypoint_info={
+        0:
+        dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''),
+        1:
+        dict(
+            name='left_eye',
+            id=1,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_eye'),
+        2:
+        dict(
+            name='right_eye',
+            id=2,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_eye'),
+        3:
+        dict(
+            name='left_ear',
+            id=3,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_ear'),
+        4:
+        dict(
+            name='right_ear',
+            id=4,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_ear'),
+        5:
+        dict(
+            name='left_shoulder',
+            id=5,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        6:
+        dict(
+            name='right_shoulder',
+            id=6,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        7:
+        dict(
+            name='left_elbow',
+            id=7,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_elbow'),
+        8:
+        dict(
+            name='right_elbow',
+            id=8,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_elbow'),
+        9:
+        dict(
+            name='left_wrist',
+            id=9,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_wrist'),
+        10:
+        dict(
+            name='right_wrist',
+            id=10,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist'),
+        11:
+        dict(
+            name='left_hip',
+            id=11,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        12:
+        dict(
+            name='right_hip',
+            id=12,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        13:
+        dict(
+            name='left_knee',
+            id=13,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        14:
+        dict(
+            name='right_knee',
+            id=14,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        15:
+        dict(
+            name='left_ankle',
+            id=15,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_ankle'),
+        16:
+        dict(
+            name='right_ankle',
+            id=16,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_ankle'),
+        17:
+        dict(
+            name='left_big_toe',
+            id=17,
+            color=[255, 128, 0],
+            type='lower',
+            swap='right_big_toe'),
+        18:
+        dict(
+            name='left_small_toe',
+            id=18,
+            color=[255, 128, 0],
+            type='lower',
+            swap='right_small_toe'),
+        19:
+        dict(
+            name='left_heel',
+            id=19,
+            color=[255, 128, 0],
+            type='lower',
+            swap='right_heel'),
+        20:
+        dict(
+            name='right_big_toe',
+            id=20,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_big_toe'),
+        21:
+        dict(
+            name='right_small_toe',
+            id=21,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_small_toe'),
+        22:
+        dict(
+            name='right_heel',
+            id=22,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_heel'),
+        23:
+        dict(
+            name='face-0',
+            id=23,
+            color=[255, 255, 255],
+            type='',
+            swap='face-16'),
+        24:
+        dict(
+            name='face-1',
+            id=24,
+            color=[255, 255, 255],
+            type='',
+            swap='face-15'),
+        25:
+        dict(
+            name='face-2',
+            id=25,
+            color=[255, 255, 255],
+            type='',
+            swap='face-14'),
+        26:
+        dict(
+            name='face-3',
+            id=26,
+            color=[255, 255, 255],
+            type='',
+            swap='face-13'),
+        27:
+        dict(
+            name='face-4',
+            id=27,
+            color=[255, 255, 255],
+            type='',
+            swap='face-12'),
+        28:
+        dict(
+            name='face-5',
+            id=28,
+            color=[255, 255, 255],
+            type='',
+            swap='face-11'),
+        29:
+        dict(
+            name='face-6',
+            id=29,
+            color=[255, 255, 255],
+            type='',
+            swap='face-10'),
+        30:
+        dict(
+            name='face-7',
+            id=30,
+            color=[255, 255, 255],
+            type='',
+            swap='face-9'),
+        31:
+        dict(name='face-8', id=31, color=[255, 255, 255], type='', swap=''),
+        32:
+        dict(
+            name='face-9',
+            id=32,
+            color=[255, 255, 255],
+            type='',
+            swap='face-7'),
+        33:
+        dict(
+            name='face-10',
+            id=33,
+            color=[255, 255, 255],
+            type='',
+            swap='face-6'),
+        34:
+        dict(
+            name='face-11',
+            id=34,
+            color=[255, 255, 255],
+            type='',
+            swap='face-5'),
+        35:
+        dict(
+            name='face-12',
+            id=35,
+            color=[255, 255, 255],
+            type='',
+            swap='face-4'),
+        36:
+        dict(
+            name='face-13',
+            id=36,
+            color=[255, 255, 255],
+            type='',
+            swap='face-3'),
+        37:
+        dict(
+            name='face-14',
+            id=37,
+            color=[255, 255, 255],
+            type='',
+            swap='face-2'),
+        38:
+        dict(
+            name='face-15',
+            id=38,
+            color=[255, 255, 255],
+            type='',
+            swap='face-1'),
+        39:
+        dict(
+            name='face-16',
+            id=39,
+            color=[255, 255, 255],
+            type='',
+            swap='face-0'),
+        40:
+        dict(
+            name='face-17',
+            id=40,
+            color=[255, 255, 255],
+            type='',
+            swap='face-26'),
+        41:
+        dict(
+            name='face-18',
+            id=41,
+            color=[255, 255, 255],
+            type='',
+            swap='face-25'),
+        42:
+        dict(
+            name='face-19',
+            id=42,
+            color=[255, 255, 255],
+            type='',
+            swap='face-24'),
+        43:
+        dict(
+            name='face-20',
+            id=43,
+            color=[255, 255, 255],
+            type='',
+            swap='face-23'),
+        44:
+        dict(
+            name='face-21',
+            id=44,
+            color=[255, 255, 255],
+            type='',
+            swap='face-22'),
+        45:
+        dict(
+            name='face-22',
+            id=45,
+            color=[255, 255, 255],
+            type='',
+            swap='face-21'),
+        46:
+        dict(
+            name='face-23',
+            id=46,
+            color=[255, 255, 255],
+            type='',
+            swap='face-20'),
+        47:
+        dict(
+            name='face-24',
+            id=47,
+            color=[255, 255, 255],
+            type='',
+            swap='face-19'),
+        48:
+        dict(
+            name='face-25',
+            id=48,
+            color=[255, 255, 255],
+            type='',
+            swap='face-18'),
+        49:
+        dict(
+            name='face-26',
+            id=49,
+            color=[255, 255, 255],
+            type='',
+            swap='face-17'),
+        50:
+        dict(name='face-27', id=50, color=[255, 255, 255], type='', swap=''),
+        51:
+        dict(name='face-28', id=51, color=[255, 255, 255], type='', swap=''),
+        52:
+        dict(name='face-29', id=52, color=[255, 255, 255], type='', swap=''),
+        53:
+        dict(name='face-30', id=53, color=[255, 255, 255], type='', swap=''),
+        54:
+        dict(
+            name='face-31',
+            id=54,
+            color=[255, 255, 255],
+            type='',
+            swap='face-35'),
+        55:
+        dict(
+            name='face-32',
+            id=55,
+            color=[255, 255, 255],
+            type='',
+            swap='face-34'),
+        56:
+        dict(name='face-33', id=56, color=[255, 255, 255], type='', swap=''),
+        57:
+        dict(
+            name='face-34',
+            id=57,
+            color=[255, 255, 255],
+            type='',
+            swap='face-32'),
+        58:
+        dict(
+            name='face-35',
+            id=58,
+            color=[255, 255, 255],
+            type='',
+            swap='face-31'),
+        59:
+        dict(
+            name='face-36',
+            id=59,
+            color=[255, 255, 255],
+            type='',
+            swap='face-45'),
+        60:
+        dict(
+            name='face-37',
+            id=60,
+            color=[255, 255, 255],
+            type='',
+            swap='face-44'),
+        61:
+        dict(
+            name='face-38',
+            id=61,
+            color=[255, 255, 255],
+            type='',
+            swap='face-43'),
+        62:
+        dict(
+            name='face-39',
+            id=62,
+            color=[255, 255, 255],
+            type='',
+            swap='face-42'),
+        63:
+        dict(
+            name='face-40',
+            id=63,
+            color=[255, 255, 255],
+            type='',
+            swap='face-47'),
+        64:
+        dict(
+            name='face-41',
+            id=64,
+            color=[255, 255, 255],
+            type='',
+            swap='face-46'),
+        65:
+        dict(
+            name='face-42',
+            id=65,
+            color=[255, 255, 255],
+            type='',
+            swap='face-39'),
+        66:
+        dict(
+            name='face-43',
+            id=66,
+            color=[255, 255, 255],
+            type='',
+            swap='face-38'),
+        67:
+        dict(
+            name='face-44',
+            id=67,
+            color=[255, 255, 255],
+            type='',
+            swap='face-37'),
+        68:
+        dict(
+            name='face-45',
+            id=68,
+            color=[255, 255, 255],
+            type='',
+            swap='face-36'),
+        69:
+        dict(
+            name='face-46',
+            id=69,
+            color=[255, 255, 255],
+            type='',
+            swap='face-41'),
+        70:
+        dict(
+            name='face-47',
+            id=70,
+            color=[255, 255, 255],
+            type='',
+            swap='face-40'),
+        71:
+        dict(
+            name='face-48',
+            id=71,
+            color=[255, 255, 255],
+            type='',
+            swap='face-54'),
+        72:
+        dict(
+            name='face-49',
+            id=72,
+            color=[255, 255, 255],
+            type='',
+            swap='face-53'),
+        73:
+        dict(
+            name='face-50',
+            id=73,
+            color=[255, 255, 255],
+            type='',
+            swap='face-52'),
+        74:
+        dict(name='face-51', id=74, color=[255, 255, 255], type='', swap=''),
+        75:
+        dict(
+            name='face-52',
+            id=75,
+            color=[255, 255, 255],
+            type='',
+            swap='face-50'),
+        76:
+        dict(
+            name='face-53',
+            id=76,
+            color=[255, 255, 255],
+            type='',
+            swap='face-49'),
+        77:
+        dict(
+            name='face-54',
+            id=77,
+            color=[255, 255, 255],
+            type='',
+            swap='face-48'),
+        78:
+        dict(
+            name='face-55',
+            id=78,
+            color=[255, 255, 255],
+            type='',
+            swap='face-59'),
+        79:
+        dict(
+            name='face-56',
+            id=79,
+            color=[255, 255, 255],
+            type='',
+            swap='face-58'),
+        80:
+        dict(name='face-57', id=80, color=[255, 255, 255], type='', swap=''),
+        81:
+        dict(
+            name='face-58',
+            id=81,
+            color=[255, 255, 255],
+            type='',
+            swap='face-56'),
+        82:
+        dict(
+            name='face-59',
+            id=82,
+            color=[255, 255, 255],
+            type='',
+            swap='face-55'),
+        83:
+        dict(
+            name='face-60',
+            id=83,
+            color=[255, 255, 255],
+            type='',
+            swap='face-64'),
+        84:
+        dict(
+            name='face-61',
+            id=84,
+            color=[255, 255, 255],
+            type='',
+            swap='face-63'),
+        85:
+        dict(name='face-62', id=85, color=[255, 255, 255], type='', swap=''),
+        86:
+        dict(
+            name='face-63',
+            id=86,
+            color=[255, 255, 255],
+            type='',
+            swap='face-61'),
+        87:
+        dict(
+            name='face-64',
+            id=87,
+            color=[255, 255, 255],
+            type='',
+            swap='face-60'),
+        88:
+        dict(
+            name='face-65',
+            id=88,
+            color=[255, 255, 255],
+            type='',
+            swap='face-67'),
+        89:
+        dict(name='face-66', id=89, color=[255, 255, 255], type='', swap=''),
+        90:
+        dict(
+            name='face-67',
+            id=90,
+            color=[255, 255, 255],
+            type='',
+            swap='face-65'),
+        91:
+        dict(
+            name='left_hand_root',
+            id=91,
+            color=[255, 255, 255],
+            type='',
+            swap='right_hand_root'),
+        92:
+        dict(
+            name='left_thumb1',
+            id=92,
+            color=[255, 128, 0],
+            type='',
+            swap='right_thumb1'),
+        93:
+        dict(
+            name='left_thumb2',
+            id=93,
+            color=[255, 128, 0],
+            type='',
+            swap='right_thumb2'),
+        94:
+        dict(
+            name='left_thumb3',
+            id=94,
+            color=[255, 128, 0],
+            type='',
+            swap='right_thumb3'),
+        95:
+        dict(
+            name='left_thumb4',
+            id=95,
+            color=[255, 128, 0],
+            type='',
+            swap='right_thumb4'),
+        96:
+        dict(
+            name='left_forefinger1',
+            id=96,
+            color=[255, 153, 255],
+            type='',
+            swap='right_forefinger1'),
+        97:
+        dict(
+            name='left_forefinger2',
+            id=97,
+            color=[255, 153, 255],
+            type='',
+            swap='right_forefinger2'),
+        98:
+        dict(
+            name='left_forefinger3',
+            id=98,
+            color=[255, 153, 255],
+            type='',
+            swap='right_forefinger3'),
+        99:
+        dict(
+            name='left_forefinger4',
+            id=99,
+            color=[255, 153, 255],
+            type='',
+            swap='right_forefinger4'),
+        100:
+        dict(
+            name='left_middle_finger1',
+            id=100,
+            color=[102, 178, 255],
+            type='',
+            swap='right_middle_finger1'),
+        101:
+        dict(
+            name='left_middle_finger2',
+            id=101,
+            color=[102, 178, 255],
+            type='',
+            swap='right_middle_finger2'),
+        102:
+        dict(
+            name='left_middle_finger3',
+            id=102,
+            color=[102, 178, 255],
+            type='',
+            swap='right_middle_finger3'),
+        103:
+        dict(
+            name='left_middle_finger4',
+            id=103,
+            color=[102, 178, 255],
+            type='',
+            swap='right_middle_finger4'),
+        104:
+        dict(
+            name='left_ring_finger1',
+            id=104,
+            color=[255, 51, 51],
+            type='',
+            swap='right_ring_finger1'),
+        105:
+        dict(
+            name='left_ring_finger2',
+            id=105,
+            color=[255, 51, 51],
+            type='',
+            swap='right_ring_finger2'),
+        106:
+        dict(
+            name='left_ring_finger3',
+            id=106,
+            color=[255, 51, 51],
+            type='',
+            swap='right_ring_finger3'),
+        107:
+        dict(
+            name='left_ring_finger4',
+            id=107,
+            color=[255, 51, 51],
+            type='',
+            swap='right_ring_finger4'),
+        108:
+        dict(
+            name='left_pinky_finger1',
+            id=108,
+            color=[0, 255, 0],
+            type='',
+            swap='right_pinky_finger1'),
+        109:
+        dict(
+            name='left_pinky_finger2',
+            id=109,
+            color=[0, 255, 0],
+            type='',
+            swap='right_pinky_finger2'),
+        110:
+        dict(
+            name='left_pinky_finger3',
+            id=110,
+            color=[0, 255, 0],
+            type='',
+            swap='right_pinky_finger3'),
+        111:
+        dict(
+            name='left_pinky_finger4',
+            id=111,
+            color=[0, 255, 0],
+            type='',
+            swap='right_pinky_finger4'),
+        112:
+        dict(
+            name='right_hand_root',
+            id=112,
+            color=[255, 255, 255],
+            type='',
+            swap='left_hand_root'),
+        113:
+        dict(
+            name='right_thumb1',
+            id=113,
+            color=[255, 128, 0],
+            type='',
+            swap='left_thumb1'),
+        114:
+        dict(
+            name='right_thumb2',
+            id=114,
+            color=[255, 128, 0],
+            type='',
+            swap='left_thumb2'),
+        115:
+        dict(
+            name='right_thumb3',
+            id=115,
+            color=[255, 128, 0],
+            type='',
+            swap='left_thumb3'),
+        116:
+        dict(
+            name='right_thumb4',
+            id=116,
+            color=[255, 128, 0],
+            type='',
+            swap='left_thumb4'),
+        117:
+        dict(
+            name='right_forefinger1',
+            id=117,
+            color=[255, 153, 255],
+            type='',
+            swap='left_forefinger1'),
+        118:
+        dict(
+            name='right_forefinger2',
+            id=118,
+            color=[255, 153, 255],
+            type='',
+            swap='left_forefinger2'),
+        119:
+        dict(
+            name='right_forefinger3',
+            id=119,
+            color=[255, 153, 255],
+            type='',
+            swap='left_forefinger3'),
+        120:
+        dict(
+            name='right_forefinger4',
+            id=120,
+            color=[255, 153, 255],
+            type='',
+            swap='left_forefinger4'),
+        121:
+        dict(
+            name='right_middle_finger1',
+            id=121,
+            color=[102, 178, 255],
+            type='',
+            swap='left_middle_finger1'),
+        122:
+        dict(
+            name='right_middle_finger2',
+            id=122,
+            color=[102, 178, 255],
+            type='',
+            swap='left_middle_finger2'),
+        123:
+        dict(
+            name='right_middle_finger3',
+            id=123,
+            color=[102, 178, 255],
+            type='',
+            swap='left_middle_finger3'),
+        124:
+        dict(
+            name='right_middle_finger4',
+            id=124,
+            color=[102, 178, 255],
+            type='',
+            swap='left_middle_finger4'),
+        125:
+        dict(
+            name='right_ring_finger1',
+            id=125,
+            color=[255, 51, 51],
+            type='',
+            swap='left_ring_finger1'),
+        126:
+        dict(
+            name='right_ring_finger2',
+            id=126,
+            color=[255, 51, 51],
+            type='',
+            swap='left_ring_finger2'),
+        127:
+        dict(
+            name='right_ring_finger3',
+            id=127,
+            color=[255, 51, 51],
+            type='',
+            swap='left_ring_finger3'),
+        128:
+        dict(
+            name='right_ring_finger4',
+            id=128,
+            color=[255, 51, 51],
+            type='',
+            swap='left_ring_finger4'),
+        129:
+        dict(
+            name='right_pinky_finger1',
+            id=129,
+            color=[0, 255, 0],
+            type='',
+            swap='left_pinky_finger1'),
+        130:
+        dict(
+            name='right_pinky_finger2',
+            id=130,
+            color=[0, 255, 0],
+            type='',
+            swap='left_pinky_finger2'),
+        131:
+        dict(
+            name='right_pinky_finger3',
+            id=131,
+            color=[0, 255, 0],
+            type='',
+            swap='left_pinky_finger3'),
+        132:
+        dict(
+            name='right_pinky_finger4',
+            id=132,
+            color=[0, 255, 0],
+            type='',
+            swap='left_pinky_finger4')
+    },
+    skeleton_info={
+        0:
+        dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]),
+        1:
+        dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]),
+        2:
+        dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]),
+        4:
+        dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]),
+        5:
+        dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]),
+        6:
+        dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]),
+        7:
+        dict(
+            link=('left_shoulder', 'right_shoulder'),
+            id=7,
+            color=[51, 153, 255]),
+        8:
+        dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]),
+        9:
+        dict(
+            link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]),
+        10:
+        dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]),
+        11:
+        dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]),
+        12:
+        dict(link=('left_eye', 'right_eye'), id=12, color=[51, 153, 255]),
+        13:
+        dict(link=('nose', 'left_eye'), id=13, color=[51, 153, 255]),
+        14:
+        dict(link=('nose', 'right_eye'), id=14, color=[51, 153, 255]),
+        15:
+        dict(link=('left_eye', 'left_ear'), id=15, color=[51, 153, 255]),
+        16:
+        dict(link=('right_eye', 'right_ear'), id=16, color=[51, 153, 255]),
+        17:
+        dict(link=('left_ear', 'left_shoulder'), id=17, color=[51, 153, 255]),
+        18:
+        dict(
+            link=('right_ear', 'right_shoulder'), id=18, color=[51, 153, 255]),
+        19:
+        dict(link=('left_ankle', 'left_big_toe'), id=19, color=[0, 255, 0]),
+        20:
+        dict(link=('left_ankle', 'left_small_toe'), id=20, color=[0, 255, 0]),
+        21:
+        dict(link=('left_ankle', 'left_heel'), id=21, color=[0, 255, 0]),
+        22:
+        dict(
+            link=('right_ankle', 'right_big_toe'), id=22, color=[255, 128, 0]),
+        23:
+        dict(
+            link=('right_ankle', 'right_small_toe'),
+            id=23,
+            color=[255, 128, 0]),
+        24:
+        dict(link=('right_ankle', 'right_heel'), id=24, color=[255, 128, 0]),
+        25:
+        dict(
+            link=('left_hand_root', 'left_thumb1'), id=25, color=[255, 128,
+                                                                  0]),
+        26:
+        dict(link=('left_thumb1', 'left_thumb2'), id=26, color=[255, 128, 0]),
+        27:
+        dict(link=('left_thumb2', 'left_thumb3'), id=27, color=[255, 128, 0]),
+        28:
+        dict(link=('left_thumb3', 'left_thumb4'), id=28, color=[255, 128, 0]),
+        29:
+        dict(
+            link=('left_hand_root', 'left_forefinger1'),
+            id=29,
+            color=[255, 153, 255]),
+        30:
+        dict(
+            link=('left_forefinger1', 'left_forefinger2'),
+            id=30,
+            color=[255, 153, 255]),
+        31:
+        dict(
+            link=('left_forefinger2', 'left_forefinger3'),
+            id=31,
+            color=[255, 153, 255]),
+        32:
+        dict(
+            link=('left_forefinger3', 'left_forefinger4'),
+            id=32,
+            color=[255, 153, 255]),
+        33:
+        dict(
+            link=('left_hand_root', 'left_middle_finger1'),
+            id=33,
+            color=[102, 178, 255]),
+        34:
+        dict(
+            link=('left_middle_finger1', 'left_middle_finger2'),
+            id=34,
+            color=[102, 178, 255]),
+        35:
+        dict(
+            link=('left_middle_finger2', 'left_middle_finger3'),
+            id=35,
+            color=[102, 178, 255]),
+        36:
+        dict(
+            link=('left_middle_finger3', 'left_middle_finger4'),
+            id=36,
+            color=[102, 178, 255]),
+        37:
+        dict(
+            link=('left_hand_root', 'left_ring_finger1'),
+            id=37,
+            color=[255, 51, 51]),
+        38:
+        dict(
+            link=('left_ring_finger1', 'left_ring_finger2'),
+            id=38,
+            color=[255, 51, 51]),
+        39:
+        dict(
+            link=('left_ring_finger2', 'left_ring_finger3'),
+            id=39,
+            color=[255, 51, 51]),
+        40:
+        dict(
+            link=('left_ring_finger3', 'left_ring_finger4'),
+            id=40,
+            color=[255, 51, 51]),
+        41:
+        dict(
+            link=('left_hand_root', 'left_pinky_finger1'),
+            id=41,
+            color=[0, 255, 0]),
+        42:
+        dict(
+            link=('left_pinky_finger1', 'left_pinky_finger2'),
+            id=42,
+            color=[0, 255, 0]),
+        43:
+        dict(
+            link=('left_pinky_finger2', 'left_pinky_finger3'),
+            id=43,
+            color=[0, 255, 0]),
+        44:
+        dict(
+            link=('left_pinky_finger3', 'left_pinky_finger4'),
+            id=44,
+            color=[0, 255, 0]),
+        45:
+        dict(
+            link=('right_hand_root', 'right_thumb1'),
+            id=45,
+            color=[255, 128, 0]),
+        46:
+        dict(
+            link=('right_thumb1', 'right_thumb2'), id=46, color=[255, 128, 0]),
+        47:
+        dict(
+            link=('right_thumb2', 'right_thumb3'), id=47, color=[255, 128, 0]),
+        48:
+        dict(
+            link=('right_thumb3', 'right_thumb4'), id=48, color=[255, 128, 0]),
+        49:
+        dict(
+            link=('right_hand_root', 'right_forefinger1'),
+            id=49,
+            color=[255, 153, 255]),
+        50:
+        dict(
+            link=('right_forefinger1', 'right_forefinger2'),
+            id=50,
+            color=[255, 153, 255]),
+        51:
+        dict(
+            link=('right_forefinger2', 'right_forefinger3'),
+            id=51,
+            color=[255, 153, 255]),
+        52:
+        dict(
+            link=('right_forefinger3', 'right_forefinger4'),
+            id=52,
+            color=[255, 153, 255]),
+        53:
+        dict(
+            link=('right_hand_root', 'right_middle_finger1'),
+            id=53,
+            color=[102, 178, 255]),
+        54:
+        dict(
+            link=('right_middle_finger1', 'right_middle_finger2'),
+            id=54,
+            color=[102, 178, 255]),
+        55:
+        dict(
+            link=('right_middle_finger2', 'right_middle_finger3'),
+            id=55,
+            color=[102, 178, 255]),
+        56:
+        dict(
+            link=('right_middle_finger3', 'right_middle_finger4'),
+            id=56,
+            color=[102, 178, 255]),
+        57:
+        dict(
+            link=('right_hand_root', 'right_ring_finger1'),
+            id=57,
+            color=[255, 51, 51]),
+        58:
+        dict(
+            link=('right_ring_finger1', 'right_ring_finger2'),
+            id=58,
+            color=[255, 51, 51]),
+        59:
+        dict(
+            link=('right_ring_finger2', 'right_ring_finger3'),
+            id=59,
+            color=[255, 51, 51]),
+        60:
+        dict(
+            link=('right_ring_finger3', 'right_ring_finger4'),
+            id=60,
+            color=[255, 51, 51]),
+        61:
+        dict(
+            link=('right_hand_root', 'right_pinky_finger1'),
+            id=61,
+            color=[0, 255, 0]),
+        62:
+        dict(
+            link=('right_pinky_finger1', 'right_pinky_finger2'),
+            id=62,
+            color=[0, 255, 0]),
+        63:
+        dict(
+            link=('right_pinky_finger2', 'right_pinky_finger3'),
+            id=63,
+            color=[0, 255, 0]),
+        64:
+        dict(
+            link=('right_pinky_finger3', 'right_pinky_finger4'),
+            id=64,
+            color=[0, 255, 0])
+    },
+    joint_weights=[1.] * 133,
+    # 'https://github.com/jin-s13/COCO-WholeBody/blob/master/'
+    # 'evaluation/myeval_wholebody.py#L175'
+    sigmas=[
+        0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062,
+        0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089, 0.068, 0.066, 0.066,
+        0.092, 0.094, 0.094, 0.042, 0.043, 0.044, 0.043, 0.040, 0.035, 0.031,
+        0.025, 0.020, 0.023, 0.029, 0.032, 0.037, 0.038, 0.043, 0.041, 0.045,
+        0.013, 0.012, 0.011, 0.011, 0.012, 0.012, 0.011, 0.011, 0.013, 0.015,
+        0.009, 0.007, 0.007, 0.007, 0.012, 0.009, 0.008, 0.016, 0.010, 0.017,
+        0.011, 0.009, 0.011, 0.009, 0.007, 0.013, 0.008, 0.011, 0.012, 0.010,
+        0.034, 0.008, 0.008, 0.009, 0.008, 0.008, 0.007, 0.010, 0.008, 0.009,
+        0.009, 0.009, 0.007, 0.007, 0.008, 0.011, 0.008, 0.008, 0.008, 0.01,
+        0.008, 0.029, 0.022, 0.035, 0.037, 0.047, 0.026, 0.025, 0.024, 0.035,
+        0.018, 0.024, 0.022, 0.026, 0.017, 0.021, 0.021, 0.032, 0.02, 0.019,
+        0.022, 0.031, 0.029, 0.022, 0.035, 0.037, 0.047, 0.026, 0.025, 0.024,
+        0.035, 0.018, 0.024, 0.022, 0.026, 0.017, 0.021, 0.021, 0.032, 0.02,
+        0.019, 0.022, 0.031
+    ])
diff --git a/main/_base_/datasets/coco_wholebody_face.py b/main/_base_/datasets/coco_wholebody_face.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c9ee3350e3bd67ab1825344849487834c71c82b
--- /dev/null
+++ b/main/_base_/datasets/coco_wholebody_face.py
@@ -0,0 +1,448 @@
+dataset_info = dict(
+    dataset_name='coco_wholebody_face',
+    paper_info=dict(
+        author='Jin, Sheng and Xu, Lumin and Xu, Jin and '
+        'Wang, Can and Liu, Wentao and '
+        'Qian, Chen and Ouyang, Wanli and Luo, Ping',
+        title='Whole-Body Human Pose Estimation in the Wild',
+        container='Proceedings of the European '
+        'Conference on Computer Vision (ECCV)',
+        year='2020',
+        homepage='https://github.com/jin-s13/COCO-WholeBody/',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='face-0',
+            id=0,
+            color=[255, 255, 255],
+            type='',
+            swap='face-16'),
+        1:
+        dict(
+            name='face-1',
+            id=1,
+            color=[255, 255, 255],
+            type='',
+            swap='face-15'),
+        2:
+        dict(
+            name='face-2',
+            id=2,
+            color=[255, 255, 255],
+            type='',
+            swap='face-14'),
+        3:
+        dict(
+            name='face-3',
+            id=3,
+            color=[255, 255, 255],
+            type='',
+            swap='face-13'),
+        4:
+        dict(
+            name='face-4',
+            id=4,
+            color=[255, 255, 255],
+            type='',
+            swap='face-12'),
+        5:
+        dict(
+            name='face-5',
+            id=5,
+            color=[255, 255, 255],
+            type='',
+            swap='face-11'),
+        6:
+        dict(
+            name='face-6',
+            id=6,
+            color=[255, 255, 255],
+            type='',
+            swap='face-10'),
+        7:
+        dict(
+            name='face-7', id=7, color=[255, 255, 255], type='',
+            swap='face-9'),
+        8:
+        dict(name='face-8', id=8, color=[255, 255, 255], type='', swap=''),
+        9:
+        dict(
+            name='face-9', id=9, color=[255, 255, 255], type='',
+            swap='face-7'),
+        10:
+        dict(
+            name='face-10',
+            id=10,
+            color=[255, 255, 255],
+            type='',
+            swap='face-6'),
+        11:
+        dict(
+            name='face-11',
+            id=11,
+            color=[255, 255, 255],
+            type='',
+            swap='face-5'),
+        12:
+        dict(
+            name='face-12',
+            id=12,
+            color=[255, 255, 255],
+            type='',
+            swap='face-4'),
+        13:
+        dict(
+            name='face-13',
+            id=13,
+            color=[255, 255, 255],
+            type='',
+            swap='face-3'),
+        14:
+        dict(
+            name='face-14',
+            id=14,
+            color=[255, 255, 255],
+            type='',
+            swap='face-2'),
+        15:
+        dict(
+            name='face-15',
+            id=15,
+            color=[255, 255, 255],
+            type='',
+            swap='face-1'),
+        16:
+        dict(
+            name='face-16',
+            id=16,
+            color=[255, 255, 255],
+            type='',
+            swap='face-0'),
+        17:
+        dict(
+            name='face-17',
+            id=17,
+            color=[255, 255, 255],
+            type='',
+            swap='face-26'),
+        18:
+        dict(
+            name='face-18',
+            id=18,
+            color=[255, 255, 255],
+            type='',
+            swap='face-25'),
+        19:
+        dict(
+            name='face-19',
+            id=19,
+            color=[255, 255, 255],
+            type='',
+            swap='face-24'),
+        20:
+        dict(
+            name='face-20',
+            id=20,
+            color=[255, 255, 255],
+            type='',
+            swap='face-23'),
+        21:
+        dict(
+            name='face-21',
+            id=21,
+            color=[255, 255, 255],
+            type='',
+            swap='face-22'),
+        22:
+        dict(
+            name='face-22',
+            id=22,
+            color=[255, 255, 255],
+            type='',
+            swap='face-21'),
+        23:
+        dict(
+            name='face-23',
+            id=23,
+            color=[255, 255, 255],
+            type='',
+            swap='face-20'),
+        24:
+        dict(
+            name='face-24',
+            id=24,
+            color=[255, 255, 255],
+            type='',
+            swap='face-19'),
+        25:
+        dict(
+            name='face-25',
+            id=25,
+            color=[255, 255, 255],
+            type='',
+            swap='face-18'),
+        26:
+        dict(
+            name='face-26',
+            id=26,
+            color=[255, 255, 255],
+            type='',
+            swap='face-17'),
+        27:
+        dict(name='face-27', id=27, color=[255, 255, 255], type='', swap=''),
+        28:
+        dict(name='face-28', id=28, color=[255, 255, 255], type='', swap=''),
+        29:
+        dict(name='face-29', id=29, color=[255, 255, 255], type='', swap=''),
+        30:
+        dict(name='face-30', id=30, color=[255, 255, 255], type='', swap=''),
+        31:
+        dict(
+            name='face-31',
+            id=31,
+            color=[255, 255, 255],
+            type='',
+            swap='face-35'),
+        32:
+        dict(
+            name='face-32',
+            id=32,
+            color=[255, 255, 255],
+            type='',
+            swap='face-34'),
+        33:
+        dict(name='face-33', id=33, color=[255, 255, 255], type='', swap=''),
+        34:
+        dict(
+            name='face-34',
+            id=34,
+            color=[255, 255, 255],
+            type='',
+            swap='face-32'),
+        35:
+        dict(
+            name='face-35',
+            id=35,
+            color=[255, 255, 255],
+            type='',
+            swap='face-31'),
+        36:
+        dict(
+            name='face-36',
+            id=36,
+            color=[255, 255, 255],
+            type='',
+            swap='face-45'),
+        37:
+        dict(
+            name='face-37',
+            id=37,
+            color=[255, 255, 255],
+            type='',
+            swap='face-44'),
+        38:
+        dict(
+            name='face-38',
+            id=38,
+            color=[255, 255, 255],
+            type='',
+            swap='face-43'),
+        39:
+        dict(
+            name='face-39',
+            id=39,
+            color=[255, 255, 255],
+            type='',
+            swap='face-42'),
+        40:
+        dict(
+            name='face-40',
+            id=40,
+            color=[255, 255, 255],
+            type='',
+            swap='face-47'),
+        41:
+        dict(
+            name='face-41',
+            id=41,
+            color=[255, 255, 255],
+            type='',
+            swap='face-46'),
+        42:
+        dict(
+            name='face-42',
+            id=42,
+            color=[255, 255, 255],
+            type='',
+            swap='face-39'),
+        43:
+        dict(
+            name='face-43',
+            id=43,
+            color=[255, 255, 255],
+            type='',
+            swap='face-38'),
+        44:
+        dict(
+            name='face-44',
+            id=44,
+            color=[255, 255, 255],
+            type='',
+            swap='face-37'),
+        45:
+        dict(
+            name='face-45',
+            id=45,
+            color=[255, 255, 255],
+            type='',
+            swap='face-36'),
+        46:
+        dict(
+            name='face-46',
+            id=46,
+            color=[255, 255, 255],
+            type='',
+            swap='face-41'),
+        47:
+        dict(
+            name='face-47',
+            id=47,
+            color=[255, 255, 255],
+            type='',
+            swap='face-40'),
+        48:
+        dict(
+            name='face-48',
+            id=48,
+            color=[255, 255, 255],
+            type='',
+            swap='face-54'),
+        49:
+        dict(
+            name='face-49',
+            id=49,
+            color=[255, 255, 255],
+            type='',
+            swap='face-53'),
+        50:
+        dict(
+            name='face-50',
+            id=50,
+            color=[255, 255, 255],
+            type='',
+            swap='face-52'),
+        51:
+        dict(name='face-51', id=52, color=[255, 255, 255], type='', swap=''),
+        52:
+        dict(
+            name='face-52',
+            id=52,
+            color=[255, 255, 255],
+            type='',
+            swap='face-50'),
+        53:
+        dict(
+            name='face-53',
+            id=53,
+            color=[255, 255, 255],
+            type='',
+            swap='face-49'),
+        54:
+        dict(
+            name='face-54',
+            id=54,
+            color=[255, 255, 255],
+            type='',
+            swap='face-48'),
+        55:
+        dict(
+            name='face-55',
+            id=55,
+            color=[255, 255, 255],
+            type='',
+            swap='face-59'),
+        56:
+        dict(
+            name='face-56',
+            id=56,
+            color=[255, 255, 255],
+            type='',
+            swap='face-58'),
+        57:
+        dict(name='face-57', id=57, color=[255, 255, 255], type='', swap=''),
+        58:
+        dict(
+            name='face-58',
+            id=58,
+            color=[255, 255, 255],
+            type='',
+            swap='face-56'),
+        59:
+        dict(
+            name='face-59',
+            id=59,
+            color=[255, 255, 255],
+            type='',
+            swap='face-55'),
+        60:
+        dict(
+            name='face-60',
+            id=60,
+            color=[255, 255, 255],
+            type='',
+            swap='face-64'),
+        61:
+        dict(
+            name='face-61',
+            id=61,
+            color=[255, 255, 255],
+            type='',
+            swap='face-63'),
+        62:
+        dict(name='face-62', id=62, color=[255, 255, 255], type='', swap=''),
+        63:
+        dict(
+            name='face-63',
+            id=63,
+            color=[255, 255, 255],
+            type='',
+            swap='face-61'),
+        64:
+        dict(
+            name='face-64',
+            id=64,
+            color=[255, 255, 255],
+            type='',
+            swap='face-60'),
+        65:
+        dict(
+            name='face-65',
+            id=65,
+            color=[255, 255, 255],
+            type='',
+            swap='face-67'),
+        66:
+        dict(name='face-66', id=66, color=[255, 255, 255], type='', swap=''),
+        67:
+        dict(
+            name='face-67',
+            id=67,
+            color=[255, 255, 255],
+            type='',
+            swap='face-65')
+    },
+    skeleton_info={},
+    joint_weights=[1.] * 68,
+
+    # 'https://github.com/jin-s13/COCO-WholeBody/blob/master/'
+    # 'evaluation/myeval_wholebody.py#L177'
+    sigmas=[
+        0.042, 0.043, 0.044, 0.043, 0.040, 0.035, 0.031, 0.025, 0.020, 0.023,
+        0.029, 0.032, 0.037, 0.038, 0.043, 0.041, 0.045, 0.013, 0.012, 0.011,
+        0.011, 0.012, 0.012, 0.011, 0.011, 0.013, 0.015, 0.009, 0.007, 0.007,
+        0.007, 0.012, 0.009, 0.008, 0.016, 0.010, 0.017, 0.011, 0.009, 0.011,
+        0.009, 0.007, 0.013, 0.008, 0.011, 0.012, 0.010, 0.034, 0.008, 0.008,
+        0.009, 0.008, 0.008, 0.007, 0.010, 0.008, 0.009, 0.009, 0.009, 0.007,
+        0.007, 0.008, 0.011, 0.008, 0.008, 0.008, 0.01, 0.008
+    ])
diff --git a/main/_base_/datasets/coco_wholebody_hand.py b/main/_base_/datasets/coco_wholebody_hand.py
new file mode 100644
index 0000000000000000000000000000000000000000..1910b2ced5a8b31cd6f83911e41cae9f1a580222
--- /dev/null
+++ b/main/_base_/datasets/coco_wholebody_hand.py
@@ -0,0 +1,147 @@
+dataset_info = dict(
+    dataset_name='coco_wholebody_hand',
+    paper_info=dict(
+        author='Jin, Sheng and Xu, Lumin and Xu, Jin and '
+        'Wang, Can and Liu, Wentao and '
+        'Qian, Chen and Ouyang, Wanli and Luo, Ping',
+        title='Whole-Body Human Pose Estimation in the Wild',
+        container='Proceedings of the European '
+        'Conference on Computer Vision (ECCV)',
+        year='2020',
+        homepage='https://github.com/jin-s13/COCO-WholeBody/',
+    ),
+    keypoint_info={
+        0:
+        dict(name='wrist', id=0, color=[255, 255, 255], type='', swap=''),
+        1:
+        dict(name='thumb1', id=1, color=[255, 128, 0], type='', swap=''),
+        2:
+        dict(name='thumb2', id=2, color=[255, 128, 0], type='', swap=''),
+        3:
+        dict(name='thumb3', id=3, color=[255, 128, 0], type='', swap=''),
+        4:
+        dict(name='thumb4', id=4, color=[255, 128, 0], type='', swap=''),
+        5:
+        dict(
+            name='forefinger1', id=5, color=[255, 153, 255], type='', swap=''),
+        6:
+        dict(
+            name='forefinger2', id=6, color=[255, 153, 255], type='', swap=''),
+        7:
+        dict(
+            name='forefinger3', id=7, color=[255, 153, 255], type='', swap=''),
+        8:
+        dict(
+            name='forefinger4', id=8, color=[255, 153, 255], type='', swap=''),
+        9:
+        dict(
+            name='middle_finger1',
+            id=9,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        10:
+        dict(
+            name='middle_finger2',
+            id=10,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        11:
+        dict(
+            name='middle_finger3',
+            id=11,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        12:
+        dict(
+            name='middle_finger4',
+            id=12,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        13:
+        dict(
+            name='ring_finger1', id=13, color=[255, 51, 51], type='', swap=''),
+        14:
+        dict(
+            name='ring_finger2', id=14, color=[255, 51, 51], type='', swap=''),
+        15:
+        dict(
+            name='ring_finger3', id=15, color=[255, 51, 51], type='', swap=''),
+        16:
+        dict(
+            name='ring_finger4', id=16, color=[255, 51, 51], type='', swap=''),
+        17:
+        dict(name='pinky_finger1', id=17, color=[0, 255, 0], type='', swap=''),
+        18:
+        dict(name='pinky_finger2', id=18, color=[0, 255, 0], type='', swap=''),
+        19:
+        dict(name='pinky_finger3', id=19, color=[0, 255, 0], type='', swap=''),
+        20:
+        dict(name='pinky_finger4', id=20, color=[0, 255, 0], type='', swap='')
+    },
+    skeleton_info={
+        0:
+        dict(link=('wrist', 'thumb1'), id=0, color=[255, 128, 0]),
+        1:
+        dict(link=('thumb1', 'thumb2'), id=1, color=[255, 128, 0]),
+        2:
+        dict(link=('thumb2', 'thumb3'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('thumb3', 'thumb4'), id=3, color=[255, 128, 0]),
+        4:
+        dict(link=('wrist', 'forefinger1'), id=4, color=[255, 153, 255]),
+        5:
+        dict(link=('forefinger1', 'forefinger2'), id=5, color=[255, 153, 255]),
+        6:
+        dict(link=('forefinger2', 'forefinger3'), id=6, color=[255, 153, 255]),
+        7:
+        dict(link=('forefinger3', 'forefinger4'), id=7, color=[255, 153, 255]),
+        8:
+        dict(link=('wrist', 'middle_finger1'), id=8, color=[102, 178, 255]),
+        9:
+        dict(
+            link=('middle_finger1', 'middle_finger2'),
+            id=9,
+            color=[102, 178, 255]),
+        10:
+        dict(
+            link=('middle_finger2', 'middle_finger3'),
+            id=10,
+            color=[102, 178, 255]),
+        11:
+        dict(
+            link=('middle_finger3', 'middle_finger4'),
+            id=11,
+            color=[102, 178, 255]),
+        12:
+        dict(link=('wrist', 'ring_finger1'), id=12, color=[255, 51, 51]),
+        13:
+        dict(
+            link=('ring_finger1', 'ring_finger2'), id=13, color=[255, 51, 51]),
+        14:
+        dict(
+            link=('ring_finger2', 'ring_finger3'), id=14, color=[255, 51, 51]),
+        15:
+        dict(
+            link=('ring_finger3', 'ring_finger4'), id=15, color=[255, 51, 51]),
+        16:
+        dict(link=('wrist', 'pinky_finger1'), id=16, color=[0, 255, 0]),
+        17:
+        dict(
+            link=('pinky_finger1', 'pinky_finger2'), id=17, color=[0, 255, 0]),
+        18:
+        dict(
+            link=('pinky_finger2', 'pinky_finger3'), id=18, color=[0, 255, 0]),
+        19:
+        dict(
+            link=('pinky_finger3', 'pinky_finger4'), id=19, color=[0, 255, 0])
+    },
+    joint_weights=[1.] * 21,
+    sigmas=[
+        0.029, 0.022, 0.035, 0.037, 0.047, 0.026, 0.025, 0.024, 0.035, 0.018,
+        0.024, 0.022, 0.026, 0.017, 0.021, 0.021, 0.032, 0.02, 0.019, 0.022,
+        0.031
+    ])
diff --git a/main/_base_/datasets/cofw.py b/main/_base_/datasets/cofw.py
new file mode 100644
index 0000000000000000000000000000000000000000..2fb7ad2f8d1fdbe868b3691858a370e26b59a105
--- /dev/null
+++ b/main/_base_/datasets/cofw.py
@@ -0,0 +1,134 @@
+dataset_info = dict(
+    dataset_name='cofw',
+    paper_info=dict(
+        author='Burgos-Artizzu, Xavier P and Perona, '
+        r'Pietro and Doll{\'a}r, Piotr',
+        title='Robust face landmark estimation under occlusion',
+        container='Proceedings of the IEEE international '
+        'conference on computer vision',
+        year='2013',
+        homepage='http://www.vision.caltech.edu/xpburgos/ICCV13/',
+    ),
+    keypoint_info={
+        0:
+        dict(name='kpt-0', id=0, color=[255, 255, 255], type='', swap='kpt-1'),
+        1:
+        dict(name='kpt-1', id=1, color=[255, 255, 255], type='', swap='kpt-0'),
+        2:
+        dict(name='kpt-2', id=2, color=[255, 255, 255], type='', swap='kpt-3'),
+        3:
+        dict(name='kpt-3', id=3, color=[255, 255, 255], type='', swap='kpt-2'),
+        4:
+        dict(name='kpt-4', id=4, color=[255, 255, 255], type='', swap='kpt-6'),
+        5:
+        dict(name='kpt-5', id=5, color=[255, 255, 255], type='', swap='kpt-7'),
+        6:
+        dict(name='kpt-6', id=6, color=[255, 255, 255], type='', swap='kpt-4'),
+        7:
+        dict(name='kpt-7', id=7, color=[255, 255, 255], type='', swap='kpt-5'),
+        8:
+        dict(name='kpt-8', id=8, color=[255, 255, 255], type='', swap='kpt-9'),
+        9:
+        dict(name='kpt-9', id=9, color=[255, 255, 255], type='', swap='kpt-8'),
+        10:
+        dict(
+            name='kpt-10',
+            id=10,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-11'),
+        11:
+        dict(
+            name='kpt-11',
+            id=11,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-10'),
+        12:
+        dict(
+            name='kpt-12',
+            id=12,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-14'),
+        13:
+        dict(
+            name='kpt-13',
+            id=13,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-15'),
+        14:
+        dict(
+            name='kpt-14',
+            id=14,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-12'),
+        15:
+        dict(
+            name='kpt-15',
+            id=15,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-13'),
+        16:
+        dict(
+            name='kpt-16',
+            id=16,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-17'),
+        17:
+        dict(
+            name='kpt-17',
+            id=17,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-16'),
+        18:
+        dict(
+            name='kpt-18',
+            id=18,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-19'),
+        19:
+        dict(
+            name='kpt-19',
+            id=19,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-18'),
+        20:
+        dict(name='kpt-20', id=20, color=[255, 255, 255], type='', swap=''),
+        21:
+        dict(name='kpt-21', id=21, color=[255, 255, 255], type='', swap=''),
+        22:
+        dict(
+            name='kpt-22',
+            id=22,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-23'),
+        23:
+        dict(
+            name='kpt-23',
+            id=23,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-22'),
+        24:
+        dict(name='kpt-24', id=24, color=[255, 255, 255], type='', swap=''),
+        25:
+        dict(name='kpt-25', id=25, color=[255, 255, 255], type='', swap=''),
+        26:
+        dict(name='kpt-26', id=26, color=[255, 255, 255], type='', swap=''),
+        27:
+        dict(name='kpt-27', id=27, color=[255, 255, 255], type='', swap=''),
+        28:
+        dict(name='kpt-28', id=28, color=[255, 255, 255], type='', swap='')
+    },
+    skeleton_info={},
+    joint_weights=[1.] * 29,
+    sigmas=[])
diff --git a/main/_base_/datasets/crowdpose.py b/main/_base_/datasets/crowdpose.py
new file mode 100644
index 0000000000000000000000000000000000000000..45086531a601870716eed15a32c5413c0e24b7ae
--- /dev/null
+++ b/main/_base_/datasets/crowdpose.py
@@ -0,0 +1,147 @@
+dataset_info = dict(
+    dataset_name='crowdpose',
+    paper_info=dict(
+        author='Li, Jiefeng and Wang, Can and Zhu, Hao and '
+        'Mao, Yihuan and Fang, Hao-Shu and Lu, Cewu',
+        title='CrowdPose: Efficient Crowded Scenes Pose Estimation '
+        'and A New Benchmark',
+        container='Proceedings of IEEE Conference on Computer '
+        'Vision and Pattern Recognition (CVPR)',
+        year='2019',
+        homepage='https://github.com/Jeff-sjtu/CrowdPose',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='left_shoulder',
+            id=0,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_shoulder'),
+        1:
+        dict(
+            name='right_shoulder',
+            id=1,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_shoulder'),
+        2:
+        dict(
+            name='left_elbow',
+            id=2,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_elbow'),
+        3:
+        dict(
+            name='right_elbow',
+            id=3,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_elbow'),
+        4:
+        dict(
+            name='left_wrist',
+            id=4,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_wrist'),
+        5:
+        dict(
+            name='right_wrist',
+            id=5,
+            color=[0, 255, 0],
+            type='upper',
+            swap='left_wrist'),
+        6:
+        dict(
+            name='left_hip',
+            id=6,
+            color=[255, 128, 0],
+            type='lower',
+            swap='right_hip'),
+        7:
+        dict(
+            name='right_hip',
+            id=7,
+            color=[0, 255, 0],
+            type='lower',
+            swap='left_hip'),
+        8:
+        dict(
+            name='left_knee',
+            id=8,
+            color=[255, 128, 0],
+            type='lower',
+            swap='right_knee'),
+        9:
+        dict(
+            name='right_knee',
+            id=9,
+            color=[0, 255, 0],
+            type='lower',
+            swap='left_knee'),
+        10:
+        dict(
+            name='left_ankle',
+            id=10,
+            color=[255, 128, 0],
+            type='lower',
+            swap='right_ankle'),
+        11:
+        dict(
+            name='right_ankle',
+            id=11,
+            color=[0, 255, 0],
+            type='lower',
+            swap='left_ankle'),
+        12:
+        dict(
+            name='top_head', id=12, color=[255, 128, 0], type='upper',
+            swap=''),
+        13:
+        dict(name='neck', id=13, color=[0, 255, 0], type='upper', swap='')
+    },
+    skeleton_info={
+        0:
+        dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]),
+        1:
+        dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]),
+        2:
+        dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]),
+        4:
+        dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]),
+        5:
+        dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]),
+        6:
+        dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]),
+        7:
+        dict(
+            link=('left_shoulder', 'right_shoulder'),
+            id=7,
+            color=[51, 153, 255]),
+        8:
+        dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]),
+        9:
+        dict(
+            link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]),
+        10:
+        dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]),
+        11:
+        dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]),
+        12:
+        dict(link=('top_head', 'neck'), id=12, color=[51, 153, 255]),
+        13:
+        dict(link=('right_shoulder', 'neck'), id=13, color=[51, 153, 255]),
+        14:
+        dict(link=('left_shoulder', 'neck'), id=14, color=[51, 153, 255])
+    },
+    joint_weights=[
+        0.2, 0.2, 0.2, 1.3, 1.5, 0.2, 1.3, 1.5, 0.2, 0.2, 0.5, 0.2, 0.2, 0.5
+    ],
+    sigmas=[
+        0.079, 0.079, 0.072, 0.072, 0.062, 0.062, 0.107, 0.107, 0.087, 0.087,
+        0.089, 0.089, 0.079, 0.079
+    ])
diff --git a/main/_base_/datasets/deepfashion_full.py b/main/_base_/datasets/deepfashion_full.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d989069ee7253d3a5b5f01c81135b1a472cd4b2
--- /dev/null
+++ b/main/_base_/datasets/deepfashion_full.py
@@ -0,0 +1,74 @@
+dataset_info = dict(
+    dataset_name='deepfashion_full',
+    paper_info=dict(
+        author='Liu, Ziwei and Luo, Ping and Qiu, Shi '
+        'and Wang, Xiaogang and Tang, Xiaoou',
+        title='DeepFashion: Powering Robust Clothes Recognition '
+        'and Retrieval with Rich Annotations',
+        container='Proceedings of IEEE Conference on Computer '
+        'Vision and Pattern Recognition (CVPR)',
+        year='2016',
+        homepage='http://mmlab.ie.cuhk.edu.hk/projects/'
+        'DeepFashion/LandmarkDetection.html',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='left collar',
+            id=0,
+            color=[255, 255, 255],
+            type='',
+            swap='right collar'),
+        1:
+        dict(
+            name='right collar',
+            id=1,
+            color=[255, 255, 255],
+            type='',
+            swap='left collar'),
+        2:
+        dict(
+            name='left sleeve',
+            id=2,
+            color=[255, 255, 255],
+            type='',
+            swap='right sleeve'),
+        3:
+        dict(
+            name='right sleeve',
+            id=3,
+            color=[255, 255, 255],
+            type='',
+            swap='left sleeve'),
+        4:
+        dict(
+            name='left waistline',
+            id=0,
+            color=[255, 255, 255],
+            type='',
+            swap='right waistline'),
+        5:
+        dict(
+            name='right waistline',
+            id=1,
+            color=[255, 255, 255],
+            type='',
+            swap='left waistline'),
+        6:
+        dict(
+            name='left hem',
+            id=2,
+            color=[255, 255, 255],
+            type='',
+            swap='right hem'),
+        7:
+        dict(
+            name='right hem',
+            id=3,
+            color=[255, 255, 255],
+            type='',
+            swap='left hem'),
+    },
+    skeleton_info={},
+    joint_weights=[1.] * 8,
+    sigmas=[])
diff --git a/main/_base_/datasets/deepfashion_lower.py b/main/_base_/datasets/deepfashion_lower.py
new file mode 100644
index 0000000000000000000000000000000000000000..db014a1747ca618f93a7d092d29027015b48ae3c
--- /dev/null
+++ b/main/_base_/datasets/deepfashion_lower.py
@@ -0,0 +1,46 @@
+dataset_info = dict(
+    dataset_name='deepfashion_lower',
+    paper_info=dict(
+        author='Liu, Ziwei and Luo, Ping and Qiu, Shi '
+        'and Wang, Xiaogang and Tang, Xiaoou',
+        title='DeepFashion: Powering Robust Clothes Recognition '
+        'and Retrieval with Rich Annotations',
+        container='Proceedings of IEEE Conference on Computer '
+        'Vision and Pattern Recognition (CVPR)',
+        year='2016',
+        homepage='http://mmlab.ie.cuhk.edu.hk/projects/'
+        'DeepFashion/LandmarkDetection.html',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='left waistline',
+            id=0,
+            color=[255, 255, 255],
+            type='',
+            swap='right waistline'),
+        1:
+        dict(
+            name='right waistline',
+            id=1,
+            color=[255, 255, 255],
+            type='',
+            swap='left waistline'),
+        2:
+        dict(
+            name='left hem',
+            id=2,
+            color=[255, 255, 255],
+            type='',
+            swap='right hem'),
+        3:
+        dict(
+            name='right hem',
+            id=3,
+            color=[255, 255, 255],
+            type='',
+            swap='left hem'),
+    },
+    skeleton_info={},
+    joint_weights=[1.] * 4,
+    sigmas=[])
diff --git a/main/_base_/datasets/deepfashion_upper.py b/main/_base_/datasets/deepfashion_upper.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0b012fd37bee1ba5ed956a7a5465a8623bf0894
--- /dev/null
+++ b/main/_base_/datasets/deepfashion_upper.py
@@ -0,0 +1,60 @@
+dataset_info = dict(
+    dataset_name='deepfashion_upper',
+    paper_info=dict(
+        author='Liu, Ziwei and Luo, Ping and Qiu, Shi '
+        'and Wang, Xiaogang and Tang, Xiaoou',
+        title='DeepFashion: Powering Robust Clothes Recognition '
+        'and Retrieval with Rich Annotations',
+        container='Proceedings of IEEE Conference on Computer '
+        'Vision and Pattern Recognition (CVPR)',
+        year='2016',
+        homepage='http://mmlab.ie.cuhk.edu.hk/projects/'
+        'DeepFashion/LandmarkDetection.html',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='left collar',
+            id=0,
+            color=[255, 255, 255],
+            type='',
+            swap='right collar'),
+        1:
+        dict(
+            name='right collar',
+            id=1,
+            color=[255, 255, 255],
+            type='',
+            swap='left collar'),
+        2:
+        dict(
+            name='left sleeve',
+            id=2,
+            color=[255, 255, 255],
+            type='',
+            swap='right sleeve'),
+        3:
+        dict(
+            name='right sleeve',
+            id=3,
+            color=[255, 255, 255],
+            type='',
+            swap='left sleeve'),
+        4:
+        dict(
+            name='left hem',
+            id=4,
+            color=[255, 255, 255],
+            type='',
+            swap='right hem'),
+        5:
+        dict(
+            name='right hem',
+            id=5,
+            color=[255, 255, 255],
+            type='',
+            swap='left hem'),
+    },
+    skeleton_info={},
+    joint_weights=[1.] * 6,
+    sigmas=[])
diff --git a/main/_base_/datasets/fly.py b/main/_base_/datasets/fly.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f94ff57ca93d8f562b6a61b9a67198abdcde217
--- /dev/null
+++ b/main/_base_/datasets/fly.py
@@ -0,0 +1,237 @@
+dataset_info = dict(
+    dataset_name='fly',
+    paper_info=dict(
+        author='Pereira, Talmo D and Aldarondo, Diego E and '
+        'Willmore, Lindsay and Kislin, Mikhail and '
+        'Wang, Samuel S-H and Murthy, Mala and Shaevitz, Joshua W',
+        title='Fast animal pose estimation using deep neural networks',
+        container='Nature methods',
+        year='2019',
+        homepage='https://github.com/jgraving/DeepPoseKit-Data',
+    ),
+    keypoint_info={
+        0:
+        dict(name='head', id=0, color=[255, 255, 255], type='', swap=''),
+        1:
+        dict(name='eyeL', id=1, color=[255, 255, 255], type='', swap='eyeR'),
+        2:
+        dict(name='eyeR', id=2, color=[255, 255, 255], type='', swap='eyeL'),
+        3:
+        dict(name='neck', id=3, color=[255, 255, 255], type='', swap=''),
+        4:
+        dict(name='thorax', id=4, color=[255, 255, 255], type='', swap=''),
+        5:
+        dict(name='abdomen', id=5, color=[255, 255, 255], type='', swap=''),
+        6:
+        dict(
+            name='forelegR1',
+            id=6,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegL1'),
+        7:
+        dict(
+            name='forelegR2',
+            id=7,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegL2'),
+        8:
+        dict(
+            name='forelegR3',
+            id=8,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegL3'),
+        9:
+        dict(
+            name='forelegR4',
+            id=9,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegL4'),
+        10:
+        dict(
+            name='midlegR1',
+            id=10,
+            color=[255, 255, 255],
+            type='',
+            swap='midlegL1'),
+        11:
+        dict(
+            name='midlegR2',
+            id=11,
+            color=[255, 255, 255],
+            type='',
+            swap='midlegL2'),
+        12:
+        dict(
+            name='midlegR3',
+            id=12,
+            color=[255, 255, 255],
+            type='',
+            swap='midlegL3'),
+        13:
+        dict(
+            name='midlegR4',
+            id=13,
+            color=[255, 255, 255],
+            type='',
+            swap='midlegL4'),
+        14:
+        dict(
+            name='hindlegR1',
+            id=14,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegL1'),
+        15:
+        dict(
+            name='hindlegR2',
+            id=15,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegL2'),
+        16:
+        dict(
+            name='hindlegR3',
+            id=16,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegL3'),
+        17:
+        dict(
+            name='hindlegR4',
+            id=17,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegL4'),
+        18:
+        dict(
+            name='forelegL1',
+            id=18,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegR1'),
+        19:
+        dict(
+            name='forelegL2',
+            id=19,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegR2'),
+        20:
+        dict(
+            name='forelegL3',
+            id=20,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegR3'),
+        21:
+        dict(
+            name='forelegL4',
+            id=21,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegR4'),
+        22:
+        dict(
+            name='midlegL1',
+            id=22,
+            color=[255, 255, 255],
+            type='',
+            swap='midlegR1'),
+        23:
+        dict(
+            name='midlegL2',
+            id=23,
+            color=[255, 255, 255],
+            type='',
+            swap='midlegR2'),
+        24:
+        dict(
+            name='midlegL3',
+            id=24,
+            color=[255, 255, 255],
+            type='',
+            swap='midlegR3'),
+        25:
+        dict(
+            name='midlegL4',
+            id=25,
+            color=[255, 255, 255],
+            type='',
+            swap='midlegR4'),
+        26:
+        dict(
+            name='hindlegL1',
+            id=26,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegR1'),
+        27:
+        dict(
+            name='hindlegL2',
+            id=27,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegR2'),
+        28:
+        dict(
+            name='hindlegL3',
+            id=28,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegR3'),
+        29:
+        dict(
+            name='hindlegL4',
+            id=29,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegR4'),
+        30:
+        dict(
+            name='wingL', id=30, color=[255, 255, 255], type='', swap='wingR'),
+        31:
+        dict(
+            name='wingR', id=31, color=[255, 255, 255], type='', swap='wingL'),
+    },
+    skeleton_info={
+        0: dict(link=('eyeL', 'head'), id=0, color=[255, 255, 255]),
+        1: dict(link=('eyeR', 'head'), id=1, color=[255, 255, 255]),
+        2: dict(link=('neck', 'head'), id=2, color=[255, 255, 255]),
+        3: dict(link=('thorax', 'neck'), id=3, color=[255, 255, 255]),
+        4: dict(link=('abdomen', 'thorax'), id=4, color=[255, 255, 255]),
+        5: dict(link=('forelegR2', 'forelegR1'), id=5, color=[255, 255, 255]),
+        6: dict(link=('forelegR3', 'forelegR2'), id=6, color=[255, 255, 255]),
+        7: dict(link=('forelegR4', 'forelegR3'), id=7, color=[255, 255, 255]),
+        8: dict(link=('midlegR2', 'midlegR1'), id=8, color=[255, 255, 255]),
+        9: dict(link=('midlegR3', 'midlegR2'), id=9, color=[255, 255, 255]),
+        10: dict(link=('midlegR4', 'midlegR3'), id=10, color=[255, 255, 255]),
+        11:
+        dict(link=('hindlegR2', 'hindlegR1'), id=11, color=[255, 255, 255]),
+        12:
+        dict(link=('hindlegR3', 'hindlegR2'), id=12, color=[255, 255, 255]),
+        13:
+        dict(link=('hindlegR4', 'hindlegR3'), id=13, color=[255, 255, 255]),
+        14:
+        dict(link=('forelegL2', 'forelegL1'), id=14, color=[255, 255, 255]),
+        15:
+        dict(link=('forelegL3', 'forelegL2'), id=15, color=[255, 255, 255]),
+        16:
+        dict(link=('forelegL4', 'forelegL3'), id=16, color=[255, 255, 255]),
+        17: dict(link=('midlegL2', 'midlegL1'), id=17, color=[255, 255, 255]),
+        18: dict(link=('midlegL3', 'midlegL2'), id=18, color=[255, 255, 255]),
+        19: dict(link=('midlegL4', 'midlegL3'), id=19, color=[255, 255, 255]),
+        20:
+        dict(link=('hindlegL2', 'hindlegL1'), id=20, color=[255, 255, 255]),
+        21:
+        dict(link=('hindlegL3', 'hindlegL2'), id=21, color=[255, 255, 255]),
+        22:
+        dict(link=('hindlegL4', 'hindlegL3'), id=22, color=[255, 255, 255]),
+        23: dict(link=('wingL', 'neck'), id=23, color=[255, 255, 255]),
+        24: dict(link=('wingR', 'neck'), id=24, color=[255, 255, 255])
+    },
+    joint_weights=[1.] * 32,
+    sigmas=[])
diff --git a/main/_base_/datasets/freihand2d.py b/main/_base_/datasets/freihand2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b960d10f3538801531dbccdd67aeac6e73ac572
--- /dev/null
+++ b/main/_base_/datasets/freihand2d.py
@@ -0,0 +1,144 @@
+dataset_info = dict(
+    dataset_name='freihand',
+    paper_info=dict(
+        author='Zimmermann, Christian and Ceylan, Duygu and '
+        'Yang, Jimei and Russell, Bryan and '
+        'Argus, Max and Brox, Thomas',
+        title='Freihand: A dataset for markerless capture of hand pose '
+        'and shape from single rgb images',
+        container='Proceedings of the IEEE International '
+        'Conference on Computer Vision',
+        year='2019',
+        homepage='https://lmb.informatik.uni-freiburg.de/projects/freihand/',
+    ),
+    keypoint_info={
+        0:
+        dict(name='wrist', id=0, color=[255, 255, 255], type='', swap=''),
+        1:
+        dict(name='thumb1', id=1, color=[255, 128, 0], type='', swap=''),
+        2:
+        dict(name='thumb2', id=2, color=[255, 128, 0], type='', swap=''),
+        3:
+        dict(name='thumb3', id=3, color=[255, 128, 0], type='', swap=''),
+        4:
+        dict(name='thumb4', id=4, color=[255, 128, 0], type='', swap=''),
+        5:
+        dict(
+            name='forefinger1', id=5, color=[255, 153, 255], type='', swap=''),
+        6:
+        dict(
+            name='forefinger2', id=6, color=[255, 153, 255], type='', swap=''),
+        7:
+        dict(
+            name='forefinger3', id=7, color=[255, 153, 255], type='', swap=''),
+        8:
+        dict(
+            name='forefinger4', id=8, color=[255, 153, 255], type='', swap=''),
+        9:
+        dict(
+            name='middle_finger1',
+            id=9,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        10:
+        dict(
+            name='middle_finger2',
+            id=10,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        11:
+        dict(
+            name='middle_finger3',
+            id=11,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        12:
+        dict(
+            name='middle_finger4',
+            id=12,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        13:
+        dict(
+            name='ring_finger1', id=13, color=[255, 51, 51], type='', swap=''),
+        14:
+        dict(
+            name='ring_finger2', id=14, color=[255, 51, 51], type='', swap=''),
+        15:
+        dict(
+            name='ring_finger3', id=15, color=[255, 51, 51], type='', swap=''),
+        16:
+        dict(
+            name='ring_finger4', id=16, color=[255, 51, 51], type='', swap=''),
+        17:
+        dict(name='pinky_finger1', id=17, color=[0, 255, 0], type='', swap=''),
+        18:
+        dict(name='pinky_finger2', id=18, color=[0, 255, 0], type='', swap=''),
+        19:
+        dict(name='pinky_finger3', id=19, color=[0, 255, 0], type='', swap=''),
+        20:
+        dict(name='pinky_finger4', id=20, color=[0, 255, 0], type='', swap='')
+    },
+    skeleton_info={
+        0:
+        dict(link=('wrist', 'thumb1'), id=0, color=[255, 128, 0]),
+        1:
+        dict(link=('thumb1', 'thumb2'), id=1, color=[255, 128, 0]),
+        2:
+        dict(link=('thumb2', 'thumb3'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('thumb3', 'thumb4'), id=3, color=[255, 128, 0]),
+        4:
+        dict(link=('wrist', 'forefinger1'), id=4, color=[255, 153, 255]),
+        5:
+        dict(link=('forefinger1', 'forefinger2'), id=5, color=[255, 153, 255]),
+        6:
+        dict(link=('forefinger2', 'forefinger3'), id=6, color=[255, 153, 255]),
+        7:
+        dict(link=('forefinger3', 'forefinger4'), id=7, color=[255, 153, 255]),
+        8:
+        dict(link=('wrist', 'middle_finger1'), id=8, color=[102, 178, 255]),
+        9:
+        dict(
+            link=('middle_finger1', 'middle_finger2'),
+            id=9,
+            color=[102, 178, 255]),
+        10:
+        dict(
+            link=('middle_finger2', 'middle_finger3'),
+            id=10,
+            color=[102, 178, 255]),
+        11:
+        dict(
+            link=('middle_finger3', 'middle_finger4'),
+            id=11,
+            color=[102, 178, 255]),
+        12:
+        dict(link=('wrist', 'ring_finger1'), id=12, color=[255, 51, 51]),
+        13:
+        dict(
+            link=('ring_finger1', 'ring_finger2'), id=13, color=[255, 51, 51]),
+        14:
+        dict(
+            link=('ring_finger2', 'ring_finger3'), id=14, color=[255, 51, 51]),
+        15:
+        dict(
+            link=('ring_finger3', 'ring_finger4'), id=15, color=[255, 51, 51]),
+        16:
+        dict(link=('wrist', 'pinky_finger1'), id=16, color=[0, 255, 0]),
+        17:
+        dict(
+            link=('pinky_finger1', 'pinky_finger2'), id=17, color=[0, 255, 0]),
+        18:
+        dict(
+            link=('pinky_finger2', 'pinky_finger3'), id=18, color=[0, 255, 0]),
+        19:
+        dict(
+            link=('pinky_finger3', 'pinky_finger4'), id=19, color=[0, 255, 0])
+    },
+    joint_weights=[1.] * 21,
+    sigmas=[])
diff --git a/main/_base_/datasets/h36m.py b/main/_base_/datasets/h36m.py
new file mode 100644
index 0000000000000000000000000000000000000000..00a719d8b19f9ff3c5ef98476d73216055bf9186
--- /dev/null
+++ b/main/_base_/datasets/h36m.py
@@ -0,0 +1,152 @@
+dataset_info = dict(
+    dataset_name='h36m',
+    paper_info=dict(
+        author='Ionescu, Catalin and Papava, Dragos and '
+        'Olaru, Vlad and Sminchisescu, Cristian',
+        title='Human3.6M: Large Scale Datasets and Predictive '
+        'Methods for 3D Human Sensing in Natural Environments',
+        container='IEEE Transactions on Pattern Analysis and '
+        'Machine Intelligence',
+        year='2014',
+        homepage='http://vision.imar.ro/human3.6m/description.php',
+    ),
+    keypoint_info={
+        0:
+        dict(name='root', id=0, color=[51, 153, 255], type='lower', swap=''),
+        1:
+        dict(
+            name='right_hip',
+            id=1,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        2:
+        dict(
+            name='right_knee',
+            id=2,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        3:
+        dict(
+            name='right_foot',
+            id=3,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_foot'),
+        4:
+        dict(
+            name='left_hip',
+            id=4,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        5:
+        dict(
+            name='left_knee',
+            id=5,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        6:
+        dict(
+            name='left_foot',
+            id=6,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_foot'),
+        7:
+        dict(name='spine', id=7, color=[51, 153, 255], type='upper', swap=''),
+        8:
+        dict(name='thorax', id=8, color=[51, 153, 255], type='upper', swap=''),
+        9:
+        dict(
+            name='neck_base',
+            id=9,
+            color=[51, 153, 255],
+            type='upper',
+            swap=''),
+        10:
+        dict(name='head', id=10, color=[51, 153, 255], type='upper', swap=''),
+        11:
+        dict(
+            name='left_shoulder',
+            id=11,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        12:
+        dict(
+            name='left_elbow',
+            id=12,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_elbow'),
+        13:
+        dict(
+            name='left_wrist',
+            id=13,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_wrist'),
+        14:
+        dict(
+            name='right_shoulder',
+            id=14,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        15:
+        dict(
+            name='right_elbow',
+            id=15,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_elbow'),
+        16:
+        dict(
+            name='right_wrist',
+            id=16,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist')
+    },
+    skeleton_info={
+        0:
+        dict(link=('root', 'left_hip'), id=0, color=[0, 255, 0]),
+        1:
+        dict(link=('left_hip', 'left_knee'), id=1, color=[0, 255, 0]),
+        2:
+        dict(link=('left_knee', 'left_foot'), id=2, color=[0, 255, 0]),
+        3:
+        dict(link=('root', 'right_hip'), id=3, color=[255, 128, 0]),
+        4:
+        dict(link=('right_hip', 'right_knee'), id=4, color=[255, 128, 0]),
+        5:
+        dict(link=('right_knee', 'right_foot'), id=5, color=[255, 128, 0]),
+        6:
+        dict(link=('root', 'spine'), id=6, color=[51, 153, 255]),
+        7:
+        dict(link=('spine', 'thorax'), id=7, color=[51, 153, 255]),
+        8:
+        dict(link=('thorax', 'neck_base'), id=8, color=[51, 153, 255]),
+        9:
+        dict(link=('neck_base', 'head'), id=9, color=[51, 153, 255]),
+        10:
+        dict(link=('thorax', 'left_shoulder'), id=10, color=[0, 255, 0]),
+        11:
+        dict(link=('left_shoulder', 'left_elbow'), id=11, color=[0, 255, 0]),
+        12:
+        dict(link=('left_elbow', 'left_wrist'), id=12, color=[0, 255, 0]),
+        13:
+        dict(link=('thorax', 'right_shoulder'), id=13, color=[255, 128, 0]),
+        14:
+        dict(
+            link=('right_shoulder', 'right_elbow'), id=14, color=[255, 128,
+                                                                  0]),
+        15:
+        dict(link=('right_elbow', 'right_wrist'), id=15, color=[255, 128, 0])
+    },
+    joint_weights=[1.] * 17,
+    sigmas=[],
+    stats_info=dict(bbox_center=(528., 427.), bbox_scale=400.))
diff --git a/main/_base_/datasets/halpe.py b/main/_base_/datasets/halpe.py
new file mode 100644
index 0000000000000000000000000000000000000000..1385fe81dc2190684f2142449c0f288f2cb74c1a
--- /dev/null
+++ b/main/_base_/datasets/halpe.py
@@ -0,0 +1,1157 @@
+dataset_info = dict(
+    dataset_name='halpe',
+    paper_info=dict(
+        author='Li, Yong-Lu and Xu, Liang and Liu, Xinpeng and Huang, Xijie'
+        ' and Xu, Yue and Wang, Shiyi and Fang, Hao-Shu'
+        ' and Ma, Ze and Chen, Mingyang and Lu, Cewu',
+        title='PaStaNet: Toward Human Activity Knowledge Engine',
+        container='CVPR',
+        year='2020',
+        homepage='https://github.com/Fang-Haoshu/Halpe-FullBody/',
+    ),
+    keypoint_info={
+        0:
+        dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''),
+        1:
+        dict(
+            name='left_eye',
+            id=1,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_eye'),
+        2:
+        dict(
+            name='right_eye',
+            id=2,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_eye'),
+        3:
+        dict(
+            name='left_ear',
+            id=3,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_ear'),
+        4:
+        dict(
+            name='right_ear',
+            id=4,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_ear'),
+        5:
+        dict(
+            name='left_shoulder',
+            id=5,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        6:
+        dict(
+            name='right_shoulder',
+            id=6,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        7:
+        dict(
+            name='left_elbow',
+            id=7,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_elbow'),
+        8:
+        dict(
+            name='right_elbow',
+            id=8,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_elbow'),
+        9:
+        dict(
+            name='left_wrist',
+            id=9,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_wrist'),
+        10:
+        dict(
+            name='right_wrist',
+            id=10,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist'),
+        11:
+        dict(
+            name='left_hip',
+            id=11,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        12:
+        dict(
+            name='right_hip',
+            id=12,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        13:
+        dict(
+            name='left_knee',
+            id=13,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        14:
+        dict(
+            name='right_knee',
+            id=14,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        15:
+        dict(
+            name='left_ankle',
+            id=15,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_ankle'),
+        16:
+        dict(
+            name='right_ankle',
+            id=16,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_ankle'),
+        17:
+        dict(name='head', id=17, color=[255, 128, 0], type='upper', swap=''),
+        18:
+        dict(name='neck', id=18, color=[255, 128, 0], type='upper', swap=''),
+        19:
+        dict(name='hip', id=19, color=[255, 128, 0], type='lower', swap=''),
+        20:
+        dict(
+            name='left_big_toe',
+            id=20,
+            color=[255, 128, 0],
+            type='lower',
+            swap='right_big_toe'),
+        21:
+        dict(
+            name='right_big_toe',
+            id=21,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_big_toe'),
+        22:
+        dict(
+            name='left_small_toe',
+            id=22,
+            color=[255, 128, 0],
+            type='lower',
+            swap='right_small_toe'),
+        23:
+        dict(
+            name='right_small_toe',
+            id=23,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_small_toe'),
+        24:
+        dict(
+            name='left_heel',
+            id=24,
+            color=[255, 128, 0],
+            type='lower',
+            swap='right_heel'),
+        25:
+        dict(
+            name='right_heel',
+            id=25,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_heel'),
+        26:
+        dict(
+            name='face-0',
+            id=26,
+            color=[255, 255, 255],
+            type='',
+            swap='face-16'),
+        27:
+        dict(
+            name='face-1',
+            id=27,
+            color=[255, 255, 255],
+            type='',
+            swap='face-15'),
+        28:
+        dict(
+            name='face-2',
+            id=28,
+            color=[255, 255, 255],
+            type='',
+            swap='face-14'),
+        29:
+        dict(
+            name='face-3',
+            id=29,
+            color=[255, 255, 255],
+            type='',
+            swap='face-13'),
+        30:
+        dict(
+            name='face-4',
+            id=30,
+            color=[255, 255, 255],
+            type='',
+            swap='face-12'),
+        31:
+        dict(
+            name='face-5',
+            id=31,
+            color=[255, 255, 255],
+            type='',
+            swap='face-11'),
+        32:
+        dict(
+            name='face-6',
+            id=32,
+            color=[255, 255, 255],
+            type='',
+            swap='face-10'),
+        33:
+        dict(
+            name='face-7',
+            id=33,
+            color=[255, 255, 255],
+            type='',
+            swap='face-9'),
+        34:
+        dict(name='face-8', id=34, color=[255, 255, 255], type='', swap=''),
+        35:
+        dict(
+            name='face-9',
+            id=35,
+            color=[255, 255, 255],
+            type='',
+            swap='face-7'),
+        36:
+        dict(
+            name='face-10',
+            id=36,
+            color=[255, 255, 255],
+            type='',
+            swap='face-6'),
+        37:
+        dict(
+            name='face-11',
+            id=37,
+            color=[255, 255, 255],
+            type='',
+            swap='face-5'),
+        38:
+        dict(
+            name='face-12',
+            id=38,
+            color=[255, 255, 255],
+            type='',
+            swap='face-4'),
+        39:
+        dict(
+            name='face-13',
+            id=39,
+            color=[255, 255, 255],
+            type='',
+            swap='face-3'),
+        40:
+        dict(
+            name='face-14',
+            id=40,
+            color=[255, 255, 255],
+            type='',
+            swap='face-2'),
+        41:
+        dict(
+            name='face-15',
+            id=41,
+            color=[255, 255, 255],
+            type='',
+            swap='face-1'),
+        42:
+        dict(
+            name='face-16',
+            id=42,
+            color=[255, 255, 255],
+            type='',
+            swap='face-0'),
+        43:
+        dict(
+            name='face-17',
+            id=43,
+            color=[255, 255, 255],
+            type='',
+            swap='face-26'),
+        44:
+        dict(
+            name='face-18',
+            id=44,
+            color=[255, 255, 255],
+            type='',
+            swap='face-25'),
+        45:
+        dict(
+            name='face-19',
+            id=45,
+            color=[255, 255, 255],
+            type='',
+            swap='face-24'),
+        46:
+        dict(
+            name='face-20',
+            id=46,
+            color=[255, 255, 255],
+            type='',
+            swap='face-23'),
+        47:
+        dict(
+            name='face-21',
+            id=47,
+            color=[255, 255, 255],
+            type='',
+            swap='face-22'),
+        48:
+        dict(
+            name='face-22',
+            id=48,
+            color=[255, 255, 255],
+            type='',
+            swap='face-21'),
+        49:
+        dict(
+            name='face-23',
+            id=49,
+            color=[255, 255, 255],
+            type='',
+            swap='face-20'),
+        50:
+        dict(
+            name='face-24',
+            id=50,
+            color=[255, 255, 255],
+            type='',
+            swap='face-19'),
+        51:
+        dict(
+            name='face-25',
+            id=51,
+            color=[255, 255, 255],
+            type='',
+            swap='face-18'),
+        52:
+        dict(
+            name='face-26',
+            id=52,
+            color=[255, 255, 255],
+            type='',
+            swap='face-17'),
+        53:
+        dict(name='face-27', id=53, color=[255, 255, 255], type='', swap=''),
+        54:
+        dict(name='face-28', id=54, color=[255, 255, 255], type='', swap=''),
+        55:
+        dict(name='face-29', id=55, color=[255, 255, 255], type='', swap=''),
+        56:
+        dict(name='face-30', id=56, color=[255, 255, 255], type='', swap=''),
+        57:
+        dict(
+            name='face-31',
+            id=57,
+            color=[255, 255, 255],
+            type='',
+            swap='face-35'),
+        58:
+        dict(
+            name='face-32',
+            id=58,
+            color=[255, 255, 255],
+            type='',
+            swap='face-34'),
+        59:
+        dict(name='face-33', id=59, color=[255, 255, 255], type='', swap=''),
+        60:
+        dict(
+            name='face-34',
+            id=60,
+            color=[255, 255, 255],
+            type='',
+            swap='face-32'),
+        61:
+        dict(
+            name='face-35',
+            id=61,
+            color=[255, 255, 255],
+            type='',
+            swap='face-31'),
+        62:
+        dict(
+            name='face-36',
+            id=62,
+            color=[255, 255, 255],
+            type='',
+            swap='face-45'),
+        63:
+        dict(
+            name='face-37',
+            id=63,
+            color=[255, 255, 255],
+            type='',
+            swap='face-44'),
+        64:
+        dict(
+            name='face-38',
+            id=64,
+            color=[255, 255, 255],
+            type='',
+            swap='face-43'),
+        65:
+        dict(
+            name='face-39',
+            id=65,
+            color=[255, 255, 255],
+            type='',
+            swap='face-42'),
+        66:
+        dict(
+            name='face-40',
+            id=66,
+            color=[255, 255, 255],
+            type='',
+            swap='face-47'),
+        67:
+        dict(
+            name='face-41',
+            id=67,
+            color=[255, 255, 255],
+            type='',
+            swap='face-46'),
+        68:
+        dict(
+            name='face-42',
+            id=68,
+            color=[255, 255, 255],
+            type='',
+            swap='face-39'),
+        69:
+        dict(
+            name='face-43',
+            id=69,
+            color=[255, 255, 255],
+            type='',
+            swap='face-38'),
+        70:
+        dict(
+            name='face-44',
+            id=70,
+            color=[255, 255, 255],
+            type='',
+            swap='face-37'),
+        71:
+        dict(
+            name='face-45',
+            id=71,
+            color=[255, 255, 255],
+            type='',
+            swap='face-36'),
+        72:
+        dict(
+            name='face-46',
+            id=72,
+            color=[255, 255, 255],
+            type='',
+            swap='face-41'),
+        73:
+        dict(
+            name='face-47',
+            id=73,
+            color=[255, 255, 255],
+            type='',
+            swap='face-40'),
+        74:
+        dict(
+            name='face-48',
+            id=74,
+            color=[255, 255, 255],
+            type='',
+            swap='face-54'),
+        75:
+        dict(
+            name='face-49',
+            id=75,
+            color=[255, 255, 255],
+            type='',
+            swap='face-53'),
+        76:
+        dict(
+            name='face-50',
+            id=76,
+            color=[255, 255, 255],
+            type='',
+            swap='face-52'),
+        77:
+        dict(name='face-51', id=77, color=[255, 255, 255], type='', swap=''),
+        78:
+        dict(
+            name='face-52',
+            id=78,
+            color=[255, 255, 255],
+            type='',
+            swap='face-50'),
+        79:
+        dict(
+            name='face-53',
+            id=79,
+            color=[255, 255, 255],
+            type='',
+            swap='face-49'),
+        80:
+        dict(
+            name='face-54',
+            id=80,
+            color=[255, 255, 255],
+            type='',
+            swap='face-48'),
+        81:
+        dict(
+            name='face-55',
+            id=81,
+            color=[255, 255, 255],
+            type='',
+            swap='face-59'),
+        82:
+        dict(
+            name='face-56',
+            id=82,
+            color=[255, 255, 255],
+            type='',
+            swap='face-58'),
+        83:
+        dict(name='face-57', id=83, color=[255, 255, 255], type='', swap=''),
+        84:
+        dict(
+            name='face-58',
+            id=84,
+            color=[255, 255, 255],
+            type='',
+            swap='face-56'),
+        85:
+        dict(
+            name='face-59',
+            id=85,
+            color=[255, 255, 255],
+            type='',
+            swap='face-55'),
+        86:
+        dict(
+            name='face-60',
+            id=86,
+            color=[255, 255, 255],
+            type='',
+            swap='face-64'),
+        87:
+        dict(
+            name='face-61',
+            id=87,
+            color=[255, 255, 255],
+            type='',
+            swap='face-63'),
+        88:
+        dict(name='face-62', id=88, color=[255, 255, 255], type='', swap=''),
+        89:
+        dict(
+            name='face-63',
+            id=89,
+            color=[255, 255, 255],
+            type='',
+            swap='face-61'),
+        90:
+        dict(
+            name='face-64',
+            id=90,
+            color=[255, 255, 255],
+            type='',
+            swap='face-60'),
+        91:
+        dict(
+            name='face-65',
+            id=91,
+            color=[255, 255, 255],
+            type='',
+            swap='face-67'),
+        92:
+        dict(name='face-66', id=92, color=[255, 255, 255], type='', swap=''),
+        93:
+        dict(
+            name='face-67',
+            id=93,
+            color=[255, 255, 255],
+            type='',
+            swap='face-65'),
+        94:
+        dict(
+            name='left_hand_root',
+            id=94,
+            color=[255, 255, 255],
+            type='',
+            swap='right_hand_root'),
+        95:
+        dict(
+            name='left_thumb1',
+            id=95,
+            color=[255, 128, 0],
+            type='',
+            swap='right_thumb1'),
+        96:
+        dict(
+            name='left_thumb2',
+            id=96,
+            color=[255, 128, 0],
+            type='',
+            swap='right_thumb2'),
+        97:
+        dict(
+            name='left_thumb3',
+            id=97,
+            color=[255, 128, 0],
+            type='',
+            swap='right_thumb3'),
+        98:
+        dict(
+            name='left_thumb4',
+            id=98,
+            color=[255, 128, 0],
+            type='',
+            swap='right_thumb4'),
+        99:
+        dict(
+            name='left_forefinger1',
+            id=99,
+            color=[255, 153, 255],
+            type='',
+            swap='right_forefinger1'),
+        100:
+        dict(
+            name='left_forefinger2',
+            id=100,
+            color=[255, 153, 255],
+            type='',
+            swap='right_forefinger2'),
+        101:
+        dict(
+            name='left_forefinger3',
+            id=101,
+            color=[255, 153, 255],
+            type='',
+            swap='right_forefinger3'),
+        102:
+        dict(
+            name='left_forefinger4',
+            id=102,
+            color=[255, 153, 255],
+            type='',
+            swap='right_forefinger4'),
+        103:
+        dict(
+            name='left_middle_finger1',
+            id=103,
+            color=[102, 178, 255],
+            type='',
+            swap='right_middle_finger1'),
+        104:
+        dict(
+            name='left_middle_finger2',
+            id=104,
+            color=[102, 178, 255],
+            type='',
+            swap='right_middle_finger2'),
+        105:
+        dict(
+            name='left_middle_finger3',
+            id=105,
+            color=[102, 178, 255],
+            type='',
+            swap='right_middle_finger3'),
+        106:
+        dict(
+            name='left_middle_finger4',
+            id=106,
+            color=[102, 178, 255],
+            type='',
+            swap='right_middle_finger4'),
+        107:
+        dict(
+            name='left_ring_finger1',
+            id=107,
+            color=[255, 51, 51],
+            type='',
+            swap='right_ring_finger1'),
+        108:
+        dict(
+            name='left_ring_finger2',
+            id=108,
+            color=[255, 51, 51],
+            type='',
+            swap='right_ring_finger2'),
+        109:
+        dict(
+            name='left_ring_finger3',
+            id=109,
+            color=[255, 51, 51],
+            type='',
+            swap='right_ring_finger3'),
+        110:
+        dict(
+            name='left_ring_finger4',
+            id=110,
+            color=[255, 51, 51],
+            type='',
+            swap='right_ring_finger4'),
+        111:
+        dict(
+            name='left_pinky_finger1',
+            id=111,
+            color=[0, 255, 0],
+            type='',
+            swap='right_pinky_finger1'),
+        112:
+        dict(
+            name='left_pinky_finger2',
+            id=112,
+            color=[0, 255, 0],
+            type='',
+            swap='right_pinky_finger2'),
+        113:
+        dict(
+            name='left_pinky_finger3',
+            id=113,
+            color=[0, 255, 0],
+            type='',
+            swap='right_pinky_finger3'),
+        114:
+        dict(
+            name='left_pinky_finger4',
+            id=114,
+            color=[0, 255, 0],
+            type='',
+            swap='right_pinky_finger4'),
+        115:
+        dict(
+            name='right_hand_root',
+            id=115,
+            color=[255, 255, 255],
+            type='',
+            swap='left_hand_root'),
+        116:
+        dict(
+            name='right_thumb1',
+            id=116,
+            color=[255, 128, 0],
+            type='',
+            swap='left_thumb1'),
+        117:
+        dict(
+            name='right_thumb2',
+            id=117,
+            color=[255, 128, 0],
+            type='',
+            swap='left_thumb2'),
+        118:
+        dict(
+            name='right_thumb3',
+            id=118,
+            color=[255, 128, 0],
+            type='',
+            swap='left_thumb3'),
+        119:
+        dict(
+            name='right_thumb4',
+            id=119,
+            color=[255, 128, 0],
+            type='',
+            swap='left_thumb4'),
+        120:
+        dict(
+            name='right_forefinger1',
+            id=120,
+            color=[255, 153, 255],
+            type='',
+            swap='left_forefinger1'),
+        121:
+        dict(
+            name='right_forefinger2',
+            id=121,
+            color=[255, 153, 255],
+            type='',
+            swap='left_forefinger2'),
+        122:
+        dict(
+            name='right_forefinger3',
+            id=122,
+            color=[255, 153, 255],
+            type='',
+            swap='left_forefinger3'),
+        123:
+        dict(
+            name='right_forefinger4',
+            id=123,
+            color=[255, 153, 255],
+            type='',
+            swap='left_forefinger4'),
+        124:
+        dict(
+            name='right_middle_finger1',
+            id=124,
+            color=[102, 178, 255],
+            type='',
+            swap='left_middle_finger1'),
+        125:
+        dict(
+            name='right_middle_finger2',
+            id=125,
+            color=[102, 178, 255],
+            type='',
+            swap='left_middle_finger2'),
+        126:
+        dict(
+            name='right_middle_finger3',
+            id=126,
+            color=[102, 178, 255],
+            type='',
+            swap='left_middle_finger3'),
+        127:
+        dict(
+            name='right_middle_finger4',
+            id=127,
+            color=[102, 178, 255],
+            type='',
+            swap='left_middle_finger4'),
+        128:
+        dict(
+            name='right_ring_finger1',
+            id=128,
+            color=[255, 51, 51],
+            type='',
+            swap='left_ring_finger1'),
+        129:
+        dict(
+            name='right_ring_finger2',
+            id=129,
+            color=[255, 51, 51],
+            type='',
+            swap='left_ring_finger2'),
+        130:
+        dict(
+            name='right_ring_finger3',
+            id=130,
+            color=[255, 51, 51],
+            type='',
+            swap='left_ring_finger3'),
+        131:
+        dict(
+            name='right_ring_finger4',
+            id=131,
+            color=[255, 51, 51],
+            type='',
+            swap='left_ring_finger4'),
+        132:
+        dict(
+            name='right_pinky_finger1',
+            id=132,
+            color=[0, 255, 0],
+            type='',
+            swap='left_pinky_finger1'),
+        133:
+        dict(
+            name='right_pinky_finger2',
+            id=133,
+            color=[0, 255, 0],
+            type='',
+            swap='left_pinky_finger2'),
+        134:
+        dict(
+            name='right_pinky_finger3',
+            id=134,
+            color=[0, 255, 0],
+            type='',
+            swap='left_pinky_finger3'),
+        135:
+        dict(
+            name='right_pinky_finger4',
+            id=135,
+            color=[0, 255, 0],
+            type='',
+            swap='left_pinky_finger4')
+    },
+    skeleton_info={
+        0:
+        dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]),
+        1:
+        dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]),
+        2:
+        dict(link=('left_hip', 'hip'), id=2, color=[0, 255, 0]),
+        3:
+        dict(link=('right_ankle', 'right_knee'), id=3, color=[255, 128, 0]),
+        4:
+        dict(link=('right_knee', 'right_hip'), id=4, color=[255, 128, 0]),
+        5:
+        dict(link=('right_hip', 'hip'), id=5, color=[255, 128, 0]),
+        6:
+        dict(link=('head', 'neck'), id=6, color=[51, 153, 255]),
+        7:
+        dict(link=('neck', 'hip'), id=7, color=[51, 153, 255]),
+        8:
+        dict(link=('neck', 'left_shoulder'), id=8, color=[0, 255, 0]),
+        9:
+        dict(link=('left_shoulder', 'left_elbow'), id=9, color=[0, 255, 0]),
+        10:
+        dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]),
+        11:
+        dict(link=('neck', 'right_shoulder'), id=11, color=[255, 128, 0]),
+        12:
+        dict(
+            link=('right_shoulder', 'right_elbow'), id=12, color=[255, 128,
+                                                                  0]),
+        13:
+        dict(link=('right_elbow', 'right_wrist'), id=13, color=[255, 128, 0]),
+        14:
+        dict(link=('left_eye', 'right_eye'), id=14, color=[51, 153, 255]),
+        15:
+        dict(link=('nose', 'left_eye'), id=15, color=[51, 153, 255]),
+        16:
+        dict(link=('nose', 'right_eye'), id=16, color=[51, 153, 255]),
+        17:
+        dict(link=('left_eye', 'left_ear'), id=17, color=[51, 153, 255]),
+        18:
+        dict(link=('right_eye', 'right_ear'), id=18, color=[51, 153, 255]),
+        19:
+        dict(link=('left_ear', 'left_shoulder'), id=19, color=[51, 153, 255]),
+        20:
+        dict(
+            link=('right_ear', 'right_shoulder'), id=20, color=[51, 153, 255]),
+        21:
+        dict(link=('left_ankle', 'left_big_toe'), id=21, color=[0, 255, 0]),
+        22:
+        dict(link=('left_ankle', 'left_small_toe'), id=22, color=[0, 255, 0]),
+        23:
+        dict(link=('left_ankle', 'left_heel'), id=23, color=[0, 255, 0]),
+        24:
+        dict(
+            link=('right_ankle', 'right_big_toe'), id=24, color=[255, 128, 0]),
+        25:
+        dict(
+            link=('right_ankle', 'right_small_toe'),
+            id=25,
+            color=[255, 128, 0]),
+        26:
+        dict(link=('right_ankle', 'right_heel'), id=26, color=[255, 128, 0]),
+        27:
+        dict(link=('left_wrist', 'left_thumb1'), id=27, color=[255, 128, 0]),
+        28:
+        dict(link=('left_thumb1', 'left_thumb2'), id=28, color=[255, 128, 0]),
+        29:
+        dict(link=('left_thumb2', 'left_thumb3'), id=29, color=[255, 128, 0]),
+        30:
+        dict(link=('left_thumb3', 'left_thumb4'), id=30, color=[255, 128, 0]),
+        31:
+        dict(
+            link=('left_wrist', 'left_forefinger1'),
+            id=31,
+            color=[255, 153, 255]),
+        32:
+        dict(
+            link=('left_forefinger1', 'left_forefinger2'),
+            id=32,
+            color=[255, 153, 255]),
+        33:
+        dict(
+            link=('left_forefinger2', 'left_forefinger3'),
+            id=33,
+            color=[255, 153, 255]),
+        34:
+        dict(
+            link=('left_forefinger3', 'left_forefinger4'),
+            id=34,
+            color=[255, 153, 255]),
+        35:
+        dict(
+            link=('left_wrist', 'left_middle_finger1'),
+            id=35,
+            color=[102, 178, 255]),
+        36:
+        dict(
+            link=('left_middle_finger1', 'left_middle_finger2'),
+            id=36,
+            color=[102, 178, 255]),
+        37:
+        dict(
+            link=('left_middle_finger2', 'left_middle_finger3'),
+            id=37,
+            color=[102, 178, 255]),
+        38:
+        dict(
+            link=('left_middle_finger3', 'left_middle_finger4'),
+            id=38,
+            color=[102, 178, 255]),
+        39:
+        dict(
+            link=('left_wrist', 'left_ring_finger1'),
+            id=39,
+            color=[255, 51, 51]),
+        40:
+        dict(
+            link=('left_ring_finger1', 'left_ring_finger2'),
+            id=40,
+            color=[255, 51, 51]),
+        41:
+        dict(
+            link=('left_ring_finger2', 'left_ring_finger3'),
+            id=41,
+            color=[255, 51, 51]),
+        42:
+        dict(
+            link=('left_ring_finger3', 'left_ring_finger4'),
+            id=42,
+            color=[255, 51, 51]),
+        43:
+        dict(
+            link=('left_wrist', 'left_pinky_finger1'),
+            id=43,
+            color=[0, 255, 0]),
+        44:
+        dict(
+            link=('left_pinky_finger1', 'left_pinky_finger2'),
+            id=44,
+            color=[0, 255, 0]),
+        45:
+        dict(
+            link=('left_pinky_finger2', 'left_pinky_finger3'),
+            id=45,
+            color=[0, 255, 0]),
+        46:
+        dict(
+            link=('left_pinky_finger3', 'left_pinky_finger4'),
+            id=46,
+            color=[0, 255, 0]),
+        47:
+        dict(link=('right_wrist', 'right_thumb1'), id=47, color=[255, 128, 0]),
+        48:
+        dict(
+            link=('right_thumb1', 'right_thumb2'), id=48, color=[255, 128, 0]),
+        49:
+        dict(
+            link=('right_thumb2', 'right_thumb3'), id=49, color=[255, 128, 0]),
+        50:
+        dict(
+            link=('right_thumb3', 'right_thumb4'), id=50, color=[255, 128, 0]),
+        51:
+        dict(
+            link=('right_wrist', 'right_forefinger1'),
+            id=51,
+            color=[255, 153, 255]),
+        52:
+        dict(
+            link=('right_forefinger1', 'right_forefinger2'),
+            id=52,
+            color=[255, 153, 255]),
+        53:
+        dict(
+            link=('right_forefinger2', 'right_forefinger3'),
+            id=53,
+            color=[255, 153, 255]),
+        54:
+        dict(
+            link=('right_forefinger3', 'right_forefinger4'),
+            id=54,
+            color=[255, 153, 255]),
+        55:
+        dict(
+            link=('right_wrist', 'right_middle_finger1'),
+            id=55,
+            color=[102, 178, 255]),
+        56:
+        dict(
+            link=('right_middle_finger1', 'right_middle_finger2'),
+            id=56,
+            color=[102, 178, 255]),
+        57:
+        dict(
+            link=('right_middle_finger2', 'right_middle_finger3'),
+            id=57,
+            color=[102, 178, 255]),
+        58:
+        dict(
+            link=('right_middle_finger3', 'right_middle_finger4'),
+            id=58,
+            color=[102, 178, 255]),
+        59:
+        dict(
+            link=('right_wrist', 'right_ring_finger1'),
+            id=59,
+            color=[255, 51, 51]),
+        60:
+        dict(
+            link=('right_ring_finger1', 'right_ring_finger2'),
+            id=60,
+            color=[255, 51, 51]),
+        61:
+        dict(
+            link=('right_ring_finger2', 'right_ring_finger3'),
+            id=61,
+            color=[255, 51, 51]),
+        62:
+        dict(
+            link=('right_ring_finger3', 'right_ring_finger4'),
+            id=62,
+            color=[255, 51, 51]),
+        63:
+        dict(
+            link=('right_wrist', 'right_pinky_finger1'),
+            id=63,
+            color=[0, 255, 0]),
+        64:
+        dict(
+            link=('right_pinky_finger1', 'right_pinky_finger2'),
+            id=64,
+            color=[0, 255, 0]),
+        65:
+        dict(
+            link=('right_pinky_finger2', 'right_pinky_finger3'),
+            id=65,
+            color=[0, 255, 0]),
+        66:
+        dict(
+            link=('right_pinky_finger3', 'right_pinky_finger4'),
+            id=66,
+            color=[0, 255, 0])
+    },
+    joint_weights=[1.] * 136,
+
+    # 'https://github.com/Fang-Haoshu/Halpe-FullBody/blob/master/'
+    # 'HalpeCOCOAPI/PythonAPI/halpecocotools/cocoeval.py#L245'
+    sigmas=[
+        0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062,
+        0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089, 0.08, 0.08, 0.08,
+        0.089, 0.089, 0.089, 0.089, 0.089, 0.089, 0.015, 0.015, 0.015, 0.015,
+        0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015,
+        0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015,
+        0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015,
+        0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015,
+        0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015,
+        0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015,
+        0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015,
+        0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015,
+        0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015,
+        0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.015,
+        0.015, 0.015, 0.015, 0.015, 0.015, 0.015
+    ])
diff --git a/main/_base_/datasets/horse10.py b/main/_base_/datasets/horse10.py
new file mode 100644
index 0000000000000000000000000000000000000000..a485bf191bc151b0d76e48f3e55eb8e2dda6c506
--- /dev/null
+++ b/main/_base_/datasets/horse10.py
@@ -0,0 +1,201 @@
+dataset_info = dict(
+    dataset_name='horse10',
+    paper_info=dict(
+        author='Mathis, Alexander and Biasi, Thomas and '
+        'Schneider, Steffen and '
+        'Yuksekgonul, Mert and Rogers, Byron and '
+        'Bethge, Matthias and '
+        'Mathis, Mackenzie W',
+        title='Pretraining boosts out-of-domain robustness '
+        'for pose estimation',
+        container='Proceedings of the IEEE/CVF Winter Conference on '
+        'Applications of Computer Vision',
+        year='2021',
+        homepage='http://www.mackenziemathislab.org/horse10',
+    ),
+    keypoint_info={
+        0:
+        dict(name='Nose', id=0, color=[255, 153, 255], type='upper', swap=''),
+        1:
+        dict(name='Eye', id=1, color=[255, 153, 255], type='upper', swap=''),
+        2:
+        dict(
+            name='Nearknee',
+            id=2,
+            color=[255, 102, 255],
+            type='upper',
+            swap=''),
+        3:
+        dict(
+            name='Nearfrontfetlock',
+            id=3,
+            color=[255, 102, 255],
+            type='upper',
+            swap=''),
+        4:
+        dict(
+            name='Nearfrontfoot',
+            id=4,
+            color=[255, 102, 255],
+            type='upper',
+            swap=''),
+        5:
+        dict(
+            name='Offknee', id=5, color=[255, 102, 255], type='upper',
+            swap=''),
+        6:
+        dict(
+            name='Offfrontfetlock',
+            id=6,
+            color=[255, 102, 255],
+            type='upper',
+            swap=''),
+        7:
+        dict(
+            name='Offfrontfoot',
+            id=7,
+            color=[255, 102, 255],
+            type='upper',
+            swap=''),
+        8:
+        dict(
+            name='Shoulder',
+            id=8,
+            color=[255, 153, 255],
+            type='upper',
+            swap=''),
+        9:
+        dict(
+            name='Midshoulder',
+            id=9,
+            color=[255, 153, 255],
+            type='upper',
+            swap=''),
+        10:
+        dict(
+            name='Elbow', id=10, color=[255, 153, 255], type='upper', swap=''),
+        11:
+        dict(
+            name='Girth', id=11, color=[255, 153, 255], type='upper', swap=''),
+        12:
+        dict(
+            name='Wither', id=12, color=[255, 153, 255], type='upper',
+            swap=''),
+        13:
+        dict(
+            name='Nearhindhock',
+            id=13,
+            color=[255, 51, 255],
+            type='lower',
+            swap=''),
+        14:
+        dict(
+            name='Nearhindfetlock',
+            id=14,
+            color=[255, 51, 255],
+            type='lower',
+            swap=''),
+        15:
+        dict(
+            name='Nearhindfoot',
+            id=15,
+            color=[255, 51, 255],
+            type='lower',
+            swap=''),
+        16:
+        dict(name='Hip', id=16, color=[255, 153, 255], type='lower', swap=''),
+        17:
+        dict(
+            name='Stifle', id=17, color=[255, 153, 255], type='lower',
+            swap=''),
+        18:
+        dict(
+            name='Offhindhock',
+            id=18,
+            color=[255, 51, 255],
+            type='lower',
+            swap=''),
+        19:
+        dict(
+            name='Offhindfetlock',
+            id=19,
+            color=[255, 51, 255],
+            type='lower',
+            swap=''),
+        20:
+        dict(
+            name='Offhindfoot',
+            id=20,
+            color=[255, 51, 255],
+            type='lower',
+            swap=''),
+        21:
+        dict(
+            name='Ischium',
+            id=21,
+            color=[255, 153, 255],
+            type='lower',
+            swap='')
+    },
+    skeleton_info={
+        0:
+        dict(link=('Nose', 'Eye'), id=0, color=[255, 153, 255]),
+        1:
+        dict(link=('Eye', 'Wither'), id=1, color=[255, 153, 255]),
+        2:
+        dict(link=('Wither', 'Hip'), id=2, color=[255, 153, 255]),
+        3:
+        dict(link=('Hip', 'Ischium'), id=3, color=[255, 153, 255]),
+        4:
+        dict(link=('Ischium', 'Stifle'), id=4, color=[255, 153, 255]),
+        5:
+        dict(link=('Stifle', 'Girth'), id=5, color=[255, 153, 255]),
+        6:
+        dict(link=('Girth', 'Elbow'), id=6, color=[255, 153, 255]),
+        7:
+        dict(link=('Elbow', 'Shoulder'), id=7, color=[255, 153, 255]),
+        8:
+        dict(link=('Shoulder', 'Midshoulder'), id=8, color=[255, 153, 255]),
+        9:
+        dict(link=('Midshoulder', 'Wither'), id=9, color=[255, 153, 255]),
+        10:
+        dict(
+            link=('Nearknee', 'Nearfrontfetlock'),
+            id=10,
+            color=[255, 102, 255]),
+        11:
+        dict(
+            link=('Nearfrontfetlock', 'Nearfrontfoot'),
+            id=11,
+            color=[255, 102, 255]),
+        12:
+        dict(
+            link=('Offknee', 'Offfrontfetlock'), id=12, color=[255, 102, 255]),
+        13:
+        dict(
+            link=('Offfrontfetlock', 'Offfrontfoot'),
+            id=13,
+            color=[255, 102, 255]),
+        14:
+        dict(
+            link=('Nearhindhock', 'Nearhindfetlock'),
+            id=14,
+            color=[255, 51, 255]),
+        15:
+        dict(
+            link=('Nearhindfetlock', 'Nearhindfoot'),
+            id=15,
+            color=[255, 51, 255]),
+        16:
+        dict(
+            link=('Offhindhock', 'Offhindfetlock'),
+            id=16,
+            color=[255, 51, 255]),
+        17:
+        dict(
+            link=('Offhindfetlock', 'Offhindfoot'),
+            id=17,
+            color=[255, 51, 255])
+    },
+    joint_weights=[1.] * 22,
+    sigmas=[])
diff --git a/main/_base_/datasets/interhand2d.py b/main/_base_/datasets/interhand2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..0134f07de5bf536eaffbf71155a7e6eb33b24f0a
--- /dev/null
+++ b/main/_base_/datasets/interhand2d.py
@@ -0,0 +1,142 @@
+dataset_info = dict(
+    dataset_name='interhand2d',
+    paper_info=dict(
+        author='Moon, Gyeongsik and Yu, Shoou-I and Wen, He and '
+        'Shiratori, Takaaki and Lee, Kyoung Mu',
+        title='InterHand2.6M: A dataset and baseline for 3D '
+        'interacting hand pose estimation from a single RGB image',
+        container='arXiv',
+        year='2020',
+        homepage='https://mks0601.github.io/InterHand2.6M/',
+    ),
+    keypoint_info={
+        0:
+        dict(name='thumb4', id=0, color=[255, 128, 0], type='', swap=''),
+        1:
+        dict(name='thumb3', id=1, color=[255, 128, 0], type='', swap=''),
+        2:
+        dict(name='thumb2', id=2, color=[255, 128, 0], type='', swap=''),
+        3:
+        dict(name='thumb1', id=3, color=[255, 128, 0], type='', swap=''),
+        4:
+        dict(
+            name='forefinger4', id=4, color=[255, 153, 255], type='', swap=''),
+        5:
+        dict(
+            name='forefinger3', id=5, color=[255, 153, 255], type='', swap=''),
+        6:
+        dict(
+            name='forefinger2', id=6, color=[255, 153, 255], type='', swap=''),
+        7:
+        dict(
+            name='forefinger1', id=7, color=[255, 153, 255], type='', swap=''),
+        8:
+        dict(
+            name='middle_finger4',
+            id=8,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        9:
+        dict(
+            name='middle_finger3',
+            id=9,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        10:
+        dict(
+            name='middle_finger2',
+            id=10,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        11:
+        dict(
+            name='middle_finger1',
+            id=11,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        12:
+        dict(
+            name='ring_finger4', id=12, color=[255, 51, 51], type='', swap=''),
+        13:
+        dict(
+            name='ring_finger3', id=13, color=[255, 51, 51], type='', swap=''),
+        14:
+        dict(
+            name='ring_finger2', id=14, color=[255, 51, 51], type='', swap=''),
+        15:
+        dict(
+            name='ring_finger1', id=15, color=[255, 51, 51], type='', swap=''),
+        16:
+        dict(name='pinky_finger4', id=16, color=[0, 255, 0], type='', swap=''),
+        17:
+        dict(name='pinky_finger3', id=17, color=[0, 255, 0], type='', swap=''),
+        18:
+        dict(name='pinky_finger2', id=18, color=[0, 255, 0], type='', swap=''),
+        19:
+        dict(name='pinky_finger1', id=19, color=[0, 255, 0], type='', swap=''),
+        20:
+        dict(name='wrist', id=20, color=[255, 255, 255], type='', swap='')
+    },
+    skeleton_info={
+        0:
+        dict(link=('wrist', 'thumb1'), id=0, color=[255, 128, 0]),
+        1:
+        dict(link=('thumb1', 'thumb2'), id=1, color=[255, 128, 0]),
+        2:
+        dict(link=('thumb2', 'thumb3'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('thumb3', 'thumb4'), id=3, color=[255, 128, 0]),
+        4:
+        dict(link=('wrist', 'forefinger1'), id=4, color=[255, 153, 255]),
+        5:
+        dict(link=('forefinger1', 'forefinger2'), id=5, color=[255, 153, 255]),
+        6:
+        dict(link=('forefinger2', 'forefinger3'), id=6, color=[255, 153, 255]),
+        7:
+        dict(link=('forefinger3', 'forefinger4'), id=7, color=[255, 153, 255]),
+        8:
+        dict(link=('wrist', 'middle_finger1'), id=8, color=[102, 178, 255]),
+        9:
+        dict(
+            link=('middle_finger1', 'middle_finger2'),
+            id=9,
+            color=[102, 178, 255]),
+        10:
+        dict(
+            link=('middle_finger2', 'middle_finger3'),
+            id=10,
+            color=[102, 178, 255]),
+        11:
+        dict(
+            link=('middle_finger3', 'middle_finger4'),
+            id=11,
+            color=[102, 178, 255]),
+        12:
+        dict(link=('wrist', 'ring_finger1'), id=12, color=[255, 51, 51]),
+        13:
+        dict(
+            link=('ring_finger1', 'ring_finger2'), id=13, color=[255, 51, 51]),
+        14:
+        dict(
+            link=('ring_finger2', 'ring_finger3'), id=14, color=[255, 51, 51]),
+        15:
+        dict(
+            link=('ring_finger3', 'ring_finger4'), id=15, color=[255, 51, 51]),
+        16:
+        dict(link=('wrist', 'pinky_finger1'), id=16, color=[0, 255, 0]),
+        17:
+        dict(
+            link=('pinky_finger1', 'pinky_finger2'), id=17, color=[0, 255, 0]),
+        18:
+        dict(
+            link=('pinky_finger2', 'pinky_finger3'), id=18, color=[0, 255, 0]),
+        19:
+        dict(
+            link=('pinky_finger3', 'pinky_finger4'), id=19, color=[0, 255, 0])
+    },
+    joint_weights=[1.] * 21,
+    sigmas=[])
diff --git a/main/_base_/datasets/interhand3d.py b/main/_base_/datasets/interhand3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2bd8121c281c741ec9b980c7570ebef8a632993
--- /dev/null
+++ b/main/_base_/datasets/interhand3d.py
@@ -0,0 +1,487 @@
+dataset_info = dict(
+    dataset_name='interhand3d',
+    paper_info=dict(
+        author='Moon, Gyeongsik and Yu, Shoou-I and Wen, He and '
+        'Shiratori, Takaaki and Lee, Kyoung Mu',
+        title='InterHand2.6M: A dataset and baseline for 3D '
+        'interacting hand pose estimation from a single RGB image',
+        container='arXiv',
+        year='2020',
+        homepage='https://mks0601.github.io/InterHand2.6M/',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='right_thumb4',
+            id=0,
+            color=[255, 128, 0],
+            type='',
+            swap='left_thumb4'),
+        1:
+        dict(
+            name='right_thumb3',
+            id=1,
+            color=[255, 128, 0],
+            type='',
+            swap='left_thumb3'),
+        2:
+        dict(
+            name='right_thumb2',
+            id=2,
+            color=[255, 128, 0],
+            type='',
+            swap='left_thumb2'),
+        3:
+        dict(
+            name='right_thumb1',
+            id=3,
+            color=[255, 128, 0],
+            type='',
+            swap='left_thumb1'),
+        4:
+        dict(
+            name='right_forefinger4',
+            id=4,
+            color=[255, 153, 255],
+            type='',
+            swap='left_forefinger4'),
+        5:
+        dict(
+            name='right_forefinger3',
+            id=5,
+            color=[255, 153, 255],
+            type='',
+            swap='left_forefinger3'),
+        6:
+        dict(
+            name='right_forefinger2',
+            id=6,
+            color=[255, 153, 255],
+            type='',
+            swap='left_forefinger2'),
+        7:
+        dict(
+            name='right_forefinger1',
+            id=7,
+            color=[255, 153, 255],
+            type='',
+            swap='left_forefinger1'),
+        8:
+        dict(
+            name='right_middle_finger4',
+            id=8,
+            color=[102, 178, 255],
+            type='',
+            swap='left_middle_finger4'),
+        9:
+        dict(
+            name='right_middle_finger3',
+            id=9,
+            color=[102, 178, 255],
+            type='',
+            swap='left_middle_finger3'),
+        10:
+        dict(
+            name='right_middle_finger2',
+            id=10,
+            color=[102, 178, 255],
+            type='',
+            swap='left_middle_finger2'),
+        11:
+        dict(
+            name='right_middle_finger1',
+            id=11,
+            color=[102, 178, 255],
+            type='',
+            swap='left_middle_finger1'),
+        12:
+        dict(
+            name='right_ring_finger4',
+            id=12,
+            color=[255, 51, 51],
+            type='',
+            swap='left_ring_finger4'),
+        13:
+        dict(
+            name='right_ring_finger3',
+            id=13,
+            color=[255, 51, 51],
+            type='',
+            swap='left_ring_finger3'),
+        14:
+        dict(
+            name='right_ring_finger2',
+            id=14,
+            color=[255, 51, 51],
+            type='',
+            swap='left_ring_finger2'),
+        15:
+        dict(
+            name='right_ring_finger1',
+            id=15,
+            color=[255, 51, 51],
+            type='',
+            swap='left_ring_finger1'),
+        16:
+        dict(
+            name='right_pinky_finger4',
+            id=16,
+            color=[0, 255, 0],
+            type='',
+            swap='left_pinky_finger4'),
+        17:
+        dict(
+            name='right_pinky_finger3',
+            id=17,
+            color=[0, 255, 0],
+            type='',
+            swap='left_pinky_finger3'),
+        18:
+        dict(
+            name='right_pinky_finger2',
+            id=18,
+            color=[0, 255, 0],
+            type='',
+            swap='left_pinky_finger2'),
+        19:
+        dict(
+            name='right_pinky_finger1',
+            id=19,
+            color=[0, 255, 0],
+            type='',
+            swap='left_pinky_finger1'),
+        20:
+        dict(
+            name='right_wrist',
+            id=20,
+            color=[255, 255, 255],
+            type='',
+            swap='left_wrist'),
+        21:
+        dict(
+            name='left_thumb4',
+            id=21,
+            color=[255, 128, 0],
+            type='',
+            swap='right_thumb4'),
+        22:
+        dict(
+            name='left_thumb3',
+            id=22,
+            color=[255, 128, 0],
+            type='',
+            swap='right_thumb3'),
+        23:
+        dict(
+            name='left_thumb2',
+            id=23,
+            color=[255, 128, 0],
+            type='',
+            swap='right_thumb2'),
+        24:
+        dict(
+            name='left_thumb1',
+            id=24,
+            color=[255, 128, 0],
+            type='',
+            swap='right_thumb1'),
+        25:
+        dict(
+            name='left_forefinger4',
+            id=25,
+            color=[255, 153, 255],
+            type='',
+            swap='right_forefinger4'),
+        26:
+        dict(
+            name='left_forefinger3',
+            id=26,
+            color=[255, 153, 255],
+            type='',
+            swap='right_forefinger3'),
+        27:
+        dict(
+            name='left_forefinger2',
+            id=27,
+            color=[255, 153, 255],
+            type='',
+            swap='right_forefinger2'),
+        28:
+        dict(
+            name='left_forefinger1',
+            id=28,
+            color=[255, 153, 255],
+            type='',
+            swap='right_forefinger1'),
+        29:
+        dict(
+            name='left_middle_finger4',
+            id=29,
+            color=[102, 178, 255],
+            type='',
+            swap='right_middle_finger4'),
+        30:
+        dict(
+            name='left_middle_finger3',
+            id=30,
+            color=[102, 178, 255],
+            type='',
+            swap='right_middle_finger3'),
+        31:
+        dict(
+            name='left_middle_finger2',
+            id=31,
+            color=[102, 178, 255],
+            type='',
+            swap='right_middle_finger2'),
+        32:
+        dict(
+            name='left_middle_finger1',
+            id=32,
+            color=[102, 178, 255],
+            type='',
+            swap='right_middle_finger1'),
+        33:
+        dict(
+            name='left_ring_finger4',
+            id=33,
+            color=[255, 51, 51],
+            type='',
+            swap='right_ring_finger4'),
+        34:
+        dict(
+            name='left_ring_finger3',
+            id=34,
+            color=[255, 51, 51],
+            type='',
+            swap='right_ring_finger3'),
+        35:
+        dict(
+            name='left_ring_finger2',
+            id=35,
+            color=[255, 51, 51],
+            type='',
+            swap='right_ring_finger2'),
+        36:
+        dict(
+            name='left_ring_finger1',
+            id=36,
+            color=[255, 51, 51],
+            type='',
+            swap='right_ring_finger1'),
+        37:
+        dict(
+            name='left_pinky_finger4',
+            id=37,
+            color=[0, 255, 0],
+            type='',
+            swap='right_pinky_finger4'),
+        38:
+        dict(
+            name='left_pinky_finger3',
+            id=38,
+            color=[0, 255, 0],
+            type='',
+            swap='right_pinky_finger3'),
+        39:
+        dict(
+            name='left_pinky_finger2',
+            id=39,
+            color=[0, 255, 0],
+            type='',
+            swap='right_pinky_finger2'),
+        40:
+        dict(
+            name='left_pinky_finger1',
+            id=40,
+            color=[0, 255, 0],
+            type='',
+            swap='right_pinky_finger1'),
+        41:
+        dict(
+            name='left_wrist',
+            id=41,
+            color=[255, 255, 255],
+            type='',
+            swap='right_wrist'),
+    },
+    skeleton_info={
+        0:
+        dict(link=('right_wrist', 'right_thumb1'), id=0, color=[255, 128, 0]),
+        1:
+        dict(link=('right_thumb1', 'right_thumb2'), id=1, color=[255, 128, 0]),
+        2:
+        dict(link=('right_thumb2', 'right_thumb3'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('right_thumb3', 'right_thumb4'), id=3, color=[255, 128, 0]),
+        4:
+        dict(
+            link=('right_wrist', 'right_forefinger1'),
+            id=4,
+            color=[255, 153, 255]),
+        5:
+        dict(
+            link=('right_forefinger1', 'right_forefinger2'),
+            id=5,
+            color=[255, 153, 255]),
+        6:
+        dict(
+            link=('right_forefinger2', 'right_forefinger3'),
+            id=6,
+            color=[255, 153, 255]),
+        7:
+        dict(
+            link=('right_forefinger3', 'right_forefinger4'),
+            id=7,
+            color=[255, 153, 255]),
+        8:
+        dict(
+            link=('right_wrist', 'right_middle_finger1'),
+            id=8,
+            color=[102, 178, 255]),
+        9:
+        dict(
+            link=('right_middle_finger1', 'right_middle_finger2'),
+            id=9,
+            color=[102, 178, 255]),
+        10:
+        dict(
+            link=('right_middle_finger2', 'right_middle_finger3'),
+            id=10,
+            color=[102, 178, 255]),
+        11:
+        dict(
+            link=('right_middle_finger3', 'right_middle_finger4'),
+            id=11,
+            color=[102, 178, 255]),
+        12:
+        dict(
+            link=('right_wrist', 'right_ring_finger1'),
+            id=12,
+            color=[255, 51, 51]),
+        13:
+        dict(
+            link=('right_ring_finger1', 'right_ring_finger2'),
+            id=13,
+            color=[255, 51, 51]),
+        14:
+        dict(
+            link=('right_ring_finger2', 'right_ring_finger3'),
+            id=14,
+            color=[255, 51, 51]),
+        15:
+        dict(
+            link=('right_ring_finger3', 'right_ring_finger4'),
+            id=15,
+            color=[255, 51, 51]),
+        16:
+        dict(
+            link=('right_wrist', 'right_pinky_finger1'),
+            id=16,
+            color=[0, 255, 0]),
+        17:
+        dict(
+            link=('right_pinky_finger1', 'right_pinky_finger2'),
+            id=17,
+            color=[0, 255, 0]),
+        18:
+        dict(
+            link=('right_pinky_finger2', 'right_pinky_finger3'),
+            id=18,
+            color=[0, 255, 0]),
+        19:
+        dict(
+            link=('right_pinky_finger3', 'right_pinky_finger4'),
+            id=19,
+            color=[0, 255, 0]),
+        20:
+        dict(link=('left_wrist', 'left_thumb1'), id=20, color=[255, 128, 0]),
+        21:
+        dict(link=('left_thumb1', 'left_thumb2'), id=21, color=[255, 128, 0]),
+        22:
+        dict(link=('left_thumb2', 'left_thumb3'), id=22, color=[255, 128, 0]),
+        23:
+        dict(link=('left_thumb3', 'left_thumb4'), id=23, color=[255, 128, 0]),
+        24:
+        dict(
+            link=('left_wrist', 'left_forefinger1'),
+            id=24,
+            color=[255, 153, 255]),
+        25:
+        dict(
+            link=('left_forefinger1', 'left_forefinger2'),
+            id=25,
+            color=[255, 153, 255]),
+        26:
+        dict(
+            link=('left_forefinger2', 'left_forefinger3'),
+            id=26,
+            color=[255, 153, 255]),
+        27:
+        dict(
+            link=('left_forefinger3', 'left_forefinger4'),
+            id=27,
+            color=[255, 153, 255]),
+        28:
+        dict(
+            link=('left_wrist', 'left_middle_finger1'),
+            id=28,
+            color=[102, 178, 255]),
+        29:
+        dict(
+            link=('left_middle_finger1', 'left_middle_finger2'),
+            id=29,
+            color=[102, 178, 255]),
+        30:
+        dict(
+            link=('left_middle_finger2', 'left_middle_finger3'),
+            id=30,
+            color=[102, 178, 255]),
+        31:
+        dict(
+            link=('left_middle_finger3', 'left_middle_finger4'),
+            id=31,
+            color=[102, 178, 255]),
+        32:
+        dict(
+            link=('left_wrist', 'left_ring_finger1'),
+            id=32,
+            color=[255, 51, 51]),
+        33:
+        dict(
+            link=('left_ring_finger1', 'left_ring_finger2'),
+            id=33,
+            color=[255, 51, 51]),
+        34:
+        dict(
+            link=('left_ring_finger2', 'left_ring_finger3'),
+            id=34,
+            color=[255, 51, 51]),
+        35:
+        dict(
+            link=('left_ring_finger3', 'left_ring_finger4'),
+            id=35,
+            color=[255, 51, 51]),
+        36:
+        dict(
+            link=('left_wrist', 'left_pinky_finger1'),
+            id=36,
+            color=[0, 255, 0]),
+        37:
+        dict(
+            link=('left_pinky_finger1', 'left_pinky_finger2'),
+            id=37,
+            color=[0, 255, 0]),
+        38:
+        dict(
+            link=('left_pinky_finger2', 'left_pinky_finger3'),
+            id=38,
+            color=[0, 255, 0]),
+        39:
+        dict(
+            link=('left_pinky_finger3', 'left_pinky_finger4'),
+            id=39,
+            color=[0, 255, 0]),
+    },
+    joint_weights=[1.] * 42,
+    sigmas=[])
diff --git a/main/_base_/datasets/jhmdb.py b/main/_base_/datasets/jhmdb.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b37488498a2bade1fa6f2ff6532fcd219071803
--- /dev/null
+++ b/main/_base_/datasets/jhmdb.py
@@ -0,0 +1,129 @@
+dataset_info = dict(
+    dataset_name='jhmdb',
+    paper_info=dict(
+        author='H. Jhuang and J. Gall and S. Zuffi and '
+        'C. Schmid and M. J. Black',
+        title='Towards understanding action recognition',
+        container='International Conf. on Computer Vision (ICCV)',
+        year='2013',
+        homepage='http://jhmdb.is.tue.mpg.de/dataset',
+    ),
+    keypoint_info={
+        0:
+        dict(name='neck', id=0, color=[255, 128, 0], type='upper', swap=''),
+        1:
+        dict(name='belly', id=1, color=[255, 128, 0], type='upper', swap=''),
+        2:
+        dict(name='head', id=2, color=[255, 128, 0], type='upper', swap=''),
+        3:
+        dict(
+            name='right_shoulder',
+            id=3,
+            color=[0, 255, 0],
+            type='upper',
+            swap='left_shoulder'),
+        4:
+        dict(
+            name='left_shoulder',
+            id=4,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        5:
+        dict(
+            name='right_hip',
+            id=5,
+            color=[0, 255, 0],
+            type='lower',
+            swap='left_hip'),
+        6:
+        dict(
+            name='left_hip',
+            id=6,
+            color=[51, 153, 255],
+            type='lower',
+            swap='right_hip'),
+        7:
+        dict(
+            name='right_elbow',
+            id=7,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_elbow'),
+        8:
+        dict(
+            name='left_elbow',
+            id=8,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_elbow'),
+        9:
+        dict(
+            name='right_knee',
+            id=9,
+            color=[51, 153, 255],
+            type='lower',
+            swap='left_knee'),
+        10:
+        dict(
+            name='left_knee',
+            id=10,
+            color=[255, 128, 0],
+            type='lower',
+            swap='right_knee'),
+        11:
+        dict(
+            name='right_wrist',
+            id=11,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist'),
+        12:
+        dict(
+            name='left_wrist',
+            id=12,
+            color=[255, 128, 0],
+            type='upper',
+            swap='right_wrist'),
+        13:
+        dict(
+            name='right_ankle',
+            id=13,
+            color=[0, 255, 0],
+            type='lower',
+            swap='left_ankle'),
+        14:
+        dict(
+            name='left_ankle',
+            id=14,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_ankle')
+    },
+    skeleton_info={
+        0: dict(link=('right_ankle', 'right_knee'), id=0, color=[255, 128, 0]),
+        1: dict(link=('right_knee', 'right_hip'), id=1, color=[255, 128, 0]),
+        2: dict(link=('right_hip', 'belly'), id=2, color=[255, 128, 0]),
+        3: dict(link=('belly', 'left_hip'), id=3, color=[0, 255, 0]),
+        4: dict(link=('left_hip', 'left_knee'), id=4, color=[0, 255, 0]),
+        5: dict(link=('left_knee', 'left_ankle'), id=5, color=[0, 255, 0]),
+        6: dict(link=('belly', 'neck'), id=6, color=[51, 153, 255]),
+        7: dict(link=('neck', 'head'), id=7, color=[51, 153, 255]),
+        8: dict(link=('neck', 'right_shoulder'), id=8, color=[255, 128, 0]),
+        9: dict(
+            link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]),
+        10:
+        dict(link=('right_elbow', 'right_wrist'), id=10, color=[255, 128, 0]),
+        11: dict(link=('neck', 'left_shoulder'), id=11, color=[0, 255, 0]),
+        12:
+        dict(link=('left_shoulder', 'left_elbow'), id=12, color=[0, 255, 0]),
+        13: dict(link=('left_elbow', 'left_wrist'), id=13, color=[0, 255, 0])
+    },
+    joint_weights=[
+        1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.2, 1.2, 1.5, 1.5, 1.5, 1.5
+    ],
+    # Adapted from COCO dataset.
+    sigmas=[
+        0.025, 0.107, 0.025, 0.079, 0.079, 0.107, 0.107, 0.072, 0.072, 0.087,
+        0.087, 0.062, 0.062, 0.089, 0.089
+    ])
diff --git a/main/_base_/datasets/locust.py b/main/_base_/datasets/locust.py
new file mode 100644
index 0000000000000000000000000000000000000000..db3fa15aa060b5806faae7a21f65460f77be2745
--- /dev/null
+++ b/main/_base_/datasets/locust.py
@@ -0,0 +1,263 @@
+dataset_info = dict(
+    dataset_name='locust',
+    paper_info=dict(
+        author='Graving, Jacob M and Chae, Daniel and Naik, Hemal and '
+        'Li, Liang and Koger, Benjamin and Costelloe, Blair R and '
+        'Couzin, Iain D',
+        title='DeepPoseKit, a software toolkit for fast and robust '
+        'animal pose estimation using deep learning',
+        container='Elife',
+        year='2019',
+        homepage='https://github.com/jgraving/DeepPoseKit-Data',
+    ),
+    keypoint_info={
+        0:
+        dict(name='head', id=0, color=[255, 255, 255], type='', swap=''),
+        1:
+        dict(name='neck', id=1, color=[255, 255, 255], type='', swap=''),
+        2:
+        dict(name='thorax', id=2, color=[255, 255, 255], type='', swap=''),
+        3:
+        dict(name='abdomen1', id=3, color=[255, 255, 255], type='', swap=''),
+        4:
+        dict(name='abdomen2', id=4, color=[255, 255, 255], type='', swap=''),
+        5:
+        dict(
+            name='anttipL',
+            id=5,
+            color=[255, 255, 255],
+            type='',
+            swap='anttipR'),
+        6:
+        dict(
+            name='antbaseL',
+            id=6,
+            color=[255, 255, 255],
+            type='',
+            swap='antbaseR'),
+        7:
+        dict(name='eyeL', id=7, color=[255, 255, 255], type='', swap='eyeR'),
+        8:
+        dict(
+            name='forelegL1',
+            id=8,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegR1'),
+        9:
+        dict(
+            name='forelegL2',
+            id=9,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegR2'),
+        10:
+        dict(
+            name='forelegL3',
+            id=10,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegR3'),
+        11:
+        dict(
+            name='forelegL4',
+            id=11,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegR4'),
+        12:
+        dict(
+            name='midlegL1',
+            id=12,
+            color=[255, 255, 255],
+            type='',
+            swap='midlegR1'),
+        13:
+        dict(
+            name='midlegL2',
+            id=13,
+            color=[255, 255, 255],
+            type='',
+            swap='midlegR2'),
+        14:
+        dict(
+            name='midlegL3',
+            id=14,
+            color=[255, 255, 255],
+            type='',
+            swap='midlegR3'),
+        15:
+        dict(
+            name='midlegL4',
+            id=15,
+            color=[255, 255, 255],
+            type='',
+            swap='midlegR4'),
+        16:
+        dict(
+            name='hindlegL1',
+            id=16,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegR1'),
+        17:
+        dict(
+            name='hindlegL2',
+            id=17,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegR2'),
+        18:
+        dict(
+            name='hindlegL3',
+            id=18,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegR3'),
+        19:
+        dict(
+            name='hindlegL4',
+            id=19,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegR4'),
+        20:
+        dict(
+            name='anttipR',
+            id=20,
+            color=[255, 255, 255],
+            type='',
+            swap='anttipL'),
+        21:
+        dict(
+            name='antbaseR',
+            id=21,
+            color=[255, 255, 255],
+            type='',
+            swap='antbaseL'),
+        22:
+        dict(name='eyeR', id=22, color=[255, 255, 255], type='', swap='eyeL'),
+        23:
+        dict(
+            name='forelegR1',
+            id=23,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegL1'),
+        24:
+        dict(
+            name='forelegR2',
+            id=24,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegL2'),
+        25:
+        dict(
+            name='forelegR3',
+            id=25,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegL3'),
+        26:
+        dict(
+            name='forelegR4',
+            id=26,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegL4'),
+        27:
+        dict(
+            name='midlegR1',
+            id=27,
+            color=[255, 255, 255],
+            type='',
+            swap='midlegL1'),
+        28:
+        dict(
+            name='midlegR2',
+            id=28,
+            color=[255, 255, 255],
+            type='',
+            swap='midlegL2'),
+        29:
+        dict(
+            name='midlegR3',
+            id=29,
+            color=[255, 255, 255],
+            type='',
+            swap='midlegL3'),
+        30:
+        dict(
+            name='midlegR4',
+            id=30,
+            color=[255, 255, 255],
+            type='',
+            swap='midlegL4'),
+        31:
+        dict(
+            name='hindlegR1',
+            id=31,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegL1'),
+        32:
+        dict(
+            name='hindlegR2',
+            id=32,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegL2'),
+        33:
+        dict(
+            name='hindlegR3',
+            id=33,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegL3'),
+        34:
+        dict(
+            name='hindlegR4',
+            id=34,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegL4')
+    },
+    skeleton_info={
+        0: dict(link=('neck', 'head'), id=0, color=[255, 255, 255]),
+        1: dict(link=('thorax', 'neck'), id=1, color=[255, 255, 255]),
+        2: dict(link=('abdomen1', 'thorax'), id=2, color=[255, 255, 255]),
+        3: dict(link=('abdomen2', 'abdomen1'), id=3, color=[255, 255, 255]),
+        4: dict(link=('antbaseL', 'anttipL'), id=4, color=[255, 255, 255]),
+        5: dict(link=('eyeL', 'antbaseL'), id=5, color=[255, 255, 255]),
+        6: dict(link=('forelegL2', 'forelegL1'), id=6, color=[255, 255, 255]),
+        7: dict(link=('forelegL3', 'forelegL2'), id=7, color=[255, 255, 255]),
+        8: dict(link=('forelegL4', 'forelegL3'), id=8, color=[255, 255, 255]),
+        9: dict(link=('midlegL2', 'midlegL1'), id=9, color=[255, 255, 255]),
+        10: dict(link=('midlegL3', 'midlegL2'), id=10, color=[255, 255, 255]),
+        11: dict(link=('midlegL4', 'midlegL3'), id=11, color=[255, 255, 255]),
+        12:
+        dict(link=('hindlegL2', 'hindlegL1'), id=12, color=[255, 255, 255]),
+        13:
+        dict(link=('hindlegL3', 'hindlegL2'), id=13, color=[255, 255, 255]),
+        14:
+        dict(link=('hindlegL4', 'hindlegL3'), id=14, color=[255, 255, 255]),
+        15: dict(link=('antbaseR', 'anttipR'), id=15, color=[255, 255, 255]),
+        16: dict(link=('eyeR', 'antbaseR'), id=16, color=[255, 255, 255]),
+        17:
+        dict(link=('forelegR2', 'forelegR1'), id=17, color=[255, 255, 255]),
+        18:
+        dict(link=('forelegR3', 'forelegR2'), id=18, color=[255, 255, 255]),
+        19:
+        dict(link=('forelegR4', 'forelegR3'), id=19, color=[255, 255, 255]),
+        20: dict(link=('midlegR2', 'midlegR1'), id=20, color=[255, 255, 255]),
+        21: dict(link=('midlegR3', 'midlegR2'), id=21, color=[255, 255, 255]),
+        22: dict(link=('midlegR4', 'midlegR3'), id=22, color=[255, 255, 255]),
+        23:
+        dict(link=('hindlegR2', 'hindlegR1'), id=23, color=[255, 255, 255]),
+        24:
+        dict(link=('hindlegR3', 'hindlegR2'), id=24, color=[255, 255, 255]),
+        25:
+        dict(link=('hindlegR4', 'hindlegR3'), id=25, color=[255, 255, 255])
+    },
+    joint_weights=[1.] * 35,
+    sigmas=[])
diff --git a/main/_base_/datasets/macaque.py b/main/_base_/datasets/macaque.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea8dac297ea2f0e36dabccccc021d953216a6ac8
--- /dev/null
+++ b/main/_base_/datasets/macaque.py
@@ -0,0 +1,183 @@
+dataset_info = dict(
+    dataset_name='macaque',
+    paper_info=dict(
+        author='Labuguen, Rollyn and Matsumoto, Jumpei and '
+        'Negrete, Salvador and Nishimaru, Hiroshi and '
+        'Nishijo, Hisao and Takada, Masahiko and '
+        'Go, Yasuhiro and Inoue, Ken-ichi and Shibata, Tomohiro',
+        title='MacaquePose: A novel "in the wild" macaque monkey pose dataset '
+        'for markerless motion capture',
+        container='bioRxiv',
+        year='2020',
+        homepage='http://www.pri.kyoto-u.ac.jp/datasets/'
+        'macaquepose/index.html',
+    ),
+    keypoint_info={
+        0:
+        dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''),
+        1:
+        dict(
+            name='left_eye',
+            id=1,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_eye'),
+        2:
+        dict(
+            name='right_eye',
+            id=2,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_eye'),
+        3:
+        dict(
+            name='left_ear',
+            id=3,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_ear'),
+        4:
+        dict(
+            name='right_ear',
+            id=4,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_ear'),
+        5:
+        dict(
+            name='left_shoulder',
+            id=5,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        6:
+        dict(
+            name='right_shoulder',
+            id=6,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        7:
+        dict(
+            name='left_elbow',
+            id=7,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_elbow'),
+        8:
+        dict(
+            name='right_elbow',
+            id=8,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_elbow'),
+        9:
+        dict(
+            name='left_wrist',
+            id=9,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_wrist'),
+        10:
+        dict(
+            name='right_wrist',
+            id=10,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist'),
+        11:
+        dict(
+            name='left_hip',
+            id=11,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        12:
+        dict(
+            name='right_hip',
+            id=12,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        13:
+        dict(
+            name='left_knee',
+            id=13,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        14:
+        dict(
+            name='right_knee',
+            id=14,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        15:
+        dict(
+            name='left_ankle',
+            id=15,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_ankle'),
+        16:
+        dict(
+            name='right_ankle',
+            id=16,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_ankle')
+    },
+    skeleton_info={
+        0:
+        dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]),
+        1:
+        dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]),
+        2:
+        dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]),
+        4:
+        dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]),
+        5:
+        dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]),
+        6:
+        dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]),
+        7:
+        dict(
+            link=('left_shoulder', 'right_shoulder'),
+            id=7,
+            color=[51, 153, 255]),
+        8:
+        dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]),
+        9:
+        dict(
+            link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]),
+        10:
+        dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]),
+        11:
+        dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]),
+        12:
+        dict(link=('left_eye', 'right_eye'), id=12, color=[51, 153, 255]),
+        13:
+        dict(link=('nose', 'left_eye'), id=13, color=[51, 153, 255]),
+        14:
+        dict(link=('nose', 'right_eye'), id=14, color=[51, 153, 255]),
+        15:
+        dict(link=('left_eye', 'left_ear'), id=15, color=[51, 153, 255]),
+        16:
+        dict(link=('right_eye', 'right_ear'), id=16, color=[51, 153, 255]),
+        17:
+        dict(link=('left_ear', 'left_shoulder'), id=17, color=[51, 153, 255]),
+        18:
+        dict(
+            link=('right_ear', 'right_shoulder'), id=18, color=[51, 153, 255])
+    },
+    joint_weights=[
+        1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5,
+        1.5
+    ],
+    sigmas=[
+        0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062,
+        0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089
+    ])
diff --git a/main/_base_/datasets/mhp.py b/main/_base_/datasets/mhp.py
new file mode 100644
index 0000000000000000000000000000000000000000..e16e37c79cb63c4352c48bb4e45602b8408f534b
--- /dev/null
+++ b/main/_base_/datasets/mhp.py
@@ -0,0 +1,156 @@
+dataset_info = dict(
+    dataset_name='mhp',
+    paper_info=dict(
+        author='Zhao, Jian and Li, Jianshu and Cheng, Yu and '
+        'Sim, Terence and Yan, Shuicheng and Feng, Jiashi',
+        title='Understanding humans in crowded scenes: '
+        'Deep nested adversarial learning and a '
+        'new benchmark for multi-human parsing',
+        container='Proceedings of the 26th ACM '
+        'international conference on Multimedia',
+        year='2018',
+        homepage='https://lv-mhp.github.io/dataset',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='right_ankle',
+            id=0,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_ankle'),
+        1:
+        dict(
+            name='right_knee',
+            id=1,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        2:
+        dict(
+            name='right_hip',
+            id=2,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        3:
+        dict(
+            name='left_hip',
+            id=3,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        4:
+        dict(
+            name='left_knee',
+            id=4,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        5:
+        dict(
+            name='left_ankle',
+            id=5,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_ankle'),
+        6:
+        dict(name='pelvis', id=6, color=[51, 153, 255], type='lower', swap=''),
+        7:
+        dict(name='thorax', id=7, color=[51, 153, 255], type='upper', swap=''),
+        8:
+        dict(
+            name='upper_neck',
+            id=8,
+            color=[51, 153, 255],
+            type='upper',
+            swap=''),
+        9:
+        dict(
+            name='head_top', id=9, color=[51, 153, 255], type='upper',
+            swap=''),
+        10:
+        dict(
+            name='right_wrist',
+            id=10,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist'),
+        11:
+        dict(
+            name='right_elbow',
+            id=11,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_elbow'),
+        12:
+        dict(
+            name='right_shoulder',
+            id=12,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        13:
+        dict(
+            name='left_shoulder',
+            id=13,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        14:
+        dict(
+            name='left_elbow',
+            id=14,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_elbow'),
+        15:
+        dict(
+            name='left_wrist',
+            id=15,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_wrist')
+    },
+    skeleton_info={
+        0:
+        dict(link=('right_ankle', 'right_knee'), id=0, color=[255, 128, 0]),
+        1:
+        dict(link=('right_knee', 'right_hip'), id=1, color=[255, 128, 0]),
+        2:
+        dict(link=('right_hip', 'pelvis'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('pelvis', 'left_hip'), id=3, color=[0, 255, 0]),
+        4:
+        dict(link=('left_hip', 'left_knee'), id=4, color=[0, 255, 0]),
+        5:
+        dict(link=('left_knee', 'left_ankle'), id=5, color=[0, 255, 0]),
+        6:
+        dict(link=('pelvis', 'thorax'), id=6, color=[51, 153, 255]),
+        7:
+        dict(link=('thorax', 'upper_neck'), id=7, color=[51, 153, 255]),
+        8:
+        dict(link=('upper_neck', 'head_top'), id=8, color=[51, 153, 255]),
+        9:
+        dict(link=('upper_neck', 'right_shoulder'), id=9, color=[255, 128, 0]),
+        10:
+        dict(
+            link=('right_shoulder', 'right_elbow'), id=10, color=[255, 128,
+                                                                  0]),
+        11:
+        dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]),
+        12:
+        dict(link=('upper_neck', 'left_shoulder'), id=12, color=[0, 255, 0]),
+        13:
+        dict(link=('left_shoulder', 'left_elbow'), id=13, color=[0, 255, 0]),
+        14:
+        dict(link=('left_elbow', 'left_wrist'), id=14, color=[0, 255, 0])
+    },
+    joint_weights=[
+        1.5, 1.2, 1., 1., 1.2, 1.5, 1., 1., 1., 1., 1.5, 1.2, 1., 1., 1.2, 1.5
+    ],
+    # Adapted from COCO dataset.
+    sigmas=[
+        0.089, 0.083, 0.107, 0.107, 0.083, 0.089, 0.026, 0.026, 0.026, 0.026,
+        0.062, 0.072, 0.179, 0.179, 0.072, 0.062
+    ])
diff --git a/main/_base_/datasets/mpi_inf_3dhp.py b/main/_base_/datasets/mpi_inf_3dhp.py
new file mode 100644
index 0000000000000000000000000000000000000000..ffd0a70297b24456ea38566ac205bb585aa47e5d
--- /dev/null
+++ b/main/_base_/datasets/mpi_inf_3dhp.py
@@ -0,0 +1,132 @@
+dataset_info = dict(
+    dataset_name='mpi_inf_3dhp',
+    paper_info=dict(
+        author='ehta, Dushyant and Rhodin, Helge and Casas, Dan and '
+        'Fua, Pascal and Sotnychenko, Oleksandr and Xu, Weipeng and '
+        'Theobalt, Christian',
+        title='Monocular 3D Human Pose Estimation In The Wild Using Improved '
+        'CNN Supervision',
+        container='2017 international conference on 3D vision (3DV)',
+        year='2017',
+        homepage='http://gvv.mpi-inf.mpg.de/3dhp-dataset',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='head_top', id=0, color=[51, 153, 255], type='upper',
+            swap=''),
+        1:
+        dict(name='neck', id=1, color=[51, 153, 255], type='upper', swap=''),
+        2:
+        dict(
+            name='right_shoulder',
+            id=2,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        3:
+        dict(
+            name='right_elbow',
+            id=3,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_elbow'),
+        4:
+        dict(
+            name='right_wrist',
+            id=4,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist'),
+        5:
+        dict(
+            name='left_shoulder',
+            id=5,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        6:
+        dict(
+            name='left_elbow',
+            id=6,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_elbow'),
+        7:
+        dict(
+            name='left_wrist',
+            id=7,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_wrist'),
+        8:
+        dict(
+            name='right_hip',
+            id=8,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        9:
+        dict(
+            name='right_knee',
+            id=9,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        10:
+        dict(
+            name='right_ankle',
+            id=10,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_ankle'),
+        11:
+        dict(
+            name='left_hip',
+            id=11,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        12:
+        dict(
+            name='left_knee',
+            id=12,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        13:
+        dict(
+            name='left_ankle',
+            id=13,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_ankle'),
+        14:
+        dict(name='root', id=14, color=[51, 153, 255], type='lower', swap=''),
+        15:
+        dict(name='spine', id=15, color=[51, 153, 255], type='upper', swap=''),
+        16:
+        dict(name='head', id=16, color=[51, 153, 255], type='upper', swap='')
+    },
+    skeleton_info={
+        0: dict(link=('neck', 'right_shoulder'), id=0, color=[255, 128, 0]),
+        1: dict(
+            link=('right_shoulder', 'right_elbow'), id=1, color=[255, 128, 0]),
+        2:
+        dict(link=('right_elbow', 'right_wrist'), id=2, color=[255, 128, 0]),
+        3: dict(link=('neck', 'left_shoulder'), id=3, color=[0, 255, 0]),
+        4: dict(link=('left_shoulder', 'left_elbow'), id=4, color=[0, 255, 0]),
+        5: dict(link=('left_elbow', 'left_wrist'), id=5, color=[0, 255, 0]),
+        6: dict(link=('root', 'right_hip'), id=6, color=[255, 128, 0]),
+        7: dict(link=('right_hip', 'right_knee'), id=7, color=[255, 128, 0]),
+        8: dict(link=('right_knee', 'right_ankle'), id=8, color=[255, 128, 0]),
+        9: dict(link=('root', 'left_hip'), id=9, color=[0, 255, 0]),
+        10: dict(link=('left_hip', 'left_knee'), id=10, color=[0, 255, 0]),
+        11: dict(link=('left_knee', 'left_ankle'), id=11, color=[0, 255, 0]),
+        12: dict(link=('head_top', 'head'), id=12, color=[51, 153, 255]),
+        13: dict(link=('head', 'neck'), id=13, color=[51, 153, 255]),
+        14: dict(link=('neck', 'spine'), id=14, color=[51, 153, 255]),
+        15: dict(link=('spine', 'root'), id=15, color=[51, 153, 255])
+    },
+    joint_weights=[1.] * 17,
+    sigmas=[])
diff --git a/main/_base_/datasets/mpii.py b/main/_base_/datasets/mpii.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c2a491c7b58bc3eaa5c0056d3d7184bdd1d1cc7
--- /dev/null
+++ b/main/_base_/datasets/mpii.py
@@ -0,0 +1,155 @@
+dataset_info = dict(
+    dataset_name='mpii',
+    paper_info=dict(
+        author='Mykhaylo Andriluka and Leonid Pishchulin and '
+        'Peter Gehler and Schiele, Bernt',
+        title='2D Human Pose Estimation: New Benchmark and '
+        'State of the Art Analysis',
+        container='IEEE Conference on Computer Vision and '
+        'Pattern Recognition (CVPR)',
+        year='2014',
+        homepage='http://human-pose.mpi-inf.mpg.de/',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='right_ankle',
+            id=0,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_ankle'),
+        1:
+        dict(
+            name='right_knee',
+            id=1,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        2:
+        dict(
+            name='right_hip',
+            id=2,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        3:
+        dict(
+            name='left_hip',
+            id=3,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        4:
+        dict(
+            name='left_knee',
+            id=4,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        5:
+        dict(
+            name='left_ankle',
+            id=5,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_ankle'),
+        6:
+        dict(name='pelvis', id=6, color=[51, 153, 255], type='lower', swap=''),
+        7:
+        dict(name='thorax', id=7, color=[51, 153, 255], type='upper', swap=''),
+        8:
+        dict(
+            name='upper_neck',
+            id=8,
+            color=[51, 153, 255],
+            type='upper',
+            swap=''),
+        9:
+        dict(
+            name='head_top', id=9, color=[51, 153, 255], type='upper',
+            swap=''),
+        10:
+        dict(
+            name='right_wrist',
+            id=10,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist'),
+        11:
+        dict(
+            name='right_elbow',
+            id=11,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_elbow'),
+        12:
+        dict(
+            name='right_shoulder',
+            id=12,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        13:
+        dict(
+            name='left_shoulder',
+            id=13,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        14:
+        dict(
+            name='left_elbow',
+            id=14,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_elbow'),
+        15:
+        dict(
+            name='left_wrist',
+            id=15,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_wrist')
+    },
+    skeleton_info={
+        0:
+        dict(link=('right_ankle', 'right_knee'), id=0, color=[255, 128, 0]),
+        1:
+        dict(link=('right_knee', 'right_hip'), id=1, color=[255, 128, 0]),
+        2:
+        dict(link=('right_hip', 'pelvis'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('pelvis', 'left_hip'), id=3, color=[0, 255, 0]),
+        4:
+        dict(link=('left_hip', 'left_knee'), id=4, color=[0, 255, 0]),
+        5:
+        dict(link=('left_knee', 'left_ankle'), id=5, color=[0, 255, 0]),
+        6:
+        dict(link=('pelvis', 'thorax'), id=6, color=[51, 153, 255]),
+        7:
+        dict(link=('thorax', 'upper_neck'), id=7, color=[51, 153, 255]),
+        8:
+        dict(link=('upper_neck', 'head_top'), id=8, color=[51, 153, 255]),
+        9:
+        dict(link=('upper_neck', 'right_shoulder'), id=9, color=[255, 128, 0]),
+        10:
+        dict(
+            link=('right_shoulder', 'right_elbow'), id=10, color=[255, 128,
+                                                                  0]),
+        11:
+        dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]),
+        12:
+        dict(link=('upper_neck', 'left_shoulder'), id=12, color=[0, 255, 0]),
+        13:
+        dict(link=('left_shoulder', 'left_elbow'), id=13, color=[0, 255, 0]),
+        14:
+        dict(link=('left_elbow', 'left_wrist'), id=14, color=[0, 255, 0])
+    },
+    joint_weights=[
+        1.5, 1.2, 1., 1., 1.2, 1.5, 1., 1., 1., 1., 1.5, 1.2, 1., 1., 1.2, 1.5
+    ],
+    # Adapted from COCO dataset.
+    sigmas=[
+        0.089, 0.083, 0.107, 0.107, 0.083, 0.089, 0.026, 0.026, 0.026, 0.026,
+        0.062, 0.072, 0.179, 0.179, 0.072, 0.062
+    ])
diff --git a/main/_base_/datasets/mpii_trb.py b/main/_base_/datasets/mpii_trb.py
new file mode 100644
index 0000000000000000000000000000000000000000..73940d4b4827f8e08343c3b517360db788e4820d
--- /dev/null
+++ b/main/_base_/datasets/mpii_trb.py
@@ -0,0 +1,380 @@
+dataset_info = dict(
+    dataset_name='mpii_trb',
+    paper_info=dict(
+        author='Duan, Haodong and Lin, Kwan-Yee and Jin, Sheng and '
+        'Liu, Wentao and Qian, Chen and Ouyang, Wanli',
+        title='TRB: A Novel Triplet Representation for '
+        'Understanding 2D Human Body',
+        container='Proceedings of the IEEE International '
+        'Conference on Computer Vision',
+        year='2019',
+        homepage='https://github.com/kennymckormick/'
+        'Triplet-Representation-of-human-Body',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='left_shoulder',
+            id=0,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        1:
+        dict(
+            name='right_shoulder',
+            id=1,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        2:
+        dict(
+            name='left_elbow',
+            id=2,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_elbow'),
+        3:
+        dict(
+            name='right_elbow',
+            id=3,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_elbow'),
+        4:
+        dict(
+            name='left_wrist',
+            id=4,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_wrist'),
+        5:
+        dict(
+            name='right_wrist',
+            id=5,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist'),
+        6:
+        dict(
+            name='left_hip',
+            id=6,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        7:
+        dict(
+            name='right_hip',
+            id=7,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        8:
+        dict(
+            name='left_knee',
+            id=8,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        9:
+        dict(
+            name='right_knee',
+            id=9,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        10:
+        dict(
+            name='left_ankle',
+            id=10,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_ankle'),
+        11:
+        dict(
+            name='right_ankle',
+            id=11,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_ankle'),
+        12:
+        dict(name='head', id=12, color=[51, 153, 255], type='upper', swap=''),
+        13:
+        dict(name='neck', id=13, color=[51, 153, 255], type='upper', swap=''),
+        14:
+        dict(
+            name='right_neck',
+            id=14,
+            color=[255, 255, 255],
+            type='upper',
+            swap='left_neck'),
+        15:
+        dict(
+            name='left_neck',
+            id=15,
+            color=[255, 255, 255],
+            type='upper',
+            swap='right_neck'),
+        16:
+        dict(
+            name='medial_right_shoulder',
+            id=16,
+            color=[255, 255, 255],
+            type='upper',
+            swap='medial_left_shoulder'),
+        17:
+        dict(
+            name='lateral_right_shoulder',
+            id=17,
+            color=[255, 255, 255],
+            type='upper',
+            swap='lateral_left_shoulder'),
+        18:
+        dict(
+            name='medial_right_bow',
+            id=18,
+            color=[255, 255, 255],
+            type='upper',
+            swap='medial_left_bow'),
+        19:
+        dict(
+            name='lateral_right_bow',
+            id=19,
+            color=[255, 255, 255],
+            type='upper',
+            swap='lateral_left_bow'),
+        20:
+        dict(
+            name='medial_right_wrist',
+            id=20,
+            color=[255, 255, 255],
+            type='upper',
+            swap='medial_left_wrist'),
+        21:
+        dict(
+            name='lateral_right_wrist',
+            id=21,
+            color=[255, 255, 255],
+            type='upper',
+            swap='lateral_left_wrist'),
+        22:
+        dict(
+            name='medial_left_shoulder',
+            id=22,
+            color=[255, 255, 255],
+            type='upper',
+            swap='medial_right_shoulder'),
+        23:
+        dict(
+            name='lateral_left_shoulder',
+            id=23,
+            color=[255, 255, 255],
+            type='upper',
+            swap='lateral_right_shoulder'),
+        24:
+        dict(
+            name='medial_left_bow',
+            id=24,
+            color=[255, 255, 255],
+            type='upper',
+            swap='medial_right_bow'),
+        25:
+        dict(
+            name='lateral_left_bow',
+            id=25,
+            color=[255, 255, 255],
+            type='upper',
+            swap='lateral_right_bow'),
+        26:
+        dict(
+            name='medial_left_wrist',
+            id=26,
+            color=[255, 255, 255],
+            type='upper',
+            swap='medial_right_wrist'),
+        27:
+        dict(
+            name='lateral_left_wrist',
+            id=27,
+            color=[255, 255, 255],
+            type='upper',
+            swap='lateral_right_wrist'),
+        28:
+        dict(
+            name='medial_right_hip',
+            id=28,
+            color=[255, 255, 255],
+            type='lower',
+            swap='medial_left_hip'),
+        29:
+        dict(
+            name='lateral_right_hip',
+            id=29,
+            color=[255, 255, 255],
+            type='lower',
+            swap='lateral_left_hip'),
+        30:
+        dict(
+            name='medial_right_knee',
+            id=30,
+            color=[255, 255, 255],
+            type='lower',
+            swap='medial_left_knee'),
+        31:
+        dict(
+            name='lateral_right_knee',
+            id=31,
+            color=[255, 255, 255],
+            type='lower',
+            swap='lateral_left_knee'),
+        32:
+        dict(
+            name='medial_right_ankle',
+            id=32,
+            color=[255, 255, 255],
+            type='lower',
+            swap='medial_left_ankle'),
+        33:
+        dict(
+            name='lateral_right_ankle',
+            id=33,
+            color=[255, 255, 255],
+            type='lower',
+            swap='lateral_left_ankle'),
+        34:
+        dict(
+            name='medial_left_hip',
+            id=34,
+            color=[255, 255, 255],
+            type='lower',
+            swap='medial_right_hip'),
+        35:
+        dict(
+            name='lateral_left_hip',
+            id=35,
+            color=[255, 255, 255],
+            type='lower',
+            swap='lateral_right_hip'),
+        36:
+        dict(
+            name='medial_left_knee',
+            id=36,
+            color=[255, 255, 255],
+            type='lower',
+            swap='medial_right_knee'),
+        37:
+        dict(
+            name='lateral_left_knee',
+            id=37,
+            color=[255, 255, 255],
+            type='lower',
+            swap='lateral_right_knee'),
+        38:
+        dict(
+            name='medial_left_ankle',
+            id=38,
+            color=[255, 255, 255],
+            type='lower',
+            swap='medial_right_ankle'),
+        39:
+        dict(
+            name='lateral_left_ankle',
+            id=39,
+            color=[255, 255, 255],
+            type='lower',
+            swap='lateral_right_ankle'),
+    },
+    skeleton_info={
+        0:
+        dict(link=('head', 'neck'), id=0, color=[51, 153, 255]),
+        1:
+        dict(link=('neck', 'left_shoulder'), id=1, color=[51, 153, 255]),
+        2:
+        dict(link=('neck', 'right_shoulder'), id=2, color=[51, 153, 255]),
+        3:
+        dict(link=('left_shoulder', 'left_elbow'), id=3, color=[0, 255, 0]),
+        4:
+        dict(
+            link=('right_shoulder', 'right_elbow'), id=4, color=[255, 128, 0]),
+        5:
+        dict(link=('left_elbow', 'left_wrist'), id=5, color=[0, 255, 0]),
+        6:
+        dict(link=('right_elbow', 'right_wrist'), id=6, color=[255, 128, 0]),
+        7:
+        dict(link=('left_shoulder', 'left_hip'), id=7, color=[51, 153, 255]),
+        8:
+        dict(link=('right_shoulder', 'right_hip'), id=8, color=[51, 153, 255]),
+        9:
+        dict(link=('left_hip', 'right_hip'), id=9, color=[51, 153, 255]),
+        10:
+        dict(link=('left_hip', 'left_knee'), id=10, color=[0, 255, 0]),
+        11:
+        dict(link=('right_hip', 'right_knee'), id=11, color=[255, 128, 0]),
+        12:
+        dict(link=('left_knee', 'left_ankle'), id=12, color=[0, 255, 0]),
+        13:
+        dict(link=('right_knee', 'right_ankle'), id=13, color=[255, 128, 0]),
+        14:
+        dict(link=('right_neck', 'left_neck'), id=14, color=[255, 255, 255]),
+        15:
+        dict(
+            link=('medial_right_shoulder', 'lateral_right_shoulder'),
+            id=15,
+            color=[255, 255, 255]),
+        16:
+        dict(
+            link=('medial_right_bow', 'lateral_right_bow'),
+            id=16,
+            color=[255, 255, 255]),
+        17:
+        dict(
+            link=('medial_right_wrist', 'lateral_right_wrist'),
+            id=17,
+            color=[255, 255, 255]),
+        18:
+        dict(
+            link=('medial_left_shoulder', 'lateral_left_shoulder'),
+            id=18,
+            color=[255, 255, 255]),
+        19:
+        dict(
+            link=('medial_left_bow', 'lateral_left_bow'),
+            id=19,
+            color=[255, 255, 255]),
+        20:
+        dict(
+            link=('medial_left_wrist', 'lateral_left_wrist'),
+            id=20,
+            color=[255, 255, 255]),
+        21:
+        dict(
+            link=('medial_right_hip', 'lateral_right_hip'),
+            id=21,
+            color=[255, 255, 255]),
+        22:
+        dict(
+            link=('medial_right_knee', 'lateral_right_knee'),
+            id=22,
+            color=[255, 255, 255]),
+        23:
+        dict(
+            link=('medial_right_ankle', 'lateral_right_ankle'),
+            id=23,
+            color=[255, 255, 255]),
+        24:
+        dict(
+            link=('medial_left_hip', 'lateral_left_hip'),
+            id=24,
+            color=[255, 255, 255]),
+        25:
+        dict(
+            link=('medial_left_knee', 'lateral_left_knee'),
+            id=25,
+            color=[255, 255, 255]),
+        26:
+        dict(
+            link=('medial_left_ankle', 'lateral_left_ankle'),
+            id=26,
+            color=[255, 255, 255])
+    },
+    joint_weights=[1.] * 40,
+    sigmas=[])
diff --git a/main/_base_/datasets/nvgesture.py b/main/_base_/datasets/nvgesture.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d5a3df7b9c6ac553ff8eab9428a9a3fb96ef564
--- /dev/null
+++ b/main/_base_/datasets/nvgesture.py
@@ -0,0 +1,42 @@
+dataset_info = dict(
+    dataset_name='nvgesture',
+    paper_info=dict(
+        author='Pavlo Molchanov and Xiaodong Yang and Shalini Gupta '
+        'and Kihwan Kim and Stephen Tyree and Jan Kautz',
+        title='Online Detection and Classification of Dynamic Hand Gestures '
+        'with Recurrent 3D Convolutional Neural Networks',
+        container='Proceedings of the IEEE Conference on '
+        'Computer Vision and Pattern Recognition',
+        year='2016',
+        homepage='https://research.nvidia.com/publication/2016-06_online-'
+        'detection-and-classification-dynamic-hand-gestures-recurrent-3d',
+    ),
+    category_info={
+        0: 'five fingers move right',
+        1: 'five fingers move left',
+        2: 'five fingers move up',
+        3: 'five fingers move down',
+        4: 'two fingers move right',
+        5: 'two fingers move left',
+        6: 'two fingers move up',
+        7: 'two fingers move down',
+        8: 'click',
+        9: 'beckoned',
+        10: 'stretch hand',
+        11: 'shake hand',
+        12: 'one',
+        13: 'two',
+        14: 'three',
+        15: 'lift up',
+        16: 'press down',
+        17: 'push',
+        18: 'shrink',
+        19: 'levorotation',
+        20: 'dextrorotation',
+        21: 'two fingers prod',
+        22: 'grab',
+        23: 'thumbs up',
+        24: 'OK'
+    },
+    flip_pairs=[(0, 1), (4, 5), (19, 20)],
+    fps=30)
diff --git a/main/_base_/datasets/ochuman.py b/main/_base_/datasets/ochuman.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ef20838fe583fde133a97e688d30e91ae562746
--- /dev/null
+++ b/main/_base_/datasets/ochuman.py
@@ -0,0 +1,181 @@
+dataset_info = dict(
+    dataset_name='ochuman',
+    paper_info=dict(
+        author='Zhang, Song-Hai and Li, Ruilong and Dong, Xin and '
+        'Rosin, Paul and Cai, Zixi and Han, Xi and '
+        'Yang, Dingcheng and Huang, Haozhi and Hu, Shi-Min',
+        title='Pose2seg: Detection free human instance segmentation',
+        container='Proceedings of the IEEE conference on computer '
+        'vision and pattern recognition',
+        year='2019',
+        homepage='https://github.com/liruilong940607/OCHumanApi',
+    ),
+    keypoint_info={
+        0:
+        dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''),
+        1:
+        dict(
+            name='left_eye',
+            id=1,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_eye'),
+        2:
+        dict(
+            name='right_eye',
+            id=2,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_eye'),
+        3:
+        dict(
+            name='left_ear',
+            id=3,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_ear'),
+        4:
+        dict(
+            name='right_ear',
+            id=4,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_ear'),
+        5:
+        dict(
+            name='left_shoulder',
+            id=5,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        6:
+        dict(
+            name='right_shoulder',
+            id=6,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        7:
+        dict(
+            name='left_elbow',
+            id=7,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_elbow'),
+        8:
+        dict(
+            name='right_elbow',
+            id=8,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_elbow'),
+        9:
+        dict(
+            name='left_wrist',
+            id=9,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_wrist'),
+        10:
+        dict(
+            name='right_wrist',
+            id=10,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist'),
+        11:
+        dict(
+            name='left_hip',
+            id=11,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        12:
+        dict(
+            name='right_hip',
+            id=12,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        13:
+        dict(
+            name='left_knee',
+            id=13,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        14:
+        dict(
+            name='right_knee',
+            id=14,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        15:
+        dict(
+            name='left_ankle',
+            id=15,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_ankle'),
+        16:
+        dict(
+            name='right_ankle',
+            id=16,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_ankle')
+    },
+    skeleton_info={
+        0:
+        dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]),
+        1:
+        dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]),
+        2:
+        dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]),
+        4:
+        dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]),
+        5:
+        dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]),
+        6:
+        dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]),
+        7:
+        dict(
+            link=('left_shoulder', 'right_shoulder'),
+            id=7,
+            color=[51, 153, 255]),
+        8:
+        dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]),
+        9:
+        dict(
+            link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]),
+        10:
+        dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]),
+        11:
+        dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]),
+        12:
+        dict(link=('left_eye', 'right_eye'), id=12, color=[51, 153, 255]),
+        13:
+        dict(link=('nose', 'left_eye'), id=13, color=[51, 153, 255]),
+        14:
+        dict(link=('nose', 'right_eye'), id=14, color=[51, 153, 255]),
+        15:
+        dict(link=('left_eye', 'left_ear'), id=15, color=[51, 153, 255]),
+        16:
+        dict(link=('right_eye', 'right_ear'), id=16, color=[51, 153, 255]),
+        17:
+        dict(link=('left_ear', 'left_shoulder'), id=17, color=[51, 153, 255]),
+        18:
+        dict(
+            link=('right_ear', 'right_shoulder'), id=18, color=[51, 153, 255])
+    },
+    joint_weights=[
+        1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5,
+        1.5
+    ],
+    sigmas=[
+        0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062,
+        0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089
+    ])
diff --git a/main/_base_/datasets/onehand10k.py b/main/_base_/datasets/onehand10k.py
new file mode 100644
index 0000000000000000000000000000000000000000..016770f14f3075dfa7d59389524a0c11a4feb802
--- /dev/null
+++ b/main/_base_/datasets/onehand10k.py
@@ -0,0 +1,142 @@
+dataset_info = dict(
+    dataset_name='onehand10k',
+    paper_info=dict(
+        author='Wang, Yangang and Peng, Cong and Liu, Yebin',
+        title='Mask-pose cascaded cnn for 2d hand pose estimation '
+        'from single color image',
+        container='IEEE Transactions on Circuits and Systems '
+        'for Video Technology',
+        year='2018',
+        homepage='https://www.yangangwang.com/papers/WANG-MCC-2018-10.html',
+    ),
+    keypoint_info={
+        0:
+        dict(name='wrist', id=0, color=[255, 255, 255], type='', swap=''),
+        1:
+        dict(name='thumb1', id=1, color=[255, 128, 0], type='', swap=''),
+        2:
+        dict(name='thumb2', id=2, color=[255, 128, 0], type='', swap=''),
+        3:
+        dict(name='thumb3', id=3, color=[255, 128, 0], type='', swap=''),
+        4:
+        dict(name='thumb4', id=4, color=[255, 128, 0], type='', swap=''),
+        5:
+        dict(
+            name='forefinger1', id=5, color=[255, 153, 255], type='', swap=''),
+        6:
+        dict(
+            name='forefinger2', id=6, color=[255, 153, 255], type='', swap=''),
+        7:
+        dict(
+            name='forefinger3', id=7, color=[255, 153, 255], type='', swap=''),
+        8:
+        dict(
+            name='forefinger4', id=8, color=[255, 153, 255], type='', swap=''),
+        9:
+        dict(
+            name='middle_finger1',
+            id=9,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        10:
+        dict(
+            name='middle_finger2',
+            id=10,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        11:
+        dict(
+            name='middle_finger3',
+            id=11,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        12:
+        dict(
+            name='middle_finger4',
+            id=12,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        13:
+        dict(
+            name='ring_finger1', id=13, color=[255, 51, 51], type='', swap=''),
+        14:
+        dict(
+            name='ring_finger2', id=14, color=[255, 51, 51], type='', swap=''),
+        15:
+        dict(
+            name='ring_finger3', id=15, color=[255, 51, 51], type='', swap=''),
+        16:
+        dict(
+            name='ring_finger4', id=16, color=[255, 51, 51], type='', swap=''),
+        17:
+        dict(name='pinky_finger1', id=17, color=[0, 255, 0], type='', swap=''),
+        18:
+        dict(name='pinky_finger2', id=18, color=[0, 255, 0], type='', swap=''),
+        19:
+        dict(name='pinky_finger3', id=19, color=[0, 255, 0], type='', swap=''),
+        20:
+        dict(name='pinky_finger4', id=20, color=[0, 255, 0], type='', swap='')
+    },
+    skeleton_info={
+        0:
+        dict(link=('wrist', 'thumb1'), id=0, color=[255, 128, 0]),
+        1:
+        dict(link=('thumb1', 'thumb2'), id=1, color=[255, 128, 0]),
+        2:
+        dict(link=('thumb2', 'thumb3'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('thumb3', 'thumb4'), id=3, color=[255, 128, 0]),
+        4:
+        dict(link=('wrist', 'forefinger1'), id=4, color=[255, 153, 255]),
+        5:
+        dict(link=('forefinger1', 'forefinger2'), id=5, color=[255, 153, 255]),
+        6:
+        dict(link=('forefinger2', 'forefinger3'), id=6, color=[255, 153, 255]),
+        7:
+        dict(link=('forefinger3', 'forefinger4'), id=7, color=[255, 153, 255]),
+        8:
+        dict(link=('wrist', 'middle_finger1'), id=8, color=[102, 178, 255]),
+        9:
+        dict(
+            link=('middle_finger1', 'middle_finger2'),
+            id=9,
+            color=[102, 178, 255]),
+        10:
+        dict(
+            link=('middle_finger2', 'middle_finger3'),
+            id=10,
+            color=[102, 178, 255]),
+        11:
+        dict(
+            link=('middle_finger3', 'middle_finger4'),
+            id=11,
+            color=[102, 178, 255]),
+        12:
+        dict(link=('wrist', 'ring_finger1'), id=12, color=[255, 51, 51]),
+        13:
+        dict(
+            link=('ring_finger1', 'ring_finger2'), id=13, color=[255, 51, 51]),
+        14:
+        dict(
+            link=('ring_finger2', 'ring_finger3'), id=14, color=[255, 51, 51]),
+        15:
+        dict(
+            link=('ring_finger3', 'ring_finger4'), id=15, color=[255, 51, 51]),
+        16:
+        dict(link=('wrist', 'pinky_finger1'), id=16, color=[0, 255, 0]),
+        17:
+        dict(
+            link=('pinky_finger1', 'pinky_finger2'), id=17, color=[0, 255, 0]),
+        18:
+        dict(
+            link=('pinky_finger2', 'pinky_finger3'), id=18, color=[0, 255, 0]),
+        19:
+        dict(
+            link=('pinky_finger3', 'pinky_finger4'), id=19, color=[0, 255, 0])
+    },
+    joint_weights=[1.] * 21,
+    sigmas=[])
diff --git a/main/_base_/datasets/panoptic_body3d.py b/main/_base_/datasets/panoptic_body3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3b19ac462415a840ca2e0b9e214bdb35d91b5e4
--- /dev/null
+++ b/main/_base_/datasets/panoptic_body3d.py
@@ -0,0 +1,160 @@
+dataset_info = dict(
+    dataset_name='panoptic_pose_3d',
+    paper_info=dict(
+        author='Joo, Hanbyul and Simon, Tomas and  Li, Xulong'
+        'and Liu, Hao and Tan, Lei and Gui, Lin and Banerjee, Sean'
+        'and Godisart, Timothy and Nabbe, Bart and Matthews, Iain'
+        'and Kanade, Takeo and Nobuhara, Shohei and Sheikh, Yaser',
+        title='Panoptic Studio: A Massively Multiview System '
+        'for Interaction Motion Capture',
+        container='IEEE Transactions on Pattern Analysis'
+        ' and Machine Intelligence',
+        year='2017',
+        homepage='http://domedb.perception.cs.cmu.edu',
+    ),
+    keypoint_info={
+        0:
+        dict(name='neck', id=0, color=[51, 153, 255], type='upper', swap=''),
+        1:
+        dict(name='nose', id=1, color=[51, 153, 255], type='upper', swap=''),
+        2:
+        dict(name='mid_hip', id=2, color=[0, 255, 0], type='lower', swap=''),
+        3:
+        dict(
+            name='left_shoulder',
+            id=3,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        4:
+        dict(
+            name='left_elbow',
+            id=4,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_elbow'),
+        5:
+        dict(
+            name='left_wrist',
+            id=5,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_wrist'),
+        6:
+        dict(
+            name='left_hip',
+            id=6,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        7:
+        dict(
+            name='left_knee',
+            id=7,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        8:
+        dict(
+            name='left_ankle',
+            id=8,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_ankle'),
+        9:
+        dict(
+            name='right_shoulder',
+            id=9,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        10:
+        dict(
+            name='right_elbow',
+            id=10,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_elbow'),
+        11:
+        dict(
+            name='right_wrist',
+            id=11,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist'),
+        12:
+        dict(
+            name='right_hip',
+            id=12,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        13:
+        dict(
+            name='right_knee',
+            id=13,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        14:
+        dict(
+            name='right_ankle',
+            id=14,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_ankle'),
+        15:
+        dict(
+            name='left_eye',
+            id=15,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_eye'),
+        16:
+        dict(
+            name='left_ear',
+            id=16,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_ear'),
+        17:
+        dict(
+            name='right_eye',
+            id=17,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_eye'),
+        18:
+        dict(
+            name='right_ear',
+            id=18,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_ear')
+    },
+    skeleton_info={
+        0: dict(link=('nose', 'neck'), id=0, color=[51, 153, 255]),
+        1: dict(link=('neck', 'left_shoulder'), id=1, color=[0, 255, 0]),
+        2: dict(link=('neck', 'right_shoulder'), id=2, color=[255, 128, 0]),
+        3: dict(link=('left_shoulder', 'left_elbow'), id=3, color=[0, 255, 0]),
+        4: dict(
+            link=('right_shoulder', 'right_elbow'), id=4, color=[255, 128, 0]),
+        5: dict(link=('left_elbow', 'left_wrist'), id=5, color=[0, 255, 0]),
+        6:
+        dict(link=('right_elbow', 'right_wrist'), id=6, color=[255, 128, 0]),
+        7: dict(link=('left_ankle', 'left_knee'), id=7, color=[0, 255, 0]),
+        8: dict(link=('left_knee', 'left_hip'), id=8, color=[0, 255, 0]),
+        9: dict(link=('right_ankle', 'right_knee'), id=9, color=[255, 128, 0]),
+        10: dict(link=('right_knee', 'right_hip'), id=10, color=[255, 128, 0]),
+        11: dict(link=('mid_hip', 'left_hip'), id=11, color=[0, 255, 0]),
+        12: dict(link=('mid_hip', 'right_hip'), id=12, color=[255, 128, 0]),
+        13: dict(link=('mid_hip', 'neck'), id=13, color=[51, 153, 255]),
+    },
+    joint_weights=[
+        1.0, 1.0, 1.0, 1.0, 1.2, 1.5, 1.0, 1.2, 1.5, 1.0, 1.2, 1.5, 1.0, 1.2,
+        1.5, 1.0, 1.0, 1.0, 1.0
+    ],
+    sigmas=[
+        0.026, 0.026, 0.107, 0.079, 0.072, 0.062, 0.107, 0.087, 0.089, 0.079,
+        0.072, 0.062, 0.107, 0.087, 0.089, 0.025, 0.035, 0.025, 0.035
+    ])
diff --git a/main/_base_/datasets/panoptic_hand2d.py b/main/_base_/datasets/panoptic_hand2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a65731ba87b155beb1b40591fd9acb232c2afc6
--- /dev/null
+++ b/main/_base_/datasets/panoptic_hand2d.py
@@ -0,0 +1,143 @@
+dataset_info = dict(
+    dataset_name='panoptic_hand2d',
+    paper_info=dict(
+        author='Simon, Tomas and Joo, Hanbyul and '
+        'Matthews, Iain and Sheikh, Yaser',
+        title='Hand keypoint detection in single images using '
+        'multiview bootstrapping',
+        container='Proceedings of the IEEE conference on '
+        'Computer Vision and Pattern Recognition',
+        year='2017',
+        homepage='http://domedb.perception.cs.cmu.edu/handdb.html',
+    ),
+    keypoint_info={
+        0:
+        dict(name='wrist', id=0, color=[255, 255, 255], type='', swap=''),
+        1:
+        dict(name='thumb1', id=1, color=[255, 128, 0], type='', swap=''),
+        2:
+        dict(name='thumb2', id=2, color=[255, 128, 0], type='', swap=''),
+        3:
+        dict(name='thumb3', id=3, color=[255, 128, 0], type='', swap=''),
+        4:
+        dict(name='thumb4', id=4, color=[255, 128, 0], type='', swap=''),
+        5:
+        dict(
+            name='forefinger1', id=5, color=[255, 153, 255], type='', swap=''),
+        6:
+        dict(
+            name='forefinger2', id=6, color=[255, 153, 255], type='', swap=''),
+        7:
+        dict(
+            name='forefinger3', id=7, color=[255, 153, 255], type='', swap=''),
+        8:
+        dict(
+            name='forefinger4', id=8, color=[255, 153, 255], type='', swap=''),
+        9:
+        dict(
+            name='middle_finger1',
+            id=9,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        10:
+        dict(
+            name='middle_finger2',
+            id=10,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        11:
+        dict(
+            name='middle_finger3',
+            id=11,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        12:
+        dict(
+            name='middle_finger4',
+            id=12,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        13:
+        dict(
+            name='ring_finger1', id=13, color=[255, 51, 51], type='', swap=''),
+        14:
+        dict(
+            name='ring_finger2', id=14, color=[255, 51, 51], type='', swap=''),
+        15:
+        dict(
+            name='ring_finger3', id=15, color=[255, 51, 51], type='', swap=''),
+        16:
+        dict(
+            name='ring_finger4', id=16, color=[255, 51, 51], type='', swap=''),
+        17:
+        dict(name='pinky_finger1', id=17, color=[0, 255, 0], type='', swap=''),
+        18:
+        dict(name='pinky_finger2', id=18, color=[0, 255, 0], type='', swap=''),
+        19:
+        dict(name='pinky_finger3', id=19, color=[0, 255, 0], type='', swap=''),
+        20:
+        dict(name='pinky_finger4', id=20, color=[0, 255, 0], type='', swap='')
+    },
+    skeleton_info={
+        0:
+        dict(link=('wrist', 'thumb1'), id=0, color=[255, 128, 0]),
+        1:
+        dict(link=('thumb1', 'thumb2'), id=1, color=[255, 128, 0]),
+        2:
+        dict(link=('thumb2', 'thumb3'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('thumb3', 'thumb4'), id=3, color=[255, 128, 0]),
+        4:
+        dict(link=('wrist', 'forefinger1'), id=4, color=[255, 153, 255]),
+        5:
+        dict(link=('forefinger1', 'forefinger2'), id=5, color=[255, 153, 255]),
+        6:
+        dict(link=('forefinger2', 'forefinger3'), id=6, color=[255, 153, 255]),
+        7:
+        dict(link=('forefinger3', 'forefinger4'), id=7, color=[255, 153, 255]),
+        8:
+        dict(link=('wrist', 'middle_finger1'), id=8, color=[102, 178, 255]),
+        9:
+        dict(
+            link=('middle_finger1', 'middle_finger2'),
+            id=9,
+            color=[102, 178, 255]),
+        10:
+        dict(
+            link=('middle_finger2', 'middle_finger3'),
+            id=10,
+            color=[102, 178, 255]),
+        11:
+        dict(
+            link=('middle_finger3', 'middle_finger4'),
+            id=11,
+            color=[102, 178, 255]),
+        12:
+        dict(link=('wrist', 'ring_finger1'), id=12, color=[255, 51, 51]),
+        13:
+        dict(
+            link=('ring_finger1', 'ring_finger2'), id=13, color=[255, 51, 51]),
+        14:
+        dict(
+            link=('ring_finger2', 'ring_finger3'), id=14, color=[255, 51, 51]),
+        15:
+        dict(
+            link=('ring_finger3', 'ring_finger4'), id=15, color=[255, 51, 51]),
+        16:
+        dict(link=('wrist', 'pinky_finger1'), id=16, color=[0, 255, 0]),
+        17:
+        dict(
+            link=('pinky_finger1', 'pinky_finger2'), id=17, color=[0, 255, 0]),
+        18:
+        dict(
+            link=('pinky_finger2', 'pinky_finger3'), id=18, color=[0, 255, 0]),
+        19:
+        dict(
+            link=('pinky_finger3', 'pinky_finger4'), id=19, color=[0, 255, 0])
+    },
+    joint_weights=[1.] * 21,
+    sigmas=[])
diff --git a/main/_base_/datasets/posetrack18.py b/main/_base_/datasets/posetrack18.py
new file mode 100644
index 0000000000000000000000000000000000000000..5aefd1c97fe083df35ee88bebab4f99134c27971
--- /dev/null
+++ b/main/_base_/datasets/posetrack18.py
@@ -0,0 +1,176 @@
+dataset_info = dict(
+    dataset_name='posetrack18',
+    paper_info=dict(
+        author='Andriluka, Mykhaylo and Iqbal, Umar and '
+        'Insafutdinov, Eldar and Pishchulin, Leonid and '
+        'Milan, Anton and Gall, Juergen and Schiele, Bernt',
+        title='Posetrack: A benchmark for human pose estimation and tracking',
+        container='Proceedings of the IEEE Conference on '
+        'Computer Vision and Pattern Recognition',
+        year='2018',
+        homepage='https://posetrack.net/users/download.php',
+    ),
+    keypoint_info={
+        0:
+        dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''),
+        1:
+        dict(
+            name='head_bottom',
+            id=1,
+            color=[51, 153, 255],
+            type='upper',
+            swap=''),
+        2:
+        dict(
+            name='head_top', id=2, color=[51, 153, 255], type='upper',
+            swap=''),
+        3:
+        dict(
+            name='left_ear',
+            id=3,
+            color=[51, 153, 255],
+            type='upper',
+            swap='right_ear'),
+        4:
+        dict(
+            name='right_ear',
+            id=4,
+            color=[51, 153, 255],
+            type='upper',
+            swap='left_ear'),
+        5:
+        dict(
+            name='left_shoulder',
+            id=5,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        6:
+        dict(
+            name='right_shoulder',
+            id=6,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        7:
+        dict(
+            name='left_elbow',
+            id=7,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_elbow'),
+        8:
+        dict(
+            name='right_elbow',
+            id=8,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_elbow'),
+        9:
+        dict(
+            name='left_wrist',
+            id=9,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_wrist'),
+        10:
+        dict(
+            name='right_wrist',
+            id=10,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist'),
+        11:
+        dict(
+            name='left_hip',
+            id=11,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        12:
+        dict(
+            name='right_hip',
+            id=12,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        13:
+        dict(
+            name='left_knee',
+            id=13,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        14:
+        dict(
+            name='right_knee',
+            id=14,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        15:
+        dict(
+            name='left_ankle',
+            id=15,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_ankle'),
+        16:
+        dict(
+            name='right_ankle',
+            id=16,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_ankle')
+    },
+    skeleton_info={
+        0:
+        dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]),
+        1:
+        dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]),
+        2:
+        dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]),
+        4:
+        dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]),
+        5:
+        dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]),
+        6:
+        dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]),
+        7:
+        dict(
+            link=('left_shoulder', 'right_shoulder'),
+            id=7,
+            color=[51, 153, 255]),
+        8:
+        dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]),
+        9:
+        dict(
+            link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]),
+        10:
+        dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]),
+        11:
+        dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]),
+        12:
+        dict(link=('nose', 'head_bottom'), id=12, color=[51, 153, 255]),
+        13:
+        dict(link=('nose', 'head_top'), id=13, color=[51, 153, 255]),
+        14:
+        dict(
+            link=('head_bottom', 'left_shoulder'), id=14, color=[51, 153,
+                                                                 255]),
+        15:
+        dict(
+            link=('head_bottom', 'right_shoulder'),
+            id=15,
+            color=[51, 153, 255])
+    },
+    joint_weights=[
+        1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5,
+        1.5
+    ],
+    sigmas=[
+        0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062,
+        0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089
+    ])
diff --git a/main/_base_/datasets/rhd2d.py b/main/_base_/datasets/rhd2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..4631ccd03814155b06687e0b1ba2b83404c837fc
--- /dev/null
+++ b/main/_base_/datasets/rhd2d.py
@@ -0,0 +1,151 @@
+dataset_info = dict(
+    dataset_name='rhd2d',
+    paper_info=dict(
+        author='Christian Zimmermann and Thomas Brox',
+        title='Learning to Estimate 3D Hand Pose from Single RGB Images',
+        container='arXiv',
+        year='2017',
+        homepage='https://lmb.informatik.uni-freiburg.de/resources/'
+        'datasets/RenderedHandposeDataset.en.html',
+    ),
+    # In RHD, 1-4: left thumb [tip to palm], which means the finger is from
+    # tip to palm, so as other fingers. Please refer to
+    # `https://lmb.informatik.uni-freiburg.de/resources/datasets/
+    # RenderedHandpose/README` for details of keypoint definition.
+    # But in COCO-WholeBody-Hand, FreiHand, CMU Panoptic HandDB, it is in
+    # inverse order. Pay attention to this if you want to combine RHD with
+    # other hand datasets to train a single model.
+    # Also, note that 'keypoint_info' will not directly affect the order of
+    # the keypoint in the dataset. It is mostly for visualization & storing
+    # information about flip_pairs.
+    keypoint_info={
+        0:
+        dict(name='wrist', id=0, color=[255, 255, 255], type='', swap=''),
+        1:
+        dict(name='thumb4', id=1, color=[255, 128, 0], type='', swap=''),
+        2:
+        dict(name='thumb3', id=2, color=[255, 128, 0], type='', swap=''),
+        3:
+        dict(name='thumb2', id=3, color=[255, 128, 0], type='', swap=''),
+        4:
+        dict(name='thumb1', id=4, color=[255, 128, 0], type='', swap=''),
+        5:
+        dict(
+            name='forefinger4', id=5, color=[255, 153, 255], type='', swap=''),
+        6:
+        dict(
+            name='forefinger3', id=6, color=[255, 153, 255], type='', swap=''),
+        7:
+        dict(
+            name='forefinger2', id=7, color=[255, 153, 255], type='', swap=''),
+        8:
+        dict(
+            name='forefinger1', id=8, color=[255, 153, 255], type='', swap=''),
+        9:
+        dict(
+            name='middle_finger4',
+            id=9,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        10:
+        dict(
+            name='middle_finger3',
+            id=10,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        11:
+        dict(
+            name='middle_finger2',
+            id=11,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        12:
+        dict(
+            name='middle_finger1',
+            id=12,
+            color=[102, 178, 255],
+            type='',
+            swap=''),
+        13:
+        dict(
+            name='ring_finger4', id=13, color=[255, 51, 51], type='', swap=''),
+        14:
+        dict(
+            name='ring_finger3', id=14, color=[255, 51, 51], type='', swap=''),
+        15:
+        dict(
+            name='ring_finger2', id=15, color=[255, 51, 51], type='', swap=''),
+        16:
+        dict(
+            name='ring_finger1', id=16, color=[255, 51, 51], type='', swap=''),
+        17:
+        dict(name='pinky_finger4', id=17, color=[0, 255, 0], type='', swap=''),
+        18:
+        dict(name='pinky_finger3', id=18, color=[0, 255, 0], type='', swap=''),
+        19:
+        dict(name='pinky_finger2', id=19, color=[0, 255, 0], type='', swap=''),
+        20:
+        dict(name='pinky_finger1', id=20, color=[0, 255, 0], type='', swap='')
+    },
+    skeleton_info={
+        0:
+        dict(link=('wrist', 'thumb1'), id=0, color=[255, 128, 0]),
+        1:
+        dict(link=('thumb1', 'thumb2'), id=1, color=[255, 128, 0]),
+        2:
+        dict(link=('thumb2', 'thumb3'), id=2, color=[255, 128, 0]),
+        3:
+        dict(link=('thumb3', 'thumb4'), id=3, color=[255, 128, 0]),
+        4:
+        dict(link=('wrist', 'forefinger1'), id=4, color=[255, 153, 255]),
+        5:
+        dict(link=('forefinger1', 'forefinger2'), id=5, color=[255, 153, 255]),
+        6:
+        dict(link=('forefinger2', 'forefinger3'), id=6, color=[255, 153, 255]),
+        7:
+        dict(link=('forefinger3', 'forefinger4'), id=7, color=[255, 153, 255]),
+        8:
+        dict(link=('wrist', 'middle_finger1'), id=8, color=[102, 178, 255]),
+        9:
+        dict(
+            link=('middle_finger1', 'middle_finger2'),
+            id=9,
+            color=[102, 178, 255]),
+        10:
+        dict(
+            link=('middle_finger2', 'middle_finger3'),
+            id=10,
+            color=[102, 178, 255]),
+        11:
+        dict(
+            link=('middle_finger3', 'middle_finger4'),
+            id=11,
+            color=[102, 178, 255]),
+        12:
+        dict(link=('wrist', 'ring_finger1'), id=12, color=[255, 51, 51]),
+        13:
+        dict(
+            link=('ring_finger1', 'ring_finger2'), id=13, color=[255, 51, 51]),
+        14:
+        dict(
+            link=('ring_finger2', 'ring_finger3'), id=14, color=[255, 51, 51]),
+        15:
+        dict(
+            link=('ring_finger3', 'ring_finger4'), id=15, color=[255, 51, 51]),
+        16:
+        dict(link=('wrist', 'pinky_finger1'), id=16, color=[0, 255, 0]),
+        17:
+        dict(
+            link=('pinky_finger1', 'pinky_finger2'), id=17, color=[0, 255, 0]),
+        18:
+        dict(
+            link=('pinky_finger2', 'pinky_finger3'), id=18, color=[0, 255, 0]),
+        19:
+        dict(
+            link=('pinky_finger3', 'pinky_finger4'), id=19, color=[0, 255, 0])
+    },
+    joint_weights=[1.] * 21,
+    sigmas=[])
diff --git a/main/_base_/datasets/shelf.py b/main/_base_/datasets/shelf.py
new file mode 100644
index 0000000000000000000000000000000000000000..5fe6e42b3b44e3f65947284efd9ffac58d41d43f
--- /dev/null
+++ b/main/_base_/datasets/shelf.py
@@ -0,0 +1,151 @@
+dataset_info = dict(
+    dataset_name='shelf',
+    paper_info=dict(
+        author='Belagiannis, Vasileios and Amin, Sikandar and Andriluka, '
+        'Mykhaylo and Schiele, Bernt and Navab, Nassir and Ilic, Slobodan',
+        title='3D Pictorial Structures for Multiple Human Pose Estimation',
+        container='IEEE Computer Society Conference on Computer Vision and '
+        'Pattern Recognition (CVPR)',
+        year='2014',
+        homepage='http://campar.in.tum.de/Chair/MultiHumanPose',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='right_ankle',
+            id=0,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_ankle'),
+        1:
+        dict(
+            name='right_knee',
+            id=1,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_knee'),
+        2:
+        dict(
+            name='right_hip',
+            id=2,
+            color=[255, 128, 0],
+            type='lower',
+            swap='left_hip'),
+        3:
+        dict(
+            name='left_hip',
+            id=3,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_hip'),
+        4:
+        dict(
+            name='left_knee',
+            id=4,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_knee'),
+        5:
+        dict(
+            name='left_ankle',
+            id=5,
+            color=[0, 255, 0],
+            type='lower',
+            swap='right_ankle'),
+        6:
+        dict(
+            name='right_wrist',
+            id=6,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_wrist'),
+        7:
+        dict(
+            name='right_elbow',
+            id=7,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_elbow'),
+        8:
+        dict(
+            name='right_shoulder',
+            id=8,
+            color=[255, 128, 0],
+            type='upper',
+            swap='left_shoulder'),
+        9:
+        dict(
+            name='left_shoulder',
+            id=9,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_shoulder'),
+        10:
+        dict(
+            name='left_elbow',
+            id=10,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_elbow'),
+        11:
+        dict(
+            name='left_wrist',
+            id=11,
+            color=[0, 255, 0],
+            type='upper',
+            swap='right_wrist'),
+        12:
+        dict(
+            name='bottom_head',
+            id=12,
+            color=[51, 153, 255],
+            type='upper',
+            swap=''),
+        13:
+        dict(
+            name='top_head',
+            id=13,
+            color=[51, 153, 255],
+            type='upper',
+            swap=''),
+    },
+    skeleton_info={
+        0:
+        dict(link=('right_ankle', 'right_knee'), id=0, color=[255, 128, 0]),
+        1:
+        dict(link=('right_knee', 'right_hip'), id=1, color=[255, 128, 0]),
+        2:
+        dict(link=('left_hip', 'left_knee'), id=2, color=[0, 255, 0]),
+        3:
+        dict(link=('left_knee', 'left_ankle'), id=3, color=[0, 255, 0]),
+        4:
+        dict(link=('right_hip', 'left_hip'), id=4, color=[51, 153, 255]),
+        5:
+        dict(link=('right_wrist', 'right_elbow'), id=5, color=[255, 128, 0]),
+        6:
+        dict(
+            link=('right_elbow', 'right_shoulder'), id=6, color=[255, 128, 0]),
+        7:
+        dict(link=('left_shoulder', 'left_elbow'), id=7, color=[0, 255, 0]),
+        8:
+        dict(link=('left_elbow', 'left_wrist'), id=8, color=[0, 255, 0]),
+        9:
+        dict(link=('right_hip', 'right_shoulder'), id=9, color=[255, 128, 0]),
+        10:
+        dict(link=('left_hip', 'left_shoulder'), id=10, color=[0, 255, 0]),
+        11:
+        dict(
+            link=('right_shoulder', 'bottom_head'), id=11, color=[255, 128,
+                                                                  0]),
+        12:
+        dict(link=('left_shoulder', 'bottom_head'), id=12, color=[0, 255, 0]),
+        13:
+        dict(link=('bottom_head', 'top_head'), id=13, color=[51, 153, 255]),
+    },
+    joint_weights=[
+        1.5, 1.2, 1.0, 1.0, 1.2, 1.5, 1.5, 1.2, 1.0, 1.0, 1.2, 1.5, 1.0, 1.0
+    ],
+    sigmas=[
+        0.089, 0.087, 0.107, 0.107, 0.087, 0.089, 0.062, 0.072, 0.079, 0.079,
+        0.072, 0.062, 0.026, 0.026
+    ])
diff --git a/main/_base_/datasets/wflw.py b/main/_base_/datasets/wflw.py
new file mode 100644
index 0000000000000000000000000000000000000000..bed6f56f30f7a2f093e44c5726212e2a0d4659d2
--- /dev/null
+++ b/main/_base_/datasets/wflw.py
@@ -0,0 +1,582 @@
+dataset_info = dict(
+    dataset_name='wflw',
+    paper_info=dict(
+        author='Wu, Wayne and Qian, Chen and Yang, Shuo and Wang, '
+        'Quan and Cai, Yici and Zhou, Qiang',
+        title='Look at boundary: A boundary-aware face alignment algorithm',
+        container='Proceedings of the IEEE conference on computer '
+        'vision and pattern recognition',
+        year='2018',
+        homepage='https://wywu.github.io/projects/LAB/WFLW.html',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='kpt-0', id=0, color=[255, 255, 255], type='', swap='kpt-32'),
+        1:
+        dict(
+            name='kpt-1', id=1, color=[255, 255, 255], type='', swap='kpt-31'),
+        2:
+        dict(
+            name='kpt-2', id=2, color=[255, 255, 255], type='', swap='kpt-30'),
+        3:
+        dict(
+            name='kpt-3', id=3, color=[255, 255, 255], type='', swap='kpt-29'),
+        4:
+        dict(
+            name='kpt-4', id=4, color=[255, 255, 255], type='', swap='kpt-28'),
+        5:
+        dict(
+            name='kpt-5', id=5, color=[255, 255, 255], type='', swap='kpt-27'),
+        6:
+        dict(
+            name='kpt-6', id=6, color=[255, 255, 255], type='', swap='kpt-26'),
+        7:
+        dict(
+            name='kpt-7', id=7, color=[255, 255, 255], type='', swap='kpt-25'),
+        8:
+        dict(
+            name='kpt-8', id=8, color=[255, 255, 255], type='', swap='kpt-24'),
+        9:
+        dict(
+            name='kpt-9', id=9, color=[255, 255, 255], type='', swap='kpt-23'),
+        10:
+        dict(
+            name='kpt-10',
+            id=10,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-22'),
+        11:
+        dict(
+            name='kpt-11',
+            id=11,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-21'),
+        12:
+        dict(
+            name='kpt-12',
+            id=12,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-20'),
+        13:
+        dict(
+            name='kpt-13',
+            id=13,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-19'),
+        14:
+        dict(
+            name='kpt-14',
+            id=14,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-18'),
+        15:
+        dict(
+            name='kpt-15',
+            id=15,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-17'),
+        16:
+        dict(name='kpt-16', id=16, color=[255, 255, 255], type='', swap=''),
+        17:
+        dict(
+            name='kpt-17',
+            id=17,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-15'),
+        18:
+        dict(
+            name='kpt-18',
+            id=18,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-14'),
+        19:
+        dict(
+            name='kpt-19',
+            id=19,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-13'),
+        20:
+        dict(
+            name='kpt-20',
+            id=20,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-12'),
+        21:
+        dict(
+            name='kpt-21',
+            id=21,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-11'),
+        22:
+        dict(
+            name='kpt-22',
+            id=22,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-10'),
+        23:
+        dict(
+            name='kpt-23', id=23, color=[255, 255, 255], type='',
+            swap='kpt-9'),
+        24:
+        dict(
+            name='kpt-24', id=24, color=[255, 255, 255], type='',
+            swap='kpt-8'),
+        25:
+        dict(
+            name='kpt-25', id=25, color=[255, 255, 255], type='',
+            swap='kpt-7'),
+        26:
+        dict(
+            name='kpt-26', id=26, color=[255, 255, 255], type='',
+            swap='kpt-6'),
+        27:
+        dict(
+            name='kpt-27', id=27, color=[255, 255, 255], type='',
+            swap='kpt-5'),
+        28:
+        dict(
+            name='kpt-28', id=28, color=[255, 255, 255], type='',
+            swap='kpt-4'),
+        29:
+        dict(
+            name='kpt-29', id=29, color=[255, 255, 255], type='',
+            swap='kpt-3'),
+        30:
+        dict(
+            name='kpt-30', id=30, color=[255, 255, 255], type='',
+            swap='kpt-2'),
+        31:
+        dict(
+            name='kpt-31', id=31, color=[255, 255, 255], type='',
+            swap='kpt-1'),
+        32:
+        dict(
+            name='kpt-32', id=32, color=[255, 255, 255], type='',
+            swap='kpt-0'),
+        33:
+        dict(
+            name='kpt-33',
+            id=33,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-46'),
+        34:
+        dict(
+            name='kpt-34',
+            id=34,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-45'),
+        35:
+        dict(
+            name='kpt-35',
+            id=35,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-44'),
+        36:
+        dict(
+            name='kpt-36',
+            id=36,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-43'),
+        37:
+        dict(
+            name='kpt-37',
+            id=37,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-42'),
+        38:
+        dict(
+            name='kpt-38',
+            id=38,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-50'),
+        39:
+        dict(
+            name='kpt-39',
+            id=39,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-49'),
+        40:
+        dict(
+            name='kpt-40',
+            id=40,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-48'),
+        41:
+        dict(
+            name='kpt-41',
+            id=41,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-47'),
+        42:
+        dict(
+            name='kpt-42',
+            id=42,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-37'),
+        43:
+        dict(
+            name='kpt-43',
+            id=43,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-36'),
+        44:
+        dict(
+            name='kpt-44',
+            id=44,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-35'),
+        45:
+        dict(
+            name='kpt-45',
+            id=45,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-34'),
+        46:
+        dict(
+            name='kpt-46',
+            id=46,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-33'),
+        47:
+        dict(
+            name='kpt-47',
+            id=47,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-41'),
+        48:
+        dict(
+            name='kpt-48',
+            id=48,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-40'),
+        49:
+        dict(
+            name='kpt-49',
+            id=49,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-39'),
+        50:
+        dict(
+            name='kpt-50',
+            id=50,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-38'),
+        51:
+        dict(name='kpt-51', id=51, color=[255, 255, 255], type='', swap=''),
+        52:
+        dict(name='kpt-52', id=52, color=[255, 255, 255], type='', swap=''),
+        53:
+        dict(name='kpt-53', id=53, color=[255, 255, 255], type='', swap=''),
+        54:
+        dict(name='kpt-54', id=54, color=[255, 255, 255], type='', swap=''),
+        55:
+        dict(
+            name='kpt-55',
+            id=55,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-59'),
+        56:
+        dict(
+            name='kpt-56',
+            id=56,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-58'),
+        57:
+        dict(name='kpt-57', id=57, color=[255, 255, 255], type='', swap=''),
+        58:
+        dict(
+            name='kpt-58',
+            id=58,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-56'),
+        59:
+        dict(
+            name='kpt-59',
+            id=59,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-55'),
+        60:
+        dict(
+            name='kpt-60',
+            id=60,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-72'),
+        61:
+        dict(
+            name='kpt-61',
+            id=61,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-71'),
+        62:
+        dict(
+            name='kpt-62',
+            id=62,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-70'),
+        63:
+        dict(
+            name='kpt-63',
+            id=63,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-69'),
+        64:
+        dict(
+            name='kpt-64',
+            id=64,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-68'),
+        65:
+        dict(
+            name='kpt-65',
+            id=65,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-75'),
+        66:
+        dict(
+            name='kpt-66',
+            id=66,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-74'),
+        67:
+        dict(
+            name='kpt-67',
+            id=67,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-73'),
+        68:
+        dict(
+            name='kpt-68',
+            id=68,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-64'),
+        69:
+        dict(
+            name='kpt-69',
+            id=69,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-63'),
+        70:
+        dict(
+            name='kpt-70',
+            id=70,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-62'),
+        71:
+        dict(
+            name='kpt-71',
+            id=71,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-61'),
+        72:
+        dict(
+            name='kpt-72',
+            id=72,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-60'),
+        73:
+        dict(
+            name='kpt-73',
+            id=73,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-67'),
+        74:
+        dict(
+            name='kpt-74',
+            id=74,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-66'),
+        75:
+        dict(
+            name='kpt-75',
+            id=75,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-65'),
+        76:
+        dict(
+            name='kpt-76',
+            id=76,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-82'),
+        77:
+        dict(
+            name='kpt-77',
+            id=77,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-81'),
+        78:
+        dict(
+            name='kpt-78',
+            id=78,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-80'),
+        79:
+        dict(name='kpt-79', id=79, color=[255, 255, 255], type='', swap=''),
+        80:
+        dict(
+            name='kpt-80',
+            id=80,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-78'),
+        81:
+        dict(
+            name='kpt-81',
+            id=81,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-77'),
+        82:
+        dict(
+            name='kpt-82',
+            id=82,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-76'),
+        83:
+        dict(
+            name='kpt-83',
+            id=83,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-87'),
+        84:
+        dict(
+            name='kpt-84',
+            id=84,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-86'),
+        85:
+        dict(name='kpt-85', id=85, color=[255, 255, 255], type='', swap=''),
+        86:
+        dict(
+            name='kpt-86',
+            id=86,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-84'),
+        87:
+        dict(
+            name='kpt-87',
+            id=87,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-83'),
+        88:
+        dict(
+            name='kpt-88',
+            id=88,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-92'),
+        89:
+        dict(
+            name='kpt-89',
+            id=89,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-91'),
+        90:
+        dict(name='kpt-90', id=90, color=[255, 255, 255], type='', swap=''),
+        91:
+        dict(
+            name='kpt-91',
+            id=91,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-89'),
+        92:
+        dict(
+            name='kpt-92',
+            id=92,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-88'),
+        93:
+        dict(
+            name='kpt-93',
+            id=93,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-95'),
+        94:
+        dict(name='kpt-94', id=94, color=[255, 255, 255], type='', swap=''),
+        95:
+        dict(
+            name='kpt-95',
+            id=95,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-93'),
+        96:
+        dict(
+            name='kpt-96',
+            id=96,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-97'),
+        97:
+        dict(
+            name='kpt-97',
+            id=97,
+            color=[255, 255, 255],
+            type='',
+            swap='kpt-96')
+    },
+    skeleton_info={},
+    joint_weights=[1.] * 98,
+    sigmas=[])
diff --git a/main/_base_/datasets/zebra.py b/main/_base_/datasets/zebra.py
new file mode 100644
index 0000000000000000000000000000000000000000..eac71f796a761bbf87b123f8b7b8b4585df0c525
--- /dev/null
+++ b/main/_base_/datasets/zebra.py
@@ -0,0 +1,64 @@
+dataset_info = dict(
+    dataset_name='zebra',
+    paper_info=dict(
+        author='Graving, Jacob M and Chae, Daniel and Naik, Hemal and '
+        'Li, Liang and Koger, Benjamin and Costelloe, Blair R and '
+        'Couzin, Iain D',
+        title='DeepPoseKit, a software toolkit for fast and robust '
+        'animal pose estimation using deep learning',
+        container='Elife',
+        year='2019',
+        homepage='https://github.com/jgraving/DeepPoseKit-Data',
+    ),
+    keypoint_info={
+        0:
+        dict(name='snout', id=0, color=[255, 255, 255], type='', swap=''),
+        1:
+        dict(name='head', id=1, color=[255, 255, 255], type='', swap=''),
+        2:
+        dict(name='neck', id=2, color=[255, 255, 255], type='', swap=''),
+        3:
+        dict(
+            name='forelegL1',
+            id=3,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegR1'),
+        4:
+        dict(
+            name='forelegR1',
+            id=4,
+            color=[255, 255, 255],
+            type='',
+            swap='forelegL1'),
+        5:
+        dict(
+            name='hindlegL1',
+            id=5,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegR1'),
+        6:
+        dict(
+            name='hindlegR1',
+            id=6,
+            color=[255, 255, 255],
+            type='',
+            swap='hindlegL1'),
+        7:
+        dict(name='tailbase', id=7, color=[255, 255, 255], type='', swap=''),
+        8:
+        dict(name='tailtip', id=8, color=[255, 255, 255], type='', swap='')
+    },
+    skeleton_info={
+        0: dict(link=('head', 'snout'), id=0, color=[255, 255, 255]),
+        1: dict(link=('neck', 'head'), id=1, color=[255, 255, 255]),
+        2: dict(link=('forelegL1', 'neck'), id=2, color=[255, 255, 255]),
+        3: dict(link=('forelegR1', 'neck'), id=3, color=[255, 255, 255]),
+        4: dict(link=('hindlegL1', 'tailbase'), id=4, color=[255, 255, 255]),
+        5: dict(link=('hindlegR1', 'tailbase'), id=5, color=[255, 255, 255]),
+        6: dict(link=('tailbase', 'neck'), id=6, color=[255, 255, 255]),
+        7: dict(link=('tailtip', 'tailbase'), id=7, color=[255, 255, 255])
+    },
+    joint_weights=[1.] * 9,
+    sigmas=[])
diff --git a/main/_base_/default_runtime.py b/main/_base_/default_runtime.py
new file mode 100644
index 0000000000000000000000000000000000000000..62b7ff270aae280268ea528c1fbe99c0052e20e3
--- /dev/null
+++ b/main/_base_/default_runtime.py
@@ -0,0 +1,20 @@
+checkpoint_config = dict(interval=10)
+
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+        # dict(type='PaviLoggerHook') # for internal services
+    ])
+
+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+
+# disable opencv multithreading to avoid system being overloaded
+opencv_num_threads = 0
+# set multi-process start method as `fork` to speed up the training
+mp_start_method = 'fork'
diff --git a/main/_base_/filters/gaussian.py b/main/_base_/filters/gaussian.py
new file mode 100644
index 0000000000000000000000000000000000000000..b855f4bde1e1adf71186b3f82f1a3e522fbc53ff
--- /dev/null
+++ b/main/_base_/filters/gaussian.py
@@ -0,0 +1,5 @@
+filter_cfg = dict(
+    type='GaussianFilter',
+    window_size=11,
+    sigma=4.0,
+)
diff --git a/main/_base_/filters/one_euro.py b/main/_base_/filters/one_euro.py
new file mode 100644
index 0000000000000000000000000000000000000000..61f797efdf9fb7a12d40b2d8eee6cb3a5e2e1ea9
--- /dev/null
+++ b/main/_base_/filters/one_euro.py
@@ -0,0 +1,5 @@
+filter_cfg = dict(
+    type='OneEuroFilter',
+    min_cutoff=0.004,
+    beta=0.7,
+)
diff --git a/main/_base_/filters/savizky_golay.py b/main/_base_/filters/savizky_golay.py
new file mode 100644
index 0000000000000000000000000000000000000000..40302b004460699dfe8522c59c9a3e8cf1c35d83
--- /dev/null
+++ b/main/_base_/filters/savizky_golay.py
@@ -0,0 +1,5 @@
+filter_cfg = dict(
+    type='SavizkyGolayFilter',
+    window_size=11,
+    polyorder=2,
+)
diff --git a/main/_base_/filters/smoothnet_h36m.md b/main/_base_/filters/smoothnet_h36m.md
new file mode 100644
index 0000000000000000000000000000000000000000..0901be8fe26468b3603ef77412a4feea16a1f239
--- /dev/null
+++ b/main/_base_/filters/smoothnet_h36m.md
@@ -0,0 +1,45 @@
+<!-- [OTHERS] -->
+
+<details>
+<summary align="right"><a href="https://arxiv.org/abs/2112.13715">SmoothNet (arXiv'2021)</a></summary>
+
+```bibtex
+@article{zeng2021smoothnet,
+  title={SmoothNet: A Plug-and-Play Network for Refining Human Poses in Videos},
+  author={Zeng, Ailing and Yang, Lei and Ju, Xuan and Li, Jiefeng and Wang, Jianyi and Xu, Qiang},
+  journal={arXiv preprint arXiv:2112.13715},
+  year={2021}
+}
+```
+
+</details>
+
+<!-- [DATASET] -->
+
+<details>
+<summary align="right"><a href="https://ieeexplore.ieee.org/abstract/document/6682899/">Human3.6M (TPAMI'2014)</a></summary>
+
+```bibtex
+@article{h36m_pami,
+  author = {Ionescu, Catalin and Papava, Dragos and Olaru, Vlad and Sminchisescu,  Cristian},
+  title = {Human3.6M: Large Scale Datasets and Predictive Methods for 3D Human Sensing in Natural Environments},
+  journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
+  publisher = {IEEE Computer Society},
+  volume = {36},
+  number = {7},
+  pages = {1325-1339},
+  month = {jul},
+  year = {2014}
+}
+```
+
+</details>
+
+The following SmoothNet model checkpoints are available for pose smoothing. The table shows the the performance of [SimpleBaseline3D](https://arxiv.org/abs/1705.03098) on [Human3.6M](https://ieeexplore.ieee.org/abstract/document/6682899/) dataset without/with the SmoothNet plugin, and compares the SmoothNet models with 4 different window sizes (8, 16, 32 and 64). The metrics are MPJPE(mm), P-MEJPE(mm) and Acceleration Error (mm/frame^2).
+
+| Arch                                 | Window Size | MPJPE<sup>w/o</sup> | MPJPE<sup>w</sup> | P-MPJPE<sup>w/o</sup> | P-MPJPE<sup>w</sup> | AC. Err<sup>w/o</sup> | AC. Err<sup>w</sup> |                 ckpt                  |
+| :----------------------------------- | :---------: | :-----------------: | :---------------: | :-------------------: | :-----------------: | :-------------------: | :-----------------: | :-----------------------------------: |
+| [smoothnet_ws8](/configs/_base_/filters/smoothnet_t8_h36m.py) |      8      |        54.48        |       53.15       |         42.20         |        41.32        |         19.18         |        1.87         | [ckpt](https://download.openmmlab.com/mmpose/plugin/smoothnet/smoothnet_ws8_h36m.pth) |
+| [smoothnet_ws16](/configs/_base_/filters/smoothnet_t16_h36m.py) |     16      |        54.48        |       52.74       |         42.20         |        41.20        |         19.18         |        1.22         | [ckpt](https://download.openmmlab.com/mmpose/plugin/smoothnet/smoothnet_ws16_h36m.pth) |
+| [smoothnet_ws32](/configs/_base_/filters/smoothnet_t32_h36m.py) |     32      |        54.48        |       52.47       |         42.20         |        40.84        |         19.18         |        0.99         | [ckpt](https://download.openmmlab.com/mmpose/plugin/smoothnet/smoothnet_ws32_h36m.pth) |
+| [smoothnet_ws64](/configs/_base_/filters/smoothnet_t64_h36m.py) |     64      |        54.48        |       53.37       |         42.20         |        40.77        |         19.18         |        0.92         | [ckpt](https://download.openmmlab.com/mmpose/plugin/smoothnet/smoothnet_ws64_h36m.pth) |
diff --git a/main/_base_/filters/smoothnet_t16_h36m.py b/main/_base_/filters/smoothnet_t16_h36m.py
new file mode 100644
index 0000000000000000000000000000000000000000..0cc0c3be924b59056b6b92e1a9f97978cce4a3e2
--- /dev/null
+++ b/main/_base_/filters/smoothnet_t16_h36m.py
@@ -0,0 +1,13 @@
+# Config for SmoothNet filter trained on Human3.6M data with a window size of
+# 16. The model is trained using root-centered keypoint coordinates around the
+# pelvis (index:0), thus we set root_index=0 for the filter
+filter_cfg = dict(
+    type='SmoothNetFilter',
+    window_size=16,
+    output_size=16,
+    checkpoint='https://download.openmmlab.com/mmpose/plugin/smoothnet/'
+    'smoothnet_ws16_h36m.pth',
+    hidden_size=512,
+    res_hidden_size=256,
+    num_blocks=3,
+    root_index=0)
diff --git a/main/_base_/filters/smoothnet_t32_h36m.py b/main/_base_/filters/smoothnet_t32_h36m.py
new file mode 100644
index 0000000000000000000000000000000000000000..dae59f3b81e2adceec532079a3849de23772f0eb
--- /dev/null
+++ b/main/_base_/filters/smoothnet_t32_h36m.py
@@ -0,0 +1,13 @@
+# Config for SmoothNet filter trained on Human3.6M data with a window size of
+# 32. The model is trained using root-centered keypoint coordinates around the
+# pelvis (index:0), thus we set root_index=0 for the filter
+filter_cfg = dict(
+    type='SmoothNetFilter',
+    window_size=32,
+    output_size=32,
+    checkpoint='https://download.openmmlab.com/mmpose/plugin/smoothnet/'
+    'smoothnet_ws32_h36m.pth',
+    hidden_size=512,
+    res_hidden_size=256,
+    num_blocks=3,
+    root_index=0)
diff --git a/main/_base_/filters/smoothnet_t64_h36m.py b/main/_base_/filters/smoothnet_t64_h36m.py
new file mode 100644
index 0000000000000000000000000000000000000000..aef2993272cef9fff1d7f8c882507781064d44b7
--- /dev/null
+++ b/main/_base_/filters/smoothnet_t64_h36m.py
@@ -0,0 +1,13 @@
+# Config for SmoothNet filter trained on Human3.6M data with a window size of
+# 64. The model is trained using root-centered keypoint coordinates around the
+# pelvis (index:0), thus we set root_index=0 for the filter
+filter_cfg = dict(
+    type='SmoothNetFilter',
+    window_size=64,
+    output_size=64,
+    checkpoint='https://download.openmmlab.com/mmpose/plugin/smoothnet/'
+    'smoothnet_ws64_h36m.pth',
+    hidden_size=512,
+    res_hidden_size=256,
+    num_blocks=3,
+    root_index=0)
diff --git a/main/_base_/filters/smoothnet_t8_h36m.py b/main/_base_/filters/smoothnet_t8_h36m.py
new file mode 100644
index 0000000000000000000000000000000000000000..cadd8865dc75d2247a8b4af6036131963aa4d4a5
--- /dev/null
+++ b/main/_base_/filters/smoothnet_t8_h36m.py
@@ -0,0 +1,13 @@
+# Config for SmoothNet filter trained on Human3.6M data with a window size of
+# 8. The model is trained using root-centered keypoint coordinates around the
+# pelvis (index:0), thus we set root_index=0 for the filter
+filter_cfg = dict(
+    type='SmoothNetFilter',
+    window_size=8,
+    output_size=8,
+    checkpoint='https://download.openmmlab.com/mmpose/plugin/smoothnet/'
+    'smoothnet_ws8_h36m.pth',
+    hidden_size=512,
+    res_hidden_size=256,
+    num_blocks=3,
+    root_index=0)
diff --git a/main/config.py b/main/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9a53874b62f35492e5a50034f82420c9595eed3
--- /dev/null
+++ b/main/config.py
@@ -0,0 +1,65 @@
+import os
+import os.path as osp
+import sys
+import datetime
+from mmcv import Config as MMConfig
+
+class Config:
+    def get_config_fromfile(self, config_path):
+        self.config_path = config_path
+        cfg = MMConfig.fromfile(self.config_path)
+        self.__dict__.update(dict(cfg))
+
+        # update dir
+        self.cur_dir = osp.dirname(os.path.abspath(__file__))
+        self.root_dir = osp.join(self.cur_dir, '..')
+        self.data_dir = osp.join(self.root_dir, 'dataset')
+        self.human_model_path = osp.join(self.root_dir, 'common', 'utils', 'human_model_files')
+
+        ## add some paths to the system root dir
+        sys.path.insert(0, osp.join(self.root_dir, 'common'))
+                
+    def prepare_dirs(self, exp_name):
+        time_str = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
+        self.output_dir = osp.join(self.root_dir, f'{exp_name}_{time_str}')
+        self.model_dir = osp.join(self.output_dir, 'model_dump')
+        self.vis_dir = osp.join(self.output_dir, 'vis')
+        self.log_dir = osp.join(self.output_dir, 'log')
+        self.code_dir = osp.join(self.output_dir, 'code')
+        self.result_dir = osp.join(self.output_dir, 'result')
+
+        from utils.dir import make_folder
+        make_folder(self.model_dir)
+        make_folder(self.vis_dir)
+        make_folder(self.log_dir)
+        make_folder(self.code_dir)
+        make_folder(self.result_dir)
+
+        ## copy some code to log dir as a backup
+        copy_files = ['main/train.py', 'main/test.py', 'common/base.py',
+                      'common/nets', 'main/SMPLer_X.py',
+                      'data/dataset.py', 'data/MSCOCO/MSCOCO.py', 'data/AGORA/AGORA.py']
+        for file in copy_files:
+            os.system(f'cp -r {self.root_dir}/{file} {self.code_dir}')
+
+    def update_test_config(self, testset, agora_benchmark, shapy_eval_split, pretrained_model_path, use_cache,
+                           eval_on_train=False, vis=False):
+        self.testset = testset
+        self.agora_benchmark = agora_benchmark
+        self.pretrained_model_path = pretrained_model_path
+        self.shapy_eval_split = shapy_eval_split
+        self.use_cache = use_cache
+        self.eval_on_train = eval_on_train
+        self.vis = vis
+
+
+    def update_config(self, num_gpus, pretrained_model_path, output_folder, device):
+        self.num_gpus = num_gpus
+        self.pretrained_model_path = pretrained_model_path
+        self.log_dir = output_folder
+        self.device = device
+        
+        # Save
+        cfg_save = MMConfig(self.__dict__)
+
+cfg = Config()
\ No newline at end of file
diff --git a/main/config/config_smpler_x_b32.py b/main/config/config_smpler_x_b32.py
new file mode 100644
index 0000000000000000000000000000000000000000..b737e5307a76fbeaf6eefa6e2bc775c52760fab4
--- /dev/null
+++ b/main/config/config_smpler_x_b32.py
@@ -0,0 +1,112 @@
+import os
+import os.path as osp
+
+# will be update in exp
+num_gpus = -1
+exp_name = 'output/exp1/pre_analysis'
+
+# quick access
+save_epoch = 1
+lr = 1e-5
+end_epoch = 10
+train_batch_size = 32
+
+syncbn = True
+bbox_ratio = 1.2
+
+# continue
+continue_train = False
+start_over = True
+
+# dataset setting
+agora_fix_betas = True
+agora_fix_global_orient_transl = True
+agora_valid_root_pose = True
+
+# all
+dataset_list = ['Human36M', 'MSCOCO', 'MPII', 'AGORA', 'EHF', 'SynBody', 'GTA_Human2', \
+    'EgoBody_Egocentric', 'EgoBody_Kinect', 'UBody', 'PW3D', 'MuCo', 'PROX']
+trainset_3d = ['MSCOCO','AGORA', 'UBody']
+trainset_2d = ['PW3D', 'MPII', 'Human36M']
+trainset_humandata = ['BEDLAM', 'SPEC', 'GTA_Human2','SynBody', 'PoseTrack',
+                    'EgoBody_Egocentric', 'PROX', 'CrowdPose',
+                    'EgoBody_Kinect', 'MPI_INF_3DHP', 'RICH', 'MuCo', 'InstaVariety',
+                    'Behave', 'UP3D', 'ARCTIC',
+                    'OCHuman', 'CHI3D', 'RenBody_HiRes', 'MTP', 'HumanSC3D', 'RenBody',
+                    'FIT3D', 'Talkshow' , 'SSP3D', 'LSPET']
+testset = 'EHF'
+
+use_cache = True
+# downsample
+BEDLAM_train_sample_interval = 5
+EgoBody_Kinect_train_sample_interval = 10
+train_sample_interval = 10 # UBody
+MPI_INF_3DHP_train_sample_interval = 5
+InstaVariety_train_sample_interval = 10
+RenBody_HiRes_train_sample_interval = 5
+ARCTIC_train_sample_interval = 10
+# RenBody_train_sample_interval = 10
+FIT3D_train_sample_interval = 10
+Talkshow_train_sample_interval = 10
+
+# strategy 
+data_strategy = 'balance' # 'balance' need to define total_data_len
+total_data_len = 4500000
+
+# model
+smplx_loss_weight = 1.0 #2 for agora_model for smplx shape
+smplx_pose_weight = 10.0
+
+smplx_kps_3d_weight = 100.0
+smplx_kps_2d_weight = 1.0
+net_kps_2d_weight = 1.0
+
+agora_benchmark = 'agora_model' # 'agora_model', 'test_only'
+
+model_type = 'smpler_x_b'
+encoder_config_file = 'main/transformer_utils/configs/smpler_x/encoder/body_encoder_base.py'
+encoder_pretrained_model_path = 'pretrained_models/vitpose_base.pth'
+feat_dim = 768
+
+
+## =====FIXED ARGS============================================================
+## model setting
+upscale = 4
+hand_pos_joint_num = 20
+face_pos_joint_num = 72
+num_task_token = 24
+num_noise_sample = 0
+
+## UBody setting
+train_sample_interval = 10
+test_sample_interval = 100
+make_same_len = False
+
+## input, output size
+input_img_shape = (512, 384)
+input_body_shape = (256, 192)
+output_hm_shape = (16, 16, 12)
+input_hand_shape = (256, 256)
+output_hand_hm_shape = (16, 16, 16)
+output_face_hm_shape = (8, 8, 8)
+input_face_shape = (192, 192)
+focal = (5000, 5000)  # virtual focal lengths
+princpt = (input_body_shape[1] / 2, input_body_shape[0] / 2)  # virtual principal point position
+body_3d_size = 2
+hand_3d_size = 0.3
+face_3d_size = 0.3
+camera_3d_size = 2.5
+
+## training config
+print_iters = 100
+lr_mult = 1
+
+## testing config
+test_batch_size = 32
+
+## others
+num_thread = 2
+vis = False
+
+## directory
+output_dir, model_dir, vis_dir, log_dir, result_dir, code_dir = None, None, None, None, None, None
diff --git a/main/config/config_smpler_x_h32.py b/main/config/config_smpler_x_h32.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ffd86e9e965f9f2d3fd5efdb98ad2cb83fa81ed
--- /dev/null
+++ b/main/config/config_smpler_x_h32.py
@@ -0,0 +1,111 @@
+import os
+import os.path as osp
+
+# will be update in exp
+num_gpus = -1
+exp_name = 'output/exp1/pre_analysis'
+
+# quick access
+save_epoch = 1
+lr = 1e-5
+end_epoch = 10
+train_batch_size = 16
+
+syncbn = True
+bbox_ratio = 1.2
+
+# continue
+continue_train = False
+start_over = True
+
+# dataset setting
+agora_fix_betas = True
+agora_fix_global_orient_transl = True
+agora_valid_root_pose = True
+
+# all
+dataset_list = ['Human36M', 'MSCOCO', 'MPII', 'AGORA', 'EHF', 'SynBody', 'GTA_Human2', \
+    'EgoBody_Egocentric', 'EgoBody_Kinect', 'UBody', 'PW3D', 'MuCo', 'PROX']
+trainset_3d = ['MSCOCO','AGORA', 'UBody']
+trainset_2d = ['PW3D', 'MPII', 'Human36M']
+trainset_humandata = ['BEDLAM', 'SPEC', 'GTA_Human2','SynBody', 'PoseTrack',
+                    'EgoBody_Egocentric', 'PROX', 'CrowdPose',
+                    'EgoBody_Kinect', 'MPI_INF_3DHP', 'RICH', 'MuCo', 'InstaVariety',
+                    'Behave', 'UP3D', 'ARCTIC',
+                    'OCHuman', 'CHI3D', 'RenBody_HiRes', 'MTP', 'HumanSC3D', 'RenBody',
+                    'FIT3D', 'Talkshow' , 'SSP3D', 'LSPET']
+testset = 'EHF'
+
+use_cache = True
+# downsample
+BEDLAM_train_sample_interval = 5
+EgoBody_Kinect_train_sample_interval = 10
+train_sample_interval = 10 # UBody
+MPI_INF_3DHP_train_sample_interval = 5
+InstaVariety_train_sample_interval = 10
+RenBody_HiRes_train_sample_interval = 5
+ARCTIC_train_sample_interval = 10
+# RenBody_train_sample_interval = 10
+FIT3D_train_sample_interval = 10
+Talkshow_train_sample_interval = 10
+
+# strategy 
+data_strategy = 'balance' # 'balance' need to define total_data_len
+total_data_len = 4500000
+
+# model
+smplx_loss_weight = 1.0 #2 for agora_model for smplx shape
+smplx_pose_weight = 10.0
+
+smplx_kps_3d_weight = 100.0
+smplx_kps_2d_weight = 1.0
+net_kps_2d_weight = 1.0
+
+agora_benchmark = 'agora_model' # 'agora_model', 'test_only'
+
+model_type = 'smpler_x_h'
+encoder_config_file = 'main/transformer_utils/configs/smpler_x/encoder/body_encoder_huge.py'
+encoder_pretrained_model_path = 'pretrained_models/vitpose_huge.pth'
+feat_dim = 1280
+
+## =====FIXED ARGS============================================================
+## model setting
+upscale = 4
+hand_pos_joint_num = 20
+face_pos_joint_num = 72
+num_task_token = 24
+num_noise_sample = 0
+
+## UBody setting
+train_sample_interval = 10
+test_sample_interval = 100
+make_same_len = False
+
+## input, output size
+input_img_shape = (512, 384)
+input_body_shape = (256, 192)
+output_hm_shape = (16, 16, 12)
+input_hand_shape = (256, 256)
+output_hand_hm_shape = (16, 16, 16)
+output_face_hm_shape = (8, 8, 8)
+input_face_shape = (192, 192)
+focal = (5000, 5000)  # virtual focal lengths
+princpt = (input_body_shape[1] / 2, input_body_shape[0] / 2)  # virtual principal point position
+body_3d_size = 2
+hand_3d_size = 0.3
+face_3d_size = 0.3
+camera_3d_size = 2.5
+
+## training config
+print_iters = 100
+lr_mult = 1
+
+## testing config
+test_batch_size = 32
+
+## others
+num_thread = 2
+vis = False
+
+## directory
+output_dir, model_dir, vis_dir, log_dir, result_dir, code_dir = None, None, None, None, None, None
diff --git a/main/config/config_smpler_x_l32.py b/main/config/config_smpler_x_l32.py
new file mode 100644
index 0000000000000000000000000000000000000000..1cfedc0b6b59d17d2b666bfdfabff6c45069456b
--- /dev/null
+++ b/main/config/config_smpler_x_l32.py
@@ -0,0 +1,112 @@
+import os
+import os.path as osp
+
+# will be update in exp
+num_gpus = -1
+exp_name = 'output/exp1/pre_analysis'
+
+# quick access
+save_epoch = 1
+lr = 1e-5
+end_epoch = 10
+train_batch_size = 32
+
+syncbn = True
+bbox_ratio = 1.2
+
+# continue
+continue_train = False
+start_over = True
+
+# dataset setting
+agora_fix_betas = True
+agora_fix_global_orient_transl = True
+agora_valid_root_pose = True
+
+# all
+dataset_list = ['Human36M', 'MSCOCO', 'MPII', 'AGORA', 'EHF', 'SynBody', 'GTA_Human2', \
+    'EgoBody_Egocentric', 'EgoBody_Kinect', 'UBody', 'PW3D', 'MuCo', 'PROX']
+trainset_3d = ['MSCOCO','AGORA', 'UBody']
+trainset_2d = ['PW3D', 'MPII', 'Human36M']
+trainset_humandata = ['BEDLAM', 'SPEC', 'GTA_Human2','SynBody', 'PoseTrack',
+                    'EgoBody_Egocentric', 'PROX', 'CrowdPose',
+                    'EgoBody_Kinect', 'MPI_INF_3DHP', 'RICH', 'MuCo', 'InstaVariety',
+                    'Behave', 'UP3D', 'ARCTIC',
+                    'OCHuman', 'CHI3D', 'RenBody_HiRes', 'MTP', 'HumanSC3D', 'RenBody',
+                    'FIT3D', 'Talkshow' , 'SSP3D', 'LSPET']
+testset = 'EHF'
+
+use_cache = True
+# downsample
+BEDLAM_train_sample_interval = 5
+EgoBody_Kinect_train_sample_interval = 10
+train_sample_interval = 10 # UBody
+MPI_INF_3DHP_train_sample_interval = 5
+InstaVariety_train_sample_interval = 10
+RenBody_HiRes_train_sample_interval = 5
+ARCTIC_train_sample_interval = 10
+# RenBody_train_sample_interval = 10
+FIT3D_train_sample_interval = 10
+Talkshow_train_sample_interval = 10
+
+# strategy 
+data_strategy = 'balance' # 'balance' need to define total_data_len
+total_data_len = 4500000
+
+# model
+smplx_loss_weight = 1.0 #2 for agora_model for smplx shape
+smplx_pose_weight = 10.0
+
+smplx_kps_3d_weight = 100.0
+smplx_kps_2d_weight = 1.0
+net_kps_2d_weight = 1.0
+
+agora_benchmark = 'agora_model' # 'agora_model', 'test_only'
+
+model_type = 'smpler_x_l'
+encoder_config_file = 'main/transformer_utils/configs/smpler_x/encoder/body_encoder_large.py'
+encoder_pretrained_model_path = 'pretrained_models/vitpose_large.pth'
+feat_dim = 1024
+
+
+## =====FIXED ARGS============================================================
+## model setting
+upscale = 4
+hand_pos_joint_num = 20
+face_pos_joint_num = 72
+num_task_token = 24
+num_noise_sample = 0
+
+## UBody setting
+train_sample_interval = 10
+test_sample_interval = 100
+make_same_len = False
+
+## input, output size
+input_img_shape = (512, 384)
+input_body_shape = (256, 192)
+output_hm_shape = (16, 16, 12)
+input_hand_shape = (256, 256)
+output_hand_hm_shape = (16, 16, 16)
+output_face_hm_shape = (8, 8, 8)
+input_face_shape = (192, 192)
+focal = (5000, 5000)  # virtual focal lengths
+princpt = (input_body_shape[1] / 2, input_body_shape[0] / 2)  # virtual principal point position
+body_3d_size = 2
+hand_3d_size = 0.3
+face_3d_size = 0.3
+camera_3d_size = 2.5
+
+## training config
+print_iters = 100
+lr_mult = 1
+
+## testing config
+test_batch_size = 32
+
+## others
+num_thread = 2
+vis = False
+
+## directory
+output_dir, model_dir, vis_dir, log_dir, result_dir, code_dir = None, None, None, None, None, None
diff --git a/main/config/config_smpler_x_s32.py b/main/config/config_smpler_x_s32.py
new file mode 100644
index 0000000000000000000000000000000000000000..090501bef40b1130e733d9567c05dd11b22b9ed1
--- /dev/null
+++ b/main/config/config_smpler_x_s32.py
@@ -0,0 +1,111 @@
+import os
+import os.path as osp
+
+# will be update in exp
+num_gpus = -1
+exp_name = 'output/exp1/pre_analysis'
+
+# quick access
+save_epoch = 1
+lr = 1e-5
+end_epoch = 10
+train_batch_size = 32
+
+syncbn = True
+bbox_ratio = 1.2
+
+# continue
+continue_train = False
+start_over = True
+
+# dataset setting
+agora_fix_betas = True
+agora_fix_global_orient_transl = True
+agora_valid_root_pose = True
+
+# all data
+dataset_list = ['Human36M', 'MSCOCO', 'MPII', 'AGORA', 'EHF', 'SynBody', 'GTA_Human2', \
+    'EgoBody_Egocentric', 'EgoBody_Kinect', 'UBody', 'PW3D', 'MuCo', 'PROX']
+trainset_3d = ['MSCOCO','AGORA', 'UBody']
+trainset_2d = ['PW3D', 'MPII', 'Human36M']
+trainset_humandata = ['BEDLAM', 'SPEC', 'GTA_Human2','SynBody', 'PoseTrack',
+                    'EgoBody_Egocentric', 'PROX', 'CrowdPose',
+                    'EgoBody_Kinect', 'MPI_INF_3DHP', 'RICH', 'MuCo', 'InstaVariety',
+                    'Behave', 'UP3D', 'ARCTIC',
+                    'OCHuman', 'CHI3D', 'RenBody_HiRes', 'MTP', 'HumanSC3D', 'RenBody',
+                    'FIT3D', 'Talkshow' , 'SSP3D', 'LSPET']
+testset = 'EHF'
+
+use_cache = True
+# downsample
+BEDLAM_train_sample_interval = 5
+EgoBody_Kinect_train_sample_interval = 10
+train_sample_interval = 10 # UBody
+MPI_INF_3DHP_train_sample_interval = 5
+InstaVariety_train_sample_interval = 10
+RenBody_HiRes_train_sample_interval = 5
+ARCTIC_train_sample_interval = 10
+# RenBody_train_sample_interval = 10
+FIT3D_train_sample_interval = 10
+Talkshow_train_sample_interval = 10
+
+# strategy 
+data_strategy = 'balance' # 'balance' need to define total_data_len
+total_data_len = 4500000
+
+# model
+smplx_loss_weight = 1.0 #2 for agora_model for smplx shape
+smplx_pose_weight = 10.0
+
+smplx_kps_3d_weight = 100.0
+smplx_kps_2d_weight = 1.0
+net_kps_2d_weight = 1.0
+
+agora_benchmark = 'agora_model' # 'agora_model', 'test_only'
+
+model_type = 'smpler_x_s'
+encoder_config_file = 'main/transformer_utils/configs/smpler_x/encoder/body_encoder_small.py'
+encoder_pretrained_model_path = 'pretrained_models/vitpose_small.pth'
+feat_dim = 384
+
+## =====FIXED ARGS============================================================
+## model setting
+upscale = 4
+hand_pos_joint_num = 20
+face_pos_joint_num = 72
+num_task_token = 24
+num_noise_sample = 0
+
+## UBody setting
+train_sample_interval = 10
+test_sample_interval = 100
+make_same_len = False
+
+## input, output size
+input_img_shape = (512, 384)
+input_body_shape = (256, 192)
+output_hm_shape = (16, 16, 12)
+input_hand_shape = (256, 256)
+output_hand_hm_shape = (16, 16, 16)
+output_face_hm_shape = (8, 8, 8)
+input_face_shape = (192, 192)
+focal = (5000, 5000)  # virtual focal lengths
+princpt = (input_body_shape[1] / 2, input_body_shape[0] / 2)  # virtual principal point position
+body_3d_size = 2
+hand_3d_size = 0.3
+face_3d_size = 0.3
+camera_3d_size = 2.5
+
+## training config
+print_iters = 100
+lr_mult = 1
+
+## testing config
+test_batch_size = 32
+
+## others
+num_thread = 2
+vis = False
+
+## directory
+output_dir, model_dir, vis_dir, log_dir, result_dir, code_dir = None, None, None, None, None, None
diff --git a/main/inference.py b/main/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0dc4a161590d1ec3de8c7b528e2f6fea3db6683
--- /dev/null
+++ b/main/inference.py
@@ -0,0 +1,128 @@
+import os
+import sys
+import os.path as osp
+import argparse
+import numpy as np
+import torchvision.transforms as transforms
+import torch.backends.cudnn as cudnn
+import torch
+CUR_DIR = osp.dirname(os.path.abspath(__file__))
+sys.path.insert(0, osp.join(CUR_DIR, '..', 'main'))
+sys.path.insert(0, osp.join(CUR_DIR , '..', 'common'))
+from config import cfg
+import cv2
+from tqdm import tqdm
+import json
+from typing import Literal, Union
+from mmdet.apis import init_detector, inference_detector
+from utils.inference_utils import process_mmdet_results, non_max_suppression
+
+class Inferer:
+
+    def __init__(self, pretrained_model, num_gpus, output_folder):
+        self.output_folder = output_folder
+        self.device = torch.device('cuda') if (num_gpus > 0) else torch.device('cpu')
+        config_path = osp.join(CUR_DIR, './config', f'config_{pretrained_model}.py')
+        ckpt_path = osp.join(CUR_DIR, '../pretrained_models', f'{pretrained_model}.pth.tar')
+        cfg.get_config_fromfile(config_path)
+        cfg.update_config(num_gpus, ckpt_path, output_folder, self.device)
+        self.cfg = cfg
+        cudnn.benchmark = True
+        
+        # load model
+        from base import Demoer
+        demoer = Demoer()
+        demoer._make_model()
+        demoer.model.eval()
+        self.demoer = demoer
+        checkpoint_file = osp.join(CUR_DIR, '../pretrained_models/mmdet/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth')
+        config_file= osp.join(CUR_DIR, '../pretrained_models/mmdet/mmdet_faster_rcnn_r50_fpn_coco.py')
+        model = init_detector(config_file, checkpoint_file, device=self.device)  # or device='cuda:0'
+        self.model = model
+
+    def infer(self, original_img, iou_thr, frame, multi_person=False, mesh_as_vertices=False):
+        from utils.preprocessing import process_bbox, generate_patch_image
+        from utils.vis import render_mesh, save_obj
+        from utils.human_models import smpl_x
+        mesh_paths = []
+        smplx_paths = []
+        # prepare input image
+        transform = transforms.ToTensor()
+        vis_img = original_img.copy()
+        original_img_height, original_img_width = original_img.shape[:2]
+
+        ## mmdet inference
+        mmdet_results = inference_detector(self.model, original_img)
+        mmdet_box = process_mmdet_results(mmdet_results, cat_id=0, multi_person=True)
+        
+        # save original image if no bbox
+        if len(mmdet_box[0])<1:
+            return original_img, [], []
+        
+        if not multi_person:
+            # only select the largest bbox
+            num_bbox = 1
+            mmdet_box = mmdet_box[0]
+        else:
+            # keep bbox by NMS with iou_thr
+            mmdet_box = non_max_suppression(mmdet_box[0], iou_thr)
+            num_bbox = len(mmdet_box)
+        
+        ## loop all detected bboxes
+        for bbox_id in range(num_bbox):
+            mmdet_box_xywh = np.zeros((4))
+            mmdet_box_xywh[0] = mmdet_box[bbox_id][0]
+            mmdet_box_xywh[1] = mmdet_box[bbox_id][1]
+            mmdet_box_xywh[2] =  abs(mmdet_box[bbox_id][2]-mmdet_box[bbox_id][0])
+            mmdet_box_xywh[3] =  abs(mmdet_box[bbox_id][3]-mmdet_box[bbox_id][1]) 
+
+            # skip small bboxes by bbox_thr in pixel
+            if mmdet_box_xywh[2] < 50 or mmdet_box_xywh[3] < 150:
+                continue
+
+            bbox = process_bbox(mmdet_box_xywh, original_img_width, original_img_height)
+            img, img2bb_trans, bb2img_trans = generate_patch_image(original_img, bbox, 1.0, 0.0, False, self.cfg.input_img_shape)
+            img = transform(img.astype(np.float32))/255
+            img = img.to(cfg.device)[None,:,:,:]
+            inputs = {'img': img}
+            targets = {}
+            meta_info = {}
+
+            # mesh recovery
+            with torch.no_grad():
+                out = self.demoer.model(inputs, targets, meta_info, 'test')
+            mesh = out['smplx_mesh_cam'].detach().cpu().numpy()[0]
+
+            ## save mesh
+            save_path_mesh = os.path.join(self.output_folder, 'mesh')
+            os.makedirs(save_path_mesh, exist_ok= True)
+            obj_path = os.path.join(save_path_mesh, f'{frame:05}_{bbox_id}.obj')
+            save_obj(mesh, smpl_x.face, obj_path)
+            mesh_paths.append(obj_path)
+            ## save single person param
+            smplx_pred = {}
+            smplx_pred['global_orient'] = out['smplx_root_pose'].reshape(-1,3).cpu().numpy()
+            smplx_pred['body_pose'] = out['smplx_body_pose'].reshape(-1,3).cpu().numpy()
+            smplx_pred['left_hand_pose'] = out['smplx_lhand_pose'].reshape(-1,3).cpu().numpy()
+            smplx_pred['right_hand_pose'] = out['smplx_rhand_pose'].reshape(-1,3).cpu().numpy()
+            smplx_pred['jaw_pose'] = out['smplx_jaw_pose'].reshape(-1,3).cpu().numpy()
+            smplx_pred['leye_pose'] = np.zeros((1, 3))
+            smplx_pred['reye_pose'] = np.zeros((1, 3))
+            smplx_pred['betas'] = out['smplx_shape'].reshape(-1,10).cpu().numpy()
+            smplx_pred['expression'] = out['smplx_expr'].reshape(-1,10).cpu().numpy()
+            smplx_pred['transl'] =  out['cam_trans'].reshape(-1,3).cpu().numpy()
+            save_path_smplx = os.path.join(self.output_folder, 'smplx')
+            os.makedirs(save_path_smplx, exist_ok= True)
+
+            npz_path = os.path.join(save_path_smplx, f'{frame:05}_{bbox_id}.npz')
+            np.savez(npz_path, **smplx_pred)
+            smplx_paths.append(npz_path)
+
+            ## render single person mesh
+            focal = [self.cfg.focal[0] / self.cfg.input_body_shape[1] * bbox[2], self.cfg.focal[1] / self.cfg.input_body_shape[0] * bbox[3]]
+            princpt = [self.cfg.princpt[0] / self.cfg.input_body_shape[1] * bbox[2] + bbox[0], self.cfg.princpt[1] / self.cfg.input_body_shape[0] * bbox[3] + bbox[1]]
+            vis_img = render_mesh(vis_img, mesh, smpl_x.face, {'focal': focal, 'princpt': princpt}, 
+                                  mesh_as_vertices=mesh_as_vertices)
+            vis_img = vis_img.astype('uint8') 
+        return vis_img, mesh_paths, smplx_paths
+
diff --git a/main/transformer_utils/.gitignore b/main/transformer_utils/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..e792015fe7abb9597efbe787d25d0c4d242ef42b
--- /dev/null
+++ b/main/transformer_utils/.gitignore
@@ -0,0 +1,141 @@
+
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+**/*.pyc
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/en/_build
+docs/zh_cn/_build
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+
+# custom
+mmpose/.mim
+/models
+/data
+.vscode
+.idea
+*.pkl
+*.pkl.json
+*.log.json
+*.npy
+work_dirs/
+docs/**/topics/
+docs/**/papers/*.md
+docs/**/datasets.md
+docs/**/modelzoo.md
+
+!tests/data/**/*.pkl
+!tests/data/**/*.pkl.json
+!tests/data/**/*.log.json
+!tests/data/**/*.pth
+!tests/data/**/*.npy
+
+# Pytorch
+*.pth
+
+*.DS_Store
+
+# checkpoints
+ckpts/
+vis_results
+vis_results_poseur
+scripts
\ No newline at end of file
diff --git a/main/transformer_utils/CITATION.cff b/main/transformer_utils/CITATION.cff
new file mode 100644
index 0000000000000000000000000000000000000000..067f4ff996793aa8b8a5f39d35f43c0e275cfa64
--- /dev/null
+++ b/main/transformer_utils/CITATION.cff
@@ -0,0 +1,8 @@
+cff-version: 1.2.0
+message: "If you use this software, please cite it as below."
+authors:
+  - name: "Poseur Contributors"
+title: "Poseur: Direct Human Pose Regression with Transformers"
+date-released: 2022-07-21
+url: "https://github.com/aim-uofa/Poseur"
+license: 2-clause BSD
diff --git a/main/transformer_utils/LICENSE b/main/transformer_utils/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..acab10d9e2ab392bd0839978663306846c56aca3
--- /dev/null
+++ b/main/transformer_utils/LICENSE
@@ -0,0 +1,677 @@
+Poseur for non-commercial purposes
+(For commercial use, contact chhshen@gmail.com for obtaining a commerical license.)
+
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.  We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors.  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+  To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights.  Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received.  You must make sure that they, too, receive
+or can get the source code.  And you must show them these terms so they
+know their rights.
+
+  Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+  For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software.  For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+  Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so.  This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software.  The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable.  Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products.  If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+  Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary.  To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                       TERMS AND CONDITIONS
+
+  0. Definitions.
+
+  "This License" refers to version 3 of the GNU General Public License.
+
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+  1. Source Code.
+
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+  The Corresponding Source for a work in source code form is that
+same work.
+
+  2. Basic Permissions.
+
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+  4. Conveying Verbatim Copies.
+
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+  5. Conveying Modified Source Versions.
+
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+  6. Conveying Non-Source Forms.
+
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+  7. Additional Terms.
+
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+  8. Termination.
+
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+  9. Acceptance Not Required for Having Copies.
+
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+  10. Automatic Licensing of Downstream Recipients.
+
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+  11. Patents.
+
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+  12. No Surrender of Others' Freedom.
+
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+  13. Use with the GNU Affero General Public License.
+
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+  14. Revised Versions of this License.
+
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+  If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+  15. Disclaimer of Warranty.
+
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. Limitation of Liability.
+
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+  17. Interpretation of Sections 15 and 16.
+
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+  If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+
+    <program>  Copyright (C) <year>  <name of author>
+    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+<http://www.gnu.org/licenses/>.
+
+  The GNU General Public License does not permit incorporating your program
+into proprietary programs.  If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.  But first, please read
+<http://www.gnu.org/philosophy/why-not-lgpl.html>.
\ No newline at end of file
diff --git a/main/transformer_utils/MANIFEST.in b/main/transformer_utils/MANIFEST.in
new file mode 100644
index 0000000000000000000000000000000000000000..8a93c252bd38bafddc390bc9ae9b7278e3479246
--- /dev/null
+++ b/main/transformer_utils/MANIFEST.in
@@ -0,0 +1,5 @@
+include requirements/*.txt
+include mmpose/.mim/model-index.yml
+recursive-include mmpose/.mim/configs *.py *.yml
+recursive-include mmpose/.mim/tools *.py *.sh
+recursive-include mmpose/.mim/demo *.py
diff --git a/main/transformer_utils/README.md b/main/transformer_utils/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..fe5fd4c4c2a3a081c9a06f23162975078f58c375
--- /dev/null
+++ b/main/transformer_utils/README.md
@@ -0,0 +1,80 @@
+# Poseur: Direct Human Pose Regression with Transformers
+
+
+> [**Poseur: Direct Human Pose Regression with Transformers**](https://arxiv.org/pdf/2201.07412.pdf),
+> Weian Mao\*, Yongtao Ge\*, Chunhua Shen, Zhi Tian, Xinlong Wang, Zhibin Wang, Anton van den Hengel
+> In: European Conference on Computer Vision (ECCV), 2022
+> *arXiv preprint ([arXiv 2201.07412](https://arxiv.org/pdf/2201.07412))*
+> (\* equal contribution)
+
+# Introduction
+This is a preview for Poseur, which currently including Poseur with R-50 backbone for both training and inference. More models with various backbones will be released soon. This project is bulit upon [MMPose](https://github.com/open-mmlab/mmpose) with commit ID [eeebc652842a9724259ed345c00112641d8ee06d](https://github.com/open-mmlab/mmpose/commit/eeebc652842a9724259ed345c00112641d8ee06d).
+
+# Installation & Quick Start
+1. Install following packages
+```
+pip install easydict einops
+```
+2. Follow the [MMPose instruction](mmpose_README.md) to install the project and set up the datasets (MS-COCO).
+
+For training on COCO, run:
+```
+./tools/dist_train.sh \
+configs/body/2d_kpt_sview_rgb_img/poseur/coco/poseur_res50_coco_256x192.py 8 \
+--work-dir work_dirs/poseur_res50_coco_256x192
+```
+
+For evaluating on COCO, run the following command lines:
+```
+wget https://cloudstor.aarnet.edu.au/plus/s/UXr1Dn9w6ja4fM9/download -O poseur_256x192_r50_6dec_coco.pth
+./tools/dist_test.sh configs/body/2d_kpt_sview_rgb_img/poseur/coco/poseur_res50_coco_256x192.py \
+    poseur_256x192_r50_6dec_coco.pth 4 \
+    --eval mAP \
+    --cfg-options model.filp_fuse_type=\'type2\'
+```
+
+For visualizing on COCO, run the following command lines:
+```
+python demo/top_down_img_demo.py \
+    configs/body/2d_kpt_sview_rgb_img/poseur/coco/poseur_res50_coco_256x192.py \
+    poseur_256x192_r50_6dec_coco.pth \
+    --img-root tests/data/coco/ --json-file tests/data/coco/test_coco.json \
+    --out-img-root vis_results_poseur
+```
+
+## Models
+### COCO Keypoint Detection Results
+
+Name | AP | AP.5| AP.75 |download
+--- |:---:|:---:|:---:|:---:
+[poseur_mobilenetv2_coco_256x192](configs/body/2d_kpt_sview_rgb_img/poseur/coco/poseur_mobilenetv2_coco_256x192.py)| 71.9  | 88.9 |78.6 | [model](https://cloudstor.aarnet.edu.au/plus/s/L198TFFqwWYsSop/download)
+[poseur_mobilenetv2_coco_256x192_12dec](configs/body/2d_kpt_sview_rgb_img/poseur/coco/poseur_mobilenetv2_coco_256x192_12dec.py)| 72.3  | 88.9 |78.9 | [model](https://cloudstor.aarnet.edu.au/plus/s/sw0II7qSQDjJ88h/download)
+[poseur_res50_coco_256x192](configs/body/2d_kpt_sview_rgb_img/poseur/coco/poseur_res50_coco_256x192.py)| 75.5  | 90.7 |82.6 | [model](https://cloudstor.aarnet.edu.au/plus/s/UXr1Dn9w6ja4fM9/download)
+[poseur_hrnet_w32_coco_256x192](configs/body/2d_kpt_sview_rgb_img/poseur/coco/poseur_hrnet_w32_coco_256x192.py)| 76.8  | 91.0 |83.5 | [model](https://cloudstor.aarnet.edu.au/plus/s/xMvCnp5lb2MR7S4/download)
+[poseur_hrnet_w48_coco_384x288](configs/body/2d_kpt_sview_rgb_img/poseur/coco/poseur_hrnet_w48_coco_384x288.py)| 78.7  | 91.6 |85.1 | [model](https://cloudstor.aarnet.edu.au/plus/s/IGXy98TZlJYerNc/download)
+[poseur_hrformer_tiny_coco_256x192_3dec](configs/body/2d_kpt_sview_rgb_img/poseur/coco/poseur_hrformer_tiny_coco_256x192_3dec.py)| 74.2  | 90.1 |81.4 | [model](https://cloudstor.aarnet.edu.au/plus/s/CpGYghZQX3mv32i/download)
+[poseur_hrformer_small_coco_256x192_3dec](configs/body/2d_kpt_sview_rgb_img/poseur/coco/poseur_hrformer_small_coco_256x192_3dec.py)| 76.6  | 91.0 |83.4 | [model](https://cloudstor.aarnet.edu.au/plus/s/rK2s3fdrpeP9k6l/download)
+[poseur_hrformer_big_coco_256x192](configs/body/2d_kpt_sview_rgb_img/poseur/coco/poseur_hrformer_big_coco_256x192.py)| 78.9  | 91.9 |85.6 | [model](https://cloudstor.aarnet.edu.au/plus/s/34udjbTr9p9Aigo/download)
+[poseur_hrformer_big_coco_384x288](configs/body/2d_kpt_sview_rgb_img/poseur/coco/poseur_hrformer_big_coco_384x288.py)| 79.6  | 92.1 |85.9 | [model](https://cloudstor.aarnet.edu.au/plus/s/KST3aSAlGd8PJpQ/download)
+
+
+*Disclaimer:*
+
+- Due to the update of MMPose, the results are slightly different from our original paper.
+- We use the official HRFormer implement from [here](https://github.com/HRNet/HRFormer/tree/main/pose), the implementation in mmpose has not been verified by us.
+
+# Citations
+Please consider citing our papers in your publications if the project helps your research. BibTeX reference is as follows.
+```BibTeX
+@inproceedings{mao2022poseur,
+  title={Poseur: Direct human pose regression with transformers},
+  author={Mao, Weian and Ge, Yongtao and Shen, Chunhua and Tian, Zhi and Wang, Xinlong and Wang, Zhibin and Hengel, Anton van den},
+  journal = {Proceedings of the European Conference on Computer Vision {(ECCV)}},
+  month = {October},
+  year={2022}
+}
+```
+
+## License
+
+For commercial use, please contact [Chunhua Shen](mailto:chhshen@gmail.com).
\ No newline at end of file
diff --git a/main/transformer_utils/configs/smpler_x/decoder/face_decoder.py b/main/transformer_utils/configs/smpler_x/decoder/face_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..18261da4bf55eef919556ff6075b8065d3ed5522
--- /dev/null
+++ b/main/transformer_utils/configs/smpler_x/decoder/face_decoder.py
@@ -0,0 +1,262 @@
+from config import cfg
+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=10)
+evaluation = dict(interval=25, metric='mAP', key_indicator='AP', rle_score=True)
+
+optimizer = dict(
+    type='AdamW',
+    lr=1e-3,
+    weight_decay=1e-4,
+    paramwise_cfg = dict(
+        custom_keys={
+            # 'backbone': dict(lr_mult=0.1),
+            'sampling_offsets': dict(lr_mult=0.1),
+            'reference_points': dict(lr_mult=0.1),
+            # 'query_embed': dict(lr_mult=0.5, decay_mult=1.0),
+        },
+    )
+)
+optimizer_config = dict(grad_clip=None)
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[255, 310])
+total_epochs = 325
+
+log_config = dict(
+    interval=50, hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook'),
+    ])
+
+channel_cfg = dict(
+    num_output_channels=72,
+    dataset_joints=72,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+emb_dim = 256
+if cfg.upscale==1:
+    neck_in_channels = [cfg.feat_dim]
+    num_levels = 1
+elif cfg.upscale==2:
+    neck_in_channels = [cfg.feat_dim//2, cfg.feat_dim]
+    # neck_in_channels = [768, 768]
+    num_levels = 2
+elif cfg.upscale==4:
+    neck_in_channels = [cfg.feat_dim//4, cfg.feat_dim//2, cfg.feat_dim]
+    # neck_in_channels = [768, 768, 768]
+    num_levels = 3
+elif cfg.upscale==8:
+    neck_in_channels = [cfg.feat_dim//8, cfg.feat_dim//4, cfg.feat_dim//2, cfg.feat_dim]
+    # neck_in_channels = [768, 768, 768, 768]
+    num_levels = 4
+# model settings
+norm_cfg = dict(type='BN', requires_grad=True)
+# norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    type='Poseur',
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', norm_cfg = norm_cfg, depth=50, num_stages=4, out_indices=(0, 1, 2, 3)),
+    neck=dict(
+        type='ChannelMapper',
+        in_channels=neck_in_channels,
+        kernel_size=1,
+        out_channels=emb_dim,
+        act_cfg=None,
+        norm_cfg=dict(type='GN', num_groups=32),
+    ),
+    keypoint_head=dict(
+        type='Poseur_noise_sample',
+        in_channels=512,
+        num_queries=channel_cfg['num_output_channels'],
+        num_reg_fcs=2,
+        num_joints=channel_cfg['num_output_channels'],
+        with_box_refine=True,
+        loss_coord_enc=dict(type='RLELoss_poseur', use_target_weight=True),
+        loss_coord_dec=dict(type='RLELoss_poseur', use_target_weight=True),
+        # loss_coord_dec=dict(type='L1Loss', use_target_weight=True, loss_weight=5),
+        loss_hp_keypoint=dict(type='JointsMSELoss', use_target_weight=True, loss_weight=10),
+        # loss_coord_keypoint=dict(type='L1Loss', use_target_weight=True, loss_weight=1),
+        positional_encoding=dict(
+            type='SinePositionalEncoding',
+            num_feats=emb_dim//2,
+            normalize=True,
+            offset=-0.5),
+        transformer=dict(
+            type='PoseurTransformer_v3',
+            num_joints=channel_cfg['num_output_channels'],
+            query_pose_emb = True,
+            embed_dims = emb_dim,
+            encoder=dict(
+                type='DetrTransformerEncoder_zero_layer',
+                num_layers=0,
+                transformerlayers=dict(
+                    type='BaseTransformerLayer',
+                    ffn_cfgs = dict(
+                        embed_dims=emb_dim,
+                        ),
+                    attn_cfgs=dict(
+                        type='MultiScaleDeformableAttention',
+                        num_levels=num_levels,
+                        num_points=4,
+                        embed_dims=emb_dim),
+                    
+                    feedforward_channels=1024,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'ffn', 'norm'))),
+            decoder=dict(
+                type='DeformableDetrTransformerDecoder',
+                num_layers=6,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type='DetrTransformerDecoderLayer_grouped',
+                    ffn_cfgs = dict(
+                        embed_dims=emb_dim,
+                        ),
+                    attn_cfgs=[
+                        dict(
+                            type='MultiheadAttention',
+                            embed_dims=emb_dim,
+                            num_heads=8,
+                            dropout=0.1),
+                        dict(
+                            type='MultiScaleDeformableAttention_post_value',
+                            num_levels=num_levels,
+                            num_points=4,
+                            embed_dims=emb_dim)
+                    ],
+                    num_joints=channel_cfg['num_output_channels'],
+                    feedforward_channels=1024,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm')))),
+        as_two_stage=True,
+        use_heatmap_loss=False,
+    ),
+    train_cfg=dict(image_size=[192, 256]),
+    test_cfg = dict(
+        image_size=[192, 256],
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11)
+)
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    # use_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    det_bbox_thr=0.0,
+    # use_gt_bbox=True,
+    # bbox_file='',
+    use_gt_bbox=False,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownGetBboxCenterScale', padding=1.25),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    # dict(
+    #     type='TopDownGenerateTarget',
+    #     kernel=[(11, 11), (9, 9), (7, 7), (5, 5)],
+    #     encoding='Megvii'),
+    dict(
+        target_type='wo_mask',
+        type='TopDownGenerateCoordAndHeatMapTarget',
+        encoding='MSRA',
+        sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'coord_target', 'coord_target_weight', 'hp_target', 'hp_target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownGetBboxCenterScale', padding=1.25),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=[
+            'img',
+        ],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=32,
+    # samples_per_gpu=64,
+    workers_per_gpu=8,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        # ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        # img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline),
+)
+
+fp16 = dict(loss_scale='dynamic')
diff --git a/main/transformer_utils/configs/smpler_x/decoder/hand_decoder.py b/main/transformer_utils/configs/smpler_x/decoder/hand_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1900968fc24fe32e88c98cd3af56a0aa0b3a4e5
--- /dev/null
+++ b/main/transformer_utils/configs/smpler_x/decoder/hand_decoder.py
@@ -0,0 +1,262 @@
+from config import cfg
+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=10)
+evaluation = dict(interval=25, metric='mAP', key_indicator='AP', rle_score=True)
+
+optimizer = dict(
+    type='AdamW',
+    lr=1e-3,
+    weight_decay=1e-4,
+    paramwise_cfg = dict(
+        custom_keys={
+            # 'backbone': dict(lr_mult=0.1),
+            'sampling_offsets': dict(lr_mult=0.1),
+            'reference_points': dict(lr_mult=0.1),
+            # 'query_embed': dict(lr_mult=0.5, decay_mult=1.0),
+        },
+    )
+)
+optimizer_config = dict(grad_clip=None)
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[255, 310])
+total_epochs = 325
+
+log_config = dict(
+    interval=50, hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook'),
+    ])
+
+channel_cfg = dict(
+    num_output_channels=20,
+    dataset_joints=20,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+emb_dim = 256
+if cfg.upscale==1:
+    neck_in_channels = [cfg.feat_dim]
+    num_levels = 1
+elif cfg.upscale==2:
+    neck_in_channels = [cfg.feat_dim//2, cfg.feat_dim]
+    # neck_in_channels = [768, 768]
+    num_levels = 2
+elif cfg.upscale==4:
+    neck_in_channels = [cfg.feat_dim//4, cfg.feat_dim//2, cfg.feat_dim]
+    # neck_in_channels = [768, 768, 768]
+    num_levels = 3
+elif cfg.upscale==8:
+    neck_in_channels = [cfg.feat_dim//8, cfg.feat_dim//4, cfg.feat_dim//2, cfg.feat_dim]
+    # neck_in_channels = [768, 768, 768, 768]
+    num_levels = 4
+# model settings
+norm_cfg = dict(type='BN', requires_grad=True)
+# norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    type='Poseur',
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', norm_cfg = norm_cfg, depth=50, num_stages=4, out_indices=(0, 1, 2, 3)),
+    neck=dict(
+        type='ChannelMapper',
+        in_channels=neck_in_channels,
+        kernel_size=1,
+        out_channels=emb_dim,
+        act_cfg=None,
+        norm_cfg=dict(type='GN', num_groups=32),
+    ),
+    keypoint_head=dict(
+        type='Poseur_noise_sample',
+        in_channels=512,
+        num_queries=channel_cfg['num_output_channels'],
+        num_reg_fcs=2,
+        num_joints=channel_cfg['num_output_channels'],
+        with_box_refine=True,
+        loss_coord_enc=dict(type='RLELoss_poseur', use_target_weight=True),
+        loss_coord_dec=dict(type='RLELoss_poseur', use_target_weight=True),
+        # loss_coord_dec=dict(type='L1Loss', use_target_weight=True, loss_weight=5),
+        loss_hp_keypoint=dict(type='JointsMSELoss', use_target_weight=True, loss_weight=10),
+        # loss_coord_keypoint=dict(type='L1Loss', use_target_weight=True, loss_weight=1),
+        positional_encoding=dict(
+            type='SinePositionalEncoding',
+            num_feats=emb_dim//2,
+            normalize=True,
+            offset=-0.5),
+        transformer=dict(
+            type='PoseurTransformer_v3',
+            num_joints=channel_cfg['num_output_channels'],
+            query_pose_emb = True,
+            embed_dims = emb_dim,
+            encoder=dict(
+                type='DetrTransformerEncoder_zero_layer',
+                num_layers=0,
+                transformerlayers=dict(
+                    type='BaseTransformerLayer',
+                    ffn_cfgs = dict(
+                        embed_dims=emb_dim,
+                        ),
+                    attn_cfgs=dict(
+                        type='MultiScaleDeformableAttention',
+                        num_levels=num_levels,
+                        num_points=4,
+                        embed_dims=emb_dim),
+                    
+                    feedforward_channels=1024,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'ffn', 'norm'))),
+            decoder=dict(
+                type='DeformableDetrTransformerDecoder',
+                num_layers=6,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type='DetrTransformerDecoderLayer_grouped',
+                    ffn_cfgs = dict(
+                        embed_dims=emb_dim,
+                        ),
+                    attn_cfgs=[
+                        dict(
+                            type='MultiheadAttention',
+                            embed_dims=emb_dim,
+                            num_heads=8,
+                            dropout=0.1),
+                        dict(
+                            type='MultiScaleDeformableAttention_post_value',
+                            num_levels=num_levels,
+                            num_points=4,
+                            embed_dims=emb_dim)
+                    ],
+                    feedforward_channels=1024,
+                    num_joints=channel_cfg['num_output_channels'],
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm')))),
+        as_two_stage=True,
+        use_heatmap_loss=False,
+    ),
+    train_cfg=dict(image_size=[192, 256]),
+    test_cfg = dict(
+        image_size=[192, 256],
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11)
+)
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    # use_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    det_bbox_thr=0.0,
+    # use_gt_bbox=True,
+    # bbox_file='',
+    use_gt_bbox=False,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownGetBboxCenterScale', padding=1.25),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    # dict(
+    #     type='TopDownGenerateTarget',
+    #     kernel=[(11, 11), (9, 9), (7, 7), (5, 5)],
+    #     encoding='Megvii'),
+    dict(
+        target_type='wo_mask',
+        type='TopDownGenerateCoordAndHeatMapTarget',
+        encoding='MSRA',
+        sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'coord_target', 'coord_target_weight', 'hp_target', 'hp_target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownGetBboxCenterScale', padding=1.25),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=[
+            'img',
+        ],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=32,
+    # samples_per_gpu=64,
+    workers_per_gpu=8,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        # ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        # img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline),
+)
+
+fp16 = dict(loss_scale='dynamic')
diff --git a/main/transformer_utils/configs/smpler_x/encoder/body_encoder_base.py b/main/transformer_utils/configs/smpler_x/encoder/body_encoder_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..26bdb9a4350e5fd371442113db8eb009d89cb649
--- /dev/null
+++ b/main/transformer_utils/configs/smpler_x/encoder/body_encoder_base.py
@@ -0,0 +1,169 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1,
+                 constructor='LayerDecayOptimizerConstructor',
+                 paramwise_cfg=dict(
+                                    num_layers=12,
+                                    layer_decay_rate=0.75,
+                                    custom_keys={
+                                            'bias': dict(decay_multi=0.),
+                                            'pos_embed': dict(decay_mult=0.),
+                                            'relative_position_bias_table': dict(decay_mult=0.),
+                                            'norm': dict(decay_mult=0.)
+                                            }
+                                    )
+                )
+
+optimizer_config = dict(grad_clip=dict(max_norm=1., norm_type=2))
+
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+target_type = 'GaussianHeatmap'
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='ViT',
+        img_size=(256, 192),
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        ratio=1,
+        use_checkpoint=False,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.3,
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=768,
+        num_deconv_layers=2,
+        num_deconv_filters=(256, 256),
+        num_deconv_kernels=(4, 4),
+        extra=dict(final_conv_kernel=1, ),
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=False,
+        target_type=target_type,
+        modulate_kernel=11,
+        use_udp=True))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='TopDownGenerateTarget',
+        sigma=2,
+        encoding='UDP',
+        target_type=target_type),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=4,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
\ No newline at end of file
diff --git a/main/transformer_utils/configs/smpler_x/encoder/body_encoder_huge.py b/main/transformer_utils/configs/smpler_x/encoder/body_encoder_huge.py
new file mode 100644
index 0000000000000000000000000000000000000000..cde1f090dbd2a9c6924363e5dedd36eadb827d89
--- /dev/null
+++ b/main/transformer_utils/configs/smpler_x/encoder/body_encoder_huge.py
@@ -0,0 +1,169 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1,
+                 constructor='LayerDecayOptimizerConstructor', 
+                 paramwise_cfg=dict(
+                                    num_layers=32, 
+                                    layer_decay_rate=0.85,
+                                    custom_keys={
+                                            'bias': dict(decay_multi=0.),
+                                            'pos_embed': dict(decay_mult=0.),
+                                            'relative_position_bias_table': dict(decay_mult=0.),
+                                            'norm': dict(decay_mult=0.)
+                                            }
+                                    )
+                )
+
+optimizer_config = dict(grad_clip=dict(max_norm=1., norm_type=2))
+
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+target_type = 'GaussianHeatmap'
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='ViT',
+        img_size=(256, 192),
+        patch_size=16,
+        embed_dim=1280,
+        depth=32,
+        num_heads=16,
+        ratio=1,
+        use_checkpoint=False,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.55,
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=1280,
+        num_deconv_layers=2,
+        num_deconv_filters=(256, 256),
+        num_deconv_kernels=(4, 4),
+        extra=dict(final_conv_kernel=1, ),
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=False,
+        target_type=target_type,
+        modulate_kernel=11,
+        use_udp=True))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='TopDownGenerateTarget',
+        sigma=2,
+        encoding='UDP',
+        target_type=target_type),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=4,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/main/transformer_utils/configs/smpler_x/encoder/body_encoder_large.py b/main/transformer_utils/configs/smpler_x/encoder/body_encoder_large.py
new file mode 100644
index 0000000000000000000000000000000000000000..efd95afa09d8326f2a76c55ca1eea85c6cb60dcc
--- /dev/null
+++ b/main/transformer_utils/configs/smpler_x/encoder/body_encoder_large.py
@@ -0,0 +1,169 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1,
+                 constructor='LayerDecayOptimizerConstructor',
+                 paramwise_cfg=dict(
+                                    num_layers=16,
+                                    layer_decay_rate=0.8,
+                                    custom_keys={
+                                            'bias': dict(decay_multi=0.),
+                                            'pos_embed': dict(decay_mult=0.),
+                                            'relative_position_bias_table': dict(decay_mult=0.),
+                                            'norm': dict(decay_mult=0.)
+                                            }
+                                    )
+                )
+
+optimizer_config = dict(grad_clip=dict(max_norm=1., norm_type=2))
+
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+target_type = 'GaussianHeatmap'
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='ViT',
+        img_size=(256, 192),
+        patch_size=16,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        ratio=1,
+        use_checkpoint=False,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.5,
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=1024,
+        num_deconv_layers=2,
+        num_deconv_filters=(256, 256),
+        num_deconv_kernels=(4, 4),
+        extra=dict(final_conv_kernel=1, ),
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=False,
+        target_type=target_type,
+        modulate_kernel=11,
+        use_udp=True))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='TopDownGenerateTarget',
+        sigma=2,
+        encoding='UDP',
+        target_type=target_type),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=4,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
\ No newline at end of file
diff --git a/main/transformer_utils/configs/smpler_x/encoder/body_encoder_small.py b/main/transformer_utils/configs/smpler_x/encoder/body_encoder_small.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a032877d7c9673704828f43da47a55d90ac0c2e
--- /dev/null
+++ b/main/transformer_utils/configs/smpler_x/encoder/body_encoder_small.py
@@ -0,0 +1,169 @@
+_base_ = [
+    '../../../../_base_/default_runtime.py',
+    '../../../../_base_/datasets/coco.py'
+]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+
+optimizer = dict(type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1,
+                 constructor='LayerDecayOptimizerConstructor', 
+                 paramwise_cfg=dict(
+                                    num_layers=12, 
+                                    layer_decay_rate=0.8,
+                                    custom_keys={
+                                            'bias': dict(decay_multi=0.),
+                                            'pos_embed': dict(decay_mult=0.),
+                                            'relative_position_bias_table': dict(decay_mult=0.),
+                                            'norm': dict(decay_mult=0.)
+                                            }
+                                    )
+                )
+
+optimizer_config = dict(grad_clip=dict(max_norm=1., norm_type=2))
+
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+target_type = 'GaussianHeatmap'
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained=None,
+    backbone=dict(
+        type='ViT',
+        img_size=(256, 192),
+        patch_size=16,
+        embed_dim=384,
+        depth=12,
+        num_heads=12,
+        ratio=1,
+        use_checkpoint=False,
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_path_rate=0.1,
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=384,
+        num_deconv_layers=2,
+        num_deconv_filters=(256, 256),
+        num_deconv_kernels=(4, 4),
+        extra=dict(final_conv_kernel=1, ),
+        out_channels=channel_cfg['num_output_channels'],
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=False,
+        target_type=target_type,
+        modulate_kernel=11,
+        use_udp=True))
+
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='TopDownGenerateTarget',
+        sigma=2,
+        encoding='UDP',
+        target_type=target_type),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffine', use_udp=True),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+
+test_pipeline = val_pipeline
+
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=64,
+    workers_per_gpu=4,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)
diff --git a/main/transformer_utils/mmpose/__init__.py b/main/transformer_utils/mmpose/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..abcf8693e279f59c8c80f55e1797841e593dbd72
--- /dev/null
+++ b/main/transformer_utils/mmpose/__init__.py
@@ -0,0 +1,29 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import mmpose.ops
+from .version import __version__, short_version
+
+
+def digit_version(version_str):
+    digit_version = []
+    for x in version_str.split('.'):
+        if x.isdigit():
+            digit_version.append(int(x))
+        elif x.find('rc') != -1:
+            patch_version = x.split('rc')
+            digit_version.append(int(patch_version[0]) - 1)
+            digit_version.append(int(patch_version[1]))
+    return digit_version
+
+
+mmcv_minimum_version = '1.3.8'
+mmcv_maximum_version = '1.8.0'
+mmcv_version = digit_version(mmcv.__version__)
+
+
+assert (mmcv_version >= digit_version(mmcv_minimum_version)
+        and mmcv_version <= digit_version(mmcv_maximum_version)), \
+    f'MMCV=={mmcv.__version__} is used but incompatible. ' \
+    f'Please install mmcv>={mmcv_minimum_version}, <={mmcv_maximum_version}.'
+
+__all__ = ['__version__', 'short_version']
diff --git a/main/transformer_utils/mmpose/core/__init__.py b/main/transformer_utils/mmpose/core/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..87f34c570a66dd58d6fb84f79b45063b89526d58
--- /dev/null
+++ b/main/transformer_utils/mmpose/core/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .bbox import *  # noqa: F401, F403
+from .camera import *  # noqa: F401, F403
+from .evaluation import *  # noqa: F401, F403
+from .fp16 import *  # noqa: F401, F403
+from .optimizers import *  # noqa: F401, F403
+from .post_processing import *  # noqa: F401, F403
+from .utils import *  # noqa: F401, F403
+from .visualization import *  # noqa: F401, F403
diff --git a/main/transformer_utils/mmpose/core/bbox/__init__.py b/main/transformer_utils/mmpose/core/bbox/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..557993386a6c5de8336a92514072c81b48419ba7
--- /dev/null
+++ b/main/transformer_utils/mmpose/core/bbox/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .transforms import (bbox_cs2xywh, bbox_xywh2cs, bbox_xywh2xyxy,
+                         bbox_xyxy2xywh)
+
+__all__ = ['bbox_xywh2xyxy', 'bbox_xyxy2xywh', 'bbox_xywh2cs', 'bbox_cs2xywh']
diff --git a/main/transformer_utils/mmpose/core/bbox/transforms.py b/main/transformer_utils/mmpose/core/bbox/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..703639443a9f327801b6e6b00ca278b2e22a0ee0
--- /dev/null
+++ b/main/transformer_utils/mmpose/core/bbox/transforms.py
@@ -0,0 +1,88 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+
+
+def bbox_xyxy2xywh(bbox_xyxy):
+    """Transform the bbox format from x1y1x2y2 to xywh.
+
+    Args:
+        bbox_xyxy (np.ndarray): Bounding boxes (with scores), shaped (n, 4) or
+            (n, 5). (left, top, right, bottom, [score])
+
+    Returns:
+        np.ndarray: Bounding boxes (with scores),
+          shaped (n, 4) or (n, 5). (left, top, width, height, [score])
+    """
+    bbox_xywh = bbox_xyxy.copy()
+    bbox_xywh[:, 2] = bbox_xywh[:, 2] - bbox_xywh[:, 0]
+    bbox_xywh[:, 3] = bbox_xywh[:, 3] - bbox_xywh[:, 1]
+
+    return bbox_xywh
+
+
+def bbox_xywh2xyxy(bbox_xywh):
+    """Transform the bbox format from xywh to x1y1x2y2.
+
+    Args:
+        bbox_xywh (ndarray): Bounding boxes (with scores),
+            shaped (n, 4) or (n, 5). (left, top, width, height, [score])
+    Returns:
+        np.ndarray: Bounding boxes (with scores), shaped (n, 4) or
+          (n, 5). (left, top, right, bottom, [score])
+    """
+    bbox_xyxy = bbox_xywh.copy()
+    bbox_xyxy[:, 2] = bbox_xyxy[:, 2] + bbox_xyxy[:, 0]
+    bbox_xyxy[:, 3] = bbox_xyxy[:, 3] + bbox_xyxy[:, 1]
+
+    return bbox_xyxy
+
+
+def bbox_xywh2cs(bbox, aspect_ratio, padding=1., pixel_std=200.):
+    """Transform the bbox format from (x,y,w,h) into (center, scale)
+
+    Args:
+        bbox (ndarray): Single bbox in (x, y, w, h)
+        aspect_ratio (float): The expected bbox aspect ratio (w over h)
+        padding (float): Bbox padding factor that will be multilied to scale.
+            Default: 1.0
+        pixel_std (float): The scale normalization factor. Default: 200.0
+
+    Returns:
+        tuple: A tuple containing center and scale.
+        - np.ndarray[float32](2,): Center of the bbox (x, y).
+        - np.ndarray[float32](2,): Scale of the bbox w & h.
+    """
+
+    x, y, w, h = bbox[:4]
+    center = np.array([x + w * 0.5, y + h * 0.5], dtype=np.float32)
+
+    if w > aspect_ratio * h:
+        h = w * 1.0 / aspect_ratio
+    elif w < aspect_ratio * h:
+        w = h * aspect_ratio
+
+    scale = np.array([w, h], dtype=np.float32) / pixel_std
+    scale = scale * padding
+
+    return center, scale
+
+
+def bbox_cs2xywh(center, scale, padding=1., pixel_std=200.):
+    """Transform the bbox format from (center, scale) to (x,y,w,h). Note that
+    this is not an exact inverse operation of ``bbox_xywh2cs`` because the
+    normalization of aspect ratio in ``bbox_xywh2cs`` is irreversible.
+
+    Args:
+        center (ndarray): Single bbox center in (x, y)
+        scale (ndarray): Single bbox scale in (scale_x, scale_y)
+        padding (float): Bbox padding factor that will be multilied to scale.
+            Default: 1.0
+        pixel_std (float): The scale normalization factor. Default: 200.0
+
+    Returns:
+        ndarray: Single bbox in (x, y, w, h)
+    """
+
+    wh = scale / padding * pixel_std
+    xy = center - 0.5 * wh
+    return np.r_[xy, wh]
diff --git a/main/transformer_utils/mmpose/core/camera/__init__.py b/main/transformer_utils/mmpose/core/camera/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4a3c5526560996791a85f0d84a72a66286486ca
--- /dev/null
+++ b/main/transformer_utils/mmpose/core/camera/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .camera_base import CAMERAS
+from .single_camera import SimpleCamera
+from .single_camera_torch import SimpleCameraTorch
+
+__all__ = ['CAMERAS', 'SimpleCamera', 'SimpleCameraTorch']
diff --git a/main/transformer_utils/mmpose/core/camera/camera_base.py b/main/transformer_utils/mmpose/core/camera/camera_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..28b23e7c6279e3613265a949df91f6ced0413b99
--- /dev/null
+++ b/main/transformer_utils/mmpose/core/camera/camera_base.py
@@ -0,0 +1,45 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+
+from mmcv.utils import Registry
+
+CAMERAS = Registry('camera')
+
+
+class SingleCameraBase(metaclass=ABCMeta):
+    """Base class for single camera model.
+
+    Args:
+        param (dict): Camera parameters
+
+    Methods:
+        world_to_camera: Project points from world coordinates to camera
+            coordinates
+        camera_to_world: Project points from camera coordinates to world
+            coordinates
+        camera_to_pixel: Project points from camera coordinates to pixel
+            coordinates
+        world_to_pixel: Project points from world coordinates to pixel
+            coordinates
+    """
+
+    @abstractmethod
+    def __init__(self, param):
+        """Load camera parameters and check validity."""
+
+    def world_to_camera(self, X):
+        """Project points from world coordinates to camera coordinates."""
+        raise NotImplementedError
+
+    def camera_to_world(self, X):
+        """Project points from camera coordinates to world coordinates."""
+        raise NotImplementedError
+
+    def camera_to_pixel(self, X):
+        """Project points from camera coordinates to pixel coordinates."""
+        raise NotImplementedError
+
+    def world_to_pixel(self, X):
+        """Project points from world coordinates to pixel coordinates."""
+        _X = self.world_to_camera(X)
+        return self.camera_to_pixel(_X)
diff --git a/main/transformer_utils/mmpose/core/camera/single_camera.py b/main/transformer_utils/mmpose/core/camera/single_camera.py
new file mode 100644
index 0000000000000000000000000000000000000000..cabd79941af5c81110876e94ce6103cc02ea5078
--- /dev/null
+++ b/main/transformer_utils/mmpose/core/camera/single_camera.py
@@ -0,0 +1,123 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+
+from .camera_base import CAMERAS, SingleCameraBase
+
+
+@CAMERAS.register_module()
+class SimpleCamera(SingleCameraBase):
+    """Camera model to calculate coordinate transformation with given
+    intrinsic/extrinsic camera parameters.
+
+    Note:
+        The keypoint coordinate should be an np.ndarray with a shape of
+    [...,J, C] where J is the keypoint number of an instance, and C is
+    the coordinate dimension. For example:
+
+        [J, C]: shape of joint coordinates of a person with J joints.
+        [N, J, C]: shape of a batch of person joint coordinates.
+        [N, T, J, C]: shape of a batch of pose sequences.
+
+    Args:
+        param (dict): camera parameters including:
+            - R: 3x3, camera rotation matrix (camera-to-world)
+            - T: 3x1, camera translation (camera-to-world)
+            - K: (optional) 2x3, camera intrinsic matrix
+            - k: (optional) nx1, camera radial distortion coefficients
+            - p: (optional) mx1, camera tangential distortion coefficients
+            - f: (optional) 2x1, camera focal length
+            - c: (optional) 2x1, camera center
+        if K is not provided, it will be calculated from f and c.
+
+    Methods:
+        world_to_camera: Project points from world coordinates to camera
+            coordinates
+        camera_to_pixel: Project points from camera coordinates to pixel
+            coordinates
+        world_to_pixel: Project points from world coordinates to pixel
+            coordinates
+    """
+
+    def __init__(self, param):
+
+        self.param = {}
+        # extrinsic param
+        R = np.array(param['R'], dtype=np.float32)
+        T = np.array(param['T'], dtype=np.float32)
+        assert R.shape == (3, 3)
+        assert T.shape == (3, 1)
+        # The camera matrices are transposed in advance because the joint
+        # coordinates are stored as row vectors.
+        self.param['R_c2w'] = R.T
+        self.param['T_c2w'] = T.T
+        self.param['R_w2c'] = R
+        self.param['T_w2c'] = -self.param['T_c2w'] @ self.param['R_w2c']
+
+        # intrinsic param
+        if 'K' in param:
+            K = np.array(param['K'], dtype=np.float32)
+            assert K.shape == (2, 3)
+            self.param['K'] = K.T
+            self.param['f'] = np.array([K[0, 0], K[1, 1]])[:, np.newaxis]
+            self.param['c'] = np.array([K[0, 2], K[1, 2]])[:, np.newaxis]
+        elif 'f' in param and 'c' in param:
+            f = np.array(param['f'], dtype=np.float32)
+            c = np.array(param['c'], dtype=np.float32)
+            assert f.shape == (2, 1)
+            assert c.shape == (2, 1)
+            self.param['K'] = np.concatenate((np.diagflat(f), c), axis=-1).T
+            self.param['f'] = f
+            self.param['c'] = c
+        else:
+            raise ValueError('Camera intrinsic parameters are missing. '
+                             'Either "K" or "f"&"c" should be provided.')
+
+        # distortion param
+        if 'k' in param and 'p' in param:
+            self.undistortion = True
+            self.param['k'] = np.array(param['k'], dtype=np.float32).flatten()
+            self.param['p'] = np.array(param['p'], dtype=np.float32).flatten()
+            assert self.param['k'].size in {3, 6}
+            assert self.param['p'].size == 2
+        else:
+            self.undistortion = False
+
+    def world_to_camera(self, X):
+        assert isinstance(X, np.ndarray)
+        assert X.ndim >= 2 and X.shape[-1] == 3
+        return X @ self.param['R_w2c'] + self.param['T_w2c']
+
+    def camera_to_world(self, X):
+        assert isinstance(X, np.ndarray)
+        assert X.ndim >= 2 and X.shape[-1] == 3
+        return X @ self.param['R_c2w'] + self.param['T_c2w']
+
+    def camera_to_pixel(self, X):
+        assert isinstance(X, np.ndarray)
+        assert X.ndim >= 2 and X.shape[-1] == 3
+
+        _X = X / X[..., 2:]
+
+        if self.undistortion:
+            k = self.param['k']
+            p = self.param['p']
+            _X_2d = _X[..., :2]
+            r2 = (_X_2d**2).sum(-1)
+            radial = 1 + sum(ki * r2**(i + 1) for i, ki in enumerate(k[:3]))
+            if k.size == 6:
+                radial /= 1 + sum(
+                    (ki * r2**(i + 1) for i, ki in enumerate(k[3:])))
+
+            tangential = 2 * (p[1] * _X[..., 0] + p[0] * _X[..., 1])
+
+            _X[..., :2] = _X_2d * (radial + tangential)[..., None] + np.outer(
+                r2, p[::-1]).reshape(_X_2d.shape)
+        return _X @ self.param['K']
+
+    def pixel_to_camera(self, X):
+        assert isinstance(X, np.ndarray)
+        assert X.ndim >= 2 and X.shape[-1] == 3
+        _X = X.copy()
+        _X[:, :2] = (X[:, :2] - self.param['c'].T) / self.param['f'].T * X[:,
+                                                                           [2]]
+        return _X
diff --git a/main/transformer_utils/mmpose/core/camera/single_camera_torch.py b/main/transformer_utils/mmpose/core/camera/single_camera_torch.py
new file mode 100644
index 0000000000000000000000000000000000000000..22eb72f23d6eecf1b5c5a9b570a4f142fcf6e02a
--- /dev/null
+++ b/main/transformer_utils/mmpose/core/camera/single_camera_torch.py
@@ -0,0 +1,118 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from .camera_base import CAMERAS, SingleCameraBase
+
+
+@CAMERAS.register_module()
+class SimpleCameraTorch(SingleCameraBase):
+    """Camera model to calculate coordinate transformation with given
+    intrinsic/extrinsic camera parameters.
+
+    Notes:
+        The keypoint coordinate should be an np.ndarray with a shape of
+    [...,J, C] where J is the keypoint number of an instance, and C is
+    the coordinate dimension. For example:
+
+        [J, C]: shape of joint coordinates of a person with J joints.
+        [N, J, C]: shape of a batch of person joint coordinates.
+        [N, T, J, C]: shape of a batch of pose sequences.
+
+    Args:
+        param (dict): camera parameters including:
+            - R: 3x3, camera rotation matrix (camera-to-world)
+            - T: 3x1, camera translation (camera-to-world)
+            - K: (optional) 2x3, camera intrinsic matrix
+            - k: (optional) nx1, camera radial distortion coefficients
+            - p: (optional) mx1, camera tangential distortion coefficients
+            - f: (optional) 2x1, camera focal length
+            - c: (optional) 2x1, camera center
+        if K is not provided, it will be calculated from f and c.
+
+    Methods:
+        world_to_camera: Project points from world coordinates to camera
+            coordinates
+        camera_to_pixel: Project points from camera coordinates to pixel
+            coordinates
+        world_to_pixel: Project points from world coordinates to pixel
+            coordinates
+    """
+
+    def __init__(self, param, device):
+
+        self.param = {}
+        # extrinsic param
+        R = torch.tensor(param['R'], device=device)
+        T = torch.tensor(param['T'], device=device)
+
+        assert R.shape == (3, 3)
+        assert T.shape == (3, 1)
+        # The camera matrices are transposed in advance because the joint
+        # coordinates are stored as row vectors.
+        self.param['R_c2w'] = R.T
+        self.param['T_c2w'] = T.T
+        self.param['R_w2c'] = R
+        self.param['T_w2c'] = -self.param['T_c2w'] @ self.param['R_w2c']
+
+        # intrinsic param
+        if 'K' in param:
+            K = torch.tensor(param['K'], device=device)
+            assert K.shape == (2, 3)
+            self.param['K'] = K.T
+            self.param['f'] = torch.tensor([[K[0, 0]], [K[1, 1]]],
+                                           device=device)
+            self.param['c'] = torch.tensor([[K[0, 2]], [K[1, 2]]],
+                                           device=device)
+        elif 'f' in param and 'c' in param:
+            f = torch.tensor(param['f'], device=device)
+            c = torch.tensor(param['c'], device=device)
+            assert f.shape == (2, 1)
+            assert c.shape == (2, 1)
+            self.param['K'] = torch.cat([torch.diagflat(f), c], dim=-1).T
+            self.param['f'] = f
+            self.param['c'] = c
+        else:
+            raise ValueError('Camera intrinsic parameters are missing. '
+                             'Either "K" or "f"&"c" should be provided.')
+
+        # distortion param
+        if 'k' in param and 'p' in param:
+            self.undistortion = True
+            self.param['k'] = torch.tensor(param['k'], device=device).view(-1)
+            self.param['p'] = torch.tensor(param['p'], device=device).view(-1)
+            assert len(self.param['k']) in {3, 6}
+            assert len(self.param['p']) == 2
+        else:
+            self.undistortion = False
+
+    def world_to_camera(self, X):
+        assert isinstance(X, torch.Tensor)
+        assert X.ndim >= 2 and X.shape[-1] == 3
+        return X @ self.param['R_w2c'] + self.param['T_w2c']
+
+    def camera_to_world(self, X):
+        assert isinstance(X, torch.Tensor)
+        assert X.ndim >= 2 and X.shape[-1] == 3
+        return X @ self.param['R_c2w'] + self.param['T_c2w']
+
+    def camera_to_pixel(self, X):
+        assert isinstance(X, torch.Tensor)
+        assert X.ndim >= 2 and X.shape[-1] == 3
+
+        _X = X / X[..., 2:]
+
+        if self.undistortion:
+            k = self.param['k']
+            p = self.param['p']
+            _X_2d = _X[..., :2]
+            r2 = (_X_2d**2).sum(-1)
+            radial = 1 + sum(ki * r2**(i + 1) for i, ki in enumerate(k[:3]))
+            if k.size == 6:
+                radial /= 1 + sum(
+                    (ki * r2**(i + 1) for i, ki in enumerate(k[3:])))
+
+            tangential = 2 * (p[1] * _X[..., 0] + p[0] * _X[..., 1])
+
+            _X[..., :2] = _X_2d * (radial + tangential)[..., None] + torch.ger(
+                r2, p.flip([0])).reshape(_X_2d.shape)
+        return _X @ self.param['K']
diff --git a/main/transformer_utils/mmpose/core/distributed_wrapper.py b/main/transformer_utils/mmpose/core/distributed_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..c67aceec992085e9952ea70c62009e9ec1db30ca
--- /dev/null
+++ b/main/transformer_utils/mmpose/core/distributed_wrapper.py
@@ -0,0 +1,143 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.parallel import MODULE_WRAPPERS as MMCV_MODULE_WRAPPERS
+from mmcv.parallel import MMDistributedDataParallel
+from mmcv.parallel.scatter_gather import scatter_kwargs
+from mmcv.utils import Registry
+from torch.cuda._utils import _get_device_index
+
+MODULE_WRAPPERS = Registry('module wrapper', parent=MMCV_MODULE_WRAPPERS)
+
+
+@MODULE_WRAPPERS.register_module()
+class DistributedDataParallelWrapper(nn.Module):
+    """A DistributedDataParallel wrapper for models in 3D mesh estimation task.
+
+    In  3D mesh estimation task, there is a need to wrap different modules in
+    the models with separate DistributedDataParallel. Otherwise, it will cause
+    errors for GAN training.
+    More specific, the GAN model, usually has two sub-modules:
+    generator and discriminator. If we wrap both of them in one
+    standard DistributedDataParallel, it will cause errors during training,
+    because when we update the parameters of the generator (or discriminator),
+    the parameters of the discriminator (or generator) is not updated, which is
+    not allowed for DistributedDataParallel.
+    So we design this wrapper to separately wrap DistributedDataParallel
+    for generator and discriminator.
+
+    In this wrapper, we perform two operations:
+    1. Wrap the modules in the models with separate MMDistributedDataParallel.
+        Note that only modules with parameters will be wrapped.
+    2. Do scatter operation for 'forward', 'train_step' and 'val_step'.
+
+    Note that the arguments of this wrapper is the same as those in
+    `torch.nn.parallel.distributed.DistributedDataParallel`.
+
+    Args:
+        module (nn.Module): Module that needs to be wrapped.
+        device_ids (list[int | `torch.device`]): Same as that in
+            `torch.nn.parallel.distributed.DistributedDataParallel`.
+        dim (int, optional): Same as that in the official scatter function in
+            pytorch. Defaults to 0.
+        broadcast_buffers (bool): Same as that in
+            `torch.nn.parallel.distributed.DistributedDataParallel`.
+            Defaults to False.
+        find_unused_parameters (bool, optional): Same as that in
+            `torch.nn.parallel.distributed.DistributedDataParallel`.
+            Traverse the autograd graph of all tensors contained in returned
+            value of the wrapped module’s forward function. Defaults to False.
+        kwargs (dict): Other arguments used in
+            `torch.nn.parallel.distributed.DistributedDataParallel`.
+    """
+
+    def __init__(self,
+                 module,
+                 device_ids,
+                 dim=0,
+                 broadcast_buffers=False,
+                 find_unused_parameters=False,
+                 **kwargs):
+        super().__init__()
+        assert len(device_ids) == 1, (
+            'Currently, DistributedDataParallelWrapper only supports one'
+            'single CUDA device for each process.'
+            f'The length of device_ids must be 1, but got {len(device_ids)}.')
+        self.module = module
+        self.dim = dim
+        self.to_ddp(
+            device_ids=device_ids,
+            dim=dim,
+            broadcast_buffers=broadcast_buffers,
+            find_unused_parameters=find_unused_parameters,
+            **kwargs)
+        self.output_device = _get_device_index(device_ids[0], True)
+
+    def to_ddp(self, device_ids, dim, broadcast_buffers,
+               find_unused_parameters, **kwargs):
+        """Wrap models with separate MMDistributedDataParallel.
+
+        It only wraps the modules with parameters.
+        """
+        for name, module in self.module._modules.items():
+            if next(module.parameters(), None) is None:
+                module = module.cuda()
+            elif all(not p.requires_grad for p in module.parameters()):
+                module = module.cuda()
+            else:
+                module = MMDistributedDataParallel(
+                    module.cuda(),
+                    device_ids=device_ids,
+                    dim=dim,
+                    broadcast_buffers=broadcast_buffers,
+                    find_unused_parameters=find_unused_parameters,
+                    **kwargs)
+            self.module._modules[name] = module
+
+    def scatter(self, inputs, kwargs, device_ids):
+        """Scatter function.
+
+        Args:
+            inputs (Tensor): Input Tensor.
+            kwargs (dict): Args for
+                ``mmcv.parallel.scatter_gather.scatter_kwargs``.
+            device_ids (int): Device id.
+        """
+        return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
+
+    def forward(self, *inputs, **kwargs):
+        """Forward function.
+
+        Args:
+            inputs (tuple): Input data.
+            kwargs (dict): Args for
+                ``mmcv.parallel.scatter_gather.scatter_kwargs``.
+        """
+        inputs, kwargs = self.scatter(inputs, kwargs,
+                                      [torch.cuda.current_device()])
+        return self.module(*inputs[0], **kwargs[0])
+
+    def train_step(self, *inputs, **kwargs):
+        """Train step function.
+
+        Args:
+            inputs (Tensor): Input Tensor.
+            kwargs (dict): Args for
+                ``mmcv.parallel.scatter_gather.scatter_kwargs``.
+        """
+        inputs, kwargs = self.scatter(inputs, kwargs,
+                                      [torch.cuda.current_device()])
+        output = self.module.train_step(*inputs[0], **kwargs[0])
+        return output
+
+    def val_step(self, *inputs, **kwargs):
+        """Validation step function.
+
+        Args:
+            inputs (tuple): Input data.
+            kwargs (dict): Args for ``scatter_kwargs``.
+        """
+        inputs, kwargs = self.scatter(inputs, kwargs,
+                                      [torch.cuda.current_device()])
+        output = self.module.val_step(*inputs[0], **kwargs[0])
+        return output
diff --git a/main/transformer_utils/mmpose/core/evaluation/__init__.py b/main/transformer_utils/mmpose/core/evaluation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f9378429c8ddaa15f7ac17446bc9d484987df16
--- /dev/null
+++ b/main/transformer_utils/mmpose/core/evaluation/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .bottom_up_eval import (aggregate_scale, aggregate_stage_flip,
+                             flip_feature_maps, get_group_preds,
+                             split_ae_outputs)
+from .eval_hooks import DistEvalHook, EvalHook
+from .mesh_eval import compute_similarity_transform
+from .pose3d_eval import keypoint_3d_auc, keypoint_3d_pck, keypoint_mpjpe
+from .top_down_eval import (keypoint_auc, keypoint_epe, keypoint_pck_accuracy,
+                            keypoints_from_heatmaps, keypoints_from_heatmaps3d,
+                            keypoints_from_regression,
+                            multilabel_classification_accuracy,
+                            pose_pck_accuracy, post_dark_udp)
+
+__all__ = [
+    'EvalHook', 'DistEvalHook', 'pose_pck_accuracy', 'keypoints_from_heatmaps',
+    'keypoints_from_regression', 'keypoint_pck_accuracy', 'keypoint_3d_pck',
+    'keypoint_3d_auc', 'keypoint_auc', 'keypoint_epe', 'get_group_preds',
+    'split_ae_outputs', 'flip_feature_maps', 'aggregate_stage_flip',
+    'aggregate_scale', 'compute_similarity_transform', 'post_dark_udp',
+    'keypoint_mpjpe', 'keypoints_from_heatmaps3d',
+    'multilabel_classification_accuracy'
+]
diff --git a/main/transformer_utils/mmpose/core/evaluation/bottom_up_eval.py b/main/transformer_utils/mmpose/core/evaluation/bottom_up_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b37d7c98e684284e3863922e7c7d2abedce0e24
--- /dev/null
+++ b/main/transformer_utils/mmpose/core/evaluation/bottom_up_eval.py
@@ -0,0 +1,333 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+
+from mmpose.core.post_processing import (get_warp_matrix, transform_preds,
+                                         warp_affine_joints)
+
+
+def split_ae_outputs(outputs, num_joints, with_heatmaps, with_ae,
+                     select_output_index):
+    """Split multi-stage outputs into heatmaps & tags.
+
+    Args:
+        outputs (list(Tensor)): Outputs of network
+        num_joints (int): Number of joints
+        with_heatmaps (list[bool]): Option to output
+            heatmaps for different stages.
+        with_ae (list[bool]): Option to output
+            ae tags for different stages.
+        select_output_index (list[int]): Output keep the selected index
+
+    Returns:
+        tuple: A tuple containing multi-stage outputs.
+
+        - list[Tensor]: multi-stage heatmaps.
+        - list[Tensor]: multi-stage tags.
+    """
+
+    heatmaps = []
+    tags = []
+
+    # aggregate heatmaps from different stages
+    for i, output in enumerate(outputs):
+        if i not in select_output_index:
+            continue
+        # staring index of the associative embeddings
+        offset_feat = num_joints if with_heatmaps[i] else 0
+        if with_heatmaps[i]:
+            heatmaps.append(output[:, :num_joints])
+        if with_ae[i]:
+            tags.append(output[:, offset_feat:])
+
+    return heatmaps, tags
+
+
+def flip_feature_maps(feature_maps, flip_index=None):
+    """Flip the feature maps and swap the channels.
+
+    Args:
+        feature_maps (list[Tensor]): Feature maps.
+        flip_index (list[int] | None): Channel-flip indexes.
+            If None, do not flip channels.
+
+    Returns:
+        list[Tensor]: Flipped feature_maps.
+    """
+    flipped_feature_maps = []
+    for feature_map in feature_maps:
+        feature_map = torch.flip(feature_map, [3])
+        if flip_index is not None:
+            flipped_feature_maps.append(feature_map[:, flip_index, :, :])
+        else:
+            flipped_feature_maps.append(feature_map)
+
+    return flipped_feature_maps
+
+
+def _resize_average(feature_maps, align_corners, index=-1, resize_size=None):
+    """Resize the feature maps and compute the average.
+
+    Args:
+        feature_maps (list[Tensor]): Feature maps.
+        align_corners (bool): Align corners when performing interpolation.
+        index (int): Only used when `resize_size' is None.
+            If `resize_size' is None, the target size is the size
+            of the indexed feature maps.
+        resize_size (list[int, int]): The target size [w, h].
+
+    Returns:
+        list[Tensor]: Averaged feature_maps.
+    """
+
+    if feature_maps is None:
+        return None
+    feature_maps_avg = 0
+
+    feature_map_list = _resize_concate(
+        feature_maps, align_corners, index=index, resize_size=resize_size)
+    for feature_map in feature_map_list:
+        feature_maps_avg += feature_map
+
+    feature_maps_avg /= len(feature_map_list)
+    return [feature_maps_avg]
+
+
+def _resize_unsqueeze_concat(feature_maps,
+                             align_corners,
+                             index=-1,
+                             resize_size=None):
+    """Resize, unsqueeze and concatenate the feature_maps.
+
+    Args:
+        feature_maps (list[Tensor]): Feature maps.
+        align_corners (bool): Align corners when performing interpolation.
+        index (int): Only used when `resize_size' is None.
+            If `resize_size' is None, the target size is the size
+            of the indexed feature maps.
+        resize_size (list[int, int]): The target size [w, h].
+
+    Returns:
+        list[Tensor]: Averaged feature_maps.
+    """
+    if feature_maps is None:
+        return None
+    feature_map_list = _resize_concate(
+        feature_maps, align_corners, index=index, resize_size=resize_size)
+
+    feat_dim = len(feature_map_list[0].shape) - 1
+    output_feature_maps = torch.cat(
+        [torch.unsqueeze(fmap, dim=feat_dim + 1) for fmap in feature_map_list],
+        dim=feat_dim + 1)
+    return [output_feature_maps]
+
+
+def _resize_concate(feature_maps, align_corners, index=-1, resize_size=None):
+    """Resize and concatenate the feature_maps.
+
+    Args:
+        feature_maps (list[Tensor]): Feature maps.
+        align_corners (bool): Align corners when performing interpolation.
+        index (int): Only used when `resize_size' is None.
+            If `resize_size' is None, the target size is the size
+            of the indexed feature maps.
+        resize_size (list[int, int]): The target size [w, h].
+
+    Returns:
+        list[Tensor]: Averaged feature_maps.
+    """
+    if feature_maps is None:
+        return None
+
+    feature_map_list = []
+
+    if index < 0:
+        index += len(feature_maps)
+
+    if resize_size is None:
+        resize_size = (feature_maps[index].size(2),
+                       feature_maps[index].size(3))
+
+    for feature_map in feature_maps:
+        ori_size = (feature_map.size(2), feature_map.size(3))
+        if ori_size != resize_size:
+            feature_map = torch.nn.functional.interpolate(
+                feature_map,
+                size=resize_size,
+                mode='bilinear',
+                align_corners=align_corners)
+
+        feature_map_list.append(feature_map)
+
+    return feature_map_list
+
+
+def aggregate_stage_flip(feature_maps,
+                         feature_maps_flip,
+                         index=-1,
+                         project2image=True,
+                         size_projected=None,
+                         align_corners=False,
+                         aggregate_stage='concat',
+                         aggregate_flip='average'):
+    """Inference the model to get multi-stage outputs (heatmaps & tags), and
+    resize them to base sizes.
+
+    Args:
+        feature_maps (list[Tensor]): feature_maps can be heatmaps,
+            tags, and pafs.
+        feature_maps_flip (list[Tensor] | None): flipped feature_maps.
+            feature maps can be heatmaps, tags, and pafs.
+        project2image (bool): Option to resize to base scale.
+        size_projected (list[int, int]): Base size of heatmaps [w, h].
+        align_corners (bool): Align corners when performing interpolation.
+        aggregate_stage (str): Methods to aggregate multi-stage feature maps.
+            Options: 'concat', 'average'. Default: 'concat.
+
+            - 'concat': Concatenate the original and the flipped feature maps.
+            - 'average': Get the average of the original and the flipped
+                feature maps.
+        aggregate_flip (str): Methods to aggregate the original and
+            the flipped feature maps. Options: 'concat', 'average', 'none'.
+            Default: 'average.
+
+            - 'concat': Concatenate the original and the flipped feature maps.
+            - 'average': Get the average of the original and the flipped
+                feature maps..
+            - 'none': no flipped feature maps.
+
+    Returns:
+        list[Tensor]: Aggregated feature maps with shape [NxKxWxH].
+    """
+
+    if feature_maps_flip is None:
+        aggregate_flip = 'none'
+
+    output_feature_maps = []
+
+    if aggregate_stage == 'average':
+        _aggregate_stage_func = _resize_average
+    elif aggregate_stage == 'concat':
+        _aggregate_stage_func = _resize_concate
+    else:
+        NotImplementedError()
+
+    if project2image and size_projected:
+        _origin = _aggregate_stage_func(
+            feature_maps,
+            align_corners,
+            index=index,
+            resize_size=(size_projected[1], size_projected[0]))
+
+        _flipped = _aggregate_stage_func(
+            feature_maps_flip,
+            align_corners,
+            index=index,
+            resize_size=(size_projected[1], size_projected[0]))
+    else:
+        _origin = _aggregate_stage_func(
+            feature_maps, align_corners, index=index, resize_size=None)
+        _flipped = _aggregate_stage_func(
+            feature_maps_flip, align_corners, index=index, resize_size=None)
+
+    if aggregate_flip == 'average':
+        assert feature_maps_flip is not None
+        for _ori, _fli in zip(_origin, _flipped):
+            output_feature_maps.append((_ori + _fli) / 2.0)
+
+    elif aggregate_flip == 'concat':
+        assert feature_maps_flip is not None
+        output_feature_maps.append(*_origin)
+        output_feature_maps.append(*_flipped)
+
+    elif aggregate_flip == 'none':
+        if isinstance(_origin, list):
+            output_feature_maps.append(*_origin)
+        else:
+            output_feature_maps.append(_origin)
+    else:
+        NotImplementedError()
+
+    return output_feature_maps
+
+
+def aggregate_scale(feature_maps_list,
+                    align_corners=False,
+                    aggregate_scale='average'):
+    """Aggregate multi-scale outputs.
+
+    Note:
+        batch size: N
+        keypoints num : K
+        heatmap width: W
+        heatmap height: H
+
+    Args:
+        feature_maps_list (list[Tensor]): Aggregated feature maps.
+        project2image (bool): Option to resize to base scale.
+        align_corners (bool): Align corners when performing interpolation.
+        aggregate_scale (str): Methods to aggregate multi-scale feature maps.
+            Options: 'average', 'unsqueeze_concat'.
+
+            - 'average': Get the average of the feature maps.
+            - 'unsqueeze_concat': Concatenate the feature maps along new axis.
+                Default: 'average.
+
+    Returns:
+        Tensor: Aggregated feature maps.
+    """
+
+    if aggregate_scale == 'average':
+        output_feature_maps = _resize_average(
+            feature_maps_list, align_corners, index=0, resize_size=None)
+
+    elif aggregate_scale == 'unsqueeze_concat':
+        output_feature_maps = _resize_unsqueeze_concat(
+            feature_maps_list, align_corners, index=0, resize_size=None)
+    else:
+        NotImplementedError()
+
+    return output_feature_maps[0]
+
+
+def get_group_preds(grouped_joints,
+                    center,
+                    scale,
+                    heatmap_size,
+                    use_udp=False):
+    """Transform the grouped joints back to the image.
+
+    Args:
+        grouped_joints (list): Grouped person joints.
+        center (np.ndarray[2, ]): Center of the bounding box (x, y).
+        scale (np.ndarray[2, ]): Scale of the bounding box
+            wrt [width, height].
+        heatmap_size (np.ndarray[2, ]): Size of the destination heatmaps.
+        use_udp (bool): Unbiased data processing.
+             Paper ref: Huang et al. The Devil is in the Details: Delving into
+             Unbiased Data Processing for Human Pose Estimation (CVPR'2020).
+
+    Returns:
+        list: List of the pose result for each person.
+    """
+    if len(grouped_joints) == 0:
+        return []
+
+    if use_udp:
+        if grouped_joints[0].shape[0] > 0:
+            heatmap_size_t = np.array(heatmap_size, dtype=np.float32) - 1.0
+            trans = get_warp_matrix(
+                theta=0,
+                size_input=heatmap_size_t,
+                size_dst=scale,
+                size_target=heatmap_size_t)
+            grouped_joints[0][..., :2] = \
+                warp_affine_joints(grouped_joints[0][..., :2], trans)
+        results = [person for person in grouped_joints[0]]
+    else:
+        results = []
+        for person in grouped_joints[0]:
+            joints = transform_preds(person, center, scale, heatmap_size)
+            results.append(joints)
+
+    return results
diff --git a/main/transformer_utils/mmpose/core/evaluation/eval_hooks.py b/main/transformer_utils/mmpose/core/evaluation/eval_hooks.py
new file mode 100644
index 0000000000000000000000000000000000000000..b35a9c6a990c69b2beac9e73f893f97c237e4783
--- /dev/null
+++ b/main/transformer_utils/mmpose/core/evaluation/eval_hooks.py
@@ -0,0 +1,99 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+from mmcv.runner import DistEvalHook as _DistEvalHook
+from mmcv.runner import EvalHook as _EvalHook
+
+MMPOSE_GREATER_KEYS = [
+    'acc', 'ap', 'ar', 'pck', 'auc', '3dpck', 'p-3dpck', '3dauc', 'p-3dauc',
+    'pcp'
+]
+MMPOSE_LESS_KEYS = ['loss', 'epe', 'nme', 'mpjpe', 'p-mpjpe', 'n-mpjpe']
+
+
+class EvalHook(_EvalHook):
+
+    def __init__(self,
+                 dataloader,
+                 start=None,
+                 interval=1,
+                 by_epoch=True,
+                 save_best=None,
+                 rule=None,
+                 test_fn=None,
+                 greater_keys=MMPOSE_GREATER_KEYS,
+                 less_keys=MMPOSE_LESS_KEYS,
+                 **eval_kwargs):
+
+        if test_fn is None:
+            from mmpose.apis import single_gpu_test
+            test_fn = single_gpu_test
+
+        # to be compatible with the config before v0.16.0
+
+        # remove "gpu_collect" from eval_kwargs
+        if 'gpu_collect' in eval_kwargs:
+            warnings.warn(
+                '"gpu_collect" will be deprecated in EvalHook.'
+                'Please remove it from the config.', DeprecationWarning)
+            _ = eval_kwargs.pop('gpu_collect')
+
+        # update "save_best" according to "key_indicator" and remove the
+        # latter from eval_kwargs
+        if 'key_indicator' in eval_kwargs or isinstance(save_best, bool):
+            warnings.warn(
+                '"key_indicator" will be deprecated in EvalHook.'
+                'Please use "save_best" to specify the metric key,'
+                'e.g., save_best="AP".', DeprecationWarning)
+
+            key_indicator = eval_kwargs.pop('key_indicator', 'AP')
+            if save_best is True and key_indicator is None:
+                raise ValueError('key_indicator should not be None, when '
+                                 'save_best is set to True.')
+            save_best = key_indicator
+
+        super().__init__(dataloader, start, interval, by_epoch, save_best,
+                         rule, test_fn, greater_keys, less_keys, **eval_kwargs)
+
+
+class DistEvalHook(_DistEvalHook):
+
+    def __init__(self,
+                 dataloader,
+                 start=None,
+                 interval=1,
+                 by_epoch=True,
+                 save_best=None,
+                 rule=None,
+                 test_fn=None,
+                 greater_keys=MMPOSE_GREATER_KEYS,
+                 less_keys=MMPOSE_LESS_KEYS,
+                 broadcast_bn_buffer=True,
+                 tmpdir=None,
+                 gpu_collect=False,
+                 **eval_kwargs):
+
+        if test_fn is None:
+            from mmpose.apis import multi_gpu_test
+            test_fn = multi_gpu_test
+
+        # to be compatible with the config before v0.16.0
+
+        # update "save_best" according to "key_indicator" and remove the
+        # latter from eval_kwargs
+        if 'key_indicator' in eval_kwargs or isinstance(save_best, bool):
+            warnings.warn(
+                '"key_indicator" will be deprecated in EvalHook.'
+                'Please use "save_best" to specify the metric key,'
+                'e.g., save_best="AP".', DeprecationWarning)
+
+            key_indicator = eval_kwargs.pop('key_indicator', 'AP')
+            if save_best is True and key_indicator is None:
+                raise ValueError('key_indicator should not be None, when '
+                                 'save_best is set to True.')
+            save_best = key_indicator
+
+        super().__init__(dataloader, start, interval, by_epoch, save_best,
+                         rule, test_fn, greater_keys, less_keys,
+                         broadcast_bn_buffer, tmpdir, gpu_collect,
+                         **eval_kwargs)
diff --git a/main/transformer_utils/mmpose/core/evaluation/mesh_eval.py b/main/transformer_utils/mmpose/core/evaluation/mesh_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..683b4539b29d1829a324de424c6d9f85a7037e5d
--- /dev/null
+++ b/main/transformer_utils/mmpose/core/evaluation/mesh_eval.py
@@ -0,0 +1,66 @@
+# ------------------------------------------------------------------------------
+# Adapted from https://github.com/akanazawa/hmr
+# Original licence: Copyright (c) 2018 akanazawa, under the MIT License.
+# ------------------------------------------------------------------------------
+
+import numpy as np
+
+
+def compute_similarity_transform(source_points, target_points):
+    """Computes a similarity transform (sR, t) that takes a set of 3D points
+    source_points (N x 3) closest to a set of 3D points target_points, where R
+    is an 3x3 rotation matrix, t 3x1 translation, s scale. And return the
+    transformed 3D points source_points_hat (N x 3). i.e. solves the orthogonal
+    Procrutes problem.
+
+    Note:
+        Points number: N
+
+    Args:
+        source_points (np.ndarray): Source point set with shape [N, 3].
+        target_points (np.ndarray): Target point set with shape [N, 3].
+
+    Returns:
+        np.ndarray: Transformed source point set with shape [N, 3].
+    """
+
+    assert target_points.shape[0] == source_points.shape[0]
+    assert target_points.shape[1] == 3 and source_points.shape[1] == 3
+
+    source_points = source_points.T
+    target_points = target_points.T
+
+    # 1. Remove mean.
+    mu1 = source_points.mean(axis=1, keepdims=True)
+    mu2 = target_points.mean(axis=1, keepdims=True)
+    X1 = source_points - mu1
+    X2 = target_points - mu2
+
+    # 2. Compute variance of X1 used for scale.
+    var1 = np.sum(X1**2)
+
+    # 3. The outer product of X1 and X2.
+    K = X1.dot(X2.T)
+
+    # 4. Solution that Maximizes trace(R'K) is R=U*V', where U, V are
+    # singular vectors of K.
+    U, _, Vh = np.linalg.svd(K)
+    V = Vh.T
+    # Construct Z that fixes the orientation of R to get det(R)=1.
+    Z = np.eye(U.shape[0])
+    Z[-1, -1] *= np.sign(np.linalg.det(U.dot(V.T)))
+    # Construct R.
+    R = V.dot(Z.dot(U.T))
+
+    # 5. Recover scale.
+    scale = np.trace(R.dot(K)) / var1
+
+    # 6. Recover translation.
+    t = mu2 - scale * (R.dot(mu1))
+
+    # 7. Transform the source points:
+    source_points_hat = scale * R.dot(source_points) + t
+
+    source_points_hat = source_points_hat.T
+
+    return source_points_hat
diff --git a/main/transformer_utils/mmpose/core/evaluation/pose3d_eval.py b/main/transformer_utils/mmpose/core/evaluation/pose3d_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..545778ca7441c2d3e8ec58449c8ca7b162322e9e
--- /dev/null
+++ b/main/transformer_utils/mmpose/core/evaluation/pose3d_eval.py
@@ -0,0 +1,171 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+
+from .mesh_eval import compute_similarity_transform
+
+
+def keypoint_mpjpe(pred, gt, mask, alignment='none'):
+    """Calculate the mean per-joint position error (MPJPE) and the error after
+    rigid alignment with the ground truth (P-MPJPE).
+
+    Note:
+        - batch_size: N
+        - num_keypoints: K
+        - keypoint_dims: C
+
+    Args:
+        pred (np.ndarray): Predicted keypoint location with shape [N, K, C].
+        gt (np.ndarray): Groundtruth keypoint location with shape [N, K, C].
+        mask (np.ndarray): Visibility of the target with shape [N, K].
+            False for invisible joints, and True for visible.
+            Invisible joints will be ignored for accuracy calculation.
+        alignment (str, optional): method to align the prediction with the
+            groundtruth. Supported options are:
+
+                - ``'none'``: no alignment will be applied
+                - ``'scale'``: align in the least-square sense in scale
+                - ``'procrustes'``: align in the least-square sense in
+                    scale, rotation and translation.
+    Returns:
+        tuple: A tuple containing joint position errors
+
+        - (float | np.ndarray): mean per-joint position error (mpjpe).
+        - (float | np.ndarray): mpjpe after rigid alignment with the
+            ground truth (p-mpjpe).
+    """
+    assert mask.any()
+
+    if alignment == 'none':
+        pass
+    elif alignment == 'procrustes':
+        pred = np.stack([
+            compute_similarity_transform(pred_i, gt_i)
+            for pred_i, gt_i in zip(pred, gt)
+        ])
+    elif alignment == 'scale':
+        pred_dot_pred = np.einsum('nkc,nkc->n', pred, pred)
+        pred_dot_gt = np.einsum('nkc,nkc->n', pred, gt)
+        scale_factor = pred_dot_gt / pred_dot_pred
+        pred = pred * scale_factor[:, None, None]
+    else:
+        raise ValueError(f'Invalid value for alignment: {alignment}')
+
+    error = np.linalg.norm(pred - gt, ord=2, axis=-1)[mask].mean()
+
+    return error
+
+
+def keypoint_3d_pck(pred, gt, mask, alignment='none', threshold=0.15):
+    """Calculate the Percentage of Correct Keypoints (3DPCK) w. or w/o rigid
+    alignment.
+
+    Paper ref: `Monocular 3D Human Pose Estimation In The Wild Using Improved
+    CNN Supervision' 3DV'2017. <https://arxiv.org/pdf/1611.09813>`__ .
+
+    Note:
+        - batch_size: N
+        - num_keypoints: K
+        - keypoint_dims: C
+
+    Args:
+        pred (np.ndarray[N, K, C]): Predicted keypoint location.
+        gt (np.ndarray[N, K, C]): Groundtruth keypoint location.
+        mask (np.ndarray[N, K]): Visibility of the target. False for invisible
+            joints, and True for visible. Invisible joints will be ignored for
+            accuracy calculation.
+        alignment (str, optional): method to align the prediction with the
+            groundtruth. Supported options are:
+
+            - ``'none'``: no alignment will be applied
+            - ``'scale'``: align in the least-square sense in scale
+            - ``'procrustes'``: align in the least-square sense in scale,
+                rotation and translation.
+
+        threshold:  If L2 distance between the prediction and the groundtruth
+            is less then threshold, the predicted result is considered as
+            correct. Default: 0.15 (m).
+
+    Returns:
+        pck: percentage of correct keypoints.
+    """
+    assert mask.any()
+
+    if alignment == 'none':
+        pass
+    elif alignment == 'procrustes':
+        pred = np.stack([
+            compute_similarity_transform(pred_i, gt_i)
+            for pred_i, gt_i in zip(pred, gt)
+        ])
+    elif alignment == 'scale':
+        pred_dot_pred = np.einsum('nkc,nkc->n', pred, pred)
+        pred_dot_gt = np.einsum('nkc,nkc->n', pred, gt)
+        scale_factor = pred_dot_gt / pred_dot_pred
+        pred = pred * scale_factor[:, None, None]
+    else:
+        raise ValueError(f'Invalid value for alignment: {alignment}')
+
+    error = np.linalg.norm(pred - gt, ord=2, axis=-1)
+    pck = (error < threshold).astype(np.float32)[mask].mean() * 100
+
+    return pck
+
+
+def keypoint_3d_auc(pred, gt, mask, alignment='none'):
+    """Calculate the Area Under the Curve (3DAUC) computed for a range of 3DPCK
+    thresholds.
+
+    Paper ref: `Monocular 3D Human Pose Estimation In The Wild Using Improved
+    CNN Supervision' 3DV'2017. <https://arxiv.org/pdf/1611.09813>`__ .
+    This implementation is derived from mpii_compute_3d_pck.m, which is
+    provided as part of the MPI-INF-3DHP test data release.
+
+    Note:
+        batch_size: N
+        num_keypoints: K
+        keypoint_dims: C
+
+    Args:
+        pred (np.ndarray[N, K, C]): Predicted keypoint location.
+        gt (np.ndarray[N, K, C]): Groundtruth keypoint location.
+        mask (np.ndarray[N, K]): Visibility of the target. False for invisible
+            joints, and True for visible. Invisible joints will be ignored for
+            accuracy calculation.
+        alignment (str, optional): method to align the prediction with the
+            groundtruth. Supported options are:
+
+            - ``'none'``: no alignment will be applied
+            - ``'scale'``: align in the least-square sense in scale
+            - ``'procrustes'``: align in the least-square sense in scale,
+                rotation and translation.
+
+    Returns:
+        auc: AUC computed for a range of 3DPCK thresholds.
+    """
+    assert mask.any()
+
+    if alignment == 'none':
+        pass
+    elif alignment == 'procrustes':
+        pred = np.stack([
+            compute_similarity_transform(pred_i, gt_i)
+            for pred_i, gt_i in zip(pred, gt)
+        ])
+    elif alignment == 'scale':
+        pred_dot_pred = np.einsum('nkc,nkc->n', pred, pred)
+        pred_dot_gt = np.einsum('nkc,nkc->n', pred, gt)
+        scale_factor = pred_dot_gt / pred_dot_pred
+        pred = pred * scale_factor[:, None, None]
+    else:
+        raise ValueError(f'Invalid value for alignment: {alignment}')
+
+    error = np.linalg.norm(pred - gt, ord=2, axis=-1)
+
+    thresholds = np.linspace(0., 0.15, 31)
+    pck_values = np.zeros(len(thresholds))
+    for i in range(len(thresholds)):
+        pck_values[i] = (error < thresholds[i]).astype(np.float32)[mask].mean()
+
+    auc = pck_values.mean() * 100
+
+    return auc
diff --git a/main/transformer_utils/mmpose/core/evaluation/top_down_eval.py b/main/transformer_utils/mmpose/core/evaluation/top_down_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee6a2501cf1eec1b16f7d58bf9fd62da0fa48ccf
--- /dev/null
+++ b/main/transformer_utils/mmpose/core/evaluation/top_down_eval.py
@@ -0,0 +1,684 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import cv2
+import numpy as np
+
+from mmpose.core.post_processing import transform_preds
+
+
+def _calc_distances(preds, targets, mask, normalize):
+    """Calculate the normalized distances between preds and target.
+
+    Note:
+        batch_size: N
+        num_keypoints: K
+        dimension of keypoints: D (normally, D=2 or D=3)
+
+    Args:
+        preds (np.ndarray[N, K, D]): Predicted keypoint location.
+        targets (np.ndarray[N, K, D]): Groundtruth keypoint location.
+        mask (np.ndarray[N, K]): Visibility of the target. False for invisible
+            joints, and True for visible. Invisible joints will be ignored for
+            accuracy calculation.
+        normalize (np.ndarray[N, D]): Typical value is heatmap_size
+
+    Returns:
+        np.ndarray[K, N]: The normalized distances. \
+            If target keypoints are missing, the distance is -1.
+    """
+    N, K, _ = preds.shape
+    # set mask=0 when normalize==0
+    _mask = mask.copy()
+    _mask[np.where((normalize == 0).sum(1))[0], :] = False
+    distances = np.full((N, K), -1, dtype=np.float32)
+    # handle invalid values
+    normalize[np.where(normalize <= 0)] = 1e6
+    distances[_mask] = np.linalg.norm(
+        ((preds - targets) / normalize[:, None, :])[_mask], axis=-1)
+    return distances.T
+
+
+def _distance_acc(distances, thr=0.5):
+    """Return the percentage below the distance threshold, while ignoring
+    distances values with -1.
+
+    Note:
+        batch_size: N
+    Args:
+        distances (np.ndarray[N, ]): The normalized distances.
+        thr (float): Threshold of the distances.
+
+    Returns:
+        float: Percentage of distances below the threshold. \
+            If all target keypoints are missing, return -1.
+    """
+    distance_valid = distances != -1
+    num_distance_valid = distance_valid.sum()
+    if num_distance_valid > 0:
+        return (distances[distance_valid] < thr).sum() / num_distance_valid
+    return -1
+
+
+def _get_max_preds(heatmaps):
+    """Get keypoint predictions from score maps.
+
+    Note:
+        batch_size: N
+        num_keypoints: K
+        heatmap height: H
+        heatmap width: W
+
+    Args:
+        heatmaps (np.ndarray[N, K, H, W]): model predicted heatmaps.
+
+    Returns:
+        tuple: A tuple containing aggregated results.
+
+        - preds (np.ndarray[N, K, 2]): Predicted keypoint location.
+        - maxvals (np.ndarray[N, K, 1]): Scores (confidence) of the keypoints.
+    """
+    assert isinstance(heatmaps,
+                      np.ndarray), ('heatmaps should be numpy.ndarray')
+    assert heatmaps.ndim == 4, 'batch_images should be 4-ndim'
+
+    N, K, _, W = heatmaps.shape
+    heatmaps_reshaped = heatmaps.reshape((N, K, -1))
+    idx = np.argmax(heatmaps_reshaped, 2).reshape((N, K, 1))
+    maxvals = np.amax(heatmaps_reshaped, 2).reshape((N, K, 1))
+
+    preds = np.tile(idx, (1, 1, 2)).astype(np.float32)
+    preds[:, :, 0] = preds[:, :, 0] % W
+    preds[:, :, 1] = preds[:, :, 1] // W
+
+    preds = np.where(np.tile(maxvals, (1, 1, 2)) > 0.0, preds, -1)
+    return preds, maxvals
+
+
+def _get_max_preds_3d(heatmaps):
+    """Get keypoint predictions from 3D score maps.
+
+    Note:
+        batch size: N
+        num keypoints: K
+        heatmap depth size: D
+        heatmap height: H
+        heatmap width: W
+
+    Args:
+        heatmaps (np.ndarray[N, K, D, H, W]): model predicted heatmaps.
+
+    Returns:
+        tuple: A tuple containing aggregated results.
+
+        - preds (np.ndarray[N, K, 3]): Predicted keypoint location.
+        - maxvals (np.ndarray[N, K, 1]): Scores (confidence) of the keypoints.
+    """
+    assert isinstance(heatmaps, np.ndarray), \
+        ('heatmaps should be numpy.ndarray')
+    assert heatmaps.ndim == 5, 'heatmaps should be 5-ndim'
+
+    N, K, D, H, W = heatmaps.shape
+    heatmaps_reshaped = heatmaps.reshape((N, K, -1))
+    idx = np.argmax(heatmaps_reshaped, 2).reshape((N, K, 1))
+    maxvals = np.amax(heatmaps_reshaped, 2).reshape((N, K, 1))
+
+    preds = np.zeros((N, K, 3), dtype=np.float32)
+    _idx = idx[..., 0]
+    preds[..., 2] = _idx // (H * W)
+    preds[..., 1] = (_idx // W) % H
+    preds[..., 0] = _idx % W
+
+    preds = np.where(maxvals > 0.0, preds, -1)
+    return preds, maxvals
+
+
+def pose_pck_accuracy(output, target, mask, thr=0.05, normalize=None):
+    """Calculate the pose accuracy of PCK for each individual keypoint and the
+    averaged accuracy across all keypoints from heatmaps.
+
+    Note:
+        PCK metric measures accuracy of the localization of the body joints.
+        The distances between predicted positions and the ground-truth ones
+        are typically normalized by the bounding box size.
+        The threshold (thr) of the normalized distance is commonly set
+        as 0.05, 0.1 or 0.2 etc.
+
+        - batch_size: N
+        - num_keypoints: K
+        - heatmap height: H
+        - heatmap width: W
+
+    Args:
+        output (np.ndarray[N, K, H, W]): Model output heatmaps.
+        target (np.ndarray[N, K, H, W]): Groundtruth heatmaps.
+        mask (np.ndarray[N, K]): Visibility of the target. False for invisible
+            joints, and True for visible. Invisible joints will be ignored for
+            accuracy calculation.
+        thr (float): Threshold of PCK calculation. Default 0.05.
+        normalize (np.ndarray[N, 2]): Normalization factor for H&W.
+
+    Returns:
+        tuple: A tuple containing keypoint accuracy.
+
+        - np.ndarray[K]: Accuracy of each keypoint.
+        - float: Averaged accuracy across all keypoints.
+        - int: Number of valid keypoints.
+    """
+    N, K, H, W = output.shape
+    if K == 0:
+        return None, 0, 0
+    if normalize is None:
+        normalize = np.tile(np.array([[H, W]]), (N, 1))
+
+    pred, _ = _get_max_preds(output)
+    gt, _ = _get_max_preds(target)
+    return keypoint_pck_accuracy(pred, gt, mask, thr, normalize)
+
+
+def keypoint_pck_accuracy(pred, gt, mask, thr, normalize):
+    """Calculate the pose accuracy of PCK for each individual keypoint and the
+    averaged accuracy across all keypoints for coordinates.
+
+    Note:
+        PCK metric measures accuracy of the localization of the body joints.
+        The distances between predicted positions and the ground-truth ones
+        are typically normalized by the bounding box size.
+        The threshold (thr) of the normalized distance is commonly set
+        as 0.05, 0.1 or 0.2 etc.
+
+        - batch_size: N
+        - num_keypoints: K
+
+    Args:
+        pred (np.ndarray[N, K, 2]): Predicted keypoint location.
+        gt (np.ndarray[N, K, 2]): Groundtruth keypoint location.
+        mask (np.ndarray[N, K]): Visibility of the target. False for invisible
+            joints, and True for visible. Invisible joints will be ignored for
+            accuracy calculation.
+        thr (float): Threshold of PCK calculation.
+        normalize (np.ndarray[N, 2]): Normalization factor for H&W.
+
+    Returns:
+        tuple: A tuple containing keypoint accuracy.
+
+        - acc (np.ndarray[K]): Accuracy of each keypoint.
+        - avg_acc (float): Averaged accuracy across all keypoints.
+        - cnt (int): Number of valid keypoints.
+    """
+    distances = _calc_distances(pred, gt, mask, normalize)
+
+    acc = np.array([_distance_acc(d, thr) for d in distances])
+    valid_acc = acc[acc >= 0]
+    cnt = len(valid_acc)
+    avg_acc = valid_acc.mean() if cnt > 0 else 0
+    return acc, avg_acc, cnt
+
+
+def keypoint_auc(pred, gt, mask, normalize, num_step=20):
+    """Calculate the pose accuracy of PCK for each individual keypoint and the
+    averaged accuracy across all keypoints for coordinates.
+
+    Note:
+        - batch_size: N
+        - num_keypoints: K
+
+    Args:
+        pred (np.ndarray[N, K, 2]): Predicted keypoint location.
+        gt (np.ndarray[N, K, 2]): Groundtruth keypoint location.
+        mask (np.ndarray[N, K]): Visibility of the target. False for invisible
+            joints, and True for visible. Invisible joints will be ignored for
+            accuracy calculation.
+        normalize (float): Normalization factor.
+
+    Returns:
+        float: Area under curve.
+    """
+    nor = np.tile(np.array([[normalize, normalize]]), (pred.shape[0], 1))
+    x = [1.0 * i / num_step for i in range(num_step)]
+    y = []
+    for thr in x:
+        _, avg_acc, _ = keypoint_pck_accuracy(pred, gt, mask, thr, nor)
+        y.append(avg_acc)
+
+    auc = 0
+    for i in range(num_step):
+        auc += 1.0 / num_step * y[i]
+    return auc
+
+
+def keypoint_nme(pred, gt, mask, normalize_factor):
+    """Calculate the normalized mean error (NME).
+
+    Note:
+        - batch_size: N
+        - num_keypoints: K
+
+    Args:
+        pred (np.ndarray[N, K, 2]): Predicted keypoint location.
+        gt (np.ndarray[N, K, 2]): Groundtruth keypoint location.
+        mask (np.ndarray[N, K]): Visibility of the target. False for invisible
+            joints, and True for visible. Invisible joints will be ignored for
+            accuracy calculation.
+        normalize_factor (np.ndarray[N, 2]): Normalization factor.
+
+    Returns:
+        float: normalized mean error
+    """
+    distances = _calc_distances(pred, gt, mask, normalize_factor)
+    distance_valid = distances[distances != -1]
+    return distance_valid.sum() / max(1, len(distance_valid))
+
+
+def keypoint_epe(pred, gt, mask):
+    """Calculate the end-point error.
+
+    Note:
+        - batch_size: N
+        - num_keypoints: K
+
+    Args:
+        pred (np.ndarray[N, K, 2]): Predicted keypoint location.
+        gt (np.ndarray[N, K, 2]): Groundtruth keypoint location.
+        mask (np.ndarray[N, K]): Visibility of the target. False for invisible
+            joints, and True for visible. Invisible joints will be ignored for
+            accuracy calculation.
+
+    Returns:
+        float: Average end-point error.
+    """
+
+    distances = _calc_distances(
+        pred, gt, mask,
+        np.ones((pred.shape[0], pred.shape[2]), dtype=np.float32))
+    distance_valid = distances[distances != -1]
+    return distance_valid.sum() / max(1, len(distance_valid))
+
+
+def _taylor(heatmap, coord):
+    """Distribution aware coordinate decoding method.
+
+    Note:
+        - heatmap height: H
+        - heatmap width: W
+
+    Args:
+        heatmap (np.ndarray[H, W]): Heatmap of a particular joint type.
+        coord (np.ndarray[2,]): Coordinates of the predicted keypoints.
+
+    Returns:
+        np.ndarray[2,]: Updated coordinates.
+    """
+    H, W = heatmap.shape[:2]
+    px, py = int(coord[0]), int(coord[1])
+    if 1 < px < W - 2 and 1 < py < H - 2:
+        dx = 0.5 * (heatmap[py][px + 1] - heatmap[py][px - 1])
+        dy = 0.5 * (heatmap[py + 1][px] - heatmap[py - 1][px])
+        dxx = 0.25 * (
+            heatmap[py][px + 2] - 2 * heatmap[py][px] + heatmap[py][px - 2])
+        dxy = 0.25 * (
+            heatmap[py + 1][px + 1] - heatmap[py - 1][px + 1] -
+            heatmap[py + 1][px - 1] + heatmap[py - 1][px - 1])
+        dyy = 0.25 * (
+            heatmap[py + 2 * 1][px] - 2 * heatmap[py][px] +
+            heatmap[py - 2 * 1][px])
+        derivative = np.array([[dx], [dy]])
+        hessian = np.array([[dxx, dxy], [dxy, dyy]])
+        if dxx * dyy - dxy**2 != 0:
+            hessianinv = np.linalg.inv(hessian)
+            offset = -hessianinv @ derivative
+            offset = np.squeeze(np.array(offset.T), axis=0)
+            coord += offset
+    return coord
+
+
+def post_dark_udp(coords, batch_heatmaps, kernel=3):
+    """DARK post-pocessing. Implemented by udp. Paper ref: Huang et al. The
+    Devil is in the Details: Delving into Unbiased Data Processing for Human
+    Pose Estimation (CVPR 2020). Zhang et al. Distribution-Aware Coordinate
+    Representation for Human Pose Estimation (CVPR 2020).
+
+    Note:
+        - batch size: B
+        - num keypoints: K
+        - num persons: N
+        - height of heatmaps: H
+        - width of heatmaps: W
+
+        B=1 for bottom_up paradigm where all persons share the same heatmap.
+        B=N for top_down paradigm where each person has its own heatmaps.
+
+    Args:
+        coords (np.ndarray[N, K, 2]): Initial coordinates of human pose.
+        batch_heatmaps (np.ndarray[B, K, H, W]): batch_heatmaps
+        kernel (int): Gaussian kernel size (K) for modulation.
+
+    Returns:
+        np.ndarray([N, K, 2]): Refined coordinates.
+    """
+    if not isinstance(batch_heatmaps, np.ndarray):
+        batch_heatmaps = batch_heatmaps.cpu().numpy()
+    B, K, H, W = batch_heatmaps.shape
+    N = coords.shape[0]
+    assert (B == 1 or B == N)
+    for heatmaps in batch_heatmaps:
+        for heatmap in heatmaps:
+            cv2.GaussianBlur(heatmap, (kernel, kernel), 0, heatmap)
+    np.clip(batch_heatmaps, 0.001, 50, batch_heatmaps)
+    np.log(batch_heatmaps, batch_heatmaps)
+
+    batch_heatmaps_pad = np.pad(
+        batch_heatmaps, ((0, 0), (0, 0), (1, 1), (1, 1)),
+        mode='edge').flatten()
+
+    index = coords[..., 0] + 1 + (coords[..., 1] + 1) * (W + 2)
+    index += (W + 2) * (H + 2) * np.arange(0, B * K).reshape(-1, K)
+    index = index.astype(int).reshape(-1, 1)
+    i_ = batch_heatmaps_pad[index]
+    ix1 = batch_heatmaps_pad[index + 1]
+    iy1 = batch_heatmaps_pad[index + W + 2]
+    ix1y1 = batch_heatmaps_pad[index + W + 3]
+    ix1_y1_ = batch_heatmaps_pad[index - W - 3]
+    ix1_ = batch_heatmaps_pad[index - 1]
+    iy1_ = batch_heatmaps_pad[index - 2 - W]
+
+    dx = 0.5 * (ix1 - ix1_)
+    dy = 0.5 * (iy1 - iy1_)
+    derivative = np.concatenate([dx, dy], axis=1)
+    derivative = derivative.reshape(N, K, 2, 1)
+    dxx = ix1 - 2 * i_ + ix1_
+    dyy = iy1 - 2 * i_ + iy1_
+    dxy = 0.5 * (ix1y1 - ix1 - iy1 + i_ + i_ - ix1_ - iy1_ + ix1_y1_)
+    hessian = np.concatenate([dxx, dxy, dxy, dyy], axis=1)
+    hessian = hessian.reshape(N, K, 2, 2)
+    hessian = np.linalg.inv(hessian + np.finfo(np.float32).eps * np.eye(2))
+    coords -= np.einsum('ijmn,ijnk->ijmk', hessian, derivative).squeeze()
+    return coords
+
+
+def _gaussian_blur(heatmaps, kernel=11):
+    """Modulate heatmap distribution with Gaussian.
+     sigma = 0.3*((kernel_size-1)*0.5-1)+0.8
+     sigma~=3 if k=17
+     sigma=2 if k=11;
+     sigma~=1.5 if k=7;
+     sigma~=1 if k=3;
+
+    Note:
+        - batch_size: N
+        - num_keypoints: K
+        - heatmap height: H
+        - heatmap width: W
+
+    Args:
+        heatmaps (np.ndarray[N, K, H, W]): model predicted heatmaps.
+        kernel (int): Gaussian kernel size (K) for modulation, which should
+            match the heatmap gaussian sigma when training.
+            K=17 for sigma=3 and k=11 for sigma=2.
+
+    Returns:
+        np.ndarray ([N, K, H, W]): Modulated heatmap distribution.
+    """
+    assert kernel % 2 == 1
+
+    border = (kernel - 1) // 2
+    batch_size = heatmaps.shape[0]
+    num_joints = heatmaps.shape[1]
+    height = heatmaps.shape[2]
+    width = heatmaps.shape[3]
+    for i in range(batch_size):
+        for j in range(num_joints):
+            origin_max = np.max(heatmaps[i, j])
+            dr = np.zeros((height + 2 * border, width + 2 * border),
+                          dtype=np.float32)
+            dr[border:-border, border:-border] = heatmaps[i, j].copy()
+            dr = cv2.GaussianBlur(dr, (kernel, kernel), 0)
+            heatmaps[i, j] = dr[border:-border, border:-border].copy()
+            heatmaps[i, j] *= origin_max / np.max(heatmaps[i, j])
+    return heatmaps
+
+
+def keypoints_from_regression(regression_preds, center, scale, img_size):
+    """Get final keypoint predictions from regression vectors and transform
+    them back to the image.
+
+    Note:
+        - batch_size: N
+        - num_keypoints: K
+
+    Args:
+        regression_preds (np.ndarray[N, K, 2]): model prediction.
+        center (np.ndarray[N, 2]): Center of the bounding box (x, y).
+        scale (np.ndarray[N, 2]): Scale of the bounding box
+            wrt height/width.
+        img_size (list(img_width, img_height)): model input image size.
+
+    Returns:
+        tuple:
+
+        - preds (np.ndarray[N, K, 2]): Predicted keypoint location in images.
+        - maxvals (np.ndarray[N, K, 1]): Scores (confidence) of the keypoints.
+    """
+    N, K, _ = regression_preds.shape
+    preds, maxvals = regression_preds, np.ones((N, K, 1), dtype=np.float32)
+
+    preds = preds * img_size
+
+    # Transform back to the image
+    for i in range(N):
+        preds[i] = transform_preds(preds[i], center[i], scale[i], img_size)
+
+    return preds, maxvals
+
+
+def keypoints_from_heatmaps(heatmaps,
+                            center,
+                            scale,
+                            unbiased=False,
+                            post_process='default',
+                            kernel=11,
+                            valid_radius_factor=0.0546875,
+                            use_udp=False,
+                            target_type='GaussianHeatmap'):
+    """Get final keypoint predictions from heatmaps and transform them back to
+    the image.
+
+    Note:
+        - batch size: N
+        - num keypoints: K
+        - heatmap height: H
+        - heatmap width: W
+
+    Args:
+        heatmaps (np.ndarray[N, K, H, W]): model predicted heatmaps.
+        center (np.ndarray[N, 2]): Center of the bounding box (x, y).
+        scale (np.ndarray[N, 2]): Scale of the bounding box
+            wrt height/width.
+        post_process (str/None): Choice of methods to post-process
+            heatmaps. Currently supported: None, 'default', 'unbiased',
+            'megvii'.
+        unbiased (bool): Option to use unbiased decoding. Mutually
+            exclusive with megvii.
+            Note: this arg is deprecated and unbiased=True can be replaced
+            by post_process='unbiased'
+            Paper ref: Zhang et al. Distribution-Aware Coordinate
+            Representation for Human Pose Estimation (CVPR 2020).
+        kernel (int): Gaussian kernel size (K) for modulation, which should
+            match the heatmap gaussian sigma when training.
+            K=17 for sigma=3 and k=11 for sigma=2.
+        valid_radius_factor (float): The radius factor of the positive area
+            in classification heatmap for UDP.
+        use_udp (bool): Use unbiased data processing.
+        target_type (str): 'GaussianHeatmap' or 'CombinedTarget'.
+            GaussianHeatmap: Classification target with gaussian distribution.
+            CombinedTarget: The combination of classification target
+            (response map) and regression target (offset map).
+            Paper ref: Huang et al. The Devil is in the Details: Delving into
+            Unbiased Data Processing for Human Pose Estimation (CVPR 2020).
+
+    Returns:
+        tuple: A tuple containing keypoint predictions and scores.
+
+        - preds (np.ndarray[N, K, 2]): Predicted keypoint location in images.
+        - maxvals (np.ndarray[N, K, 1]): Scores (confidence) of the keypoints.
+    """
+    # Avoid being affected
+    heatmaps = heatmaps.copy()
+
+    # detect conflicts
+    if unbiased:
+        assert post_process not in [False, None, 'megvii']
+    if post_process in ['megvii', 'unbiased']:
+        assert kernel > 0
+    if use_udp:
+        assert not post_process == 'megvii'
+
+    # normalize configs
+    if post_process is False:
+        warnings.warn(
+            'post_process=False is deprecated, '
+            'please use post_process=None instead', DeprecationWarning)
+        post_process = None
+    elif post_process is True:
+        if unbiased is True:
+            warnings.warn(
+                'post_process=True, unbiased=True is deprecated,'
+                " please use post_process='unbiased' instead",
+                DeprecationWarning)
+            post_process = 'unbiased'
+        else:
+            warnings.warn(
+                'post_process=True, unbiased=False is deprecated, '
+                "please use post_process='default' instead",
+                DeprecationWarning)
+            post_process = 'default'
+    elif post_process == 'default':
+        if unbiased is True:
+            warnings.warn(
+                'unbiased=True is deprecated, please use '
+                "post_process='unbiased' instead", DeprecationWarning)
+            post_process = 'unbiased'
+
+    # start processing
+    if post_process == 'megvii':
+        heatmaps = _gaussian_blur(heatmaps, kernel=kernel)
+
+    N, K, H, W = heatmaps.shape
+    if use_udp:
+        if target_type.lower() == 'GaussianHeatMap'.lower():
+            preds, maxvals = _get_max_preds(heatmaps)
+            preds = post_dark_udp(preds, heatmaps, kernel=kernel)
+        elif target_type.lower() == 'CombinedTarget'.lower():
+            for person_heatmaps in heatmaps:
+                for i, heatmap in enumerate(person_heatmaps):
+                    kt = 2 * kernel + 1 if i % 3 == 0 else kernel
+                    cv2.GaussianBlur(heatmap, (kt, kt), 0, heatmap)
+            # valid radius is in direct proportion to the height of heatmap.
+            valid_radius = valid_radius_factor * H
+            offset_x = heatmaps[:, 1::3, :].flatten() * valid_radius
+            offset_y = heatmaps[:, 2::3, :].flatten() * valid_radius
+            heatmaps = heatmaps[:, ::3, :]
+            preds, maxvals = _get_max_preds(heatmaps)
+            index = preds[..., 0] + preds[..., 1] * W
+            index += W * H * np.arange(0, N * K / 3)
+            index = index.astype(int).reshape(N, K // 3, 1)
+            preds += np.concatenate((offset_x[index], offset_y[index]), axis=2)
+        else:
+            raise ValueError('target_type should be either '
+                             "'GaussianHeatmap' or 'CombinedTarget'")
+    else:
+        preds, maxvals = _get_max_preds(heatmaps)
+        if post_process == 'unbiased':  # alleviate biased coordinate
+            # apply Gaussian distribution modulation.
+            heatmaps = np.log(
+                np.maximum(_gaussian_blur(heatmaps, kernel), 1e-10))
+            for n in range(N):
+                for k in range(K):
+                    preds[n][k] = _taylor(heatmaps[n][k], preds[n][k])
+        elif post_process is not None:
+            # add +/-0.25 shift to the predicted locations for higher acc.
+            for n in range(N):
+                for k in range(K):
+                    heatmap = heatmaps[n][k]
+                    px = int(preds[n][k][0])
+                    py = int(preds[n][k][1])
+                    if 1 < px < W - 1 and 1 < py < H - 1:
+                        diff = np.array([
+                            heatmap[py][px + 1] - heatmap[py][px - 1],
+                            heatmap[py + 1][px] - heatmap[py - 1][px]
+                        ])
+                        preds[n][k] += np.sign(diff) * .25
+                        if post_process == 'megvii':
+                            preds[n][k] += 0.5
+
+    # Transform back to the image
+    for i in range(N):
+        preds[i] = transform_preds(
+            preds[i], center[i], scale[i], [W, H], use_udp=use_udp)
+
+    if post_process == 'megvii':
+        maxvals = maxvals / 255.0 + 0.5
+
+    return preds, maxvals
+
+
+def keypoints_from_heatmaps3d(heatmaps, center, scale):
+    """Get final keypoint predictions from 3d heatmaps and transform them back
+    to the image.
+
+    Note:
+        - batch size: N
+        - num keypoints: K
+        - heatmap depth size: D
+        - heatmap height: H
+        - heatmap width: W
+
+    Args:
+        heatmaps (np.ndarray[N, K, D, H, W]): model predicted heatmaps.
+        center (np.ndarray[N, 2]): Center of the bounding box (x, y).
+        scale (np.ndarray[N, 2]): Scale of the bounding box
+            wrt height/width.
+
+    Returns:
+        tuple: A tuple containing keypoint predictions and scores.
+
+        - preds (np.ndarray[N, K, 3]): Predicted 3d keypoint location \
+            in images.
+        - maxvals (np.ndarray[N, K, 1]): Scores (confidence) of the keypoints.
+    """
+    N, K, D, H, W = heatmaps.shape
+    preds, maxvals = _get_max_preds_3d(heatmaps)
+    # Transform back to the image
+    for i in range(N):
+        preds[i, :, :2] = transform_preds(preds[i, :, :2], center[i], scale[i],
+                                          [W, H])
+    return preds, maxvals
+
+
+def multilabel_classification_accuracy(pred, gt, mask, thr=0.5):
+    """Get multi-label classification accuracy.
+
+    Note:
+        - batch size: N
+        - label number: L
+
+    Args:
+        pred (np.ndarray[N, L, 2]): model predicted labels.
+        gt (np.ndarray[N, L, 2]): ground-truth labels.
+        mask (np.ndarray[N, 1] or np.ndarray[N, L] ): reliability of
+        ground-truth labels.
+
+    Returns:
+        float: multi-label classification accuracy.
+    """
+    # we only compute accuracy on the samples with ground-truth of all labels.
+    valid = (mask > 0).min(axis=1) if mask.ndim == 2 else (mask > 0)
+    pred, gt = pred[valid], gt[valid]
+
+    if pred.shape[0] == 0:
+        acc = 0.0  # when no sample is with gt labels, set acc to 0.
+    else:
+        # The classification of a sample is regarded as correct
+        # only if it's correct for all labels.
+        acc = (((pred - thr) * (gt - thr)) > 0).all(axis=1).mean()
+    return acc
diff --git a/main/transformer_utils/mmpose/core/fp16/__init__.py b/main/transformer_utils/mmpose/core/fp16/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5cb054810870626496ab4145446b17cf2c2e0b5d
--- /dev/null
+++ b/main/transformer_utils/mmpose/core/fp16/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .decorators import auto_fp16, force_fp32
+from .hooks import Fp16OptimizerHook, wrap_fp16_model
+from .utils import cast_tensor_type
+
+__all__ = [
+    'auto_fp16', 'force_fp32', 'Fp16OptimizerHook', 'wrap_fp16_model',
+    'cast_tensor_type'
+]
diff --git a/main/transformer_utils/mmpose/core/fp16/decorators.py b/main/transformer_utils/mmpose/core/fp16/decorators.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d70ddf533c069b26f08ef3a973328790843def5
--- /dev/null
+++ b/main/transformer_utils/mmpose/core/fp16/decorators.py
@@ -0,0 +1,175 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import functools
+import warnings
+from inspect import getfullargspec
+
+import torch
+
+from .utils import cast_tensor_type
+
+
+def auto_fp16(apply_to=None, out_fp32=False):
+    """Decorator to enable fp16 training automatically.
+
+    This decorator is useful when you write custom modules and want to support
+    mixed precision training. If inputs arguments are fp32 tensors, they will
+    be converted to fp16 automatically. Arguments other than fp32 tensors are
+    ignored.
+
+    Args:
+        apply_to (Iterable, optional): The argument names to be converted.
+            `None` indicates all arguments.
+        out_fp32 (bool): Whether to convert the output back to fp32.
+
+    Example:
+
+        >>> import torch.nn as nn
+        >>> class MyModule1(nn.Module):
+        >>>
+        >>>     # Convert x and y to fp16
+        >>>     @auto_fp16()
+        >>>     def forward(self, x, y):
+        >>>         pass
+
+        >>> import torch.nn as nn
+        >>> class MyModule2(nn.Module):
+        >>>
+        >>>     # convert pred to fp16
+        >>>     @auto_fp16(apply_to=('pred', ))
+        >>>     def do_something(self, pred, others):
+        >>>         pass
+    """
+
+    warnings.warn(
+        'auto_fp16 in mmpose will be deprecated in the next release.'
+        'Please use mmcv.runner.auto_fp16 instead (mmcv>=1.3.1).',
+        DeprecationWarning)
+
+    def auto_fp16_wrapper(old_func):
+
+        @functools.wraps(old_func)
+        def new_func(*args, **kwargs):
+            # check if the module has set the attribute `fp16_enabled`, if not,
+            # just fallback to the original method.
+            if not isinstance(args[0], torch.nn.Module):
+                raise TypeError('@auto_fp16 can only be used to decorate the '
+                                'method of nn.Module')
+            if not (hasattr(args[0], 'fp16_enabled') and args[0].fp16_enabled):
+                return old_func(*args, **kwargs)
+            # get the arg spec of the decorated method
+            args_info = getfullargspec(old_func)
+            # get the argument names to be casted
+            args_to_cast = args_info.args if apply_to is None else apply_to
+            # convert the args that need to be processed
+            new_args = []
+            # NOTE: default args are not taken into consideration
+            if args:
+                arg_names = args_info.args[:len(args)]
+                for i, arg_name in enumerate(arg_names):
+                    if arg_name in args_to_cast:
+                        new_args.append(
+                            cast_tensor_type(args[i], torch.float, torch.half))
+                    else:
+                        new_args.append(args[i])
+            # convert the kwargs that need to be processed
+            new_kwargs = {}
+            if kwargs:
+                for arg_name, arg_value in kwargs.items():
+                    if arg_name in args_to_cast:
+                        new_kwargs[arg_name] = cast_tensor_type(
+                            arg_value, torch.float, torch.half)
+                    else:
+                        new_kwargs[arg_name] = arg_value
+            # apply converted arguments to the decorated method
+            output = old_func(*new_args, **new_kwargs)
+            # cast the results back to fp32 if necessary
+            if out_fp32:
+                output = cast_tensor_type(output, torch.half, torch.float)
+            return output
+
+        return new_func
+
+    return auto_fp16_wrapper
+
+
+def force_fp32(apply_to=None, out_fp16=False):
+    """Decorator to convert input arguments to fp32 in force.
+
+    This decorator is useful when you write custom modules and want to support
+    mixed precision training. If there are some inputs that must be processed
+    in fp32 mode, then this decorator can handle it. If inputs arguments are
+    fp16 tensors, they will be converted to fp32 automatically. Arguments other
+    than fp16 tensors are ignored.
+
+    Args:
+        apply_to (Iterable, optional): The argument names to be converted.
+            `None` indicates all arguments.
+        out_fp16 (bool): Whether to convert the output back to fp16.
+
+    Example:
+
+        >>> import torch.nn as nn
+        >>> class MyModule1(nn.Module):
+        >>>
+        >>>     # Convert x and y to fp32
+        >>>     @force_fp32()
+        >>>     def loss(self, x, y):
+        >>>         pass
+
+        >>> import torch.nn as nn
+        >>> class MyModule2(nn.Module):
+        >>>
+        >>>     # convert pred to fp32
+        >>>     @force_fp32(apply_to=('pred', ))
+        >>>     def post_process(self, pred, others):
+        >>>         pass
+    """
+    warnings.warn(
+        'force_fp32 in mmpose will be deprecated in the next release.'
+        'Please use mmcv.runner.force_fp32 instead (mmcv>=1.3.1).',
+        DeprecationWarning)
+
+    def force_fp32_wrapper(old_func):
+
+        @functools.wraps(old_func)
+        def new_func(*args, **kwargs):
+            # check if the module has set the attribute `fp16_enabled`, if not,
+            # just fallback to the original method.
+            if not isinstance(args[0], torch.nn.Module):
+                raise TypeError('@force_fp32 can only be used to decorate the '
+                                'method of nn.Module')
+            if not (hasattr(args[0], 'fp16_enabled') and args[0].fp16_enabled):
+                return old_func(*args, **kwargs)
+            # get the arg spec of the decorated method
+            args_info = getfullargspec(old_func)
+            # get the argument names to be casted
+            args_to_cast = args_info.args if apply_to is None else apply_to
+            # convert the args that need to be processed
+            new_args = []
+            if args:
+                arg_names = args_info.args[:len(args)]
+                for i, arg_name in enumerate(arg_names):
+                    if arg_name in args_to_cast:
+                        new_args.append(
+                            cast_tensor_type(args[i], torch.half, torch.float))
+                    else:
+                        new_args.append(args[i])
+            # convert the kwargs that need to be processed
+            new_kwargs = dict()
+            if kwargs:
+                for arg_name, arg_value in kwargs.items():
+                    if arg_name in args_to_cast:
+                        new_kwargs[arg_name] = cast_tensor_type(
+                            arg_value, torch.half, torch.float)
+                    else:
+                        new_kwargs[arg_name] = arg_value
+            # apply converted arguments to the decorated method
+            output = old_func(*new_args, **new_kwargs)
+            # cast the results back to fp32 if necessary
+            if out_fp16:
+                output = cast_tensor_type(output, torch.float, torch.half)
+            return output
+
+        return new_func
+
+    return force_fp32_wrapper
diff --git a/main/transformer_utils/mmpose/core/fp16/hooks.py b/main/transformer_utils/mmpose/core/fp16/hooks.py
new file mode 100644
index 0000000000000000000000000000000000000000..74081a9b73b95ebb20cabf07cfaeab86cc874780
--- /dev/null
+++ b/main/transformer_utils/mmpose/core/fp16/hooks.py
@@ -0,0 +1,167 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import torch
+import torch.nn as nn
+from mmcv.runner import OptimizerHook
+from mmcv.utils import _BatchNorm
+
+from ..utils.dist_utils import allreduce_grads
+from .utils import cast_tensor_type
+
+
+class Fp16OptimizerHook(OptimizerHook):
+    """FP16 optimizer hook.
+
+    The steps of fp16 optimizer is as follows.
+    1. Scale the loss value.
+    2. BP in the fp16 model.
+    2. Copy gradients from fp16 model to fp32 weights.
+    3. Update fp32 weights.
+    4. Copy updated parameters from fp32 weights to fp16 model.
+
+    Refer to https://arxiv.org/abs/1710.03740 for more details.
+
+    Args:
+        loss_scale (float): Scale factor multiplied with loss.
+    """
+
+    def __init__(self,
+                 grad_clip=None,
+                 coalesce=True,
+                 bucket_size_mb=-1,
+                 loss_scale=512.,
+                 distributed=True):
+        self.grad_clip = grad_clip
+        self.coalesce = coalesce
+        self.bucket_size_mb = bucket_size_mb
+        self.loss_scale = loss_scale
+        self.distributed = distributed
+
+    def before_run(self, runner):
+        """Preparing steps before Mixed Precision Training.
+
+        1. Make a master copy of fp32 weights for optimization.
+        2. Convert the main model from fp32 to fp16.
+
+        Args:
+            runner (:obj:`mmcv.Runner`): The underlines training runner.
+        """
+        # keep a copy of fp32 weights
+        runner.optimizer.param_groups = copy.deepcopy(
+            runner.optimizer.param_groups)
+        # convert model to fp16
+        wrap_fp16_model(runner.model)
+
+    @staticmethod
+    def copy_grads_to_fp32(fp16_net, fp32_weights):
+        """Copy gradients from fp16 model to fp32 weight copy."""
+        for fp32_param, fp16_param in zip(fp32_weights, fp16_net.parameters()):
+            if fp16_param.grad is not None:
+                if fp32_param.grad is None:
+                    fp32_param.grad = fp32_param.data.new(fp32_param.size())
+                fp32_param.grad.copy_(fp16_param.grad)
+
+    @staticmethod
+    def copy_params_to_fp16(fp16_net, fp32_weights):
+        """Copy updated params from fp32 weight copy to fp16 model."""
+        for fp16_param, fp32_param in zip(fp16_net.parameters(), fp32_weights):
+            fp16_param.data.copy_(fp32_param.data)
+
+    def after_train_iter(self, runner):
+        """Backward optimization steps for Mixed Precision Training.
+
+        1. Scale the loss by a scale factor.
+        2. Backward the loss to obtain the gradients (fp16).
+        3. Copy gradients from the model to the fp32 weight copy.
+        4. Scale the gradients back and update the fp32 weight copy.
+        5. Copy back the params from fp32 weight copy to the fp16 model.
+
+        Args:
+            runner (:obj:`mmcv.Runner`): The underlines training runner.
+        """
+        # clear grads of last iteration
+        runner.model.zero_grad()
+        runner.optimizer.zero_grad()
+        # scale the loss value
+        scaled_loss = runner.outputs['loss'] * self.loss_scale
+        scaled_loss.backward()
+        # copy fp16 grads in the model to fp32 params in the optimizer
+        fp32_weights = []
+        for param_group in runner.optimizer.param_groups:
+            fp32_weights += param_group['params']
+        self.copy_grads_to_fp32(runner.model, fp32_weights)
+        # allreduce grads
+        if self.distributed:
+            allreduce_grads(fp32_weights, self.coalesce, self.bucket_size_mb)
+        # scale the gradients back
+        for param in fp32_weights:
+            if param.grad is not None:
+                param.grad.div_(self.loss_scale)
+        if self.grad_clip is not None:
+            self.clip_grads(fp32_weights)
+        # update fp32 params
+        runner.optimizer.step()
+        # copy fp32 params to the fp16 model
+        self.copy_params_to_fp16(runner.model, fp32_weights)
+
+
+def wrap_fp16_model(model):
+    """Wrap the FP32 model to FP16.
+
+    1. Convert FP32 model to FP16.
+    2. Remain some necessary layers to be FP32, e.g., normalization layers.
+
+    Args:
+        model (nn.Module): Model in FP32.
+    """
+    # convert model to fp16
+    model.half()
+    # patch the normalization layers to make it work in fp32 mode
+    patch_norm_fp32(model)
+    # set `fp16_enabled` flag
+    for m in model.modules():
+        if hasattr(m, 'fp16_enabled'):
+            m.fp16_enabled = True
+
+
+def patch_norm_fp32(module):
+    """Recursively convert normalization layers from FP16 to FP32.
+
+    Args:
+        module (nn.Module): The modules to be converted in FP16.
+
+    Returns:
+        nn.Module: The converted module, the normalization layers have been
+            converted to FP32.
+    """
+    if isinstance(module, (_BatchNorm, nn.GroupNorm)):
+        module.float()
+        module.forward = patch_forward_method(module.forward, torch.half,
+                                              torch.float)
+    for child in module.children():
+        patch_norm_fp32(child)
+    return module
+
+
+def patch_forward_method(func, src_type, dst_type, convert_output=True):
+    """Patch the forward method of a module.
+
+    Args:
+        func (callable): The original forward method.
+        src_type (torch.dtype): Type of input arguments to be converted from.
+        dst_type (torch.dtype): Type of input arguments to be converted to.
+        convert_output (bool): Whether to convert the output back to src_type.
+
+    Returns:
+        callable: The patched forward method.
+    """
+
+    def new_forward(*args, **kwargs):
+        output = func(*cast_tensor_type(args, src_type, dst_type),
+                      **cast_tensor_type(kwargs, src_type, dst_type))
+        if convert_output:
+            output = cast_tensor_type(output, dst_type, src_type)
+        return output
+
+    return new_forward
diff --git a/main/transformer_utils/mmpose/core/fp16/utils.py b/main/transformer_utils/mmpose/core/fp16/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1ec3d328328560c7959ae5e77621feb77692068
--- /dev/null
+++ b/main/transformer_utils/mmpose/core/fp16/utils.py
@@ -0,0 +1,34 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections import abc
+
+import numpy as np
+import torch
+
+
+def cast_tensor_type(inputs, src_type, dst_type):
+    """Recursively convert Tensor in inputs from src_type to dst_type.
+
+    Args:
+        inputs: Inputs that to be casted.
+        src_type (torch.dtype): Source type.
+        dst_type (torch.dtype): Destination type.
+
+    Returns:
+        The same type with inputs, but all contained Tensors have been cast.
+    """
+    if isinstance(inputs, torch.Tensor):
+        return inputs.to(dst_type)
+    elif isinstance(inputs, str):
+        return inputs
+    elif isinstance(inputs, np.ndarray):
+        return inputs
+    elif isinstance(inputs, abc.Mapping):
+        return type(inputs)({
+            k: cast_tensor_type(v, src_type, dst_type)
+            for k, v in inputs.items()
+        })
+    elif isinstance(inputs, abc.Iterable):
+        return type(inputs)(
+            cast_tensor_type(item, src_type, dst_type) for item in inputs)
+
+    return inputs
diff --git a/main/transformer_utils/mmpose/core/optimizers/__init__.py b/main/transformer_utils/mmpose/core/optimizers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..906f67c231d9d33faee6c15f5c9b5582af6fdb19
--- /dev/null
+++ b/main/transformer_utils/mmpose/core/optimizers/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .builder import (OPTIMIZER_BUILDERS, OPTIMIZERS,
+                      build_optimizer_constructor, build_optimizers)
+
+__all__ = [
+    'build_optimizers', 'build_optimizer_constructor', 'OPTIMIZERS',
+    'OPTIMIZER_BUILDERS'
+]
diff --git a/main/transformer_utils/mmpose/core/optimizers/builder.py b/main/transformer_utils/mmpose/core/optimizers/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd2cf49133c57f28261b555d30a5cee18ae105af
--- /dev/null
+++ b/main/transformer_utils/mmpose/core/optimizers/builder.py
@@ -0,0 +1,70 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.runner import build_optimizer
+from mmcv.runner.optimizer import OPTIMIZER_BUILDERS as MMCV_OPTIMIZER_BUILDERS
+from mmcv.utils import Registry, build_from_cfg
+
+OPTIMIZERS = Registry('optimizers')
+OPTIMIZER_BUILDERS = Registry(
+    'optimizer builder', parent=MMCV_OPTIMIZER_BUILDERS)
+
+
+def build_optimizer_constructor(cfg):
+    constructor_type = cfg.get('type')
+    if constructor_type in OPTIMIZER_BUILDERS:
+        return build_from_cfg(cfg, OPTIMIZER_BUILDERS)
+    elif constructor_type in MMCV_OPTIMIZER_BUILDERS:
+        return build_from_cfg(cfg, MMCV_OPTIMIZER_BUILDERS)
+    else:
+        raise KeyError(f'{constructor_type} is not registered '
+                       'in the optimizer builder registry.')
+
+
+def build_optimizers(model, cfgs):
+    """Build multiple optimizers from configs.
+
+    If `cfgs` contains several dicts for optimizers, then a dict for each
+    constructed optimizers will be returned.
+    If `cfgs` only contains one optimizer config, the constructed optimizer
+    itself will be returned.
+
+    For example,
+
+    1) Multiple optimizer configs:
+
+    .. code-block:: python
+
+        optimizer_cfg = dict(
+            model1=dict(type='SGD', lr=lr),
+            model2=dict(type='SGD', lr=lr))
+
+    The return dict is
+    ``dict('model1': torch.optim.Optimizer, 'model2': torch.optim.Optimizer)``
+
+    2) Single optimizer config:
+
+    .. code-block:: python
+
+        optimizer_cfg = dict(type='SGD', lr=lr)
+
+    The return is ``torch.optim.Optimizer``.
+
+    Args:
+        model (:obj:`nn.Module`): The model with parameters to be optimized.
+        cfgs (dict): The config dict of the optimizer.
+
+    Returns:
+        dict[:obj:`torch.optim.Optimizer`] | :obj:`torch.optim.Optimizer`:
+            The initialized optimizers.
+    """
+    optimizers = {}
+    if hasattr(model, 'module'):
+        model = model.module
+    # determine whether 'cfgs' has several dicts for optimizers
+    if all(isinstance(v, dict) for v in cfgs.values()):
+        for key, cfg in cfgs.items():
+            cfg_ = cfg.copy()
+            module = getattr(model, key)
+            optimizers[key] = build_optimizer(module, cfg_)
+        return optimizers
+
+    return build_optimizer(model, cfgs)
diff --git a/main/transformer_utils/mmpose/core/optimizers/layer_decay_optimizer_constructor.py b/main/transformer_utils/mmpose/core/optimizers/layer_decay_optimizer_constructor.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ab6a82548c046483b7c412cefa0762cdbc531f8
--- /dev/null
+++ b/main/transformer_utils/mmpose/core/optimizers/layer_decay_optimizer_constructor.py
@@ -0,0 +1,208 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import warnings
+
+from mmcv.runner import DefaultOptimizerConstructor, get_dist_info
+
+from mmpose.utils import get_root_logger
+from .builder import OPTIMIZER_BUILDERS
+
+
+def get_layer_id_for_convnext(var_name, max_layer_id):
+    """Get the layer id to set the different learning rates in ``layer_wise``
+    decay_type.
+
+    Args:
+        var_name (str): The key of the model.
+        max_layer_id (int): Maximum number of backbone layers.
+
+    Returns:
+        int: The id number corresponding to different learning rate in
+        ``LearningRateDecayOptimizerConstructor``.
+    """
+
+    if var_name in ('backbone.cls_token', 'backbone.mask_token',
+                    'backbone.pos_embed'):
+        return 0
+    elif var_name.startswith('backbone.downsample_layers'):
+        stage_id = int(var_name.split('.')[2])
+        if stage_id == 0:
+            layer_id = 0
+        elif stage_id == 1:
+            layer_id = 2
+        elif stage_id == 2:
+            layer_id = 3
+        elif stage_id == 3:
+            layer_id = max_layer_id
+        return layer_id
+    elif var_name.startswith('backbone.stages'):
+        stage_id = int(var_name.split('.')[2])
+        block_id = int(var_name.split('.')[3])
+        if stage_id == 0:
+            layer_id = 1
+        elif stage_id == 1:
+            layer_id = 2
+        elif stage_id == 2:
+            layer_id = 3 + block_id // 3
+        elif stage_id == 3:
+            layer_id = max_layer_id
+        return layer_id
+    else:
+        return max_layer_id + 1
+
+
+def get_stage_id_for_convnext(var_name, max_stage_id):
+    """Get the stage id to set the different learning rates in ``stage_wise``
+    decay_type.
+
+    Args:
+        var_name (str): The key of the model.
+        max_stage_id (int): Maximum number of backbone layers.
+
+    Returns:
+        int: The id number corresponding to different learning rate in
+        ``LearningRateDecayOptimizerConstructor``.
+    """
+
+    if var_name in ('backbone.cls_token', 'backbone.mask_token',
+                    'backbone.pos_embed'):
+        return 0
+    elif var_name.startswith('backbone.downsample_layers'):
+        return 0
+    elif var_name.startswith('backbone.stages'):
+        stage_id = int(var_name.split('.')[2])
+        return stage_id + 1
+    else:
+        return max_stage_id - 1
+
+
+def get_layer_id_for_vit(var_name, max_layer_id):
+    """Get the layer id to set the different learning rates.
+
+    Args:
+        var_name (str): The key of the model.
+        num_max_layer (int): Maximum number of backbone layers.
+
+    Returns:
+        int: Returns the layer id of the key.
+    """
+
+    if var_name in ('backbone.cls_token', 'backbone.mask_token',
+                    'backbone.pos_embed'):
+        return 0
+    elif var_name.startswith('backbone.patch_embed'):
+        return 0
+    elif var_name.startswith('backbone.layers'):
+        layer_id = int(var_name.split('.')[2])
+        return layer_id + 1
+    else:
+        return max_layer_id - 1
+
+
+@OPTIMIZER_BUILDERS.register_module()
+class LearningRateDecayOptimizerConstructor(DefaultOptimizerConstructor):
+    """Different learning rates are set for different layers of backbone.
+
+    Note: Currently, this optimizer constructor is built for ConvNeXt,
+    BEiT and MAE.
+    """
+
+    def add_params(self, params, module, **kwargs):
+        """Add all parameters of module to the params list.
+
+        The parameters of the given module will be added to the list of param
+        groups, with specific rules defined by paramwise_cfg.
+
+        Args:
+            params (list[dict]): A list of param groups, it will be modified
+                in place.
+            module (nn.Module): The module to be added.
+        """
+        logger = get_root_logger()
+
+        parameter_groups = {}
+        logger.info(f'self.paramwise_cfg is {self.paramwise_cfg}')
+        num_layers = self.paramwise_cfg.get('num_layers') + 2
+        decay_rate = self.paramwise_cfg.get('decay_rate')
+        decay_type = self.paramwise_cfg.get('decay_type', 'layer_wise')
+        logger.info('Build LearningRateDecayOptimizerConstructor  '
+                    f'{decay_type} {decay_rate} - {num_layers}')
+        weight_decay = self.base_wd
+        for name, param in module.named_parameters():
+            if not param.requires_grad:
+                continue  # frozen weights
+            if len(param.shape) == 1 or name.endswith('.bias') or name in (
+                    'pos_embed', 'cls_token'):
+                group_name = 'no_decay'
+                this_weight_decay = 0.
+            else:
+                group_name = 'decay'
+                this_weight_decay = weight_decay
+            if 'layer_wise' in decay_type:
+                if 'ConvNeXt' in module.backbone.__class__.__name__:
+                    layer_id = get_layer_id_for_convnext(
+                        name, self.paramwise_cfg.get('num_layers'))
+                    logger.info(f'set param {name} as id {layer_id}')
+                elif 'BEiT' in module.backbone.__class__.__name__ or \
+                     'MAE' in module.backbone.__class__.__name__:
+                    layer_id = get_layer_id_for_vit(name, num_layers)
+                    logger.info(f'set param {name} as id {layer_id}')
+                else:
+                    raise NotImplementedError()
+            elif decay_type == 'stage_wise':
+                if 'ConvNeXt' in module.backbone.__class__.__name__:
+                    layer_id = get_stage_id_for_convnext(name, num_layers)
+                    logger.info(f'set param {name} as id {layer_id}')
+                else:
+                    raise NotImplementedError()
+            group_name = f'layer_{layer_id}_{group_name}'
+
+            if group_name not in parameter_groups:
+                scale = decay_rate**(num_layers - layer_id - 1)
+
+                parameter_groups[group_name] = {
+                    'weight_decay': this_weight_decay,
+                    'params': [],
+                    'param_names': [],
+                    'lr_scale': scale,
+                    'group_name': group_name,
+                    'lr': scale * self.base_lr,
+                }
+
+            parameter_groups[group_name]['params'].append(param)
+            parameter_groups[group_name]['param_names'].append(name)
+        rank, _ = get_dist_info()
+        if rank == 0:
+            to_display = {}
+            for key in parameter_groups:
+                to_display[key] = {
+                    'param_names': parameter_groups[key]['param_names'],
+                    'lr_scale': parameter_groups[key]['lr_scale'],
+                    'lr': parameter_groups[key]['lr'],
+                    'weight_decay': parameter_groups[key]['weight_decay'],
+                }
+            logger.info(f'Param groups = {json.dumps(to_display, indent=2)}')
+        params.extend(parameter_groups.values())
+
+
+@OPTIMIZER_BUILDERS.register_module()
+class LayerDecayOptimizerConstructor(LearningRateDecayOptimizerConstructor):
+    """Different learning rates are set for different layers of backbone.
+
+    Note: Currently, this optimizer constructor is built for BEiT,
+    and it will be deprecated.
+    Please use ``LearningRateDecayOptimizerConstructor`` instead.
+    """
+
+    def __init__(self, optimizer_cfg, paramwise_cfg):
+        warnings.warn('DeprecationWarning: Original '
+                      'LayerDecayOptimizerConstructor of BEiT '
+                      'will be deprecated. Please use '
+                      'LearningRateDecayOptimizerConstructor instead, '
+                      'and set decay_type = layer_wise_vit in paramwise_cfg.')
+        paramwise_cfg.update({'decay_type': 'layer_wise_vit'})
+        warnings.warn('DeprecationWarning: Layer_decay_rate will '
+                      'be deleted, please use decay_rate instead.')
+        paramwise_cfg['decay_rate'] = paramwise_cfg.pop('layer_decay_rate')
+        super(LayerDecayOptimizerConstructor,
+              self).__init__(optimizer_cfg, paramwise_cfg)
diff --git a/main/transformer_utils/mmpose/core/post_processing/__init__.py b/main/transformer_utils/mmpose/core/post_processing/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8076b799b9e405e7ac5a883aa3a6d5dcb84060b5
--- /dev/null
+++ b/main/transformer_utils/mmpose/core/post_processing/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from .nms import oks_iou, oks_nms, soft_oks_nms
+from .one_euro_filter import OneEuroFilter
+from .post_transforms import (affine_transform, flip_back, fliplr_joints,
+                              fliplr_regression, get_affine_transform,
+                              get_warp_matrix, rotate_point, transform_preds,
+                              warp_affine_joints)
+from .smoother import Smoother
+
+__all__ = [
+    'oks_nms', 'soft_oks_nms', 'affine_transform', 'rotate_point', 'flip_back',
+    'fliplr_joints', 'fliplr_regression', 'transform_preds',
+    'get_affine_transform', 'get_warp_matrix', 'warp_affine_joints', 'oks_iou',
+    'OneEuroFilter', 'Smoother'
+]
diff --git a/main/transformer_utils/mmpose/core/post_processing/group.py b/main/transformer_utils/mmpose/core/post_processing/group.py
new file mode 100644
index 0000000000000000000000000000000000000000..75499cb0bc4eb96f9255e9c02d20cf7a9c95c402
--- /dev/null
+++ b/main/transformer_utils/mmpose/core/post_processing/group.py
@@ -0,0 +1,418 @@
+# ------------------------------------------------------------------------------
+# Adapted from https://github.com/princeton-vl/pose-ae-train/
+# Original licence: Copyright (c) 2017, umich-vl, under BSD 3-Clause License.
+# ------------------------------------------------------------------------------
+
+import numpy as np
+import torch
+from munkres import Munkres
+
+from mmpose.core.evaluation import post_dark_udp
+
+
+def _py_max_match(scores):
+    """Apply munkres algorithm to get the best match.
+
+    Args:
+        scores(np.ndarray): cost matrix.
+
+    Returns:
+        np.ndarray: best match.
+    """
+    m = Munkres()
+    tmp = m.compute(scores)
+    tmp = np.array(tmp).astype(int)
+    return tmp
+
+
+def _match_by_tag(inp, params):
+    """Match joints by tags. Use Munkres algorithm to calculate the best match
+    for keypoints grouping.
+
+    Note:
+        number of keypoints: K
+        max number of people in an image: M (M=30 by default)
+        dim of tags: L
+            If use flip testing, L=2; else L=1.
+
+    Args:
+        inp(tuple):
+            tag_k (np.ndarray[KxMxL]): tag corresponding to the
+                top k values of feature map per keypoint.
+            loc_k (np.ndarray[KxMx2]): top k locations of the
+                feature maps for keypoint.
+            val_k (np.ndarray[KxM]): top k value of the
+                feature maps per keypoint.
+        params(Params): class Params().
+
+    Returns:
+        np.ndarray: result of pose groups.
+    """
+    assert isinstance(params, _Params), 'params should be class _Params()'
+
+    tag_k, loc_k, val_k = inp
+
+    default_ = np.zeros((params.num_joints, 3 + tag_k.shape[2]),
+                        dtype=np.float32)
+
+    joint_dict = {}
+    tag_dict = {}
+    for i in range(params.num_joints):
+        idx = params.joint_order[i]
+
+        tags = tag_k[idx]
+        joints = np.concatenate((loc_k[idx], val_k[idx, :, None], tags), 1)
+        mask = joints[:, 2] > params.detection_threshold
+        tags = tags[mask]  # shape: [M, L]
+        joints = joints[mask]  # shape: [M, 3 + L], 3: x, y, val
+
+        if joints.shape[0] == 0:
+            continue
+
+        if i == 0 or len(joint_dict) == 0:
+            for tag, joint in zip(tags, joints):
+                key = tag[0]
+                joint_dict.setdefault(key, np.copy(default_))[idx] = joint
+                tag_dict[key] = [tag]
+        else:
+            # shape: [M]
+            grouped_keys = list(joint_dict.keys())
+            if params.ignore_too_much:
+                grouped_keys = grouped_keys[:params.max_num_people]
+            # shape: [M, L]
+            grouped_tags = [np.mean(tag_dict[i], axis=0) for i in grouped_keys]
+
+            # shape: [M, M, L]
+            diff = joints[:, None, 3:] - np.array(grouped_tags)[None, :, :]
+            # shape: [M, M]
+            diff_normed = np.linalg.norm(diff, ord=2, axis=2)
+            diff_saved = np.copy(diff_normed)
+
+            if params.use_detection_val:
+                diff_normed = np.round(diff_normed) * 100 - joints[:, 2:3]
+
+            num_added = diff.shape[0]
+            num_grouped = diff.shape[1]
+
+            if num_added > num_grouped:
+                diff_normed = np.concatenate(
+                    (diff_normed,
+                     np.zeros((num_added, num_added - num_grouped),
+                              dtype=np.float32) + 1e10),
+                    axis=1)
+
+            pairs = _py_max_match(diff_normed)
+            for row, col in pairs:
+                if (row < num_added and col < num_grouped
+                        and diff_saved[row][col] < params.tag_threshold):
+                    key = grouped_keys[col]
+                    joint_dict[key][idx] = joints[row]
+                    tag_dict[key].append(tags[row])
+                else:
+                    key = tags[row][0]
+                    joint_dict.setdefault(key, np.copy(default_))[idx] = \
+                        joints[row]
+                    tag_dict[key] = [tags[row]]
+
+    joint_dict_keys = list(joint_dict.keys())
+    if params.ignore_too_much:
+        # The new person joints beyond the params.max_num_people will be
+        # ignored, for the dict is in ordered when python > 3.6 version.
+        joint_dict_keys = joint_dict_keys[:params.max_num_people]
+    results = np.array([joint_dict[i]
+                        for i in joint_dict_keys]).astype(np.float32)
+    return results
+
+
+class _Params:
+    """A class of parameter.
+
+    Args:
+        cfg(Config): config.
+    """
+
+    def __init__(self, cfg):
+        self.num_joints = cfg['num_joints']
+        self.max_num_people = cfg['max_num_people']
+
+        self.detection_threshold = cfg['detection_threshold']
+        self.tag_threshold = cfg['tag_threshold']
+        self.use_detection_val = cfg['use_detection_val']
+        self.ignore_too_much = cfg['ignore_too_much']
+
+        if self.num_joints == 17:
+            self.joint_order = [
+                i - 1 for i in
+                [1, 2, 3, 4, 5, 6, 7, 12, 13, 8, 9, 10, 11, 14, 15, 16, 17]
+            ]
+        else:
+            self.joint_order = list(np.arange(self.num_joints))
+
+
+class HeatmapParser:
+    """The heatmap parser for post processing."""
+
+    def __init__(self, cfg):
+        self.params = _Params(cfg)
+        self.tag_per_joint = cfg['tag_per_joint']
+        self.pool = torch.nn.MaxPool2d(cfg['nms_kernel'], 1,
+                                       cfg['nms_padding'])
+        self.use_udp = cfg.get('use_udp', False)
+        self.score_per_joint = cfg.get('score_per_joint', False)
+
+    def nms(self, heatmaps):
+        """Non-Maximum Suppression for heatmaps.
+
+        Args:
+            heatmap(torch.Tensor): Heatmaps before nms.
+
+        Returns:
+            torch.Tensor: Heatmaps after nms.
+        """
+
+        maxm = self.pool(heatmaps)
+        maxm = torch.eq(maxm, heatmaps).float()
+        heatmaps = heatmaps * maxm
+
+        return heatmaps
+
+    def match(self, tag_k, loc_k, val_k):
+        """Group keypoints to human poses in a batch.
+
+        Args:
+            tag_k (np.ndarray[NxKxMxL]): tag corresponding to the
+                top k values of feature map per keypoint.
+            loc_k (np.ndarray[NxKxMx2]): top k locations of the
+                feature maps for keypoint.
+            val_k (np.ndarray[NxKxM]): top k value of the
+                feature maps per keypoint.
+
+        Returns:
+            list
+        """
+
+        def _match(x):
+            return _match_by_tag(x, self.params)
+
+        return list(map(_match, zip(tag_k, loc_k, val_k)))
+
+    def top_k(self, heatmaps, tags):
+        """Find top_k values in an image.
+
+        Note:
+            batch size: N
+            number of keypoints: K
+            heatmap height: H
+            heatmap width: W
+            max number of people: M
+            dim of tags: L
+                If use flip testing, L=2; else L=1.
+
+        Args:
+            heatmaps (torch.Tensor[NxKxHxW])
+            tags (torch.Tensor[NxKxHxWxL])
+
+        Returns:
+            dict: A dict containing top_k values.
+
+            - tag_k (np.ndarray[NxKxMxL]):
+                tag corresponding to the top k values of
+                feature map per keypoint.
+            - loc_k (np.ndarray[NxKxMx2]):
+                top k location of feature map per keypoint.
+            - val_k (np.ndarray[NxKxM]):
+                top k value of feature map per keypoint.
+        """
+        heatmaps = self.nms(heatmaps)
+        N, K, H, W = heatmaps.size()
+        heatmaps = heatmaps.view(N, K, -1)
+        val_k, ind = heatmaps.topk(self.params.max_num_people, dim=2)
+
+        tags = tags.view(tags.size(0), tags.size(1), W * H, -1)
+        if not self.tag_per_joint:
+            tags = tags.expand(-1, self.params.num_joints, -1, -1)
+
+        tag_k = torch.stack(
+            [torch.gather(tags[..., i], 2, ind) for i in range(tags.size(3))],
+            dim=3)
+
+        x = ind % W
+        y = ind // W
+
+        ind_k = torch.stack((x, y), dim=3)
+
+        results = {
+            'tag_k': tag_k.cpu().numpy(),
+            'loc_k': ind_k.cpu().numpy(),
+            'val_k': val_k.cpu().numpy()
+        }
+
+        return results
+
+    @staticmethod
+    def adjust(results, heatmaps):
+        """Adjust the coordinates for better accuracy.
+
+        Note:
+            batch size: N
+            number of keypoints: K
+            heatmap height: H
+            heatmap width: W
+
+        Args:
+            results (list(np.ndarray)): Keypoint predictions.
+            heatmaps (torch.Tensor[NxKxHxW]): Heatmaps.
+        """
+        _, _, H, W = heatmaps.shape
+        for batch_id, people in enumerate(results):
+            for people_id, people_i in enumerate(people):
+                for joint_id, joint in enumerate(people_i):
+                    if joint[2] > 0:
+                        x, y = joint[0:2]
+                        xx, yy = int(x), int(y)
+                        tmp = heatmaps[batch_id][joint_id]
+                        if tmp[min(H - 1, yy + 1), xx] > tmp[max(0, yy - 1),
+                                                             xx]:
+                            y += 0.25
+                        else:
+                            y -= 0.25
+
+                        if tmp[yy, min(W - 1, xx + 1)] > tmp[yy,
+                                                             max(0, xx - 1)]:
+                            x += 0.25
+                        else:
+                            x -= 0.25
+                        results[batch_id][people_id, joint_id,
+                                          0:2] = (x + 0.5, y + 0.5)
+        return results
+
+    @staticmethod
+    def refine(heatmap, tag, keypoints, use_udp=False):
+        """Given initial keypoint predictions, we identify missing joints.
+
+        Note:
+            number of keypoints: K
+            heatmap height: H
+            heatmap width: W
+            dim of tags: L
+                If use flip testing, L=2; else L=1.
+
+        Args:
+            heatmap: np.ndarray(K, H, W).
+            tag: np.ndarray(K, H, W) |  np.ndarray(K, H, W, L)
+            keypoints: np.ndarray of size (K, 3 + L)
+                        last dim is (x, y, score, tag).
+            use_udp: bool-unbiased data processing
+
+        Returns:
+            np.ndarray: The refined keypoints.
+        """
+
+        K, H, W = heatmap.shape
+        if len(tag.shape) == 3:
+            tag = tag[..., None]
+
+        tags = []
+        for i in range(K):
+            if keypoints[i, 2] > 0:
+                # save tag value of detected keypoint
+                x, y = keypoints[i][:2].astype(int)
+                x = np.clip(x, 0, W - 1)
+                y = np.clip(y, 0, H - 1)
+                tags.append(tag[i, y, x])
+
+        # mean tag of current detected people
+        prev_tag = np.mean(tags, axis=0)
+        results = []
+
+        for _heatmap, _tag in zip(heatmap, tag):
+            # distance of all tag values with mean tag of
+            # current detected people
+            distance_tag = (((_tag -
+                              prev_tag[None, None, :])**2).sum(axis=2)**0.5)
+            norm_heatmap = _heatmap - np.round(distance_tag)
+
+            # find maximum position
+            y, x = np.unravel_index(np.argmax(norm_heatmap), _heatmap.shape)
+            xx = x.copy()
+            yy = y.copy()
+            # detection score at maximum position
+            val = _heatmap[y, x]
+            if not use_udp:
+                # offset by 0.5
+                x += 0.5
+                y += 0.5
+
+            # add a quarter offset
+            if _heatmap[yy, min(W - 1, xx + 1)] > _heatmap[yy, max(0, xx - 1)]:
+                x += 0.25
+            else:
+                x -= 0.25
+
+            if _heatmap[min(H - 1, yy + 1), xx] > _heatmap[max(0, yy - 1), xx]:
+                y += 0.25
+            else:
+                y -= 0.25
+
+            results.append((x, y, val))
+        results = np.array(results)
+
+        if results is not None:
+            for i in range(K):
+                # add keypoint if it is not detected
+                if results[i, 2] > 0 and keypoints[i, 2] == 0:
+                    keypoints[i, :3] = results[i, :3]
+
+        return keypoints
+
+    def parse(self, heatmaps, tags, adjust=True, refine=True):
+        """Group keypoints into poses given heatmap and tag.
+
+        Note:
+            batch size: N
+            number of keypoints: K
+            heatmap height: H
+            heatmap width: W
+            dim of tags: L
+                If use flip testing, L=2; else L=1.
+
+        Args:
+            heatmaps (torch.Tensor[NxKxHxW]): model output heatmaps.
+            tags (torch.Tensor[NxKxHxWxL]): model output tagmaps.
+
+        Returns:
+            tuple: A tuple containing keypoint grouping results.
+
+            - results (list(np.ndarray)): Pose results.
+            - scores (list/list(np.ndarray)): Score of people.
+        """
+        results = self.match(**self.top_k(heatmaps, tags))
+
+        if adjust:
+            if self.use_udp:
+                for i in range(len(results)):
+                    if results[i].shape[0] > 0:
+                        results[i][..., :2] = post_dark_udp(
+                            results[i][..., :2].copy(), heatmaps[i:i + 1, :])
+            else:
+                results = self.adjust(results, heatmaps)
+
+        if self.score_per_joint:
+            scores = [i[:, 2] for i in results[0]]
+        else:
+            scores = [i[:, 2].mean() for i in results[0]]
+
+        if refine:
+            results = results[0]
+            # for every detected person
+            for i in range(len(results)):
+                heatmap_numpy = heatmaps[0].cpu().numpy()
+                tag_numpy = tags[0].cpu().numpy()
+                if not self.tag_per_joint:
+                    tag_numpy = np.tile(tag_numpy,
+                                        (self.params.num_joints, 1, 1, 1))
+                results[i] = self.refine(
+                    heatmap_numpy, tag_numpy, results[i], use_udp=self.use_udp)
+            results = [results]
+
+        return results, scores
diff --git a/main/transformer_utils/mmpose/core/post_processing/nms.py b/main/transformer_utils/mmpose/core/post_processing/nms.py
new file mode 100644
index 0000000000000000000000000000000000000000..86a0ab35e0e26d27bb0bb55071018ffc5ac9af1d
--- /dev/null
+++ b/main/transformer_utils/mmpose/core/post_processing/nms.py
@@ -0,0 +1,207 @@
+# ------------------------------------------------------------------------------
+# Adapted from https://github.com/leoxiaobin/deep-high-resolution-net.pytorch
+# Original licence: Copyright (c) Microsoft, under the MIT License.
+# ------------------------------------------------------------------------------
+
+import numpy as np
+
+
+def nms(dets, thr):
+    """Greedily select boxes with high confidence and overlap <= thr.
+
+    Args:
+        dets: [[x1, y1, x2, y2, score]].
+        thr: Retain overlap < thr.
+
+    Returns:
+         list: Indexes to keep.
+    """
+    if len(dets) == 0:
+        return []
+
+    x1 = dets[:, 0]
+    y1 = dets[:, 1]
+    x2 = dets[:, 2]
+    y2 = dets[:, 3]
+    scores = dets[:, 4]
+
+    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+    order = scores.argsort()[::-1]
+
+    keep = []
+    while len(order) > 0:
+        i = order[0]
+        keep.append(i)
+        xx1 = np.maximum(x1[i], x1[order[1:]])
+        yy1 = np.maximum(y1[i], y1[order[1:]])
+        xx2 = np.minimum(x2[i], x2[order[1:]])
+        yy2 = np.minimum(y2[i], y2[order[1:]])
+
+        w = np.maximum(0.0, xx2 - xx1 + 1)
+        h = np.maximum(0.0, yy2 - yy1 + 1)
+        inter = w * h
+        ovr = inter / (areas[i] + areas[order[1:]] - inter)
+
+        inds = np.where(ovr <= thr)[0]
+        order = order[inds + 1]
+
+    return keep
+
+
+def oks_iou(g, d, a_g, a_d, sigmas=None, vis_thr=None):
+    """Calculate oks ious.
+
+    Args:
+        g: Ground truth keypoints.
+        d: Detected keypoints.
+        a_g: Area of the ground truth object.
+        a_d: Area of the detected object.
+        sigmas: standard deviation of keypoint labelling.
+        vis_thr: threshold of the keypoint visibility.
+
+    Returns:
+        list: The oks ious.
+    """
+    if sigmas is None:
+        sigmas = np.array([
+            .26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07, 1.07,
+            .87, .87, .89, .89
+        ]) / 10.0
+    vars = (sigmas * 2)**2
+    xg = g[0::3]
+    yg = g[1::3]
+    vg = g[2::3]
+    ious = np.zeros(len(d), dtype=np.float32)
+    for n_d in range(0, len(d)):
+        xd = d[n_d, 0::3]
+        yd = d[n_d, 1::3]
+        vd = d[n_d, 2::3]
+        dx = xd - xg
+        dy = yd - yg
+        e = (dx**2 + dy**2) / vars / ((a_g + a_d[n_d]) / 2 + np.spacing(1)) / 2
+        if vis_thr is not None:
+            ind = list(vg > vis_thr) and list(vd > vis_thr)
+            e = e[ind]
+        ious[n_d] = np.sum(np.exp(-e)) / len(e) if len(e) != 0 else 0.0
+    return ious
+
+
+def oks_nms(kpts_db, thr, sigmas=None, vis_thr=None, score_per_joint=False):
+    """OKS NMS implementations.
+
+    Args:
+        kpts_db: keypoints.
+        thr: Retain overlap < thr.
+        sigmas: standard deviation of keypoint labelling.
+        vis_thr: threshold of the keypoint visibility.
+        score_per_joint: the input scores (in kpts_db) are per joint scores
+
+    Returns:
+        np.ndarray: indexes to keep.
+    """
+    if len(kpts_db) == 0:
+        return []
+
+    if score_per_joint:
+        scores = np.array([k['score'].mean() for k in kpts_db])
+    else:
+        scores = np.array([k['score'] for k in kpts_db])
+
+    kpts = np.array([k['keypoints'].flatten() for k in kpts_db])
+    areas = np.array([k['area'] for k in kpts_db])
+
+    order = scores.argsort()[::-1]
+
+    keep = []
+    while len(order) > 0:
+        i = order[0]
+        keep.append(i)
+
+        oks_ovr = oks_iou(kpts[i], kpts[order[1:]], areas[i], areas[order[1:]],
+                          sigmas, vis_thr)
+
+        inds = np.where(oks_ovr <= thr)[0]
+        order = order[inds + 1]
+
+    keep = np.array(keep)
+
+    return keep
+
+
+def _rescore(overlap, scores, thr, type='gaussian'):
+    """Rescoring mechanism gaussian or linear.
+
+    Args:
+        overlap: calculated ious
+        scores: target scores.
+        thr: retain oks overlap < thr.
+        type: 'gaussian' or 'linear'
+
+    Returns:
+        np.ndarray: indexes to keep
+    """
+    assert len(overlap) == len(scores)
+    assert type in ['gaussian', 'linear']
+
+    if type == 'linear':
+        inds = np.where(overlap >= thr)[0]
+        scores[inds] = scores[inds] * (1 - overlap[inds])
+    else:
+        scores = scores * np.exp(-overlap**2 / thr)
+
+    return scores
+
+
+def soft_oks_nms(kpts_db,
+                 thr,
+                 max_dets=20,
+                 sigmas=None,
+                 vis_thr=None,
+                 score_per_joint=False):
+    """Soft OKS NMS implementations.
+
+    Args:
+        kpts_db
+        thr: retain oks overlap < thr.
+        max_dets: max number of detections to keep.
+        sigmas: Keypoint labelling uncertainty.
+        score_per_joint: the input scores (in kpts_db) are per joint scores
+
+    Returns:
+        np.ndarray: indexes to keep.
+    """
+    if len(kpts_db) == 0:
+        return []
+
+    if score_per_joint:
+        scores = np.array([k['score'].mean() for k in kpts_db])
+    else:
+        scores = np.array([k['score'] for k in kpts_db])
+
+    kpts = np.array([k['keypoints'].flatten() for k in kpts_db])
+    areas = np.array([k['area'] for k in kpts_db])
+
+    order = scores.argsort()[::-1]
+    scores = scores[order]
+
+    keep = np.zeros(max_dets, dtype=np.intp)
+    keep_cnt = 0
+    while len(order) > 0 and keep_cnt < max_dets:
+        i = order[0]
+
+        oks_ovr = oks_iou(kpts[i], kpts[order[1:]], areas[i], areas[order[1:]],
+                          sigmas, vis_thr)
+
+        order = order[1:]
+        scores = _rescore(oks_ovr, scores[1:], thr)
+
+        tmp = scores.argsort()[::-1]
+        order = order[tmp]
+        scores = scores[tmp]
+
+        keep[keep_cnt] = i
+        keep_cnt += 1
+
+    keep = keep[:keep_cnt]
+
+    return keep
diff --git a/main/transformer_utils/mmpose/core/post_processing/one_euro_filter.py b/main/transformer_utils/mmpose/core/post_processing/one_euro_filter.py
new file mode 100644
index 0000000000000000000000000000000000000000..325466522dbcbd5f2cdf85276a94269466fe741f
--- /dev/null
+++ b/main/transformer_utils/mmpose/core/post_processing/one_euro_filter.py
@@ -0,0 +1,113 @@
+# ------------------------------------------------------------------------------
+# Adapted from https://github.com/HoBeom/OneEuroFilter-Numpy
+# Original licence: Copyright (c)  HoBeom Jeon, under the MIT License.
+# ------------------------------------------------------------------------------
+import warnings
+from time import time
+
+import numpy as np
+
+
+def smoothing_factor(t_e, cutoff):
+    r = 2 * np.pi * cutoff * t_e
+    return r / (r + 1)
+
+
+def exponential_smoothing(a, x, x_prev):
+    return a * x + (1 - a) * x_prev
+
+
+class OneEuroFilter:
+
+    def __init__(self,
+                 x0,
+                 dx0=0.0,
+                 min_cutoff=1.7,
+                 beta=0.3,
+                 d_cutoff=30.0,
+                 fps=None):
+        """One Euro Filter for keypoints smoothing.
+
+        Args:
+            x0 (np.ndarray[K, 2]): Initialize keypoints value
+            dx0 (float): 0.0
+            min_cutoff (float): parameter for one euro filter
+            beta (float): parameter for one euro filter
+            d_cutoff (float): Input data FPS
+            fps (float): Video FPS for video inference
+        """
+        warnings.warn(
+            'OneEuroFilter from '
+            '`mmpose/core/post_processing/one_euro_filter.py` will '
+            'be deprecated in the future. Please use Smoother'
+            '(`mmpose/core/post_processing/smoother.py`) with '
+            'OneEuroFilter (`mmpose/core/post_processing/temporal_'
+            'filters/one_euro_filter.py`).', DeprecationWarning)
+
+        # The parameters.
+        self.data_shape = x0.shape
+        self.min_cutoff = np.full(x0.shape, min_cutoff)
+        self.beta = np.full(x0.shape, beta)
+        self.d_cutoff = np.full(x0.shape, d_cutoff)
+        # Previous values.
+        self.x_prev = x0.astype(np.float32)
+        self.dx_prev = np.full(x0.shape, dx0)
+        self.mask_prev = np.ma.masked_where(x0 <= 0, x0)
+        self.realtime = True
+        if fps is None:
+            # Using in realtime inference
+            self.t_e = None
+            self.skip_frame_factor = d_cutoff
+            self.fps = d_cutoff
+        else:
+            # fps using video inference
+            self.realtime = False
+            self.fps = float(fps)
+            self.d_cutoff = np.full(x0.shape, self.fps)
+
+        self.t_prev = time()
+
+    def __call__(self, x, t_e=1.0):
+        """Compute the filtered signal.
+
+        Hyper-parameters (cutoff, beta) are from `VNect
+        <http://gvv.mpi-inf.mpg.de/projects/VNect/>`__ .
+
+        Realtime Camera fps (d_cutoff) default 30.0
+
+        Args:
+            x (np.ndarray[K, 2]): keypoints results in frame
+            t_e (Optional): video skip frame count for posetrack
+                evaluation
+        """
+        assert x.shape == self.data_shape
+
+        t = 0
+        if self.realtime:
+            t = time()
+            t_e = (t - self.t_prev) * self.skip_frame_factor
+        t_e = np.full(x.shape, t_e)
+
+        # missing keypoints mask
+        mask = np.ma.masked_where(x <= 0, x)
+
+        # The filtered derivative of the signal.
+        a_d = smoothing_factor(t_e / self.fps, self.d_cutoff)
+        dx = (x - self.x_prev) / t_e
+        dx_hat = exponential_smoothing(a_d, dx, self.dx_prev)
+
+        # The filtered signal.
+        cutoff = self.min_cutoff + self.beta * np.abs(dx_hat)
+        a = smoothing_factor(t_e / self.fps, cutoff)
+        x_hat = exponential_smoothing(a, x, self.x_prev)
+
+        # missing keypoints remove
+        np.copyto(x_hat, -10, where=mask.mask)
+
+        # Memorize the previous values.
+        self.x_prev = x_hat
+        self.dx_prev = dx_hat
+        self.t_prev = t
+        self.mask_prev = mask
+
+        return x_hat
diff --git a/main/transformer_utils/mmpose/core/post_processing/post_transforms.py b/main/transformer_utils/mmpose/core/post_processing/post_transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..93063fb1c1a60519a527037795654b0278a880e4
--- /dev/null
+++ b/main/transformer_utils/mmpose/core/post_processing/post_transforms.py
@@ -0,0 +1,366 @@
+# ------------------------------------------------------------------------------
+# Adapted from https://github.com/leoxiaobin/deep-high-resolution-net.pytorch
+# Original licence: Copyright (c) Microsoft, under the MIT License.
+# ------------------------------------------------------------------------------
+
+import math
+
+import cv2
+import numpy as np
+import torch
+
+
+def fliplr_joints(joints_3d, joints_3d_visible, img_width, flip_pairs):
+    """Flip human joints horizontally.
+
+    Note:
+        - num_keypoints: K
+
+    Args:
+        joints_3d (np.ndarray([K, 3])): Coordinates of keypoints.
+        joints_3d_visible (np.ndarray([K, 1])): Visibility of keypoints.
+        img_width (int): Image width.
+        flip_pairs (list[tuple]): Pairs of keypoints which are mirrored
+            (for example, left ear and right ear).
+
+    Returns:
+        tuple: Flipped human joints.
+
+        - joints_3d_flipped (np.ndarray([K, 3])): Flipped joints.
+        - joints_3d_visible_flipped (np.ndarray([K, 1])): Joint visibility.
+    """
+
+    assert len(joints_3d) == len(joints_3d_visible)
+    assert img_width > 0
+
+    joints_3d_flipped = joints_3d.copy()
+    joints_3d_visible_flipped = joints_3d_visible.copy()
+
+    # Swap left-right parts
+    for left, right in flip_pairs:
+        joints_3d_flipped[left, :] = joints_3d[right, :]
+        joints_3d_flipped[right, :] = joints_3d[left, :]
+
+        joints_3d_visible_flipped[left, :] = joints_3d_visible[right, :]
+        joints_3d_visible_flipped[right, :] = joints_3d_visible[left, :]
+
+    # Flip horizontally
+    joints_3d_flipped[:, 0] = img_width - 1 - joints_3d_flipped[:, 0]
+    joints_3d_flipped = joints_3d_flipped * joints_3d_visible_flipped
+
+    return joints_3d_flipped, joints_3d_visible_flipped
+
+
+def fliplr_regression(regression,
+                      flip_pairs,
+                      center_mode='static',
+                      center_x=0.5,
+                      center_index=0):
+    """Flip human joints horizontally.
+
+    Note:
+        - batch_size: N
+        - num_keypoint: K
+
+    Args:
+        regression (np.ndarray([..., K, C])): Coordinates of keypoints, where K
+            is the joint number and C is the dimension. Example shapes are:
+
+            - [N, K, C]: a batch of keypoints where N is the batch size.
+            - [N, T, K, C]: a batch of pose sequences, where T is the frame
+                number.
+        flip_pairs (list[tuple()]): Pairs of keypoints which are mirrored
+            (for example, left ear -- right ear).
+        center_mode (str): The mode to set the center location on the x-axis
+            to flip around. Options are:
+
+            - static: use a static x value (see center_x also)
+            - root: use a root joint (see center_index also)
+        center_x (float): Set the x-axis location of the flip center. Only used
+            when center_mode=static.
+        center_index (int): Set the index of the root joint, whose x location
+            will be used as the flip center. Only used when center_mode=root.
+
+    Returns:
+        np.ndarray([..., K, C]): Flipped joints.
+    """
+    assert regression.ndim >= 2, f'Invalid pose shape {regression.shape}'
+
+    allowed_center_mode = {'static', 'root'}
+    assert center_mode in allowed_center_mode, 'Get invalid center_mode ' \
+        f'{center_mode}, allowed choices are {allowed_center_mode}'
+
+    if center_mode == 'static':
+        x_c = center_x
+    elif center_mode == 'root':
+        assert regression.shape[-2] > center_index
+        x_c = regression[..., center_index:center_index + 1, 0]
+
+    regression_flipped = regression.copy()
+    # Swap left-right parts
+    for left, right in flip_pairs:
+        regression_flipped[..., left, :] = regression[..., right, :]
+        regression_flipped[..., right, :] = regression[..., left, :]
+
+    # Flip horizontally
+    regression_flipped[..., 0] = x_c * 2 - regression_flipped[..., 0]
+    return regression_flipped
+
+
+def flip_back(output_flipped, flip_pairs, target_type='GaussianHeatmap'):
+    """Flip the flipped heatmaps back to the original form.
+
+    Note:
+        - batch_size: N
+        - num_keypoints: K
+        - heatmap height: H
+        - heatmap width: W
+
+    Args:
+        output_flipped (np.ndarray[N, K, H, W]): The output heatmaps obtained
+            from the flipped images.
+        flip_pairs (list[tuple()): Pairs of keypoints which are mirrored
+            (for example, left ear -- right ear).
+        target_type (str): GaussianHeatmap or CombinedTarget
+
+    Returns:
+        np.ndarray: heatmaps that flipped back to the original image
+    """
+    assert output_flipped.ndim == 4, \
+        'output_flipped should be [batch_size, num_keypoints, height, width]'
+    shape_ori = output_flipped.shape
+    channels = 1
+    if target_type.lower() == 'CombinedTarget'.lower():
+        channels = 3
+        output_flipped[:, 1::3, ...] = -output_flipped[:, 1::3, ...]
+    output_flipped = output_flipped.reshape(shape_ori[0], -1, channels,
+                                            shape_ori[2], shape_ori[3])
+    output_flipped_back = output_flipped.copy()
+
+    # Swap left-right parts
+    for left, right in flip_pairs:
+        output_flipped_back[:, left, ...] = output_flipped[:, right, ...]
+        output_flipped_back[:, right, ...] = output_flipped[:, left, ...]
+    output_flipped_back = output_flipped_back.reshape(shape_ori)
+    # Flip horizontally
+    output_flipped_back = output_flipped_back[..., ::-1]
+    return output_flipped_back
+
+
+def transform_preds(coords, center, scale, output_size, use_udp=False):
+    """Get final keypoint predictions from heatmaps and apply scaling and
+    translation to map them back to the image.
+
+    Note:
+        num_keypoints: K
+
+    Args:
+        coords (np.ndarray[K, ndims]):
+
+            * If ndims=2, corrds are predicted keypoint location.
+            * If ndims=4, corrds are composed of (x, y, scores, tags)
+            * If ndims=5, corrds are composed of (x, y, scores, tags,
+              flipped_tags)
+
+        center (np.ndarray[2, ]): Center of the bounding box (x, y).
+        scale (np.ndarray[2, ]): Scale of the bounding box
+            wrt [width, height].
+        output_size (np.ndarray[2, ] | list(2,)): Size of the
+            destination heatmaps.
+        use_udp (bool): Use unbiased data processing
+
+    Returns:
+        np.ndarray: Predicted coordinates in the images.
+    """
+    assert coords.shape[1] in (2, 4, 5)
+    assert len(center) == 2
+    assert len(scale) == 2
+    assert len(output_size) == 2
+
+    # Recover the scale which is normalized by a factor of 200.
+    scale = scale * 200.0
+
+    if use_udp:
+        scale_x = scale[0] / (output_size[0] - 1.0)
+        scale_y = scale[1] / (output_size[1] - 1.0)
+    else:
+        scale_x = scale[0] / output_size[0]
+        scale_y = scale[1] / output_size[1]
+
+    target_coords = np.ones_like(coords)
+    target_coords[:, 0] = coords[:, 0] * scale_x + center[0] - scale[0] * 0.5
+    target_coords[:, 1] = coords[:, 1] * scale_y + center[1] - scale[1] * 0.5
+
+    return target_coords
+
+
+def get_affine_transform(center,
+                         scale,
+                         rot,
+                         output_size,
+                         shift=(0., 0.),
+                         inv=False):
+    """Get the affine transform matrix, given the center/scale/rot/output_size.
+
+    Args:
+        center (np.ndarray[2, ]): Center of the bounding box (x, y).
+        scale (np.ndarray[2, ]): Scale of the bounding box
+            wrt [width, height].
+        rot (float): Rotation angle (degree).
+        output_size (np.ndarray[2, ] | list(2,)): Size of the
+            destination heatmaps.
+        shift (0-100%): Shift translation ratio wrt the width/height.
+            Default (0., 0.).
+        inv (bool): Option to inverse the affine transform direction.
+            (inv=False: src->dst or inv=True: dst->src)
+
+    Returns:
+        np.ndarray: The transform matrix.
+    """
+    assert len(center) == 2
+    assert len(scale) == 2
+    assert len(output_size) == 2
+    assert len(shift) == 2
+
+    # pixel_std is 200.
+    scale_tmp = scale * 200.0
+
+    shift = np.array(shift)
+    src_w = scale_tmp[0]
+    dst_w = output_size[0]
+    dst_h = output_size[1]
+
+    rot_rad = np.pi * rot / 180
+    src_dir = rotate_point([0., src_w * -0.5], rot_rad)
+    dst_dir = np.array([0., dst_w * -0.5])
+
+    src = np.zeros((3, 2), dtype=np.float32)
+    src[0, :] = center + scale_tmp * shift
+    src[1, :] = center + src_dir + scale_tmp * shift
+    src[2, :] = _get_3rd_point(src[0, :], src[1, :])
+
+    dst = np.zeros((3, 2), dtype=np.float32)
+    dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
+    dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir
+    dst[2, :] = _get_3rd_point(dst[0, :], dst[1, :])
+
+    if inv:
+        trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
+    else:
+        trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))
+
+    return trans
+
+
+def affine_transform(pt, trans_mat):
+    """Apply an affine transformation to the points.
+
+    Args:
+        pt (np.ndarray): a 2 dimensional point to be transformed
+        trans_mat (np.ndarray): 2x3 matrix of an affine transform
+
+    Returns:
+        np.ndarray: Transformed points.
+    """
+    assert len(pt) == 2
+    new_pt = np.array(trans_mat) @ np.array([pt[0], pt[1], 1.])
+
+    return new_pt
+
+
+def _get_3rd_point(a, b):
+    """To calculate the affine matrix, three pairs of points are required. This
+    function is used to get the 3rd point, given 2D points a & b.
+
+    The 3rd point is defined by rotating vector `a - b` by 90 degrees
+    anticlockwise, using b as the rotation center.
+
+    Args:
+        a (np.ndarray): point(x,y)
+        b (np.ndarray): point(x,y)
+
+    Returns:
+        np.ndarray: The 3rd point.
+    """
+    assert len(a) == 2
+    assert len(b) == 2
+    direction = a - b
+    third_pt = b + np.array([-direction[1], direction[0]], dtype=np.float32)
+
+    return third_pt
+
+
+def rotate_point(pt, angle_rad):
+    """Rotate a point by an angle.
+
+    Args:
+        pt (list[float]): 2 dimensional point to be rotated
+        angle_rad (float): rotation angle by radian
+
+    Returns:
+        list[float]: Rotated point.
+    """
+    assert len(pt) == 2
+    sn, cs = np.sin(angle_rad), np.cos(angle_rad)
+    new_x = pt[0] * cs - pt[1] * sn
+    new_y = pt[0] * sn + pt[1] * cs
+    rotated_pt = [new_x, new_y]
+
+    return rotated_pt
+
+
+def get_warp_matrix(theta, size_input, size_dst, size_target):
+    """Calculate the transformation matrix under the constraint of unbiased.
+    Paper ref: Huang et al. The Devil is in the Details: Delving into Unbiased
+    Data Processing for Human Pose Estimation (CVPR 2020).
+
+    Args:
+        theta (float): Rotation angle in degrees.
+        size_input (np.ndarray): Size of input image [w, h].
+        size_dst (np.ndarray): Size of output image [w, h].
+        size_target (np.ndarray): Size of ROI in input plane [w, h].
+
+    Returns:
+        np.ndarray: A matrix for transformation.
+    """
+    theta = np.deg2rad(theta)
+    matrix = np.zeros((2, 3), dtype=np.float32)
+    scale_x = size_dst[0] / size_target[0]
+    scale_y = size_dst[1] / size_target[1]
+    matrix[0, 0] = math.cos(theta) * scale_x
+    matrix[0, 1] = -math.sin(theta) * scale_x
+    matrix[0, 2] = scale_x * (-0.5 * size_input[0] * math.cos(theta) +
+                              0.5 * size_input[1] * math.sin(theta) +
+                              0.5 * size_target[0])
+    matrix[1, 0] = math.sin(theta) * scale_y
+    matrix[1, 1] = math.cos(theta) * scale_y
+    matrix[1, 2] = scale_y * (-0.5 * size_input[0] * math.sin(theta) -
+                              0.5 * size_input[1] * math.cos(theta) +
+                              0.5 * size_target[1])
+    return matrix
+
+
+def warp_affine_joints(joints, mat):
+    """Apply affine transformation defined by the transform matrix on the
+    joints.
+
+    Args:
+        joints (np.ndarray[..., 2]): Origin coordinate of joints.
+        mat (np.ndarray[3, 2]): The affine matrix.
+
+    Returns:
+        np.ndarray[..., 2]: Result coordinate of joints.
+    """
+    joints = np.array(joints)
+    shape = joints.shape
+    joints = joints.reshape(-1, 2)
+    return np.dot(
+        np.concatenate((joints, joints[:, 0:1] * 0 + 1), axis=1),
+        mat.T).reshape(shape)
+
+
+def affine_transform_torch(pts, t):
+    npts = pts.shape[0]
+    pts_homo = torch.cat([pts, torch.ones(npts, 1, device=pts.device)], dim=1)
+    out = torch.mm(t, torch.t(pts_homo))
+    return torch.t(out[:2, :])
diff --git a/main/transformer_utils/mmpose/core/post_processing/smoother.py b/main/transformer_utils/mmpose/core/post_processing/smoother.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b57768c03b48ff84877acbceb6e27b82832c04d
--- /dev/null
+++ b/main/transformer_utils/mmpose/core/post_processing/smoother.py
@@ -0,0 +1,227 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import warnings
+from typing import Dict, Union
+
+import numpy as np
+from mmcv import Config, is_seq_of
+
+from mmpose.core.post_processing.temporal_filters import build_filter
+
+
+class Smoother():
+    """Smoother to apply temporal smoothing on pose estimation results with a
+    filter.
+
+    Note:
+        T: The temporal length of the pose sequence
+        K: The keypoint number of each target
+        C: The keypoint coordinate dimension
+
+    Args:
+        filter_cfg (dict | str): The filter config. See example config files in
+            `configs/_base_/filters/` for details. Alternatively a config file
+            path can be accepted and the config will be loaded.
+        keypoint_dim (int): The keypoint coordinate dimension, which is
+            also indicated as C. Default: 2
+        keypoint_key (str): The dict key of the keypoints in the pose results.
+            Default: 'keypoints'
+    Example:
+        >>> import numpy as np
+        >>> # Build dummy pose result
+        >>> results = []
+        >>> for t in range(10):
+        >>>     results_t = []
+        >>>     for track_id in range(2):
+        >>>         result = {
+        >>>             'track_id': track_id,
+        >>>             'keypoints': np.random.rand(17, 3)
+        >>>         }
+        >>>         results_t.append(result)
+        >>>     results.append(results_t)
+        >>> # Example 1: Smooth multi-frame pose results offline.
+        >>> filter_cfg = dict(type='GaussianFilter', window_size=3)
+        >>> smoother = Smoother(filter_cfg, keypoint_dim=2)
+        >>> smoothed_results = smoother.smooth(results)
+        >>> # Example 2: Smooth pose results online frame-by-frame
+        >>> filter_cfg = dict(type='GaussianFilter', window_size=3)
+        >>> smoother = Smoother(filter_cfg, keypoint_dim=2)
+        >>> for result_t in results:
+        >>>     smoothed_result_t = smoother.smooth(result_t)
+    """
+
+    def __init__(self,
+                 filter_cfg: Union[Dict, str],
+                 keypoint_dim: int = 2,
+                 keypoint_key: str = 'keypoints'):
+        if isinstance(filter_cfg, str):
+            filter_cfg = Config.fromfile(filter_cfg).filter_cfg
+        self.filter_cfg = filter_cfg
+        self._filter = build_filter(filter_cfg)
+        self.keypoint_dim = keypoint_dim
+        self.key = keypoint_key
+        self.padding_size = self._filter.window_size - 1
+        self.history = {}
+
+    def _get_filter(self):
+        fltr = self._filter
+        if not fltr.shareable:
+            # If the filter is not shareable, build a new filter for the next
+            # requires
+            self._filter = build_filter(self.filter_cfg)
+        return fltr
+
+    def _collate_pose(self, results):
+        """Collate the pose results to pose sequences.
+
+        Args:
+            results (list[list[dict]]): The pose results of multiple frames.
+
+        Returns:
+            dict[str, np.ndarray]: A dict of collated pose sequences, where
+            the key is the track_id (in untracked scenario, the target index
+            will be used as the track_id), and the value is the pose sequence
+            in an array of shape [T, K, C]
+        """
+
+        if self._has_track_id(results):
+            # If the results have track_id, use it as the target indicator
+            results = [{res['track_id']: res
+                        for res in results_t} for results_t in results]
+            track_ids = results[0].keys()
+
+            for t, results_t in enumerate(results[1:]):
+                if results_t.keys() != track_ids:
+                    raise ValueError(f'Inconsistent track ids in frame {t+1}')
+
+            collated = {
+                id: np.stack([
+                    results_t[id][self.key][:, :self.keypoint_dim]
+                    for results_t in results
+                ])
+                for id in track_ids
+            }
+        else:
+            # If the results don't have track_id, use the target index
+            # as the target indicator
+            n_target = len(results[0])
+            for t, results_t in enumerate(results[1:]):
+                if len(results_t) != n_target:
+                    raise ValueError(
+                        f'Inconsistent target number in frame {t+1}: '
+                        f'{len(results_t)} vs {n_target}')
+
+            collated = {
+                id: np.stack([
+                    results_t[id][self.key][:, :self.keypoint_dim]
+                    for results_t in results
+                ])
+                for id in range(n_target)
+            }
+
+        return collated
+
+    def _scatter_pose(self, results, poses):
+        """Scatter the smoothed pose sequences and use them to update the pose
+        results.
+
+        Args:
+            results (list[list[dict]]): The original pose results
+            poses (dict[str, np.ndarray]): The smoothed pose sequences
+
+        Returns:
+            list[list[dict]]: The updated pose results
+        """
+        updated_results = []
+        for t, results_t in enumerate(results):
+            updated_results_t = []
+            if self._has_track_id(results):
+                id2result = ((result['track_id'], result)
+                             for result in results_t)
+            else:
+                id2result = enumerate(results_t)
+
+            for track_id, result in id2result:
+                result = copy.deepcopy(result)
+                result[self.key][:, :self.keypoint_dim] = poses[track_id][t]
+                updated_results_t.append(result)
+
+            updated_results.append(updated_results_t)
+        return updated_results
+
+    @staticmethod
+    def _has_track_id(results):
+        """Check if the pose results contain track_id."""
+        return 'track_id' in results[0][0]
+
+    def smooth(self, results):
+        """Apply temporal smoothing on pose estimation sequences.
+
+        Args:
+            results (list[dict] | list[list[dict]]): The pose results of a
+                single frame (non-nested list) or multiple frames (nested
+                list). The result of each target is a dict, which should
+                contains:
+
+                - track_id (optional, Any): The track ID of the target
+                - keypoints (np.ndarray): The keypoint coordinates in [K, C]
+
+        Returns:
+            (list[dict] | list[list[dict]]): Temporal smoothed pose results,
+            which has the same data structure as the input's.
+        """
+
+        # Check if input is empty
+        if not (results) or not (results[0]):
+            warnings.warn('Smoother received empty result.')
+            return results
+
+        # Check input is single frame or sequence
+        if is_seq_of(results, dict):
+            single_frame = True
+            results = [results]
+        else:
+            assert is_seq_of(results, list)
+            single_frame = False
+
+        # Get temporal length of input
+        T = len(results)
+
+        # Collate the input results to pose sequences
+        poses = self._collate_pose(results)
+
+        # Smooth the pose sequence of each target
+        smoothed_poses = {}
+        update_history = {}
+        for track_id, pose in poses.items():
+            if track_id in self.history:
+                # For tracked target, get its filter and pose history
+                pose_history, pose_filter = self.history[track_id]
+                if self.padding_size > 0:
+                    # Pad the pose sequence with pose history
+                    pose = np.concatenate((pose_history, pose), axis=0)
+            else:
+                # For new target, build a new filter
+                pose_filter = self._get_filter()
+
+            # Update the history information
+            if self.padding_size > 0:
+                pose_history = pose[-self.padding_size:].copy()
+            else:
+                pose_history = None
+            update_history[track_id] = (pose_history, pose_filter)
+
+            # Smooth the pose sequence with the filter
+            smoothed_pose = pose_filter(pose)
+            smoothed_poses[track_id] = smoothed_pose[-T:]
+
+        self.history = update_history
+
+        # Scatter the pose sequences back to the format of results
+        smoothed_results = self._scatter_pose(results, smoothed_poses)
+
+        # If the input is single frame, remove the nested list to keep the
+        # output structure consistent with the input's
+        if single_frame:
+            smoothed_results = smoothed_results[0]
+        return smoothed_results
diff --git a/main/transformer_utils/mmpose/core/post_processing/temporal_filters/__init__.py b/main/transformer_utils/mmpose/core/post_processing/temporal_filters/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8aea62513b14fdb6ac740c06e82683a1e27363db
--- /dev/null
+++ b/main/transformer_utils/mmpose/core/post_processing/temporal_filters/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .builder import build_filter
+from .gaussian_filter import GaussianFilter
+from .one_euro_filter import OneEuroFilter
+from .savizky_golay_filter import SavizkyGolayFilter
+from .smoothnet_filter import SmoothNetFilter
+
+__all__ = [
+    'build_filter', 'GaussianFilter', 'OneEuroFilter', 'SavizkyGolayFilter',
+    'SmoothNetFilter'
+]
diff --git a/main/transformer_utils/mmpose/core/post_processing/temporal_filters/builder.py b/main/transformer_utils/mmpose/core/post_processing/temporal_filters/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..adb914c5222db967c9cdb56fa9f469ff47792f79
--- /dev/null
+++ b/main/transformer_utils/mmpose/core/post_processing/temporal_filters/builder.py
@@ -0,0 +1,9 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.utils import Registry
+
+FILTERS = Registry('filters')
+
+
+def build_filter(cfg):
+    """Build filters function."""
+    return FILTERS.build(cfg)
diff --git a/main/transformer_utils/mmpose/core/post_processing/temporal_filters/filter.py b/main/transformer_utils/mmpose/core/post_processing/temporal_filters/filter.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c6ce0127092235c370f8e398751884f09a18bf5
--- /dev/null
+++ b/main/transformer_utils/mmpose/core/post_processing/temporal_filters/filter.py
@@ -0,0 +1,42 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+
+
+class TemporalFilter(metaclass=ABCMeta):
+    """Base class of temporal filter.
+
+    A subclass should implement the method __call__().
+
+    Parameters:
+        window_size (int): the size of the sliding window.
+    """
+
+    # If the filter can be shared by multiple humans or targets
+    _shareable: bool = True
+
+    def __init__(self, window_size=1):
+        self._window_size = window_size
+
+    @property
+    def window_size(self):
+        return self._window_size
+
+    @property
+    def shareable(self):
+        return self._shareable
+
+    @abstractmethod
+    def __call__(self, x):
+        """Apply filter to a pose sequence.
+
+        Note:
+            T: The temporal length of the pose sequence
+            K: The keypoint number of each target
+            C: The keypoint coordinate dimension
+
+        Args:
+            x (np.ndarray): input pose sequence in shape [T, K, C]
+
+        Returns:
+            np.ndarray: Smoothed pose sequence in shape [T, K, C]
+        """
diff --git a/main/transformer_utils/mmpose/core/post_processing/temporal_filters/gaussian_filter.py b/main/transformer_utils/mmpose/core/post_processing/temporal_filters/gaussian_filter.py
new file mode 100644
index 0000000000000000000000000000000000000000..b737cdb15aeb9985c0666afeb26e919893343262
--- /dev/null
+++ b/main/transformer_utils/mmpose/core/post_processing/temporal_filters/gaussian_filter.py
@@ -0,0 +1,44 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+from scipy.ndimage.filters import gaussian_filter1d
+from scipy.signal import medfilt
+
+from .builder import FILTERS
+from .filter import TemporalFilter
+
+
+@FILTERS.register_module(name=['GaussianFilter', 'gaussian'])
+class GaussianFilter(TemporalFilter):
+    """Apply median filter and then gaussian filter.
+
+    Adapted from:
+    https://github.com/akanazawa/human_dynamics/blob/mas
+    ter/src/util/smooth_bbox.py.
+
+    Args:
+        window_size (int): The size of the filter window (i.e., the number
+            of coefficients). window_length must be a positive odd integer.
+            Default: 11
+        sigma (float): Sigma for gaussian smoothing. Default: 4.0
+    """
+
+    def __init__(self, window_size: int = 11, sigma: float = 4.0):
+        super().__init__(window_size)
+        assert window_size % 2 == 1, (
+            'The window size of GaussianFilter should'
+            f'be odd, but got {window_size}')
+        self.sigma = sigma
+
+    def __call__(self, x: np.ndarray):
+
+        assert x.ndim == 3, ('Input should be an array with shape [T, K, C]'
+                             f', but got invalid shape {x.shape}')
+
+        T = x.shape[0]
+        if T < self.window_size:
+            pad_width = [(self.window_size - T, 0), (0, 0), (0, 0)]
+            x = np.pad(x, pad_width, mode='edge')
+        smoothed = medfilt(x, (self.window_size, 1, 1))
+
+        smoothed = gaussian_filter1d(smoothed, self.sigma, axis=0)
+        return smoothed[-T:]
diff --git a/main/transformer_utils/mmpose/core/post_processing/temporal_filters/one_euro_filter.py b/main/transformer_utils/mmpose/core/post_processing/temporal_filters/one_euro_filter.py
new file mode 100644
index 0000000000000000000000000000000000000000..b954a97fd79543f243a087510a20c4e0037b9ef5
--- /dev/null
+++ b/main/transformer_utils/mmpose/core/post_processing/temporal_filters/one_euro_filter.py
@@ -0,0 +1,113 @@
+# ------------------------------------------------------------------------------
+# Adapted from https://github.com/HoBeom/OneEuroFilter-Numpy
+# Original licence: Copyright (c)  HoBeom Jeon, under the MIT License.
+# ------------------------------------------------------------------------------
+import math
+
+import numpy as np
+
+from .builder import FILTERS
+from .filter import TemporalFilter
+
+
+def smoothing_factor(t_e, cutoff):
+    r = 2 * math.pi * cutoff * t_e
+    return r / (r + 1)
+
+
+def exponential_smoothing(a, x, x_prev):
+    return a * x + (1 - a) * x_prev
+
+
+class OneEuro:
+
+    def __init__(self, t0, x0, dx0, min_cutoff, beta, d_cutoff=1.0):
+        super(OneEuro, self).__init__()
+        """Initialize the one euro filter."""
+        # The parameters.
+        self.min_cutoff = float(min_cutoff)
+        self.beta = float(beta)
+        self.d_cutoff = float(d_cutoff)
+        # Previous values.
+        self.x_prev = x0
+        self.dx_prev = dx0
+        self.t_prev = t0
+
+    def __call__(self, x, t=None):
+        """Compute the filtered signal."""
+
+        if t is None:
+            # Assume input is feed frame by frame if not specified
+            t = self.t_prev + 1
+
+        t_e = t - self.t_prev
+
+        # The filtered derivative of the signal.
+        a_d = smoothing_factor(t_e, self.d_cutoff)  # [k, c]
+        dx = (x - self.x_prev) / t_e
+        dx_hat = exponential_smoothing(a_d, dx, self.dx_prev)
+
+        # The filtered signal.
+        cutoff = self.min_cutoff + self.beta * np.abs(dx_hat)
+        a = smoothing_factor(t_e, cutoff)
+        x_hat = exponential_smoothing(a, x, self.x_prev)
+        # Memorize the previous values.
+        self.x_prev = x_hat
+        self.dx_prev = dx_hat
+        self.t_prev = t
+        return x_hat
+
+
+@FILTERS.register_module(name=['OneEuroFilter', 'oneeuro'])
+class OneEuroFilter(TemporalFilter):
+    """Oneeuro filter, source code: https://github.com/mkocabas/VIBE/blob/c0
+    c3f77d587351c806e901221a9dc05d1ffade4b/lib/utils/smooth_pose.py.
+
+    Args:
+        min_cutoff (float, optional): Decreasing the minimum cutoff frequency
+            decreases slow speed jitter
+        beta (float, optional): Increasing the speed coefficient(beta)
+            decreases speed lag.
+    """
+
+    # Not shareable because the filter holds status of a specific target
+    _shareable: bool = False
+
+    def __init__(self, min_cutoff=0.004, beta=0.7):
+        # OneEuroFilter has Markov Property and maintains status variables
+        # within the class, thus has a windows_size of 1
+        super().__init__(window_size=1)
+        self.min_cutoff = min_cutoff
+        self.beta = beta
+        self._one_euro = None
+
+    def __call__(self, x: np.ndarray):
+        assert x.ndim == 3, ('Input should be an array with shape [T, K, C]'
+                             f', but got invalid shape {x.shape}')
+
+        pred_pose_hat = x.copy()
+
+        if self._one_euro is None:
+            # The filter is invoked for the first time
+            # Initialize the filter
+            self._one_euro = OneEuro(
+                np.zeros_like(x[0]),
+                x[0],
+                dx0=0.0,
+                min_cutoff=self.min_cutoff,
+                beta=self.beta,
+            )
+            t0 = 1
+        else:
+            # The filter has been invoked
+            t0 = 0
+
+        for t, pose in enumerate(x):
+            if t < t0:
+                # If the filter is invoked for the first time
+                # set pred_pose_hat[0] = x[0]
+                continue
+            pose = self._one_euro(pose)
+            pred_pose_hat[t] = pose
+
+        return pred_pose_hat
diff --git a/main/transformer_utils/mmpose/core/post_processing/temporal_filters/savizky_golay_filter.py b/main/transformer_utils/mmpose/core/post_processing/temporal_filters/savizky_golay_filter.py
new file mode 100644
index 0000000000000000000000000000000000000000..18e0528f6cec71f19fe1c4a1f26560c1438bd1ce
--- /dev/null
+++ b/main/transformer_utils/mmpose/core/post_processing/temporal_filters/savizky_golay_filter.py
@@ -0,0 +1,50 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+from scipy.signal import savgol_filter
+
+from .builder import FILTERS
+from .filter import TemporalFilter
+
+
+@FILTERS.register_module(name=['SavizkyGolayFilter', 'savgol'])
+class SavizkyGolayFilter(TemporalFilter):
+    """Savizky-Golay filter.
+
+    Adapted from:
+    https://docs.scipy.org/doc/scipy/reference/generated/
+    scipy.signal.savgol_filter.html.
+
+    Args:
+        window_size (int): The size of the filter window (i.e., the number
+            of coefficients). window_length must be a positive odd integer.
+            Default: 11
+        polyorder (int): The order of the polynomial used to fit the samples.
+            polyorder must be less than window_size.
+    """
+
+    def __init__(self, window_size: int = 11, polyorder: int = 2):
+        super().__init__(window_size)
+
+        # 1-D Savitzky-Golay filter
+        assert polyorder > 0, (
+            f'Got invalid parameter polyorder={polyorder}. Polyorder '
+            'should be positive.')
+        assert polyorder < window_size, (
+            f'Got invalid parameters polyorder={polyorder} and '
+            f'window_size={window_size}. Polyorder should be less than '
+            'window_size.')
+        self.polyorder = polyorder
+
+    def __call__(self, x: np.ndarray):
+
+        assert x.ndim == 3, ('Input should be an array with shape [T, K, C]'
+                             f', but got invalid shape {x.shape}')
+
+        T = x.shape[0]
+        if T < self.window_size:
+            pad_width = [(self.window_size - T, 0), (0, 0), (0, 0)]
+            x = np.pad(x, pad_width, mode='edge')
+
+        smoothed = savgol_filter(x, self.window_size, self.polyorder, axis=0)
+
+        return smoothed[-T:]
diff --git a/main/transformer_utils/mmpose/core/post_processing/temporal_filters/smoothnet_filter.py b/main/transformer_utils/mmpose/core/post_processing/temporal_filters/smoothnet_filter.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7f8df520ad9457722f738c33b79d69d3a99fb9e
--- /dev/null
+++ b/main/transformer_utils/mmpose/core/post_processing/temporal_filters/smoothnet_filter.py
@@ -0,0 +1,226 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import numpy as np
+import torch
+from mmcv.runner import load_checkpoint
+from torch import Tensor, nn
+
+from .builder import FILTERS
+from .filter import TemporalFilter
+
+
+class SmoothNetResBlock(nn.Module):
+    """Residual block module used in SmoothNet.
+
+    Args:
+        in_channels (int): Input channel number.
+        hidden_channels (int): The hidden feature channel number.
+        dropout (float): Dropout probability. Default: 0.5
+
+    Shape:
+        Input: (*, in_channels)
+        Output: (*, in_channels)
+    """
+
+    def __init__(self, in_channels, hidden_channels, dropout=0.5):
+        super().__init__()
+        self.linear1 = nn.Linear(in_channels, hidden_channels)
+        self.linear2 = nn.Linear(hidden_channels, in_channels)
+        self.lrelu = nn.LeakyReLU(0.2, inplace=True)
+        self.dropout = nn.Dropout(p=dropout, inplace=True)
+
+    def forward(self, x):
+        identity = x
+        x = self.linear1(x)
+        x = self.dropout(x)
+        x = self.lrelu(x)
+        x = self.linear2(x)
+        x = self.dropout(x)
+        x = self.lrelu(x)
+
+        out = x + identity
+        return out
+
+
+class SmoothNet(nn.Module):
+    """SmoothNet is a plug-and-play temporal-only network to refine human
+    poses. It works for 2d/3d/6d pose smoothing.
+
+    "SmoothNet: A Plug-and-Play Network for Refining Human Poses in Videos",
+    arXiv'2021. More details can be found in the `paper
+    <https://arxiv.org/abs/2112.13715>`__ .
+
+    Note:
+        N: The batch size
+        T: The temporal length of the pose sequence
+        C: The total pose dimension (e.g. keypoint_number * keypoint_dim)
+
+    Args:
+        window_size (int): The size of the input window.
+        output_size (int): The size of the output window.
+        hidden_size (int): The hidden feature dimension in the encoder,
+            the decoder and between residual blocks. Default: 512
+        res_hidden_size (int): The hidden feature dimension inside the
+            residual blocks. Default: 256
+        num_blocks (int): The number of residual blocks. Default: 3
+        dropout (float): Dropout probability. Default: 0.5
+
+    Shape:
+        Input: (N, C, T) the original pose sequence
+        Output: (N, C, T) the smoothed pose sequence
+    """
+
+    def __init__(self,
+                 window_size: int,
+                 output_size: int,
+                 hidden_size: int = 512,
+                 res_hidden_size: int = 256,
+                 num_blocks: int = 3,
+                 dropout: float = 0.5):
+        super().__init__()
+        self.window_size = window_size
+        self.output_size = output_size
+        self.hidden_size = hidden_size
+        self.res_hidden_size = res_hidden_size
+        self.num_blocks = num_blocks
+        self.dropout = dropout
+
+        assert output_size <= window_size, (
+            'The output size should be less than or equal to the window size.',
+            f' Got output_size=={output_size} and window_size=={window_size}')
+
+        # Build encoder layers
+        self.encoder = nn.Sequential(
+            nn.Linear(window_size, hidden_size),
+            nn.LeakyReLU(0.1, inplace=True))
+
+        # Build residual blocks
+        res_blocks = []
+        for _ in range(num_blocks):
+            res_blocks.append(
+                SmoothNetResBlock(
+                    in_channels=hidden_size,
+                    hidden_channels=res_hidden_size,
+                    dropout=dropout))
+        self.res_blocks = nn.Sequential(*res_blocks)
+
+        # Build decoder layers
+        self.decoder = nn.Linear(hidden_size, output_size)
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward function."""
+        N, C, T = x.shape
+        num_windows = T - self.window_size + 1
+
+        assert T >= self.window_size, (
+            'Input sequence length must be no less than the window size. ',
+            f'Got x.shape[2]=={T} and window_size=={self.window_size}')
+
+        # Unfold x to obtain input sliding windows
+        # [N, C, num_windows, window_size]
+        x = x.unfold(2, self.window_size, 1)
+
+        # Forward layers
+        x = self.encoder(x)
+        x = self.res_blocks(x)
+        x = self.decoder(x)  # [N, C, num_windows, output_size]
+
+        # Accumulate output ensembles
+        out = x.new_zeros(N, C, T)
+        count = x.new_zeros(T)
+
+        for t in range(num_windows):
+            out[..., t:t + self.output_size] += x[:, :, t]
+            count[t:t + self.output_size] += 1.0
+
+        return out.div(count)
+
+
+@FILTERS.register_module(name=['SmoothNetFilter', 'SmoothNet', 'smoothnet'])
+class SmoothNetFilter(TemporalFilter):
+    """Apply SmoothNet filter.
+
+    "SmoothNet: A Plug-and-Play Network for Refining Human Poses in Videos",
+    arXiv'2021. More details can be found in the `paper
+    <https://arxiv.org/abs/2112.13715>`__ .
+
+    Args:
+        window_size (int): The size of the filter window. It's also the
+            window_size of SmoothNet model.
+        output_size (int): The output window size of SmoothNet model.
+        checkpoint (str): The checkpoint file of the pretrained SmoothNet
+            model. Please note that `checkpoint` should be matched with
+            `window_size` and `output_size`.
+        hidden_size (int): SmoothNet argument. See :class:`SmoothNet` for
+            details. Default: 512
+        hidden_res_size (int): SmoothNet argument. See :class:`SmoothNet`
+            for details. Default: 256
+        num_blocks (int): SmoothNet argument. See :class:`SmoothNet` for
+            details. Default: 3
+        device (str): Device for model inference. Default: 'cpu'
+        root_index (int, optional): If not None, relative keypoint coordinates
+            will be calculated as the SmoothNet input, by centering the
+            keypoints around the root point. The model output will be
+            converted back to absolute coordinates. Default: None
+    """
+
+    def __init__(
+        self,
+        window_size: int,
+        output_size: int,
+        checkpoint: Optional[str] = None,
+        hidden_size: int = 512,
+        res_hidden_size: int = 256,
+        num_blocks: int = 3,
+        device: str = 'cpu',
+        root_index: Optional[int] = None,
+    ):
+        super().__init__(window_size)
+        self.device = device
+        self.root_index = root_index
+        self.smoothnet = SmoothNet(window_size, output_size, hidden_size,
+                                   res_hidden_size, num_blocks)
+        if checkpoint:
+            load_checkpoint(self.smoothnet, checkpoint)
+        self.smoothnet.to(device)
+        self.smoothnet.eval()
+
+        for p in self.smoothnet.parameters():
+            p.requires_grad_(False)
+
+    def __call__(self, x: np.ndarray):
+        assert x.ndim == 3, ('Input should be an array with shape [T, K, C]'
+                             f', but got invalid shape {x.shape}')
+
+        root_index = self.root_index
+        if root_index is not None:
+            x_root = x[:, root_index:root_index + 1]
+            x = np.delete(x, root_index, axis=1)
+            x = x - x_root
+
+        T, K, C = x.shape
+
+        if T < self.window_size:
+            # Skip smoothing if the input length is less than the window size
+            smoothed = x
+        else:
+            dtype = x.dtype
+
+            # Convert to tensor and forward the model
+            with torch.no_grad():
+                x = torch.tensor(x, dtype=torch.float32, device=self.device)
+                x = x.view(1, T, K * C).permute(0, 2, 1)  # to [1, KC, T]
+                smoothed = self.smoothnet(x)  # in shape [1, KC, T]
+
+            # Convert model output back to input shape and format
+            smoothed = smoothed.permute(0, 2, 1).view(T, K, C)  # to [T, K, C]
+            smoothed = smoothed.cpu().numpy().astype(dtype)  # to numpy.ndarray
+
+        if root_index is not None:
+            smoothed += x_root
+            smoothed = np.concatenate(
+                (smoothed[:, :root_index], x_root, smoothed[:, root_index:]),
+                axis=1)
+
+        return smoothed
diff --git a/main/transformer_utils/mmpose/core/utils/__init__.py b/main/transformer_utils/mmpose/core/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..512e7680bcce478ca00f79e536ee54cd02de93df
--- /dev/null
+++ b/main/transformer_utils/mmpose/core/utils/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .dist_utils import allreduce_grads, sync_random_seed
+from .model_util_hooks import ModelSetEpochHook
+from .regularizations import WeightNormClipHook
+
+__all__ = [
+    'allreduce_grads', 'WeightNormClipHook', 'sync_random_seed',
+    'ModelSetEpochHook'
+]
diff --git a/main/transformer_utils/mmpose/core/utils/dist_utils.py b/main/transformer_utils/mmpose/core/utils/dist_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..b81f925ad7aa51ce800e27bead8eb8ba021c2592
--- /dev/null
+++ b/main/transformer_utils/mmpose/core/utils/dist_utils.py
@@ -0,0 +1,90 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections import OrderedDict
+
+import numpy as np
+import torch
+import torch.distributed as dist
+from mmcv.runner import get_dist_info
+from torch._utils import (_flatten_dense_tensors, _take_tensors,
+                          _unflatten_dense_tensors)
+
+
+def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1):
+    """Allreduce parameters as a whole."""
+    if bucket_size_mb > 0:
+        bucket_size_bytes = bucket_size_mb * 1024 * 1024
+        buckets = _take_tensors(tensors, bucket_size_bytes)
+    else:
+        buckets = OrderedDict()
+        for tensor in tensors:
+            tp = tensor.type()
+            if tp not in buckets:
+                buckets[tp] = []
+            buckets[tp].append(tensor)
+        buckets = buckets.values()
+
+    for bucket in buckets:
+        flat_tensors = _flatten_dense_tensors(bucket)
+        dist.all_reduce(flat_tensors)
+        flat_tensors.div_(world_size)
+        for tensor, synced in zip(
+                bucket, _unflatten_dense_tensors(flat_tensors, bucket)):
+            tensor.copy_(synced)
+
+
+def allreduce_grads(params, coalesce=True, bucket_size_mb=-1):
+    """Allreduce gradients.
+
+    Args:
+        params (list[torch.Parameters]): List of parameters of a model
+        coalesce (bool, optional): Whether allreduce parameters as a whole.
+            Default: True.
+        bucket_size_mb (int, optional): Size of bucket, the unit is MB.
+            Default: -1.
+    """
+    grads = [
+        param.grad.data for param in params
+        if param.requires_grad and param.grad is not None
+    ]
+    world_size = dist.get_world_size()
+    if coalesce:
+        _allreduce_coalesced(grads, world_size, bucket_size_mb)
+    else:
+        for tensor in grads:
+            dist.all_reduce(tensor.div_(world_size))
+
+
+def sync_random_seed(seed=None, device='cuda'):
+    """Make sure different ranks share the same seed.
+
+    All workers must call
+    this function, otherwise it will deadlock. This method is generally used in
+    `DistributedSampler`, because the seed should be identical across all
+    processes in the distributed group.
+    In distributed sampling, different ranks should sample non-overlapped
+    data in the dataset. Therefore, this function is used to make sure that
+    each rank shuffles the data indices in the same order based
+    on the same seed. Then different ranks could use different indices
+    to select non-overlapped data from the same data list.
+    Args:
+        seed (int, Optional): The seed. Default to None.
+        device (str): The device where the seed will be put on.
+            Default to 'cuda'.
+    Returns:
+        int: Seed to be used.
+    """
+    if seed is None:
+        seed = np.random.randint(2**31)
+    assert isinstance(seed, int)
+
+    rank, world_size = get_dist_info()
+
+    if world_size == 1:
+        return seed
+
+    if rank == 0:
+        random_num = torch.tensor(seed, dtype=torch.int32, device=device)
+    else:
+        random_num = torch.tensor(0, dtype=torch.int32, device=device)
+    dist.broadcast(random_num, src=0)
+    return random_num.item()
diff --git a/main/transformer_utils/mmpose/core/utils/model_util_hooks.py b/main/transformer_utils/mmpose/core/utils/model_util_hooks.py
new file mode 100644
index 0000000000000000000000000000000000000000..d308a8a57a04f1a2acaa841ac2e8ad42439bb633
--- /dev/null
+++ b/main/transformer_utils/mmpose/core/utils/model_util_hooks.py
@@ -0,0 +1,13 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.runner import HOOKS, Hook
+
+
+@HOOKS.register_module()
+class ModelSetEpochHook(Hook):
+    """The hook that tells model the current epoch in training."""
+
+    def __init__(self):
+        pass
+
+    def before_epoch(self, runner):
+        runner.model.module.set_train_epoch(runner.epoch + 1)
diff --git a/main/transformer_utils/mmpose/core/utils/regularizations.py b/main/transformer_utils/mmpose/core/utils/regularizations.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8c7449038066016f6efb60e126111ace962fe98
--- /dev/null
+++ b/main/transformer_utils/mmpose/core/utils/regularizations.py
@@ -0,0 +1,86 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod, abstractproperty
+
+import torch
+
+
+class PytorchModuleHook(metaclass=ABCMeta):
+    """Base class for PyTorch module hook registers.
+
+    An instance of a subclass of PytorchModuleHook can be used to
+    register hook to a pytorch module using the `register` method like:
+        hook_register.register(module)
+
+    Subclasses should add/overwrite the following methods:
+        - __init__
+        - hook
+        - hook_type
+    """
+
+    @abstractmethod
+    def hook(self, *args, **kwargs):
+        """Hook function."""
+
+    @abstractproperty
+    def hook_type(self) -> str:
+        """Hook type Subclasses should overwrite this function to return a
+        string value in.
+
+        {`forward`, `forward_pre`, `backward`}
+        """
+
+    def register(self, module):
+        """Register the hook function to the module.
+
+        Args:
+            module (pytorch module): the module to register the hook.
+
+        Returns:
+            handle (torch.utils.hooks.RemovableHandle): a handle to remove
+                the hook by calling handle.remove()
+        """
+        assert isinstance(module, torch.nn.Module)
+
+        if self.hook_type == 'forward':
+            h = module.register_forward_hook(self.hook)
+        elif self.hook_type == 'forward_pre':
+            h = module.register_forward_pre_hook(self.hook)
+        elif self.hook_type == 'backward':
+            h = module.register_backward_hook(self.hook)
+        else:
+            raise ValueError(f'Invalid hook type {self.hook}')
+
+        return h
+
+
+class WeightNormClipHook(PytorchModuleHook):
+    """Apply weight norm clip regularization.
+
+    The module's parameter will be clip to a given maximum norm before each
+    forward pass.
+
+    Args:
+        max_norm (float): The maximum norm of the parameter.
+        module_param_names (str|list): The parameter name (or name list) to
+            apply weight norm clip.
+    """
+
+    def __init__(self, max_norm=1.0, module_param_names='weight'):
+        self.module_param_names = module_param_names if isinstance(
+            module_param_names, list) else [module_param_names]
+        self.max_norm = max_norm
+
+    @property
+    def hook_type(self):
+        return 'forward_pre'
+
+    def hook(self, module, _input):
+        for name in self.module_param_names:
+            assert name in module._parameters, f'{name} is not a parameter' \
+                f' of the module {type(module)}'
+            param = module._parameters[name]
+
+            with torch.no_grad():
+                m = param.norm().item()
+                if m > self.max_norm:
+                    param.mul_(self.max_norm / (m + 1e-6))
diff --git a/main/transformer_utils/mmpose/core/visualization/__init__.py b/main/transformer_utils/mmpose/core/visualization/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c87fc29145b1eff15713ca79bc36708a4836ecf8
--- /dev/null
+++ b/main/transformer_utils/mmpose/core/visualization/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .image import (imshow_bboxes, imshow_keypoints, imshow_keypoints_3d,
+                    imshow_mesh_3d, imshow_multiview_keypoints_3d)
+
+__all__ = [
+    'imshow_keypoints', 'imshow_keypoints_3d', 'imshow_bboxes',
+    'imshow_mesh_3d', 'imshow_multiview_keypoints_3d'
+]
diff --git a/main/transformer_utils/mmpose/core/visualization/image.py b/main/transformer_utils/mmpose/core/visualization/image.py
new file mode 100644
index 0000000000000000000000000000000000000000..d244b2b12cff970c810ae0798164e835dd6226e4
--- /dev/null
+++ b/main/transformer_utils/mmpose/core/visualization/image.py
@@ -0,0 +1,522 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+import os
+import warnings
+
+import cv2
+import mmcv
+import numpy as np
+from matplotlib import pyplot as plt
+from mmcv.utils.misc import deprecated_api_warning
+from mmcv.visualization.color import color_val
+
+try:
+    import trimesh
+    has_trimesh = True
+except (ImportError, ModuleNotFoundError):
+    has_trimesh = False
+
+try:
+    os.environ['PYOPENGL_PLATFORM'] = 'osmesa'
+    import pyrender
+    has_pyrender = True
+except (ImportError, ModuleNotFoundError):
+    has_pyrender = False
+
+
+def imshow_bboxes(img,
+                  bboxes,
+                  labels=None,
+                  colors='green',
+                  text_color='white',
+                  thickness=1,
+                  font_scale=0.5,
+                  show=True,
+                  win_name='',
+                  wait_time=0,
+                  out_file=None):
+    """Draw bboxes with labels (optional) on an image. This is a wrapper of
+    mmcv.imshow_bboxes.
+
+    Args:
+        img (str or ndarray): The image to be displayed.
+        bboxes (ndarray): ndarray of shape (k, 4), each row is a bbox in
+            format [x1, y1, x2, y2].
+        labels (str or list[str], optional): labels of each bbox.
+        colors (list[str or tuple or :obj:`Color`]): A list of colors.
+        text_color (str or tuple or :obj:`Color`): Color of texts.
+        thickness (int): Thickness of lines.
+        font_scale (float): Font scales of texts.
+        show (bool): Whether to show the image.
+        win_name (str): The window name.
+        wait_time (int): Value of waitKey param.
+        out_file (str, optional): The filename to write the image.
+
+    Returns:
+        ndarray: The image with bboxes drawn on it.
+    """
+
+    # adapt to mmcv.imshow_bboxes input format
+    bboxes = np.split(
+        bboxes, bboxes.shape[0], axis=0) if bboxes.shape[0] > 0 else []
+    if not isinstance(colors, list):
+        colors = [colors for _ in range(len(bboxes))]
+    colors = [mmcv.color_val(c) for c in colors]
+    assert len(bboxes) == len(colors)
+
+    img = mmcv.imshow_bboxes(
+        img,
+        bboxes,
+        colors,
+        top_k=-1,
+        thickness=thickness,
+        show=False,
+        out_file=None)
+
+    if labels is not None:
+        if not isinstance(labels, list):
+            labels = [labels for _ in range(len(bboxes))]
+        assert len(labels) == len(bboxes)
+
+        for bbox, label, color in zip(bboxes, labels, colors):
+            if label is None:
+                continue
+            bbox_int = bbox[0, :4].astype(np.int32)
+            # roughly estimate the proper font size
+            text_size, text_baseline = cv2.getTextSize(label,
+                                                       cv2.FONT_HERSHEY_DUPLEX,
+                                                       font_scale, thickness)
+            text_x1 = bbox_int[0]
+            text_y1 = max(0, bbox_int[1] - text_size[1] - text_baseline)
+            text_x2 = bbox_int[0] + text_size[0]
+            text_y2 = text_y1 + text_size[1] + text_baseline
+            cv2.rectangle(img, (text_x1, text_y1), (text_x2, text_y2), color,
+                          cv2.FILLED)
+            cv2.putText(img, label, (text_x1, text_y2 - text_baseline),
+                        cv2.FONT_HERSHEY_DUPLEX, font_scale,
+                        mmcv.color_val(text_color), thickness)
+
+    if show:
+        mmcv.imshow(img, win_name, wait_time)
+    if out_file is not None:
+        mmcv.imwrite(img, out_file)
+    return img
+
+
+@deprecated_api_warning({'pose_limb_color': 'pose_link_color'})
+def imshow_keypoints(img,
+                     pose_result,
+                     skeleton=None,
+                     kpt_score_thr=0.3,
+                     pose_kpt_color=None,
+                     pose_link_color=None,
+                     radius=4,
+                     thickness=1,
+                     show_keypoint_weight=False):
+    """Draw keypoints and links on an image.
+
+    Args:
+            img (str or Tensor): The image to draw poses on. If an image array
+                is given, id will be modified in-place.
+            pose_result (list[kpts]): The poses to draw. Each element kpts is
+                a set of K keypoints as an Kx3 numpy.ndarray, where each
+                keypoint is represented as x, y, score.
+            kpt_score_thr (float, optional): Minimum score of keypoints
+                to be shown. Default: 0.3.
+            pose_kpt_color (np.array[Nx3]`): Color of N keypoints. If None,
+                the keypoint will not be drawn.
+            pose_link_color (np.array[Mx3]): Color of M links. If None, the
+                links will not be drawn.
+            thickness (int): Thickness of lines.
+    """
+
+    img = mmcv.imread(img)
+    img_h, img_w, _ = img.shape
+
+    for kpts in pose_result:
+
+        kpts = np.array(kpts, copy=False)
+
+        # draw each point on image
+        if pose_kpt_color is not None:
+            assert len(pose_kpt_color) == len(kpts)
+
+            for kid, kpt in enumerate(kpts):
+                x_coord, y_coord, kpt_score = int(kpt[0]), int(kpt[1]), kpt[2]
+
+                if kpt_score < kpt_score_thr or pose_kpt_color[kid] is None:
+                    # skip the point that should not be drawn
+                    continue
+
+                color = tuple(int(c) for c in pose_kpt_color[kid])
+                if show_keypoint_weight:
+                    img_copy = img.copy()
+                    cv2.circle(img_copy, (int(x_coord), int(y_coord)), radius,
+                               color, -1)
+                    transparency = max(0, min(1, kpt_score))
+                    cv2.addWeighted(
+                        img_copy,
+                        transparency,
+                        img,
+                        1 - transparency,
+                        0,
+                        dst=img)
+                else:
+                    cv2.circle(img, (int(x_coord), int(y_coord)), radius,
+                               color, -1)
+
+        # draw links
+        if skeleton is not None and pose_link_color is not None:
+            assert len(pose_link_color) == len(skeleton)
+
+            for sk_id, sk in enumerate(skeleton):
+                pos1 = (int(kpts[sk[0], 0]), int(kpts[sk[0], 1]))
+                pos2 = (int(kpts[sk[1], 0]), int(kpts[sk[1], 1]))
+
+                if (pos1[0] <= 0 or pos1[0] >= img_w or pos1[1] <= 0
+                        or pos1[1] >= img_h or pos2[0] <= 0 or pos2[0] >= img_w
+                        or pos2[1] <= 0 or pos2[1] >= img_h
+                        or kpts[sk[0], 2] < kpt_score_thr
+                        or kpts[sk[1], 2] < kpt_score_thr
+                        or pose_link_color[sk_id] is None):
+                    # skip the link that should not be drawn
+                    continue
+                color = tuple(int(c) for c in pose_link_color[sk_id])
+                if show_keypoint_weight:
+                    img_copy = img.copy()
+                    X = (pos1[0], pos2[0])
+                    Y = (pos1[1], pos2[1])
+                    mX = np.mean(X)
+                    mY = np.mean(Y)
+                    length = ((Y[0] - Y[1])**2 + (X[0] - X[1])**2)**0.5
+                    angle = math.degrees(math.atan2(Y[0] - Y[1], X[0] - X[1]))
+                    stickwidth = 2
+                    polygon = cv2.ellipse2Poly(
+                        (int(mX), int(mY)), (int(length / 2), int(stickwidth)),
+                        int(angle), 0, 360, 1)
+                    cv2.fillConvexPoly(img_copy, polygon, color)
+                    transparency = max(
+                        0, min(1, 0.5 * (kpts[sk[0], 2] + kpts[sk[1], 2])))
+                    cv2.addWeighted(
+                        img_copy,
+                        transparency,
+                        img,
+                        1 - transparency,
+                        0,
+                        dst=img)
+                else:
+                    cv2.line(img, pos1, pos2, color, thickness=thickness)
+
+    return img
+
+
+def imshow_keypoints_3d(
+    pose_result,
+    img=None,
+    skeleton=None,
+    pose_kpt_color=None,
+    pose_link_color=None,
+    vis_height=400,
+    kpt_score_thr=0.3,
+    num_instances=-1,
+    *,
+    axis_azimuth=70,
+    axis_limit=1.7,
+    axis_dist=10.0,
+    axis_elev=15.0,
+):
+    """Draw 3D keypoints and links in 3D coordinates.
+
+    Args:
+        pose_result (list[dict]): 3D pose results containing:
+            - "keypoints_3d" ([K,4]): 3D keypoints
+            - "title" (str): Optional. A string to specify the title of the
+                visualization of this pose result
+        img (str|np.ndarray): Opptional. The image or image path to show input
+            image and/or 2D pose. Note that the image should be given in BGR
+            channel order.
+        skeleton (list of [idx_i,idx_j]): Skeleton described by a list of
+            links, each is a pair of joint indices.
+        pose_kpt_color (np.ndarray[Nx3]`): Color of N keypoints. If None, do
+            not nddraw keypoints.
+        pose_link_color (np.array[Mx3]): Color of M links. If None, do not
+            draw links.
+        vis_height (int): The image height of the visualization. The width
+                will be N*vis_height depending on the number of visualized
+                items.
+        kpt_score_thr (float): Minimum score of keypoints to be shown.
+            Default: 0.3.
+        num_instances (int): Number of instances to be shown in 3D. If smaller
+            than 0, all the instances in the pose_result will be shown.
+            Otherwise, pad or truncate the pose_result to a length of
+            num_instances.
+        axis_azimuth (float): axis azimuth angle for 3D visualizations.
+        axis_dist (float): axis distance for 3D visualizations.
+        axis_elev (float): axis elevation view angle for 3D visualizations.
+        axis_limit (float): The axis limit to visualize 3d pose. The xyz
+            range will be set as:
+            - x: [x_c - axis_limit/2, x_c + axis_limit/2]
+            - y: [y_c - axis_limit/2, y_c + axis_limit/2]
+            - z: [0, axis_limit]
+            Where x_c, y_c is the mean value of x and y coordinates
+        figsize: (float): figure size in inch.
+    """
+
+    show_img = img is not None
+    if num_instances < 0:
+        num_instances = len(pose_result)
+    else:
+        if len(pose_result) > num_instances:
+            pose_result = pose_result[:num_instances]
+        elif len(pose_result) < num_instances:
+            pose_result += [dict()] * (num_instances - len(pose_result))
+    num_axis = num_instances + 1 if show_img else num_instances
+
+    plt.ioff()
+    fig = plt.figure(figsize=(vis_height * num_axis * 0.01, vis_height * 0.01))
+
+    if show_img:
+        img = mmcv.imread(img, channel_order='bgr')
+        img = mmcv.bgr2rgb(img)
+        img = mmcv.imrescale(img, scale=vis_height / img.shape[0])
+
+        ax_img = fig.add_subplot(1, num_axis, 1)
+        ax_img.get_xaxis().set_visible(False)
+        ax_img.get_yaxis().set_visible(False)
+        ax_img.set_axis_off()
+        ax_img.set_title('Input')
+        ax_img.imshow(img, aspect='equal')
+
+    for idx, res in enumerate(pose_result):
+        dummy = len(res) == 0
+        kpts = np.zeros((1, 3)) if dummy else res['keypoints_3d']
+        if kpts.shape[1] == 3:
+            kpts = np.concatenate([kpts, np.ones((kpts.shape[0], 1))], axis=1)
+        valid = kpts[:, 3] >= kpt_score_thr
+
+        ax_idx = idx + 2 if show_img else idx + 1
+        ax = fig.add_subplot(1, num_axis, ax_idx, projection='3d')
+        ax.view_init(
+            elev=axis_elev,
+            azim=axis_azimuth,
+        )
+        x_c = np.mean(kpts[valid, 0]) if sum(valid) > 0 else 0
+        y_c = np.mean(kpts[valid, 1]) if sum(valid) > 0 else 0
+        ax.set_xlim3d([x_c - axis_limit / 2, x_c + axis_limit / 2])
+        ax.set_ylim3d([y_c - axis_limit / 2, y_c + axis_limit / 2])
+        ax.set_zlim3d([0, axis_limit])
+        ax.set_aspect('auto')
+        ax.set_xticks([])
+        ax.set_yticks([])
+        ax.set_zticks([])
+        ax.set_xticklabels([])
+        ax.set_yticklabels([])
+        ax.set_zticklabels([])
+        ax.dist = axis_dist
+
+        if not dummy and pose_kpt_color is not None:
+            pose_kpt_color = np.array(pose_kpt_color)
+            assert len(pose_kpt_color) == len(kpts)
+            x_3d, y_3d, z_3d = np.split(kpts[:, :3], [1, 2], axis=1)
+            # matplotlib uses RGB color in [0, 1] value range
+            _color = pose_kpt_color[..., ::-1] / 255.
+            ax.scatter(
+                x_3d[valid],
+                y_3d[valid],
+                z_3d[valid],
+                marker='o',
+                color=_color[valid],
+            )
+
+        if not dummy and skeleton is not None and pose_link_color is not None:
+            pose_link_color = np.array(pose_link_color)
+            assert len(pose_link_color) == len(skeleton)
+            for link, link_color in zip(skeleton, pose_link_color):
+                link_indices = [_i for _i in link]
+                xs_3d = kpts[link_indices, 0]
+                ys_3d = kpts[link_indices, 1]
+                zs_3d = kpts[link_indices, 2]
+                kpt_score = kpts[link_indices, 3]
+                if kpt_score.min() > kpt_score_thr:
+                    # matplotlib uses RGB color in [0, 1] value range
+                    _color = link_color[::-1] / 255.
+                    ax.plot(xs_3d, ys_3d, zs_3d, color=_color, zdir='z')
+
+        if 'title' in res:
+            ax.set_title(res['title'])
+
+    # convert figure to numpy array
+    fig.tight_layout()
+    fig.canvas.draw()
+    img_w, img_h = fig.canvas.get_width_height()
+    img_vis = np.frombuffer(
+        fig.canvas.tostring_rgb(), dtype=np.uint8).reshape(img_h, img_w, -1)
+    img_vis = mmcv.rgb2bgr(img_vis)
+
+    plt.close(fig)
+
+    return img_vis
+
+
+def imshow_mesh_3d(img,
+                   vertices,
+                   faces,
+                   camera_center,
+                   focal_length,
+                   colors=(76, 76, 204)):
+    """Render 3D meshes on background image.
+
+    Args:
+        img(np.ndarray): Background image.
+        vertices (list of np.ndarray): Vetrex coordinates in camera space.
+        faces (list of np.ndarray): Faces of meshes.
+        camera_center ([2]): Center pixel.
+        focal_length ([2]): Focal length of camera.
+        colors (list[str or tuple or Color]): A list of mesh colors.
+    """
+
+    H, W, C = img.shape
+
+    if not has_pyrender:
+        warnings.warn('pyrender package is not installed.')
+        return img
+
+    if not has_trimesh:
+        warnings.warn('trimesh package is not installed.')
+        return img
+
+    try:
+        renderer = pyrender.OffscreenRenderer(
+            viewport_width=W, viewport_height=H)
+    except (ImportError, RuntimeError):
+        warnings.warn('pyrender package is not installed correctly.')
+        return img
+
+    if not isinstance(colors, list):
+        colors = [colors for _ in range(len(vertices))]
+    colors = [color_val(c) for c in colors]
+
+    depth_map = np.ones([H, W]) * np.inf
+    output_img = img
+    for idx in range(len(vertices)):
+        color = colors[idx]
+        color = [c / 255.0 for c in color]
+        color.append(1.0)
+        vert = vertices[idx]
+        face = faces[idx]
+
+        material = pyrender.MetallicRoughnessMaterial(
+            metallicFactor=0.2, alphaMode='OPAQUE', baseColorFactor=color)
+
+        mesh = trimesh.Trimesh(vert, face)
+        rot = trimesh.transformations.rotation_matrix(
+            np.radians(180), [1, 0, 0])
+        mesh.apply_transform(rot)
+        mesh = pyrender.Mesh.from_trimesh(mesh, material=material)
+
+        scene = pyrender.Scene(ambient_light=(0.5, 0.5, 0.5))
+        scene.add(mesh, 'mesh')
+
+        camera_pose = np.eye(4)
+        camera = pyrender.IntrinsicsCamera(
+            fx=focal_length[0],
+            fy=focal_length[1],
+            cx=camera_center[0],
+            cy=camera_center[1],
+            zfar=1e5)
+        scene.add(camera, pose=camera_pose)
+
+        light = pyrender.DirectionalLight(color=[1.0, 1.0, 1.0], intensity=1)
+        light_pose = np.eye(4)
+
+        light_pose[:3, 3] = np.array([0, -1, 1])
+        scene.add(light, pose=light_pose)
+
+        light_pose[:3, 3] = np.array([0, 1, 1])
+        scene.add(light, pose=light_pose)
+
+        light_pose[:3, 3] = np.array([1, 1, 2])
+        scene.add(light, pose=light_pose)
+
+        color, rend_depth = renderer.render(
+            scene, flags=pyrender.RenderFlags.RGBA)
+
+        valid_mask = (rend_depth < depth_map) * (rend_depth > 0)
+        depth_map[valid_mask] = rend_depth[valid_mask]
+        valid_mask = valid_mask[:, :, None]
+        output_img = (
+            valid_mask * color[:, :, :3] + (1 - valid_mask) * output_img)
+
+    return output_img
+
+
+def imshow_multiview_keypoints_3d(
+    pose_result,
+    skeleton=None,
+    pose_kpt_color=None,
+    pose_link_color=None,
+    space_size=[8000, 8000, 2000],
+    space_center=[0, -500, 800],
+    kpt_score_thr=0.0,
+):
+    """Draw 3D keypoints and links in 3D coordinates.
+
+    Args:
+        pose_result (list[kpts]): The poses to draw. Each element kpts is
+            a set of K keypoints as an Kx4 numpy.ndarray, where each
+            keypoint is represented as x, y, z, score.
+        skeleton (list of [idx_i,idx_j]): Skeleton described by a list of
+            links, each is a pair of joint indices.
+        pose_kpt_color (np.ndarray[Nx3]`): Color of N keypoints. If None, do
+            not nddraw keypoints.
+        pose_link_color (np.array[Mx3]): Color of M links. If None, do not
+            draw links.
+        space_size: (list). Default: [8000, 8000, 2000].
+        space_center: (list). Default: [0, -500, 800].
+        kpt_score_thr (float): Minimum score of keypoints to be shown.
+            Default: 0.0.
+    """
+    fig = plt.figure()
+    ax = plt.axes(projection='3d')
+    ax.set_xlim3d(space_center[0] - space_size[0] * 0.5,
+                  space_center[0] + space_size[0] * 0.5)
+    ax.set_ylim3d(space_center[1] - space_size[1] * 0.5,
+                  space_center[1] + space_size[1] * 0.5)
+    ax.set_zlim3d(space_center[2] - space_size[2] * 0.5,
+                  space_center[2] + space_size[2] * 0.5)
+    pose_kpt_color = np.array(pose_kpt_color)
+    pose_kpt_color = pose_kpt_color[..., ::-1] / 255.
+
+    for kpts in pose_result:
+        # draw each point on image
+        xs, ys, zs, scores = kpts.T
+        valid = scores > kpt_score_thr
+        ax.scatter(
+            xs[valid],
+            ys[valid],
+            zs[valid],
+            marker='o',
+            color=pose_kpt_color[valid])
+
+        for link, link_color in zip(skeleton, pose_link_color):
+            link_indices = [_i for _i in link]
+            xs_3d = kpts[link_indices, 0]
+            ys_3d = kpts[link_indices, 1]
+            zs_3d = kpts[link_indices, 2]
+            kpt_score = kpts[link_indices, 3]
+            if kpt_score.min() > kpt_score_thr:
+                # matplotlib uses RGB color in [0, 1] value range
+                _color = np.array(link_color[::-1]) / 255.
+                ax.plot(xs_3d, ys_3d, zs_3d, color=_color)
+
+    # convert figure to numpy array
+    fig.tight_layout()
+    fig.canvas.draw()
+    img_w, img_h = fig.canvas.get_width_height()
+    img_vis = np.frombuffer(
+        fig.canvas.tostring_rgb(), dtype=np.uint8).reshape(img_h, img_w, -1)
+    img_vis = mmcv.rgb2bgr(img_vis)
+
+    plt.close(fig)
+
+    return img_vis
diff --git a/main/transformer_utils/mmpose/deprecated.py b/main/transformer_utils/mmpose/deprecated.py
new file mode 100644
index 0000000000000000000000000000000000000000..b930901722ab8fe57455f8eaf9e7c1c728b4b4f8
--- /dev/null
+++ b/main/transformer_utils/mmpose/deprecated.py
@@ -0,0 +1,199 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+from .datasets.builder import DATASETS
+from .datasets.datasets.base import Kpt2dSviewRgbImgTopDownDataset
+from .models.builder import HEADS, POSENETS
+from .models.detectors import AssociativeEmbedding
+from .models.heads import (AEHigherResolutionHead, AESimpleHead,
+                           DeepposeRegressionHead, HMRMeshHead,
+                           TopdownHeatmapMSMUHead,
+                           TopdownHeatmapMultiStageHead,
+                           TopdownHeatmapSimpleHead)
+
+
+@DATASETS.register_module()
+class TopDownFreiHandDataset(Kpt2dSviewRgbImgTopDownDataset):
+    """Deprecated TopDownFreiHandDataset."""
+
+    def __init__(self, *args, **kwargs):
+        raise (ImportError(
+            'TopDownFreiHandDataset has been renamed into FreiHandDataset,'
+            'check https://github.com/open-mmlab/mmpose/pull/202 for details.')
+               )
+
+    def _get_db(self):
+        return []
+
+    def evaluate(self, cfg, preds, output_dir, *args, **kwargs):
+        return None
+
+
+@DATASETS.register_module()
+class TopDownOneHand10KDataset(Kpt2dSviewRgbImgTopDownDataset):
+    """Deprecated TopDownOneHand10KDataset."""
+
+    def __init__(self, *args, **kwargs):
+        raise (ImportError(
+            'TopDownOneHand10KDataset has been renamed into OneHand10KDataset,'
+            'check https://github.com/open-mmlab/mmpose/pull/202 for details.')
+               )
+
+    def _get_db(self):
+        return []
+
+    def evaluate(self, cfg, preds, output_dir, *args, **kwargs):
+        return None
+
+
+@DATASETS.register_module()
+class TopDownPanopticDataset(Kpt2dSviewRgbImgTopDownDataset):
+    """Deprecated TopDownPanopticDataset."""
+
+    def __init__(self, *args, **kwargs):
+        raise (ImportError(
+            'TopDownPanopticDataset has been renamed into PanopticDataset,'
+            'check https://github.com/open-mmlab/mmpose/pull/202 for details.')
+               )
+
+    def _get_db(self):
+        return []
+
+    def evaluate(self, cfg, preds, output_dir, *args, **kwargs):
+        return None
+
+
+@HEADS.register_module()
+class BottomUpHigherResolutionHead(AEHigherResolutionHead):
+    """Bottom-up head for Higher Resolution.
+
+    BottomUpHigherResolutionHead has been renamed into AEHigherResolutionHead,
+    check https://github.com/open- mmlab/mmpose/pull/656 for details.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(
+            'BottomUpHigherResolutionHead has been renamed into '
+            'AEHigherResolutionHead, check '
+            'https://github.com/open-mmlab/mmpose/pull/656 for details.',
+            DeprecationWarning)
+
+
+@HEADS.register_module()
+class BottomUpSimpleHead(AESimpleHead):
+    """Bottom-up simple head.
+
+    BottomUpSimpleHead has been renamed into AESimpleHead, check
+    https://github.com/open-mmlab/mmpose/pull/656 for details.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(
+            'BottomUpHigherResolutionHead has been renamed into '
+            'AEHigherResolutionHead, check '
+            'https://github.com/open-mmlab/mmpose/pull/656 for details',
+            DeprecationWarning)
+
+
+@HEADS.register_module()
+class TopDownSimpleHead(TopdownHeatmapSimpleHead):
+    """Top-down heatmap simple head.
+
+    TopDownSimpleHead has been renamed into TopdownHeatmapSimpleHead, check
+    https://github.com/open-mmlab/mmpose/pull/656 for details.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(
+            'TopDownSimpleHead has been renamed into '
+            'TopdownHeatmapSimpleHead, check '
+            'https://github.com/open-mmlab/mmpose/pull/656 for details.',
+            DeprecationWarning)
+
+
+@HEADS.register_module()
+class TopDownMultiStageHead(TopdownHeatmapMultiStageHead):
+    """Top-down heatmap multi-stage head.
+
+    TopDownMultiStageHead has been renamed into TopdownHeatmapMultiStageHead,
+    check https://github.com/open-mmlab/mmpose/pull/656 for details.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(
+            'TopDownMultiStageHead has been renamed into '
+            'TopdownHeatmapMultiStageHead, check '
+            'https://github.com/open-mmlab/mmpose/pull/656 for details.',
+            DeprecationWarning)
+
+
+@HEADS.register_module()
+class TopDownMSMUHead(TopdownHeatmapMSMUHead):
+    """Heads for multi-stage multi-unit heads.
+
+    TopDownMSMUHead has been renamed into TopdownHeatmapMSMUHead, check
+    https://github.com/open-mmlab/mmpose/pull/656 for details.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(
+            'TopDownMSMUHead has been renamed into '
+            'TopdownHeatmapMSMUHead, check '
+            'https://github.com/open-mmlab/mmpose/pull/656 for details.',
+            DeprecationWarning)
+
+
+@HEADS.register_module()
+class MeshHMRHead(HMRMeshHead):
+    """SMPL parameters regressor head.
+
+    MeshHMRHead has been renamed into HMRMeshHead, check
+    https://github.com/open-mmlab/mmpose/pull/656 for details.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(
+            'MeshHMRHead has been renamed into '
+            'HMRMeshHead, check '
+            'https://github.com/open-mmlab/mmpose/pull/656 for details.',
+            DeprecationWarning)
+
+
+@HEADS.register_module()
+class FcHead(DeepposeRegressionHead):
+    """FcHead (deprecated).
+
+    FcHead has been renamed into DeepposeRegressionHead, check
+    https://github.com/open-mmlab/mmpose/pull/656 for details.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(
+            'FcHead has been renamed into '
+            'DeepposeRegressionHead, check '
+            'https://github.com/open-mmlab/mmpose/pull/656 for details.',
+            DeprecationWarning)
+
+
+@POSENETS.register_module()
+class BottomUp(AssociativeEmbedding):
+    """Associative Embedding.
+
+    BottomUp has been renamed into AssociativeEmbedding, check
+    https://github.com/open-mmlab/mmpose/pull/656 for details.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(
+            'BottomUp has been renamed into '
+            'AssociativeEmbedding, check '
+            'https://github.com/open-mmlab/mmpose/pull/656 for details.',
+            DeprecationWarning)
diff --git a/main/transformer_utils/mmpose/models/__init__.py b/main/transformer_utils/mmpose/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..641d115a693abff882fa7604811430f8e6b605ab
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .builder import (BACKBONES, HEADS, LOSSES, MESH_MODELS, NECKS, POSENETS,
+                      build_backbone, build_head, build_loss, build_mesh_model,
+                      build_neck, build_posenet)
+from .detectors import *  # noqa
+from .heads import *  # noqa
+from .losses import *  # noqa
+from .necks import *  # noqa
+from .utils import *  # noqa
+
+
+__all__ = [
+    'HEADS', 'NECKS', 'LOSSES', 'POSENETS', 'MESH_MODELS',
+    'build_head', 'build_loss', 'build_posenet',
+    'build_neck', 'build_mesh_model'
+]
diff --git a/main/transformer_utils/mmpose/models/backbones/__init__.py b/main/transformer_utils/mmpose/models/backbones/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..06717917a2dbd08800587d3ffa193149e42a653c
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/backbones/__init__.py
@@ -0,0 +1,41 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .alexnet import AlexNet
+from .cpm import CPM
+from .hourglass import HourglassNet
+from .hourglass_ae import HourglassAENet
+from .hrformer import HRFormer
+from .hrnet import HRNet
+from .i3d import I3D
+from .litehrnet import LiteHRNet
+from .mobilenet_v2 import MobileNetV2
+from .mobilenet_v3 import MobileNetV3
+from .mspn import MSPN
+from .pvt import PyramidVisionTransformer, PyramidVisionTransformerV2
+from .regnet import RegNet
+from .resnest import ResNeSt
+from .resnet import ResNet, ResNetV1d
+from .resnext import ResNeXt
+from .rsn import RSN
+from .scnet import SCNet
+from .seresnet import SEResNet
+from .seresnext import SEResNeXt
+from .shufflenet_v1 import ShuffleNetV1
+from .shufflenet_v2 import ShuffleNetV2
+from .swin import SwinTransformer
+from .tcformer import TCFormer
+from .tcn import TCN
+from .v2v_net import V2VNet
+from .vgg import VGG
+from .vipnas_mbv3 import ViPNAS_MobileNetV3
+from .vipnas_resnet import ViPNAS_ResNet
+from .hrt import HRT
+from .vit import ViT
+
+__all__ = [
+    'AlexNet', 'HourglassNet', 'HourglassAENet', 'HRNet', 'MobileNetV2',
+    'MobileNetV3', 'RegNet', 'ResNet', 'ResNetV1d', 'ResNeXt', 'SCNet',
+    'SEResNet', 'SEResNeXt', 'ShuffleNetV1', 'ShuffleNetV2', 'CPM', 'RSN',
+    'MSPN', 'ResNeSt', 'VGG', 'TCN', 'ViPNAS_ResNet', 'ViPNAS_MobileNetV3',
+    'LiteHRNet', 'V2VNet', 'HRFormer', 'PyramidVisionTransformer',
+    'PyramidVisionTransformerV2', 'SwinTransformer', 'I3D', 'TCFormer', 'ViT'
+]
diff --git a/main/transformer_utils/mmpose/models/backbones/alexnet.py b/main/transformer_utils/mmpose/models/backbones/alexnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8efd74d118f5abe4d9c880ebe80ce7cbd58c6b2
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/backbones/alexnet.py
@@ -0,0 +1,56 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+
+from ..builder import BACKBONES
+from .base_backbone import BaseBackbone
+
+
+@BACKBONES.register_module()
+class AlexNet(BaseBackbone):
+    """`AlexNet <https://en.wikipedia.org/wiki/AlexNet>`__ backbone.
+
+    The input for AlexNet is a 224x224 RGB image.
+
+    Args:
+        num_classes (int): number of classes for classification.
+            The default value is -1, which uses the backbone as
+            a feature extractor without the top classifier.
+    """
+
+    def __init__(self, num_classes=-1):
+        super().__init__()
+        self.num_classes = num_classes
+        self.features = nn.Sequential(
+            nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(kernel_size=3, stride=2),
+            nn.Conv2d(64, 192, kernel_size=5, padding=2),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(kernel_size=3, stride=2),
+            nn.Conv2d(192, 384, kernel_size=3, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(384, 256, kernel_size=3, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(256, 256, kernel_size=3, padding=1),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(kernel_size=3, stride=2),
+        )
+        if self.num_classes > 0:
+            self.classifier = nn.Sequential(
+                nn.Dropout(),
+                nn.Linear(256 * 6 * 6, 4096),
+                nn.ReLU(inplace=True),
+                nn.Dropout(),
+                nn.Linear(4096, 4096),
+                nn.ReLU(inplace=True),
+                nn.Linear(4096, num_classes),
+            )
+
+    def forward(self, x):
+
+        x = self.features(x)
+        if self.num_classes > 0:
+            x = x.view(x.size(0), 256 * 6 * 6)
+            x = self.classifier(x)
+
+        return x
diff --git a/main/transformer_utils/mmpose/models/backbones/base_backbone.py b/main/transformer_utils/mmpose/models/backbones/base_backbone.py
new file mode 100644
index 0000000000000000000000000000000000000000..8787d944d2233955a96d0446d9ead9f8fd8a6a9c
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/backbones/base_backbone.py
@@ -0,0 +1,83 @@
+# # Copyright (c) OpenMMLab. All rights reserved.
+# import logging
+# from abc import ABCMeta, abstractmethod
+#
+# import torch.nn as nn
+#
+# from .utils import load_checkpoint
+#
+#
+# class BaseBackbone(nn.Module, metaclass=ABCMeta):
+#     """Base backbone.
+#
+#     This class defines the basic functions of a backbone. Any backbone that
+#     inherits this class should at least define its own `forward` function.
+#     """
+#
+#     def init_weights(self, pretrained=None):
+#         """Init backbone weights.
+#
+#         Args:
+#             pretrained (str | None): If pretrained is a string, then it
+#                 initializes backbone weights by loading the pretrained
+#                 checkpoint. If pretrained is None, then it follows default
+#                 initializer or customized initializer in subclasses.
+#         """
+#         if isinstance(pretrained, str):
+#             logger = logging.getLogger()
+#             load_checkpoint(self, pretrained, strict=False, logger=logger)
+#         elif pretrained is None:
+#             # use default initializer or customized initializer in subclasses
+#             pass
+#         else:
+#             raise TypeError('pretrained must be a str or None.'
+#                             f' But received {type(pretrained)}.')
+#
+#     @abstractmethod
+#     def forward(self, x):
+#         """Forward function.
+#
+#         Args:
+#             x (Tensor | tuple[Tensor]): x could be a torch.Tensor or a tuple of
+#                 torch.Tensor, containing input data for forward computation.
+#         """
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+from abc import ABCMeta, abstractmethod
+
+import torch.nn as nn
+
+from .utils import load_checkpoint
+# from mmcv_custom.checkpoint import load_checkpoint
+
+class BaseBackbone(nn.Module, metaclass=ABCMeta):
+    """Base backbone.
+    This class defines the basic functions of a backbone. Any backbone that
+    inherits this class should at least define its own `forward` function.
+    """
+
+    def init_weights(self, pretrained=None, patch_padding='pad'):
+        """Init backbone weights.
+        Args:
+            pretrained (str | None): If pretrained is a string, then it
+                initializes backbone weights by loading the pretrained
+                checkpoint. If pretrained is None, then it follows default
+                initializer or customized initializer in subclasses.
+        """
+        if isinstance(pretrained, str):
+            logger = logging.getLogger()
+            load_checkpoint(self, pretrained, strict=False, logger=logger, patch_padding=patch_padding)
+        elif pretrained is None:
+            # use default initializer or customized initializer in subclasses
+            pass
+        else:
+            raise TypeError('pretrained must be a str or None.'
+                            f' But received {type(pretrained)}.')
+
+    @abstractmethod
+    def forward(self, x):
+        """Forward function.
+        Args:
+            x (Tensor | tuple[Tensor]): x could be a torch.Tensor or a tuple of
+                torch.Tensor, containing input data for forward computation.
+        """
\ No newline at end of file
diff --git a/main/transformer_utils/mmpose/models/backbones/cpm.py b/main/transformer_utils/mmpose/models/backbones/cpm.py
new file mode 100644
index 0000000000000000000000000000000000000000..458245d755f930f4ff625a754aadbab5c13494a6
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/backbones/cpm.py
@@ -0,0 +1,186 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, constant_init, normal_init
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmpose.utils import get_root_logger
+from ..builder import BACKBONES
+from .base_backbone import BaseBackbone
+from .utils import load_checkpoint
+
+
+class CpmBlock(nn.Module):
+    """CpmBlock for Convolutional Pose Machine.
+
+    Args:
+        in_channels (int): Input channels of this block.
+        channels (list): Output channels of each conv module.
+        kernels (list): Kernel sizes of each conv module.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 channels=(128, 128, 128),
+                 kernels=(11, 11, 11),
+                 norm_cfg=None):
+        super().__init__()
+
+        assert len(channels) == len(kernels)
+        layers = []
+        for i in range(len(channels)):
+            if i == 0:
+                input_channels = in_channels
+            else:
+                input_channels = channels[i - 1]
+            layers.append(
+                ConvModule(
+                    input_channels,
+                    channels[i],
+                    kernels[i],
+                    padding=(kernels[i] - 1) // 2,
+                    norm_cfg=norm_cfg))
+        self.model = nn.Sequential(*layers)
+
+    def forward(self, x):
+        """Model forward function."""
+        out = self.model(x)
+        return out
+
+
+@BACKBONES.register_module()
+class CPM(BaseBackbone):
+    """CPM backbone.
+
+    Convolutional Pose Machines.
+    More details can be found in the `paper
+    <https://arxiv.org/abs/1602.00134>`__ .
+
+    Args:
+        in_channels (int): The input channels of the CPM.
+        out_channels (int): The output channels of the CPM.
+        feat_channels (int): Feature channel of each CPM stage.
+        middle_channels (int): Feature channel of conv after the middle stage.
+        num_stages (int): Number of stages.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+
+    Example:
+        >>> from mmpose.models import CPM
+        >>> import torch
+        >>> self = CPM(3, 17)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 368, 368)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_output in level_outputs:
+        ...     print(tuple(level_output.shape))
+        (1, 17, 46, 46)
+        (1, 17, 46, 46)
+        (1, 17, 46, 46)
+        (1, 17, 46, 46)
+        (1, 17, 46, 46)
+        (1, 17, 46, 46)
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 feat_channels=128,
+                 middle_channels=32,
+                 num_stages=6,
+                 norm_cfg=dict(type='BN', requires_grad=True)):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        super().__init__()
+
+        assert in_channels == 3
+
+        self.num_stages = num_stages
+        assert self.num_stages >= 1
+
+        self.stem = nn.Sequential(
+            ConvModule(in_channels, 128, 9, padding=4, norm_cfg=norm_cfg),
+            nn.MaxPool2d(kernel_size=3, stride=2, padding=1),
+            ConvModule(128, 128, 9, padding=4, norm_cfg=norm_cfg),
+            nn.MaxPool2d(kernel_size=3, stride=2, padding=1),
+            ConvModule(128, 128, 9, padding=4, norm_cfg=norm_cfg),
+            nn.MaxPool2d(kernel_size=3, stride=2, padding=1),
+            ConvModule(128, 32, 5, padding=2, norm_cfg=norm_cfg),
+            ConvModule(32, 512, 9, padding=4, norm_cfg=norm_cfg),
+            ConvModule(512, 512, 1, padding=0, norm_cfg=norm_cfg),
+            ConvModule(512, out_channels, 1, padding=0, act_cfg=None))
+
+        self.middle = nn.Sequential(
+            ConvModule(in_channels, 128, 9, padding=4, norm_cfg=norm_cfg),
+            nn.MaxPool2d(kernel_size=3, stride=2, padding=1),
+            ConvModule(128, 128, 9, padding=4, norm_cfg=norm_cfg),
+            nn.MaxPool2d(kernel_size=3, stride=2, padding=1),
+            ConvModule(128, 128, 9, padding=4, norm_cfg=norm_cfg),
+            nn.MaxPool2d(kernel_size=3, stride=2, padding=1))
+
+        self.cpm_stages = nn.ModuleList([
+            CpmBlock(
+                middle_channels + out_channels,
+                channels=[feat_channels, feat_channels, feat_channels],
+                kernels=[11, 11, 11],
+                norm_cfg=norm_cfg) for _ in range(num_stages - 1)
+        ])
+
+        self.middle_conv = nn.ModuleList([
+            nn.Sequential(
+                ConvModule(
+                    128, middle_channels, 5, padding=2, norm_cfg=norm_cfg))
+            for _ in range(num_stages - 1)
+        ])
+
+        self.out_convs = nn.ModuleList([
+            nn.Sequential(
+                ConvModule(
+                    feat_channels,
+                    feat_channels,
+                    1,
+                    padding=0,
+                    norm_cfg=norm_cfg),
+                ConvModule(feat_channels, out_channels, 1, act_cfg=None))
+            for _ in range(num_stages - 1)
+        ])
+
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in backbone.
+
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        if isinstance(pretrained, str):
+            logger = get_root_logger()
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    normal_init(m, std=0.001)
+                elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
+                    constant_init(m, 1)
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, x):
+        """Model forward function."""
+        stage1_out = self.stem(x)
+        middle_out = self.middle(x)
+        out_feats = []
+
+        out_feats.append(stage1_out)
+
+        for ind in range(self.num_stages - 1):
+            single_stage = self.cpm_stages[ind]
+            out_conv = self.out_convs[ind]
+
+            inp_feat = torch.cat(
+                [out_feats[-1], self.middle_conv[ind](middle_out)], 1)
+            cpm_feat = single_stage(inp_feat)
+            out_feat = out_conv(cpm_feat)
+            out_feats.append(out_feat)
+
+        return out_feats
diff --git a/main/transformer_utils/mmpose/models/backbones/hourglass.py b/main/transformer_utils/mmpose/models/backbones/hourglass.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf75fad9895ebfd3f3c2a6bffedb3d7e4cc77cba
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/backbones/hourglass.py
@@ -0,0 +1,212 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule, constant_init, normal_init
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmpose.utils import get_root_logger
+from ..builder import BACKBONES
+from .base_backbone import BaseBackbone
+from .resnet import BasicBlock, ResLayer
+from .utils import load_checkpoint
+
+
+class HourglassModule(nn.Module):
+    """Hourglass Module for HourglassNet backbone.
+
+    Generate module recursively and use BasicBlock as the base unit.
+
+    Args:
+        depth (int): Depth of current HourglassModule.
+        stage_channels (list[int]): Feature channels of sub-modules in current
+            and follow-up HourglassModule.
+        stage_blocks (list[int]): Number of sub-modules stacked in current and
+            follow-up HourglassModule.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+    """
+
+    def __init__(self,
+                 depth,
+                 stage_channels,
+                 stage_blocks,
+                 norm_cfg=dict(type='BN', requires_grad=True)):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        super().__init__()
+
+        self.depth = depth
+
+        cur_block = stage_blocks[0]
+        next_block = stage_blocks[1]
+
+        cur_channel = stage_channels[0]
+        next_channel = stage_channels[1]
+
+        self.up1 = ResLayer(
+            BasicBlock, cur_block, cur_channel, cur_channel, norm_cfg=norm_cfg)
+
+        self.low1 = ResLayer(
+            BasicBlock,
+            cur_block,
+            cur_channel,
+            next_channel,
+            stride=2,
+            norm_cfg=norm_cfg)
+
+        if self.depth > 1:
+            self.low2 = HourglassModule(depth - 1, stage_channels[1:],
+                                        stage_blocks[1:])
+        else:
+            self.low2 = ResLayer(
+                BasicBlock,
+                next_block,
+                next_channel,
+                next_channel,
+                norm_cfg=norm_cfg)
+
+        self.low3 = ResLayer(
+            BasicBlock,
+            cur_block,
+            next_channel,
+            cur_channel,
+            norm_cfg=norm_cfg,
+            downsample_first=False)
+
+        self.up2 = nn.Upsample(scale_factor=2)
+
+    def forward(self, x):
+        """Model forward function."""
+        up1 = self.up1(x)
+        low1 = self.low1(x)
+        low2 = self.low2(low1)
+        low3 = self.low3(low2)
+        up2 = self.up2(low3)
+        return up1 + up2
+
+
+@BACKBONES.register_module()
+class HourglassNet(BaseBackbone):
+    """HourglassNet backbone.
+
+    Stacked Hourglass Networks for Human Pose Estimation.
+    More details can be found in the `paper
+    <https://arxiv.org/abs/1603.06937>`__ .
+
+    Args:
+        downsample_times (int): Downsample times in a HourglassModule.
+        num_stacks (int): Number of HourglassModule modules stacked,
+            1 for Hourglass-52, 2 for Hourglass-104.
+        stage_channels (list[int]): Feature channel of each sub-module in a
+            HourglassModule.
+        stage_blocks (list[int]): Number of sub-modules stacked in a
+            HourglassModule.
+        feat_channel (int): Feature channel of conv after a HourglassModule.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+
+    Example:
+        >>> from mmpose.models import HourglassNet
+        >>> import torch
+        >>> self = HourglassNet()
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 511, 511)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_output in level_outputs:
+        ...     print(tuple(level_output.shape))
+        (1, 256, 128, 128)
+        (1, 256, 128, 128)
+    """
+
+    def __init__(self,
+                 downsample_times=5,
+                 num_stacks=2,
+                 stage_channels=(256, 256, 384, 384, 384, 512),
+                 stage_blocks=(2, 2, 2, 2, 2, 4),
+                 feat_channel=256,
+                 norm_cfg=dict(type='BN', requires_grad=True)):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        super().__init__()
+
+        self.num_stacks = num_stacks
+        assert self.num_stacks >= 1
+        assert len(stage_channels) == len(stage_blocks)
+        assert len(stage_channels) > downsample_times
+
+        cur_channel = stage_channels[0]
+
+        self.stem = nn.Sequential(
+            ConvModule(3, 128, 7, padding=3, stride=2, norm_cfg=norm_cfg),
+            ResLayer(BasicBlock, 1, 128, 256, stride=2, norm_cfg=norm_cfg))
+
+        self.hourglass_modules = nn.ModuleList([
+            HourglassModule(downsample_times, stage_channels, stage_blocks)
+            for _ in range(num_stacks)
+        ])
+
+        self.inters = ResLayer(
+            BasicBlock,
+            num_stacks - 1,
+            cur_channel,
+            cur_channel,
+            norm_cfg=norm_cfg)
+
+        self.conv1x1s = nn.ModuleList([
+            ConvModule(
+                cur_channel, cur_channel, 1, norm_cfg=norm_cfg, act_cfg=None)
+            for _ in range(num_stacks - 1)
+        ])
+
+        self.out_convs = nn.ModuleList([
+            ConvModule(
+                cur_channel, feat_channel, 3, padding=1, norm_cfg=norm_cfg)
+            for _ in range(num_stacks)
+        ])
+
+        self.remap_convs = nn.ModuleList([
+            ConvModule(
+                feat_channel, cur_channel, 1, norm_cfg=norm_cfg, act_cfg=None)
+            for _ in range(num_stacks - 1)
+        ])
+
+        self.relu = nn.ReLU(inplace=True)
+
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in backbone.
+
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        if isinstance(pretrained, str):
+            logger = get_root_logger()
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    normal_init(m, std=0.001)
+                elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
+                    constant_init(m, 1)
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, x):
+        """Model forward function."""
+        inter_feat = self.stem(x)
+        out_feats = []
+
+        for ind in range(self.num_stacks):
+            single_hourglass = self.hourglass_modules[ind]
+            out_conv = self.out_convs[ind]
+
+            hourglass_feat = single_hourglass(inter_feat)
+            out_feat = out_conv(hourglass_feat)
+            out_feats.append(out_feat)
+
+            if ind < self.num_stacks - 1:
+                inter_feat = self.conv1x1s[ind](
+                    inter_feat) + self.remap_convs[ind](
+                        out_feat)
+                inter_feat = self.inters[ind](self.relu(inter_feat))
+
+        return out_feats
diff --git a/main/transformer_utils/mmpose/models/backbones/hourglass_ae.py b/main/transformer_utils/mmpose/models/backbones/hourglass_ae.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a700e5cb2157fd1dc16771145f065e991b270ea
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/backbones/hourglass_ae.py
@@ -0,0 +1,212 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule, MaxPool2d, constant_init, normal_init
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmpose.utils import get_root_logger
+from ..builder import BACKBONES
+from .base_backbone import BaseBackbone
+from .utils import load_checkpoint
+
+
+class HourglassAEModule(nn.Module):
+    """Modified Hourglass Module for HourglassNet_AE backbone.
+
+    Generate module recursively and use BasicBlock as the base unit.
+
+    Args:
+        depth (int): Depth of current HourglassModule.
+        stage_channels (list[int]): Feature channels of sub-modules in current
+            and follow-up HourglassModule.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+    """
+
+    def __init__(self,
+                 depth,
+                 stage_channels,
+                 norm_cfg=dict(type='BN', requires_grad=True)):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        super().__init__()
+
+        self.depth = depth
+
+        cur_channel = stage_channels[0]
+        next_channel = stage_channels[1]
+
+        self.up1 = ConvModule(
+            cur_channel, cur_channel, 3, padding=1, norm_cfg=norm_cfg)
+
+        self.pool1 = MaxPool2d(2, 2)
+
+        self.low1 = ConvModule(
+            cur_channel, next_channel, 3, padding=1, norm_cfg=norm_cfg)
+
+        if self.depth > 1:
+            self.low2 = HourglassAEModule(depth - 1, stage_channels[1:])
+        else:
+            self.low2 = ConvModule(
+                next_channel, next_channel, 3, padding=1, norm_cfg=norm_cfg)
+
+        self.low3 = ConvModule(
+            next_channel, cur_channel, 3, padding=1, norm_cfg=norm_cfg)
+
+        self.up2 = nn.UpsamplingNearest2d(scale_factor=2)
+
+    def forward(self, x):
+        """Model forward function."""
+        up1 = self.up1(x)
+        pool1 = self.pool1(x)
+        low1 = self.low1(pool1)
+        low2 = self.low2(low1)
+        low3 = self.low3(low2)
+        up2 = self.up2(low3)
+        return up1 + up2
+
+
+@BACKBONES.register_module()
+class HourglassAENet(BaseBackbone):
+    """Hourglass-AE Network proposed by Newell et al.
+
+    Associative Embedding: End-to-End Learning for Joint
+    Detection and Grouping.
+
+    More details can be found in the `paper
+    <https://arxiv.org/abs/1611.05424>`__ .
+
+    Args:
+        downsample_times (int): Downsample times in a HourglassModule.
+        num_stacks (int): Number of HourglassModule modules stacked,
+            1 for Hourglass-52, 2 for Hourglass-104.
+        stage_channels (list[int]): Feature channel of each sub-module in a
+            HourglassModule.
+        stage_blocks (list[int]): Number of sub-modules stacked in a
+            HourglassModule.
+        feat_channels (int): Feature channel of conv after a HourglassModule.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+
+    Example:
+        >>> from mmpose.models import HourglassAENet
+        >>> import torch
+        >>> self = HourglassAENet()
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 512, 512)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_output in level_outputs:
+        ...     print(tuple(level_output.shape))
+        (1, 34, 128, 128)
+    """
+
+    def __init__(self,
+                 downsample_times=4,
+                 num_stacks=1,
+                 out_channels=34,
+                 stage_channels=(256, 384, 512, 640, 768),
+                 feat_channels=256,
+                 norm_cfg=dict(type='BN', requires_grad=True)):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        super().__init__()
+
+        self.num_stacks = num_stacks
+        assert self.num_stacks >= 1
+        assert len(stage_channels) > downsample_times
+
+        cur_channels = stage_channels[0]
+
+        self.stem = nn.Sequential(
+            ConvModule(3, 64, 7, padding=3, stride=2, norm_cfg=norm_cfg),
+            ConvModule(64, 128, 3, padding=1, norm_cfg=norm_cfg),
+            MaxPool2d(2, 2),
+            ConvModule(128, 128, 3, padding=1, norm_cfg=norm_cfg),
+            ConvModule(128, feat_channels, 3, padding=1, norm_cfg=norm_cfg),
+        )
+
+        self.hourglass_modules = nn.ModuleList([
+            nn.Sequential(
+                HourglassAEModule(
+                    downsample_times, stage_channels, norm_cfg=norm_cfg),
+                ConvModule(
+                    feat_channels,
+                    feat_channels,
+                    3,
+                    padding=1,
+                    norm_cfg=norm_cfg),
+                ConvModule(
+                    feat_channels,
+                    feat_channels,
+                    3,
+                    padding=1,
+                    norm_cfg=norm_cfg)) for _ in range(num_stacks)
+        ])
+
+        self.out_convs = nn.ModuleList([
+            ConvModule(
+                cur_channels,
+                out_channels,
+                1,
+                padding=0,
+                norm_cfg=None,
+                act_cfg=None) for _ in range(num_stacks)
+        ])
+
+        self.remap_out_convs = nn.ModuleList([
+            ConvModule(
+                out_channels,
+                feat_channels,
+                1,
+                norm_cfg=norm_cfg,
+                act_cfg=None) for _ in range(num_stacks - 1)
+        ])
+
+        self.remap_feature_convs = nn.ModuleList([
+            ConvModule(
+                feat_channels,
+                feat_channels,
+                1,
+                norm_cfg=norm_cfg,
+                act_cfg=None) for _ in range(num_stacks - 1)
+        ])
+
+        self.relu = nn.ReLU(inplace=True)
+
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in backbone.
+
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        if isinstance(pretrained, str):
+            logger = get_root_logger()
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    normal_init(m, std=0.001)
+                elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
+                    constant_init(m, 1)
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, x):
+        """Model forward function."""
+        inter_feat = self.stem(x)
+        out_feats = []
+
+        for ind in range(self.num_stacks):
+            single_hourglass = self.hourglass_modules[ind]
+            out_conv = self.out_convs[ind]
+
+            hourglass_feat = single_hourglass(inter_feat)
+            out_feat = out_conv(hourglass_feat)
+            out_feats.append(out_feat)
+
+            if ind < self.num_stacks - 1:
+                inter_feat = inter_feat + self.remap_out_convs[ind](
+                    out_feat) + self.remap_feature_convs[ind](
+                        hourglass_feat)
+
+        return out_feats
diff --git a/main/transformer_utils/mmpose/models/backbones/hrformer.py b/main/transformer_utils/mmpose/models/backbones/hrformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..b843300a9fdb85908678c5a3fd45ce19e97ce2fe
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/backbones/hrformer.py
@@ -0,0 +1,746 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+import math
+
+import torch
+import torch.nn as nn
+# from timm.models.layers import to_2tuple, trunc_normal_
+from mmcv.cnn import (build_activation_layer, build_conv_layer,
+                      build_norm_layer, trunc_normal_init)
+from mmcv.cnn.bricks.transformer import build_dropout
+from mmcv.runner import BaseModule
+from torch.nn.functional import pad
+
+from ..builder import BACKBONES
+from .hrnet import Bottleneck, HRModule, HRNet
+
+
+def nlc_to_nchw(x, hw_shape):
+    """Convert [N, L, C] shape tensor to [N, C, H, W] shape tensor.
+
+    Args:
+        x (Tensor): The input tensor of shape [N, L, C] before conversion.
+        hw_shape (Sequence[int]): The height and width of output feature map.
+
+    Returns:
+        Tensor: The output tensor of shape [N, C, H, W] after conversion.
+    """
+    H, W = hw_shape
+    assert len(x.shape) == 3
+    B, L, C = x.shape
+    assert L == H * W, 'The seq_len doesn\'t match H, W'
+    return x.transpose(1, 2).reshape(B, C, H, W)
+
+
+def nchw_to_nlc(x):
+    """Flatten [N, C, H, W] shape tensor to [N, L, C] shape tensor.
+
+    Args:
+        x (Tensor): The input tensor of shape [N, C, H, W] before conversion.
+
+    Returns:
+        Tensor: The output tensor of shape [N, L, C] after conversion.
+    """
+    assert len(x.shape) == 4
+    return x.flatten(2).transpose(1, 2).contiguous()
+
+
+def build_drop_path(drop_path_rate):
+    """Build drop path layer."""
+    return build_dropout(dict(type='DropPath', drop_prob=drop_path_rate))
+
+
+class WindowMSA(BaseModule):
+    """Window based multi-head self-attention (W-MSA) module with relative
+    position bias.
+
+    Args:
+        embed_dims (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (tuple[int]): The height and width of the window.
+        qkv_bias (bool, optional):  If True, add a learnable bias to q, k, v.
+            Default: True.
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Default: None.
+        attn_drop_rate (float, optional): Dropout ratio of attention weight.
+            Default: 0.0
+        proj_drop_rate (float, optional): Dropout ratio of output. Default: 0.
+        with_rpe (bool, optional): If True, use relative position bias.
+            Default: True.
+        init_cfg (dict | None, optional): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 window_size,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 attn_drop_rate=0.,
+                 proj_drop_rate=0.,
+                 with_rpe=True,
+                 init_cfg=None):
+
+        super().__init__(init_cfg=init_cfg)
+        self.embed_dims = embed_dims
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_embed_dims = embed_dims // num_heads
+        self.scale = qk_scale or head_embed_dims**-0.5
+
+        self.with_rpe = with_rpe
+        if self.with_rpe:
+            # define a parameter table of relative position bias
+            self.relative_position_bias_table = nn.Parameter(
+                torch.zeros(
+                    (2 * window_size[0] - 1) * (2 * window_size[1] - 1),
+                    num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+
+            Wh, Ww = self.window_size
+            rel_index_coords = self.double_step_seq(2 * Ww - 1, Wh, 1, Ww)
+            rel_position_index = rel_index_coords + rel_index_coords.T
+            rel_position_index = rel_position_index.flip(1).contiguous()
+            self.register_buffer('relative_position_index', rel_position_index)
+
+        self.qkv = nn.Linear(embed_dims, embed_dims * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop_rate)
+        self.proj = nn.Linear(embed_dims, embed_dims)
+        self.proj_drop = nn.Dropout(proj_drop_rate)
+
+        self.softmax = nn.Softmax(dim=-1)
+
+    def init_weights(self):
+        trunc_normal_init(self.relative_position_bias_table, std=0.02)
+
+    def forward(self, x, mask=None):
+        """
+        Args:
+
+            x (tensor): input features with shape of (B*num_windows, N, C)
+            mask (tensor | None, Optional): mask with shape of (num_windows,
+                Wh*Ww, Wh*Ww), value should be between (-inf, 0].
+        """
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads,
+                                  C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+
+        if self.with_rpe:
+            relative_position_bias = self.relative_position_bias_table[
+                self.relative_position_index.view(-1)].view(
+                    self.window_size[0] * self.window_size[1],
+                    self.window_size[0] * self.window_size[1],
+                    -1)  # Wh*Ww,Wh*Ww,nH
+            relative_position_bias = relative_position_bias.permute(
+                2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+            attn = attn + relative_position_bias.unsqueeze(0)
+
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B // nW, nW, self.num_heads, N,
+                             N) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+        attn = self.softmax(attn)
+
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+    @staticmethod
+    def double_step_seq(step1, len1, step2, len2):
+        seq1 = torch.arange(0, step1 * len1, step1)
+        seq2 = torch.arange(0, step2 * len2, step2)
+        return (seq1[:, None] + seq2[None, :]).reshape(1, -1)
+
+
+class LocalWindowSelfAttention(BaseModule):
+    r""" Local-window Self Attention (LSA) module with relative position bias.
+
+    This module is the short-range self-attention module in the
+    Interlaced Sparse Self-Attention <https://arxiv.org/abs/1907.12273>`_.
+
+    Args:
+        embed_dims (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (tuple[int] | int): The height and width of the window.
+        qkv_bias (bool, optional):  If True, add a learnable bias to q, k, v.
+            Default: True.
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Default: None.
+        attn_drop_rate (float, optional): Dropout ratio of attention weight.
+            Default: 0.0
+        proj_drop_rate (float, optional): Dropout ratio of output. Default: 0.
+        with_rpe (bool, optional): If True, use relative position bias.
+            Default: True.
+        with_pad_mask (bool, optional): If True, mask out the padded tokens in
+            the attention process. Default: False.
+        init_cfg (dict | None, optional): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 window_size,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 attn_drop_rate=0.,
+                 proj_drop_rate=0.,
+                 with_rpe=True,
+                 with_pad_mask=False,
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        if isinstance(window_size, int):
+            window_size = (window_size, window_size)
+        self.window_size = window_size
+        self.with_pad_mask = with_pad_mask
+        self.attn = WindowMSA(
+            embed_dims=embed_dims,
+            num_heads=num_heads,
+            window_size=window_size,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop_rate=attn_drop_rate,
+            proj_drop_rate=proj_drop_rate,
+            with_rpe=with_rpe,
+            init_cfg=init_cfg)
+
+    def forward(self, x, H, W, **kwargs):
+        """Forward function."""
+        B, N, C = x.shape
+        x = x.view(B, H, W, C)
+        Wh, Ww = self.window_size
+
+        # center-pad the feature on H and W axes
+        pad_h = math.ceil(H / Wh) * Wh - H
+        pad_w = math.ceil(W / Ww) * Ww - W
+        x = pad(x, (0, 0, pad_w // 2, pad_w - pad_w // 2, pad_h // 2,
+                    pad_h - pad_h // 2))
+
+        # permute
+        x = x.view(B, math.ceil(H / Wh), Wh, math.ceil(W / Ww), Ww, C)
+        x = x.permute(0, 1, 3, 2, 4, 5)
+        x = x.reshape(-1, Wh * Ww, C)  # (B*num_window, Wh*Ww, C)
+
+        # attention
+        if self.with_pad_mask and pad_h > 0 and pad_w > 0:
+            pad_mask = x.new_zeros(1, H, W, 1)
+            pad_mask = pad(
+                pad_mask, [
+                    0, 0, pad_w // 2, pad_w - pad_w // 2, pad_h // 2,
+                    pad_h - pad_h // 2
+                ],
+                value=-float('inf'))
+            pad_mask = pad_mask.view(1, math.ceil(H / Wh), Wh,
+                                     math.ceil(W / Ww), Ww, 1)
+            pad_mask = pad_mask.permute(1, 3, 0, 2, 4, 5)
+            pad_mask = pad_mask.reshape(-1, Wh * Ww)
+            pad_mask = pad_mask[:, None, :].expand([-1, Wh * Ww, -1])
+            out = self.attn(x, pad_mask, **kwargs)
+        else:
+            out = self.attn(x, **kwargs)
+
+        # reverse permutation
+        out = out.reshape(B, math.ceil(H / Wh), math.ceil(W / Ww), Wh, Ww, C)
+        out = out.permute(0, 1, 3, 2, 4, 5)
+        out = out.reshape(B, H + pad_h, W + pad_w, C)
+
+        # de-pad
+        out = out[:, pad_h // 2:H + pad_h // 2, pad_w // 2:W + pad_w // 2]
+        return out.reshape(B, N, C)
+
+
+class CrossFFN(BaseModule):
+    r"""FFN with Depthwise Conv of HRFormer.
+
+    Args:
+        in_features (int): The feature dimension.
+        hidden_features (int, optional): The hidden dimension of FFNs.
+            Defaults: The same as in_features.
+        act_cfg (dict, optional): Config of activation layer.
+            Default: dict(type='GELU').
+        dw_act_cfg (dict, optional): Config of activation layer appended
+            right after DW Conv. Default: dict(type='GELU').
+        norm_cfg (dict, optional): Config of norm layer.
+            Default: dict(type='SyncBN').
+        init_cfg (dict | list | None, optional): The init config.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_cfg=dict(type='GELU'),
+                 dw_act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='SyncBN'),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Conv2d(in_features, hidden_features, kernel_size=1)
+        self.act1 = build_activation_layer(act_cfg)
+        self.norm1 = build_norm_layer(norm_cfg, hidden_features)[1]
+        self.dw3x3 = nn.Conv2d(
+            hidden_features,
+            hidden_features,
+            kernel_size=3,
+            stride=1,
+            groups=hidden_features,
+            padding=1)
+        self.act2 = build_activation_layer(dw_act_cfg)
+        self.norm2 = build_norm_layer(norm_cfg, hidden_features)[1]
+        self.fc2 = nn.Conv2d(hidden_features, out_features, kernel_size=1)
+        self.act3 = build_activation_layer(act_cfg)
+        self.norm3 = build_norm_layer(norm_cfg, out_features)[1]
+
+        # put the modules togather
+        self.layers = [
+            self.fc1, self.norm1, self.act1, self.dw3x3, self.norm2, self.act2,
+            self.fc2, self.norm3, self.act3
+        ]
+
+    def forward(self, x, H, W):
+        """Forward function."""
+        x = nlc_to_nchw(x, (H, W))
+        for layer in self.layers:
+            x = layer(x)
+        x = nchw_to_nlc(x)
+        return x
+
+
+class HRFormerBlock(BaseModule):
+    """High-Resolution Block for HRFormer.
+
+    Args:
+        in_features (int): The input dimension.
+        out_features (int): The output dimension.
+        num_heads (int): The number of head within each LSA.
+        window_size (int, optional): The window size for the LSA.
+            Default: 7
+        mlp_ratio (int, optional): The expansion ration of FFN.
+            Default: 4
+        act_cfg (dict, optional): Config of activation layer.
+            Default: dict(type='GELU').
+        norm_cfg (dict, optional): Config of norm layer.
+            Default: dict(type='SyncBN').
+        transformer_norm_cfg (dict, optional): Config of transformer norm
+            layer. Default: dict(type='LN', eps=1e-6).
+        init_cfg (dict | list | None, optional): The init config.
+            Default: None.
+    """
+
+    expansion = 1
+
+    def __init__(self,
+                 in_features,
+                 out_features,
+                 num_heads,
+                 window_size=7,
+                 mlp_ratio=4.0,
+                 drop_path=0.0,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='SyncBN'),
+                 transformer_norm_cfg=dict(type='LN', eps=1e-6),
+                 init_cfg=None,
+                 **kwargs):
+        super(HRFormerBlock, self).__init__(init_cfg=init_cfg)
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.mlp_ratio = mlp_ratio
+
+        self.norm1 = build_norm_layer(transformer_norm_cfg, in_features)[1]
+        self.attn = LocalWindowSelfAttention(
+            in_features,
+            num_heads=num_heads,
+            window_size=window_size,
+            init_cfg=None,
+            **kwargs)
+
+        self.norm2 = build_norm_layer(transformer_norm_cfg, out_features)[1]
+        self.ffn = CrossFFN(
+            in_features=in_features,
+            hidden_features=int(in_features * mlp_ratio),
+            out_features=out_features,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            dw_act_cfg=act_cfg,
+            init_cfg=None)
+
+        self.drop_path = build_drop_path(
+            drop_path) if drop_path > 0.0 else nn.Identity()
+
+    def forward(self, x):
+        """Forward function."""
+        B, C, H, W = x.size()
+        # Attention
+        x = x.view(B, C, -1).permute(0, 2, 1)
+        x = x + self.drop_path(self.attn(self.norm1(x), H, W))
+        # FFN
+        x = x + self.drop_path(self.ffn(self.norm2(x), H, W))
+        x = x.permute(0, 2, 1).view(B, C, H, W)
+        return x
+
+    def extra_repr(self):
+        """(Optional) Set the extra information about this module."""
+        return 'num_heads={}, window_size={}, mlp_ratio={}'.format(
+            self.num_heads, self.window_size, self.mlp_ratio)
+
+
+class HRFomerModule(HRModule):
+    """High-Resolution Module for HRFormer.
+
+    Args:
+        num_branches (int): The number of branches in the HRFormerModule.
+        block (nn.Module): The building block of HRFormer.
+            The block should be the HRFormerBlock.
+        num_blocks (tuple): The number of blocks in each branch.
+            The length must be equal to num_branches.
+        num_inchannels (tuple): The number of input channels in each branch.
+            The length must be equal to num_branches.
+        num_channels (tuple): The number of channels in each branch.
+            The length must be equal to num_branches.
+        num_heads (tuple): The number of heads within the LSAs.
+        num_window_sizes (tuple): The window size for the LSAs.
+        num_mlp_ratios (tuple): The expansion ratio for the FFNs.
+        drop_path (int, optional): The drop path rate of HRFomer.
+            Default: 0.0
+        multiscale_output (bool, optional): Whether to output multi-level
+            features produced by multiple branches. If False, only the first
+            level feature will be output. Default: True.
+        conv_cfg (dict, optional): Config of the conv layers.
+            Default: None.
+        norm_cfg (dict, optional): Config of the norm layers appended
+            right after conv. Default: dict(type='SyncBN', requires_grad=True)
+        transformer_norm_cfg (dict, optional): Config of the norm layers.
+            Default: dict(type='LN', eps=1e-6)
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False
+        upsample_cfg(dict, optional): The config of upsample layers in fuse
+            layers. Default: dict(mode='bilinear', align_corners=False)
+    """
+
+    def __init__(self,
+                 num_branches,
+                 block,
+                 num_blocks,
+                 num_inchannels,
+                 num_channels,
+                 num_heads,
+                 num_window_sizes,
+                 num_mlp_ratios,
+                 multiscale_output=True,
+                 drop_paths=0.0,
+                 with_rpe=True,
+                 with_pad_mask=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='SyncBN', requires_grad=True),
+                 transformer_norm_cfg=dict(type='LN', eps=1e-6),
+                 with_cp=False,
+                 upsample_cfg=dict(mode='bilinear', align_corners=False)):
+
+        self.transformer_norm_cfg = transformer_norm_cfg
+        self.drop_paths = drop_paths
+        self.num_heads = num_heads
+        self.num_window_sizes = num_window_sizes
+        self.num_mlp_ratios = num_mlp_ratios
+        self.with_rpe = with_rpe
+        self.with_pad_mask = with_pad_mask
+
+        super().__init__(num_branches, block, num_blocks, num_inchannels,
+                         num_channels, multiscale_output, with_cp, conv_cfg,
+                         norm_cfg, upsample_cfg)
+
+    def _make_one_branch(self,
+                         branch_index,
+                         block,
+                         num_blocks,
+                         num_channels,
+                         stride=1):
+        """Build one branch."""
+        # HRFormerBlock does not support down sample layer yet.
+        assert stride == 1 and self.in_channels[branch_index] == num_channels[
+            branch_index]
+        layers = []
+        layers.append(
+            block(
+                self.in_channels[branch_index],
+                num_channels[branch_index],
+                num_heads=self.num_heads[branch_index],
+                window_size=self.num_window_sizes[branch_index],
+                mlp_ratio=self.num_mlp_ratios[branch_index],
+                drop_path=self.drop_paths[0],
+                norm_cfg=self.norm_cfg,
+                transformer_norm_cfg=self.transformer_norm_cfg,
+                init_cfg=None,
+                with_rpe=self.with_rpe,
+                with_pad_mask=self.with_pad_mask))
+
+        self.in_channels[
+            branch_index] = self.in_channels[branch_index] * block.expansion
+        for i in range(1, num_blocks[branch_index]):
+            layers.append(
+                block(
+                    self.in_channels[branch_index],
+                    num_channels[branch_index],
+                    num_heads=self.num_heads[branch_index],
+                    window_size=self.num_window_sizes[branch_index],
+                    mlp_ratio=self.num_mlp_ratios[branch_index],
+                    drop_path=self.drop_paths[i],
+                    norm_cfg=self.norm_cfg,
+                    transformer_norm_cfg=self.transformer_norm_cfg,
+                    init_cfg=None,
+                    with_rpe=self.with_rpe,
+                    with_pad_mask=self.with_pad_mask))
+        return nn.Sequential(*layers)
+
+    def _make_fuse_layers(self):
+        """Build fuse layers."""
+        if self.num_branches == 1:
+            return None
+        num_branches = self.num_branches
+        num_inchannels = self.in_channels
+        fuse_layers = []
+        for i in range(num_branches if self.multiscale_output else 1):
+            fuse_layer = []
+            for j in range(num_branches):
+                if j > i:
+                    fuse_layer.append(
+                        nn.Sequential(
+                            build_conv_layer(
+                                self.conv_cfg,
+                                num_inchannels[j],
+                                num_inchannels[i],
+                                kernel_size=1,
+                                stride=1,
+                                bias=False),
+                            build_norm_layer(self.norm_cfg,
+                                             num_inchannels[i])[1],
+                            nn.Upsample(
+                                scale_factor=2**(j - i),
+                                mode=self.upsample_cfg['mode'],
+                                align_corners=self.
+                                upsample_cfg['align_corners'])))
+                elif j == i:
+                    fuse_layer.append(None)
+                else:
+                    conv3x3s = []
+                    for k in range(i - j):
+                        if k == i - j - 1:
+                            num_outchannels_conv3x3 = num_inchannels[i]
+                            with_out_act = False
+                        else:
+                            num_outchannels_conv3x3 = num_inchannels[j]
+                            with_out_act = True
+                        sub_modules = [
+                            build_conv_layer(
+                                self.conv_cfg,
+                                num_inchannels[j],
+                                num_inchannels[j],
+                                kernel_size=3,
+                                stride=2,
+                                padding=1,
+                                groups=num_inchannels[j],
+                                bias=False,
+                            ),
+                            build_norm_layer(self.norm_cfg,
+                                             num_inchannels[j])[1],
+                            build_conv_layer(
+                                self.conv_cfg,
+                                num_inchannels[j],
+                                num_outchannels_conv3x3,
+                                kernel_size=1,
+                                stride=1,
+                                bias=False,
+                            ),
+                            build_norm_layer(self.norm_cfg,
+                                             num_outchannels_conv3x3)[1]
+                        ]
+                        if with_out_act:
+                            sub_modules.append(nn.ReLU(False))
+                        conv3x3s.append(nn.Sequential(*sub_modules))
+                    fuse_layer.append(nn.Sequential(*conv3x3s))
+            fuse_layers.append(nn.ModuleList(fuse_layer))
+
+        return nn.ModuleList(fuse_layers)
+
+    def get_num_inchannels(self):
+        """Return the number of input channels."""
+        return self.in_channels
+
+
+@BACKBONES.register_module()
+class HRFormer(HRNet):
+    """HRFormer backbone.
+
+    This backbone is the implementation of `HRFormer: High-Resolution
+    Transformer for Dense Prediction <https://arxiv.org/abs/2110.09408>`_.
+
+    Args:
+        extra (dict): Detailed configuration for each stage of HRNet.
+            There must be 4 stages, the configuration for each stage must have
+            5 keys:
+
+                - num_modules (int): The number of HRModule in this stage.
+                - num_branches (int): The number of branches in the HRModule.
+                - block (str): The type of block.
+                - num_blocks (tuple): The number of blocks in each branch.
+                    The length must be equal to num_branches.
+                - num_channels (tuple): The number of channels in each branch.
+                    The length must be equal to num_branches.
+        in_channels (int): Number of input image channels. Normally 3.
+        conv_cfg (dict): Dictionary to construct and config conv layer.
+            Default: None.
+        norm_cfg (dict): Config of norm layer.
+            Use `SyncBN` by default.
+        transformer_norm_cfg (dict): Config of transformer norm layer.
+            Use `LN` by default.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        zero_init_residual (bool): Whether to use zero init for last norm layer
+            in resblocks to let them behave as identity. Default: False.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters. Default: -1.
+    Example:
+        >>> from mmpose.models import HRFormer
+        >>> import torch
+        >>> extra = dict(
+        >>>     stage1=dict(
+        >>>         num_modules=1,
+        >>>         num_branches=1,
+        >>>         block='BOTTLENECK',
+        >>>         num_blocks=(2, ),
+        >>>         num_channels=(64, )),
+        >>>     stage2=dict(
+        >>>         num_modules=1,
+        >>>         num_branches=2,
+        >>>         block='HRFORMER',
+        >>>         window_sizes=(7, 7),
+        >>>         num_heads=(1, 2),
+        >>>         mlp_ratios=(4, 4),
+        >>>         num_blocks=(2, 2),
+        >>>         num_channels=(32, 64)),
+        >>>     stage3=dict(
+        >>>         num_modules=4,
+        >>>         num_branches=3,
+        >>>         block='HRFORMER',
+        >>>         window_sizes=(7, 7, 7),
+        >>>         num_heads=(1, 2, 4),
+        >>>         mlp_ratios=(4, 4, 4),
+        >>>         num_blocks=(2, 2, 2),
+        >>>         num_channels=(32, 64, 128)),
+        >>>     stage4=dict(
+        >>>         num_modules=2,
+        >>>         num_branches=4,
+        >>>         block='HRFORMER',
+        >>>         window_sizes=(7, 7, 7, 7),
+        >>>         num_heads=(1, 2, 4, 8),
+        >>>         mlp_ratios=(4, 4, 4, 4),
+        >>>         num_blocks=(2, 2, 2, 2),
+        >>>         num_channels=(32, 64, 128, 256)))
+        >>> self = HRFormer(extra, in_channels=1)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 1, 32, 32)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 32, 8, 8)
+        (1, 64, 4, 4)
+        (1, 128, 2, 2)
+        (1, 256, 1, 1)
+    """
+
+    blocks_dict = {'BOTTLENECK': Bottleneck, 'HRFORMERBLOCK': HRFormerBlock}
+
+    def __init__(self,
+                 extra,
+                 in_channels=3,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 transformer_norm_cfg=dict(type='LN', eps=1e-6),
+                 norm_eval=False,
+                 with_cp=False,
+                 zero_init_residual=False,
+                 frozen_stages=-1):
+
+        # stochastic depth
+        depths = [
+            extra[stage]['num_blocks'][0] * extra[stage]['num_modules']
+            for stage in ['stage2', 'stage3', 'stage4']
+        ]
+        depth_s2, depth_s3, _ = depths
+        drop_path_rate = extra['drop_path_rate']
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
+        ]
+        extra['stage2']['drop_path_rates'] = dpr[0:depth_s2]
+        extra['stage3']['drop_path_rates'] = dpr[depth_s2:depth_s2 + depth_s3]
+        extra['stage4']['drop_path_rates'] = dpr[depth_s2 + depth_s3:]
+
+        # HRFormer use bilinear upsample as default
+        upsample_cfg = extra.get('upsample', {
+            'mode': 'bilinear',
+            'align_corners': False
+        })
+        extra['upsample'] = upsample_cfg
+        self.transformer_norm_cfg = transformer_norm_cfg
+        self.with_rpe = extra.get('with_rpe', True)
+        self.with_pad_mask = extra.get('with_pad_mask', False)
+
+        super().__init__(extra, in_channels, conv_cfg, norm_cfg, norm_eval,
+                         with_cp, zero_init_residual, frozen_stages)
+
+    def _make_stage(self,
+                    layer_config,
+                    num_inchannels,
+                    multiscale_output=True):
+        """Make each stage."""
+        num_modules = layer_config['num_modules']
+        num_branches = layer_config['num_branches']
+        num_blocks = layer_config['num_blocks']
+        num_channels = layer_config['num_channels']
+        block = self.blocks_dict[layer_config['block']]
+        num_heads = layer_config['num_heads']
+        num_window_sizes = layer_config['window_sizes']
+        num_mlp_ratios = layer_config['mlp_ratios']
+        drop_path_rates = layer_config['drop_path_rates']
+
+        modules = []
+        for i in range(num_modules):
+            # multiscale_output is only used at the last module
+            if not multiscale_output and i == num_modules - 1:
+                reset_multiscale_output = False
+            else:
+                reset_multiscale_output = True
+
+            modules.append(
+                HRFomerModule(
+                    num_branches,
+                    block,
+                    num_blocks,
+                    num_inchannels,
+                    num_channels,
+                    num_heads,
+                    num_window_sizes,
+                    num_mlp_ratios,
+                    reset_multiscale_output,
+                    drop_paths=drop_path_rates[num_blocks[0] *
+                                               i:num_blocks[0] * (i + 1)],
+                    with_rpe=self.with_rpe,
+                    with_pad_mask=self.with_pad_mask,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    transformer_norm_cfg=self.transformer_norm_cfg,
+                    with_cp=self.with_cp,
+                    upsample_cfg=self.upsample_cfg))
+            num_inchannels = modules[-1].get_num_inchannels()
+
+        return nn.Sequential(*modules), num_inchannels
diff --git a/main/transformer_utils/mmpose/models/backbones/hrnet.py b/main/transformer_utils/mmpose/models/backbones/hrnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..87dc8cef555b5e8d78fcc69293047b0cbe2ea8a6
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/backbones/hrnet.py
@@ -0,0 +1,604 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import torch.nn as nn
+from mmcv.cnn import (build_conv_layer, build_norm_layer, constant_init,
+                      normal_init)
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmpose.utils import get_root_logger
+from ..builder import BACKBONES
+from .resnet import BasicBlock, Bottleneck, get_expansion
+from .utils import load_checkpoint
+
+
+class HRModule(nn.Module):
+    """High-Resolution Module for HRNet.
+
+    In this module, every branch has 4 BasicBlocks/Bottlenecks. Fusion/Exchange
+    is in this module.
+    """
+
+    def __init__(self,
+                 num_branches,
+                 blocks,
+                 num_blocks,
+                 in_channels,
+                 num_channels,
+                 multiscale_output=False,
+                 with_cp=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 upsample_cfg=dict(mode='nearest', align_corners=None)):
+
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        super().__init__()
+        self._check_branches(num_branches, num_blocks, in_channels,
+                             num_channels)
+
+        self.in_channels = in_channels
+        self.num_branches = num_branches
+
+        self.multiscale_output = multiscale_output
+        self.norm_cfg = norm_cfg
+        self.conv_cfg = conv_cfg
+        self.upsample_cfg = upsample_cfg
+        self.with_cp = with_cp
+        self.branches = self._make_branches(num_branches, blocks, num_blocks,
+                                            num_channels)
+        self.fuse_layers = self._make_fuse_layers()
+        self.relu = nn.ReLU(inplace=True)
+
+    @staticmethod
+    def _check_branches(num_branches, num_blocks, in_channels, num_channels):
+        """Check input to avoid ValueError."""
+        if num_branches != len(num_blocks):
+            error_msg = f'NUM_BRANCHES({num_branches}) ' \
+                f'!= NUM_BLOCKS({len(num_blocks)})'
+            raise ValueError(error_msg)
+
+        if num_branches != len(num_channels):
+            error_msg = f'NUM_BRANCHES({num_branches}) ' \
+                f'!= NUM_CHANNELS({len(num_channels)})'
+            raise ValueError(error_msg)
+
+        if num_branches != len(in_channels):
+            error_msg = f'NUM_BRANCHES({num_branches}) ' \
+                f'!= NUM_INCHANNELS({len(in_channels)})'
+            raise ValueError(error_msg)
+
+    def _make_one_branch(self,
+                         branch_index,
+                         block,
+                         num_blocks,
+                         num_channels,
+                         stride=1):
+        """Make one branch."""
+        downsample = None
+        if stride != 1 or \
+                self.in_channels[branch_index] != \
+                num_channels[branch_index] * get_expansion(block):
+            downsample = nn.Sequential(
+                build_conv_layer(
+                    self.conv_cfg,
+                    self.in_channels[branch_index],
+                    num_channels[branch_index] * get_expansion(block),
+                    kernel_size=1,
+                    stride=stride,
+                    bias=False),
+                build_norm_layer(
+                    self.norm_cfg,
+                    num_channels[branch_index] * get_expansion(block))[1])
+
+        layers = []
+        layers.append(
+            block(
+                self.in_channels[branch_index],
+                num_channels[branch_index] * get_expansion(block),
+                stride=stride,
+                downsample=downsample,
+                with_cp=self.with_cp,
+                norm_cfg=self.norm_cfg,
+                conv_cfg=self.conv_cfg))
+        self.in_channels[branch_index] = \
+            num_channels[branch_index] * get_expansion(block)
+        for _ in range(1, num_blocks[branch_index]):
+            layers.append(
+                block(
+                    self.in_channels[branch_index],
+                    num_channels[branch_index] * get_expansion(block),
+                    with_cp=self.with_cp,
+                    norm_cfg=self.norm_cfg,
+                    conv_cfg=self.conv_cfg))
+
+        return nn.Sequential(*layers)
+
+    def _make_branches(self, num_branches, block, num_blocks, num_channels):
+        """Make branches."""
+        branches = []
+
+        for i in range(num_branches):
+            branches.append(
+                self._make_one_branch(i, block, num_blocks, num_channels))
+
+        return nn.ModuleList(branches)
+
+    def _make_fuse_layers(self):
+        """Make fuse layer."""
+        if self.num_branches == 1:
+            return None
+
+        num_branches = self.num_branches
+        in_channels = self.in_channels
+        fuse_layers = []
+        num_out_branches = num_branches if self.multiscale_output else 1
+
+        for i in range(num_out_branches):
+            fuse_layer = []
+            for j in range(num_branches):
+                if j > i:
+                    fuse_layer.append(
+                        nn.Sequential(
+                            build_conv_layer(
+                                self.conv_cfg,
+                                in_channels[j],
+                                in_channels[i],
+                                kernel_size=1,
+                                stride=1,
+                                padding=0,
+                                bias=False),
+                            build_norm_layer(self.norm_cfg, in_channels[i])[1],
+                            nn.Upsample(
+                                scale_factor=2**(j - i),
+                                mode=self.upsample_cfg['mode'],
+                                align_corners=self.
+                                upsample_cfg['align_corners'])))
+                elif j == i:
+                    fuse_layer.append(None)
+                else:
+                    conv_downsamples = []
+                    for k in range(i - j):
+                        if k == i - j - 1:
+                            conv_downsamples.append(
+                                nn.Sequential(
+                                    build_conv_layer(
+                                        self.conv_cfg,
+                                        in_channels[j],
+                                        in_channels[i],
+                                        kernel_size=3,
+                                        stride=2,
+                                        padding=1,
+                                        bias=False),
+                                    build_norm_layer(self.norm_cfg,
+                                                     in_channels[i])[1]))
+                        else:
+                            conv_downsamples.append(
+                                nn.Sequential(
+                                    build_conv_layer(
+                                        self.conv_cfg,
+                                        in_channels[j],
+                                        in_channels[j],
+                                        kernel_size=3,
+                                        stride=2,
+                                        padding=1,
+                                        bias=False),
+                                    build_norm_layer(self.norm_cfg,
+                                                     in_channels[j])[1],
+                                    nn.ReLU(inplace=True)))
+                    fuse_layer.append(nn.Sequential(*conv_downsamples))
+            fuse_layers.append(nn.ModuleList(fuse_layer))
+
+        return nn.ModuleList(fuse_layers)
+
+    def forward(self, x):
+        """Forward function."""
+        if self.num_branches == 1:
+            return [self.branches[0](x[0])]
+
+        for i in range(self.num_branches):
+            x[i] = self.branches[i](x[i])
+
+        x_fuse = []
+        for i in range(len(self.fuse_layers)):
+            y = 0
+            for j in range(self.num_branches):
+                if i == j:
+                    y += x[j]
+                else:
+                    y += self.fuse_layers[i][j](x[j])
+            x_fuse.append(self.relu(y))
+        return x_fuse
+
+
+@BACKBONES.register_module()
+class HRNet(nn.Module):
+    """HRNet backbone.
+
+    `High-Resolution Representations for Labeling Pixels and Regions
+    <https://arxiv.org/abs/1904.04514>`__
+
+    Args:
+        extra (dict): detailed configuration for each stage of HRNet.
+        in_channels (int): Number of input image channels. Default: 3.
+        conv_cfg (dict): dictionary to construct and config conv layer.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+        zero_init_residual (bool): whether to use zero init for last norm layer
+            in resblocks to let them behave as identity.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters. Default: -1.
+
+    Example:
+        >>> from mmpose.models import HRNet
+        >>> import torch
+        >>> extra = dict(
+        >>>     stage1=dict(
+        >>>         num_modules=1,
+        >>>         num_branches=1,
+        >>>         block='BOTTLENECK',
+        >>>         num_blocks=(4, ),
+        >>>         num_channels=(64, )),
+        >>>     stage2=dict(
+        >>>         num_modules=1,
+        >>>         num_branches=2,
+        >>>         block='BASIC',
+        >>>         num_blocks=(4, 4),
+        >>>         num_channels=(32, 64)),
+        >>>     stage3=dict(
+        >>>         num_modules=4,
+        >>>         num_branches=3,
+        >>>         block='BASIC',
+        >>>         num_blocks=(4, 4, 4),
+        >>>         num_channels=(32, 64, 128)),
+        >>>     stage4=dict(
+        >>>         num_modules=3,
+        >>>         num_branches=4,
+        >>>         block='BASIC',
+        >>>         num_blocks=(4, 4, 4, 4),
+        >>>         num_channels=(32, 64, 128, 256)))
+        >>> self = HRNet(extra, in_channels=1)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 1, 32, 32)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 32, 8, 8)
+    """
+
+    blocks_dict = {'BASIC': BasicBlock, 'BOTTLENECK': Bottleneck}
+
+    def __init__(self,
+                 extra,
+                 in_channels=3,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 norm_eval=False,
+                 with_cp=False,
+                 zero_init_residual=False,
+                 frozen_stages=-1):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        super().__init__()
+        self.extra = extra
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.norm_eval = norm_eval
+        self.with_cp = with_cp
+        self.zero_init_residual = zero_init_residual
+        self.frozen_stages = frozen_stages
+
+        # stem net
+        self.norm1_name, norm1 = build_norm_layer(self.norm_cfg, 64, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(self.norm_cfg, 64, postfix=2)
+
+        self.conv1 = build_conv_layer(
+            self.conv_cfg,
+            in_channels,
+            64,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            bias=False)
+
+        self.add_module(self.norm1_name, norm1)
+        self.conv2 = build_conv_layer(
+            self.conv_cfg,
+            64,
+            64,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            bias=False)
+
+        self.add_module(self.norm2_name, norm2)
+        self.relu = nn.ReLU(inplace=True)
+
+        self.upsample_cfg = self.extra.get('upsample', {
+            'mode': 'nearest',
+            'align_corners': None
+        })
+
+        # stage 1
+        self.stage1_cfg = self.extra['stage1']
+        num_channels = self.stage1_cfg['num_channels'][0]
+        block_type = self.stage1_cfg['block']
+        num_blocks = self.stage1_cfg['num_blocks'][0]
+
+        block = self.blocks_dict[block_type]
+        stage1_out_channels = num_channels * get_expansion(block)
+        self.layer1 = self._make_layer(block, 64, stage1_out_channels,
+                                       num_blocks)
+
+        # stage 2
+        self.stage2_cfg = self.extra['stage2']
+        num_channels = self.stage2_cfg['num_channels']
+        block_type = self.stage2_cfg['block']
+
+        block = self.blocks_dict[block_type]
+        num_channels = [
+            channel * get_expansion(block) for channel in num_channels
+        ]
+        self.transition1 = self._make_transition_layer([stage1_out_channels],
+                                                       num_channels)
+        self.stage2, pre_stage_channels = self._make_stage(
+            self.stage2_cfg, num_channels)
+
+        # stage 3
+        self.stage3_cfg = self.extra['stage3']
+        num_channels = self.stage3_cfg['num_channels']
+        block_type = self.stage3_cfg['block']
+
+        block = self.blocks_dict[block_type]
+        num_channels = [
+            channel * get_expansion(block) for channel in num_channels
+        ]
+        self.transition2 = self._make_transition_layer(pre_stage_channels,
+                                                       num_channels)
+        self.stage3, pre_stage_channels = self._make_stage(
+            self.stage3_cfg, num_channels)
+
+        # stage 4
+        self.stage4_cfg = self.extra['stage4']
+        num_channels = self.stage4_cfg['num_channels']
+        block_type = self.stage4_cfg['block']
+
+        block = self.blocks_dict[block_type]
+        num_channels = [
+            channel * get_expansion(block) for channel in num_channels
+        ]
+        self.transition3 = self._make_transition_layer(pre_stage_channels,
+                                                       num_channels)
+
+        self.stage4, pre_stage_channels = self._make_stage(
+            self.stage4_cfg,
+            num_channels,
+            multiscale_output=self.stage4_cfg.get('multiscale_output', False))
+
+        self._freeze_stages()
+
+    @property
+    def norm1(self):
+        """nn.Module: the normalization layer named "norm1" """
+        return getattr(self, self.norm1_name)
+
+    @property
+    def norm2(self):
+        """nn.Module: the normalization layer named "norm2" """
+        return getattr(self, self.norm2_name)
+
+    def _make_transition_layer(self, num_channels_pre_layer,
+                               num_channels_cur_layer):
+        """Make transition layer."""
+        num_branches_cur = len(num_channels_cur_layer)
+        num_branches_pre = len(num_channels_pre_layer)
+
+        transition_layers = []
+        for i in range(num_branches_cur):
+            if i < num_branches_pre:
+                if num_channels_cur_layer[i] != num_channels_pre_layer[i]:
+                    transition_layers.append(
+                        nn.Sequential(
+                            build_conv_layer(
+                                self.conv_cfg,
+                                num_channels_pre_layer[i],
+                                num_channels_cur_layer[i],
+                                kernel_size=3,
+                                stride=1,
+                                padding=1,
+                                bias=False),
+                            build_norm_layer(self.norm_cfg,
+                                             num_channels_cur_layer[i])[1],
+                            nn.ReLU(inplace=True)))
+                else:
+                    transition_layers.append(None)
+            else:
+                conv_downsamples = []
+                for j in range(i + 1 - num_branches_pre):
+                    in_channels = num_channels_pre_layer[-1]
+                    out_channels = num_channels_cur_layer[i] \
+                        if j == i - num_branches_pre else in_channels
+                    conv_downsamples.append(
+                        nn.Sequential(
+                            build_conv_layer(
+                                self.conv_cfg,
+                                in_channels,
+                                out_channels,
+                                kernel_size=3,
+                                stride=2,
+                                padding=1,
+                                bias=False),
+                            build_norm_layer(self.norm_cfg, out_channels)[1],
+                            nn.ReLU(inplace=True)))
+                transition_layers.append(nn.Sequential(*conv_downsamples))
+
+        return nn.ModuleList(transition_layers)
+
+    def _make_layer(self, block, in_channels, out_channels, blocks, stride=1):
+        """Make layer."""
+        downsample = None
+        if stride != 1 or in_channels != out_channels:
+            downsample = nn.Sequential(
+                build_conv_layer(
+                    self.conv_cfg,
+                    in_channels,
+                    out_channels,
+                    kernel_size=1,
+                    stride=stride,
+                    bias=False),
+                build_norm_layer(self.norm_cfg, out_channels)[1])
+
+        layers = []
+        layers.append(
+            block(
+                in_channels,
+                out_channels,
+                stride=stride,
+                downsample=downsample,
+                with_cp=self.with_cp,
+                norm_cfg=self.norm_cfg,
+                conv_cfg=self.conv_cfg))
+        for _ in range(1, blocks):
+            layers.append(
+                block(
+                    out_channels,
+                    out_channels,
+                    with_cp=self.with_cp,
+                    norm_cfg=self.norm_cfg,
+                    conv_cfg=self.conv_cfg))
+
+        return nn.Sequential(*layers)
+
+    def _make_stage(self, layer_config, in_channels, multiscale_output=True):
+        """Make stage."""
+        num_modules = layer_config['num_modules']
+        num_branches = layer_config['num_branches']
+        num_blocks = layer_config['num_blocks']
+        num_channels = layer_config['num_channels']
+        block = self.blocks_dict[layer_config['block']]
+
+        hr_modules = []
+        for i in range(num_modules):
+            # multi_scale_output is only used for the last module
+            if not multiscale_output and i == num_modules - 1:
+                reset_multiscale_output = False
+            else:
+                reset_multiscale_output = True
+
+            hr_modules.append(
+                HRModule(
+                    num_branches,
+                    block,
+                    num_blocks,
+                    in_channels,
+                    num_channels,
+                    reset_multiscale_output,
+                    with_cp=self.with_cp,
+                    norm_cfg=self.norm_cfg,
+                    conv_cfg=self.conv_cfg,
+                    upsample_cfg=self.upsample_cfg))
+
+            in_channels = hr_modules[-1].in_channels
+
+        return nn.Sequential(*hr_modules), in_channels
+
+    def _freeze_stages(self):
+        """Freeze parameters."""
+        if self.frozen_stages >= 0:
+            self.norm1.eval()
+            self.norm2.eval()
+
+            for m in [self.conv1, self.norm1, self.conv2, self.norm2]:
+                for param in m.parameters():
+                    param.requires_grad = False
+
+        for i in range(1, self.frozen_stages + 1):
+            if i == 1:
+                m = getattr(self, 'layer1')
+            else:
+                m = getattr(self, f'stage{i}')
+
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+            if i < 4:
+                m = getattr(self, f'transition{i}')
+                m.eval()
+                for param in m.parameters():
+                    param.requires_grad = False
+
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in backbone.
+
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        if isinstance(pretrained, str):
+            logger = get_root_logger()
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    normal_init(m, std=0.001)
+                elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
+                    constant_init(m, 1)
+
+            if self.zero_init_residual:
+                for m in self.modules():
+                    if isinstance(m, Bottleneck):
+                        constant_init(m.norm3, 0)
+                    elif isinstance(m, BasicBlock):
+                        constant_init(m.norm2, 0)
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, x):
+        """Forward function."""
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.norm2(x)
+        x = self.relu(x)
+        x = self.layer1(x)
+
+        x_list = []
+        for i in range(self.stage2_cfg['num_branches']):
+            if self.transition1[i] is not None:
+                x_list.append(self.transition1[i](x))
+            else:
+                x_list.append(x)
+        y_list = self.stage2(x_list)
+
+        x_list = []
+        for i in range(self.stage3_cfg['num_branches']):
+            if self.transition2[i] is not None:
+                x_list.append(self.transition2[i](y_list[-1]))
+            else:
+                x_list.append(y_list[i])
+        y_list = self.stage3(x_list)
+
+        x_list = []
+        for i in range(self.stage4_cfg['num_branches']):
+            if self.transition3[i] is not None:
+                x_list.append(self.transition3[i](y_list[-1]))
+            else:
+                x_list.append(y_list[i])
+        y_list = self.stage4(x_list)
+
+        return y_list
+
+    def train(self, mode=True):
+        """Convert the model into training mode."""
+        super().train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, _BatchNorm):
+                    m.eval()
diff --git a/main/transformer_utils/mmpose/models/backbones/hrt.py b/main/transformer_utils/mmpose/models/backbones/hrt.py
new file mode 100644
index 0000000000000000000000000000000000000000..67be3d4429d03360698701b7cd6e67e7c7a0b4ad
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/backbones/hrt.py
@@ -0,0 +1,676 @@
+# --------------------------------------------------------
+# High Resolution Transformer
+# Copyright (c) 2021 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Rao Fu, RainbowSecret
+# --------------------------------------------------------
+
+import pdb
+import torch
+import torch.nn as nn
+from mmcv.cnn import (
+    build_conv_layer,
+    build_norm_layer,
+    constant_init,
+    kaiming_init,
+    normal_init,
+)
+# from mmcv.runner import load_checkpoint
+from .hrt_checkpoint import load_checkpoint
+from mmcv.runner.checkpoint import load_state_dict
+from mmcv.utils.parrots_wrapper import _BatchNorm
+
+from mmpose.models.utils.ops import resize
+from mmpose.utils import get_root_logger
+from ..builder import BACKBONES
+from .modules.bottleneck_block import Bottleneck
+from .modules.transformer_block import GeneralTransformerBlock
+
+
+class HighResolutionTransformerModule(nn.Module):
+    def __init__(
+        self,
+        num_branches,
+        blocks,
+        num_blocks,
+        in_channels,
+        num_channels,
+        multiscale_output,
+        with_cp=False,
+        conv_cfg=None,
+        norm_cfg=dict(type="BN", requires_grad=True),
+        num_heads=None,
+        num_window_sizes=None,
+        num_mlp_ratios=None,
+        drop_paths=0.0,
+    ):
+        super(HighResolutionTransformerModule, self).__init__()
+        self._check_branches(num_branches, num_blocks, in_channels, num_channels)
+
+        self.in_channels = in_channels
+        self.num_branches = num_branches
+
+        self.multiscale_output = multiscale_output
+        self.norm_cfg = norm_cfg
+        self.conv_cfg = conv_cfg
+        self.with_cp = with_cp
+        self.branches = self._make_branches(
+            num_branches,
+            blocks,
+            num_blocks,
+            num_channels,
+            num_heads,
+            num_window_sizes,
+            num_mlp_ratios,
+            drop_paths,
+        )
+        self.fuse_layers = self._make_fuse_layers()
+        self.relu = nn.ReLU(inplace=True)
+
+        # MHSA parameters
+        self.num_heads = num_heads
+        self.num_window_sizes = num_window_sizes
+        self.num_mlp_ratios = num_mlp_ratios
+
+    def _check_branches(self, num_branches, num_blocks, in_channels, num_channels):
+        logger = get_root_logger()
+        if num_branches != len(num_blocks):
+            error_msg = "NUM_BRANCHES({}) <> NUM_BLOCKS({})".format(
+                num_branches, len(num_blocks)
+            )
+            logger.error(error_msg)
+            raise ValueError(error_msg)
+
+        if num_branches != len(num_channels):
+            error_msg = "NUM_BRANCHES({}) <> NUM_CHANNELS({})".format(
+                num_branches, len(num_channels)
+            )
+            logger.error(error_msg)
+            raise ValueError(error_msg)
+
+        if num_branches != len(in_channels):
+            error_msg = "NUM_BRANCHES({}) <> IN_CHANNELS({})".format(
+                num_branches, len(in_channels)
+            )
+            logger.error(error_msg)
+            raise ValueError(error_msg)
+
+    def _make_one_branch(
+        self,
+        branch_index,
+        block,
+        num_blocks,
+        num_channels,
+        num_heads,
+        num_window_sizes,
+        num_mlp_ratios,
+        drop_paths,
+        stride=1,
+    ):
+        """Make one branch."""
+        downsample = None
+        if (
+            stride != 1
+            or self.in_channels[branch_index]
+            != num_channels[branch_index] * block.expansion
+        ):
+            downsample = nn.Sequential(
+                build_conv_layer(
+                    self.conv_cfg,
+                    self.in_channels[branch_index],
+                    num_channels[branch_index] * block.expansion,
+                    kernel_size=1,
+                    stride=stride,
+                    bias=False,
+                ),
+                build_norm_layer(
+                    self.norm_cfg, num_channels[branch_index] * block.expansion
+                )[1],
+            )
+
+        layers = []
+
+        layers.append(
+            block(
+                self.in_channels[branch_index],
+                num_channels[branch_index],
+                num_heads=num_heads[branch_index],
+                window_size=num_window_sizes[branch_index],
+                mlp_ratio=num_mlp_ratios[branch_index],
+                drop_path=drop_paths[0],
+                norm_cfg=self.norm_cfg,
+                conv_cfg=self.conv_cfg,
+            )
+        )
+        self.in_channels[branch_index] = num_channels[branch_index] * block.expansion
+        for i in range(1, num_blocks[branch_index]):
+            layers.append(
+                block(
+                    self.in_channels[branch_index],
+                    num_channels[branch_index],
+                    num_heads=num_heads[branch_index],
+                    window_size=num_window_sizes[branch_index],
+                    mlp_ratio=num_mlp_ratios[branch_index],
+                    drop_path=drop_paths[i],
+                    norm_cfg=self.norm_cfg,
+                    conv_cfg=self.conv_cfg,
+                )
+            )
+
+        return nn.Sequential(*layers)
+
+    def _make_branches(
+        self,
+        num_branches,
+        block,
+        num_blocks,
+        num_channels,
+        num_heads,
+        num_window_sizes,
+        num_mlp_ratios,
+        drop_paths,
+    ):
+        """Make branches."""
+        branches = []
+
+        for i in range(num_branches):
+            branches.append(
+                self._make_one_branch(
+                    i,
+                    block,
+                    num_blocks,
+                    num_channels,
+                    num_heads,
+                    num_window_sizes,
+                    num_mlp_ratios,
+                    drop_paths,
+                )
+            )
+
+        return nn.ModuleList(branches)
+
+    def _make_fuse_layers(self):
+        """Build fuse layer."""
+        if self.num_branches == 1:
+            return None
+
+        num_branches = self.num_branches
+        in_channels = self.in_channels
+        fuse_layers = []
+        num_out_branches = num_branches if self.multiscale_output else 1
+        for i in range(num_out_branches):
+            fuse_layer = []
+            for j in range(num_branches):
+                if j > i:
+                    fuse_layer.append(
+                        nn.Sequential(
+                            build_conv_layer(
+                                self.conv_cfg,
+                                in_channels[j],
+                                in_channels[i],
+                                kernel_size=1,
+                                stride=1,
+                                padding=0,
+                                bias=False,
+                            ),
+                            build_norm_layer(self.norm_cfg, in_channels[i])[1],
+                            nn.Upsample(
+                                scale_factor=2 ** (j - i),
+                                mode="bilinear",
+                                align_corners=False,
+                            ),
+                        )
+                    )
+                elif j == i:
+                    fuse_layer.append(None)
+                else:
+                    conv_downsamples = []
+                    for k in range(i - j):
+                        if k == i - j - 1:
+                            conv_downsamples.append(
+                                nn.Sequential(
+                                    build_conv_layer(
+                                        self.conv_cfg,
+                                        in_channels[j],
+                                        in_channels[j],
+                                        kernel_size=3,
+                                        stride=2,
+                                        padding=1,
+                                        groups=in_channels[j],
+                                        bias=False,
+                                    ),
+                                    build_norm_layer(self.norm_cfg, in_channels[j])[1],
+                                    build_conv_layer(
+                                        self.conv_cfg,
+                                        in_channels[j],
+                                        in_channels[i],
+                                        kernel_size=1,
+                                        stride=1,
+                                        bias=False,
+                                    ),
+                                    build_norm_layer(self.norm_cfg, in_channels[i])[1],
+                                )
+                            )
+                        else:
+                            conv_downsamples.append(
+                                nn.Sequential(
+                                    build_conv_layer(
+                                        self.conv_cfg,
+                                        in_channels[j],
+                                        in_channels[j],
+                                        kernel_size=3,
+                                        stride=2,
+                                        padding=1,
+                                        groups=in_channels[j],
+                                        bias=False,
+                                    ),
+                                    build_norm_layer(self.norm_cfg, in_channels[j])[1],
+                                    build_conv_layer(
+                                        self.conv_cfg,
+                                        in_channels[j],
+                                        in_channels[j],
+                                        kernel_size=1,
+                                        stride=1,
+                                        bias=False,
+                                    ),
+                                    build_norm_layer(self.norm_cfg, in_channels[j])[1],
+                                    nn.ReLU(inplace=True),
+                                )
+                            )
+                    fuse_layer.append(nn.Sequential(*conv_downsamples))
+            fuse_layers.append(nn.ModuleList(fuse_layer))
+        return nn.ModuleList(fuse_layers)
+
+    def forward(self, x):
+        """Forward function."""
+        if self.num_branches == 1:
+            return [self.branches[0](x[0])]
+
+        for i in range(self.num_branches):
+            x[i] = self.branches[i](x[i])
+
+        x_fuse = []
+        for i in range(len(self.fuse_layers)):
+            y = x[0] if i == 0 else self.fuse_layers[i][0](x[0])
+            for j in range(1, self.num_branches):
+                if i == j:
+                    y += x[j]
+                elif j > i:
+                    y = y + resize(
+                        self.fuse_layers[i][j](x[j]),
+                        size=x[i].shape[2:],
+                        mode="bilinear",
+                        align_corners=False,
+                    )
+                else:
+                    y += self.fuse_layers[i][j](x[j])
+            x_fuse.append(self.relu(y))
+        return x_fuse
+
+
+@BACKBONES.register_module()
+class HRT(nn.Module):
+    """HRT backbone.
+    High Resolution Transformer Backbone
+    """
+
+    blocks_dict = {
+        "BOTTLENECK": Bottleneck,
+        "TRANSFORMER_BLOCK": GeneralTransformerBlock,
+    }
+
+    def __init__(
+        self,
+        extra,
+        in_channels=3,
+        conv_cfg=None,
+        norm_cfg=dict(type="BN", requires_grad=True),
+        norm_eval=False,
+        with_cp=False,
+        zero_init_residual=False,
+    ):
+        super(HRT, self).__init__()
+        self.extra = extra
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.norm_eval = norm_eval
+        self.with_cp = with_cp
+        self.zero_init_residual = zero_init_residual
+
+        # stem net
+        self.norm1_name, norm1 = build_norm_layer(self.norm_cfg, 64, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(self.norm_cfg, 64, postfix=2)
+
+        self.conv1 = build_conv_layer(
+            self.conv_cfg,
+            in_channels,
+            64,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            bias=False,
+        )
+        self.add_module(self.norm1_name, norm1)
+
+        self.conv2 = build_conv_layer(
+            self.conv_cfg, 64, 64, kernel_size=3, stride=2, padding=1, bias=False
+        )
+        self.add_module(self.norm2_name, norm2)
+        self.relu = nn.ReLU(inplace=True)
+
+        # generat drop path rate list
+        depth_s2 = (
+            self.extra["stage2"]["num_blocks"][0] * self.extra["stage2"]["num_modules"]
+        )
+        depth_s3 = (
+            self.extra["stage3"]["num_blocks"][0] * self.extra["stage3"]["num_modules"]
+        )
+        depth_s4 = (
+            self.extra["stage4"]["num_blocks"][0] * self.extra["stage4"]["num_modules"]
+        )
+        depths = [depth_s2, depth_s3, depth_s4]
+        drop_path_rate = self.extra["drop_path_rate"]
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]
+
+        logger = get_root_logger()
+        logger.info(dpr)
+
+        # stage 1
+        self.stage1_cfg = self.extra["stage1"]
+        num_channels = self.stage1_cfg["num_channels"][0]
+        block_type = self.stage1_cfg["block"]
+        num_blocks = self.stage1_cfg["num_blocks"][0]
+
+        block = self.blocks_dict[block_type]
+        stage1_out_channels = num_channels * block.expansion
+        self.layer1 = self._make_layer(block, 64, num_channels, num_blocks)
+
+        # stage 2
+        self.stage2_cfg = self.extra["stage2"]
+        num_channels = self.stage2_cfg["num_channels"]
+        block_type = self.stage2_cfg["block"]
+
+        block = self.blocks_dict[block_type]
+        num_channels = [channel * block.expansion for channel in num_channels]
+        self.transition1 = self._make_transition_layer(
+            [stage1_out_channels], num_channels
+        )
+        self.stage2, pre_stage_channels = self._make_stage(
+            self.stage2_cfg, num_channels, drop_paths=dpr[0:depth_s2]
+        )
+
+        # stage 3
+        self.stage3_cfg = self.extra["stage3"]
+        num_channels = self.stage3_cfg["num_channels"]
+        block_type = self.stage3_cfg["block"]
+
+        block = self.blocks_dict[block_type]
+        num_channels = [channel * block.expansion for channel in num_channels]
+        self.transition2 = self._make_transition_layer(pre_stage_channels, num_channels)
+        self.stage3, pre_stage_channels = self._make_stage(
+            self.stage3_cfg,
+            num_channels,
+            drop_paths=dpr[depth_s2 : depth_s2 + depth_s3],
+        )
+
+        # stage 4
+        self.stage4_cfg = self.extra["stage4"]
+        num_channels = self.stage4_cfg["num_channels"]
+        block_type = self.stage4_cfg["block"]
+
+        block = self.blocks_dict[block_type]
+        num_channels = [channel * block.expansion for channel in num_channels]
+        self.transition3 = self._make_transition_layer(pre_stage_channels, num_channels)
+        self.stage4, pre_stage_channels = self._make_stage(
+            self.stage4_cfg,
+            num_channels,
+            multiscale_output=self.stage4_cfg.get("multiscale_output", True),
+            drop_paths=dpr[depth_s2 + depth_s3 :],
+        )
+
+    @property
+    def norm1(self):
+        """nn.Module: the normalization layer named "norm1" """
+        return getattr(self, self.norm1_name)
+
+    @property
+    def norm2(self):
+        """nn.Module: the normalization layer named "norm2" """
+        return getattr(self, self.norm2_name)
+
+    def _make_transition_layer(self, num_channels_pre_layer, num_channels_cur_layer):
+        """Make transition layer."""
+        num_branches_cur = len(num_channels_cur_layer)
+        num_branches_pre = len(num_channels_pre_layer)
+
+        transition_layers = []
+        for i in range(num_branches_cur):
+            if i < num_branches_pre:
+                if num_channels_cur_layer[i] != num_channels_pre_layer[i]:
+                    transition_layers.append(
+                        nn.Sequential(
+                            build_conv_layer(
+                                self.conv_cfg,
+                                num_channels_pre_layer[i],
+                                num_channels_cur_layer[i],
+                                kernel_size=3,
+                                stride=1,
+                                padding=1,
+                                bias=False,
+                            ),
+                            build_norm_layer(self.norm_cfg, num_channels_cur_layer[i])[
+                                1
+                            ],
+                            nn.ReLU(inplace=True),
+                        )
+                    )
+                else:
+                    transition_layers.append(None)
+            else:
+                conv_downsamples = []
+                for j in range(i + 1 - num_branches_pre):
+                    in_channels = num_channels_pre_layer[-1]
+                    out_channels = (
+                        num_channels_cur_layer[i]
+                        if j == i - num_branches_pre
+                        else in_channels
+                    )
+                    conv_downsamples.append(
+                        nn.Sequential(
+                            build_conv_layer(
+                                self.conv_cfg,
+                                in_channels,
+                                out_channels,
+                                kernel_size=3,
+                                stride=2,
+                                padding=1,
+                                bias=False,
+                            ),
+                            build_norm_layer(self.norm_cfg, out_channels)[1],
+                            nn.ReLU(inplace=True),
+                        )
+                    )
+                transition_layers.append(nn.Sequential(*conv_downsamples))
+
+        return nn.ModuleList(transition_layers)
+
+    def _make_layer(
+        self,
+        block,
+        inplanes,
+        planes,
+        blocks,
+        stride=1,
+        num_heads=1,
+        window_size=7,
+        mlp_ratio=4.0,
+    ):
+        """Make each layer."""
+        downsample = None
+        if stride != 1 or inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                build_conv_layer(
+                    self.conv_cfg,
+                    inplanes,
+                    planes * block.expansion,
+                    kernel_size=1,
+                    stride=stride,
+                    bias=False,
+                ),
+                build_norm_layer(self.norm_cfg, planes * block.expansion)[1],
+            )
+
+        layers = []
+        if isinstance(block, GeneralTransformerBlock):
+            layers.append(
+                block(
+                    inplanes,
+                    planes,
+                    num_heads=num_heads,
+                    window_size=window_size,
+                    mlp_ratio=mlp_ratio,
+                    norm_cfg=self.norm_cfg,
+                    conv_cfg=self.conv_cfg,
+                )
+            )
+        else:
+            layers.append(
+                block(
+                    inplanes,
+                    planes,
+                    stride,
+                    downsample=downsample,
+                    with_cp=self.with_cp,
+                    norm_cfg=self.norm_cfg,
+                    conv_cfg=self.conv_cfg,
+                )
+            )
+        inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(
+                block(
+                    inplanes,
+                    planes,
+                    with_cp=self.with_cp,
+                    norm_cfg=self.norm_cfg,
+                    conv_cfg=self.conv_cfg,
+                )
+            )
+
+        return nn.Sequential(*layers)
+
+    def _make_stage(
+        self, layer_config, in_channels, multiscale_output=True, drop_paths=0.0
+    ):
+        """Make each stage."""
+        num_modules = layer_config["num_modules"]
+        num_branches = layer_config["num_branches"]
+        num_blocks = layer_config["num_blocks"]
+        num_channels = layer_config["num_channels"]
+        block = self.blocks_dict[layer_config["block"]]
+
+        num_heads = layer_config["num_heads"]
+        num_window_sizes = layer_config["num_window_sizes"]
+        num_mlp_ratios = layer_config["num_mlp_ratios"]
+
+        hr_modules = []
+        for i in range(num_modules):
+            # multi_scale_output is only used for the last module
+            if not multiscale_output and i == num_modules - 1:
+                reset_multiscale_output = False
+            else:
+                reset_multiscale_output = True
+
+            hr_modules.append(
+                HighResolutionTransformerModule(
+                    num_branches,
+                    block,
+                    num_blocks,
+                    in_channels,
+                    num_channels,
+                    reset_multiscale_output,
+                    with_cp=self.with_cp,
+                    norm_cfg=self.norm_cfg,
+                    conv_cfg=self.conv_cfg,
+                    num_heads=num_heads,
+                    num_window_sizes=num_window_sizes,
+                    num_mlp_ratios=num_mlp_ratios,
+                    drop_paths=drop_paths[num_blocks[0] * i : num_blocks[0] * (i + 1)],
+                )
+            )
+
+        return nn.Sequential(*hr_modules), in_channels
+
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in backbone.
+
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+            Defaults to None.
+        """
+        if isinstance(pretrained, str):
+            logger = get_root_logger()
+            ckpt = load_checkpoint(self, pretrained, strict=False)
+            if "model" in ckpt:
+                msg = self.load_state_dict(ckpt["model"], strict=False)
+                logger.info(msg)
+        elif pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    """mmseg: kaiming_init(m)"""
+                    normal_init(m, std=0.001)
+                elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
+                    constant_init(m, 1)
+
+            if self.zero_init_residual:
+                for m in self.modules():
+                    if isinstance(m, Bottleneck):
+                        constant_init(m.norm3, 0)
+                    elif isinstance(m, BasicBlock):
+                        constant_init(m.norm2, 0)
+        else:
+            raise TypeError("pretrained must be a str or None")
+
+    def forward(self, x):
+        """Forward function."""
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.norm2(x)
+        x = self.relu(x)
+        x = self.layer1(x)
+
+        x_list = []
+        for i in range(self.stage2_cfg["num_branches"]):
+            if self.transition1[i] is not None:
+                x_list.append(self.transition1[i](x))
+            else:
+                x_list.append(x)
+        y_list = self.stage2(x_list)
+
+        x_list = []
+        for i in range(self.stage3_cfg["num_branches"]):
+            if self.transition2[i] is not None:
+                x_list.append(self.transition2[i](y_list[-1]))
+            else:
+                x_list.append(y_list[i])
+        y_list = self.stage3(x_list)
+
+        x_list = []
+        for i in range(self.stage4_cfg["num_branches"]):
+            if self.transition3[i] is not None:
+                x_list.append(self.transition3[i](y_list[-1]))
+            else:
+                x_list.append(y_list[i])
+        y_list = self.stage4(x_list)
+
+        return y_list
+
+    def train(self, mode=True):
+        """Convert the model into training mode."""
+        super(HRT, self).train(mode)
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, _BatchNorm):
+                    m.eval()
diff --git a/main/transformer_utils/mmpose/models/backbones/hrt_checkpoint.py b/main/transformer_utils/mmpose/models/backbones/hrt_checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..e27749d45ad2e1b24e50de8b85af90b4464e91ba
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/backbones/hrt_checkpoint.py
@@ -0,0 +1,500 @@
+# Copyright (c) Open-MMLab. All rights reserved.
+import io
+import os
+import os.path as osp
+import pkgutil
+import time
+import warnings
+from collections import OrderedDict
+from importlib import import_module
+from tempfile import TemporaryDirectory
+
+import torch
+import torchvision
+from torch.optim import Optimizer
+from torch.utils import model_zoo
+from torch.nn import functional as F
+
+import mmcv
+from mmcv.fileio import FileClient
+from mmcv.fileio import load as load_file
+from mmcv.parallel import is_module_wrapper
+from mmcv.utils import mkdir_or_exist
+from mmcv.runner import get_dist_info
+
+ENV_MMCV_HOME = 'MMCV_HOME'
+ENV_XDG_CACHE_HOME = 'XDG_CACHE_HOME'
+DEFAULT_CACHE_DIR = '~/.cache'
+
+
+def _get_mmcv_home():
+    mmcv_home = os.path.expanduser(
+        os.getenv(
+            ENV_MMCV_HOME,
+            os.path.join(
+                os.getenv(ENV_XDG_CACHE_HOME, DEFAULT_CACHE_DIR), 'mmcv')))
+
+    mkdir_or_exist(mmcv_home)
+    return mmcv_home
+
+
+def load_state_dict(module, state_dict, strict=False, logger=None):
+    """Load state_dict to a module.
+
+    This method is modified from :meth:`torch.nn.Module.load_state_dict`.
+    Default value for ``strict`` is set to ``False`` and the message for
+    param mismatch will be shown even if strict is False.
+
+    Args:
+        module (Module): Module that receives the state_dict.
+        state_dict (OrderedDict): Weights.
+        strict (bool): whether to strictly enforce that the keys
+            in :attr:`state_dict` match the keys returned by this module's
+            :meth:`~torch.nn.Module.state_dict` function. Default: ``False``.
+        logger (:obj:`logging.Logger`, optional): Logger to log the error
+            message. If not specified, print function will be used.
+    """
+    unexpected_keys = []
+    all_missing_keys = []
+    err_msg = []
+
+    metadata = getattr(state_dict, '_metadata', None)
+    state_dict = state_dict.copy()
+    if metadata is not None:
+        state_dict._metadata = metadata
+
+    # use _load_from_state_dict to enable checkpoint version control
+    def load(module, prefix=''):
+        # recursively check parallel module in case that the model has a
+        # complicated structure, e.g., nn.Module(nn.Module(DDP))
+        if is_module_wrapper(module):
+            module = module.module
+        local_metadata = {} if metadata is None else metadata.get(
+            prefix[:-1], {})
+        module._load_from_state_dict(state_dict, prefix, local_metadata, True,
+                                     all_missing_keys, unexpected_keys,
+                                     err_msg)
+        for name, child in module._modules.items():
+            if child is not None:
+                load(child, prefix + name + '.')
+
+    load(module)
+    load = None  # break load->load reference cycle
+
+    # ignore "num_batches_tracked" of BN layers
+    missing_keys = [
+        key for key in all_missing_keys if 'num_batches_tracked' not in key
+    ]
+
+    if unexpected_keys:
+        err_msg.append('unexpected key in source '
+                       f'state_dict: {", ".join(unexpected_keys)}\n')
+    if missing_keys:
+        err_msg.append(
+            f'missing keys in source state_dict: {", ".join(missing_keys)}\n')
+
+    rank, _ = get_dist_info()
+    if len(err_msg) > 0 and rank == 0:
+        err_msg.insert(
+            0, 'The model and loaded state dict do not match exactly\n')
+        err_msg = '\n'.join(err_msg)
+        if strict:
+            raise RuntimeError(err_msg)
+        elif logger is not None:
+            logger.warning(err_msg)
+        else:
+            print(err_msg)
+
+
+def load_url_dist(url, model_dir=None):
+    """In distributed setting, this function only download checkpoint at local
+    rank 0."""
+    rank, world_size = get_dist_info()
+    rank = int(os.environ.get('LOCAL_RANK', rank))
+    if rank == 0:
+        checkpoint = model_zoo.load_url(url, model_dir=model_dir)
+    if world_size > 1:
+        torch.distributed.barrier()
+        if rank > 0:
+            checkpoint = model_zoo.load_url(url, model_dir=model_dir)
+    return checkpoint
+
+
+def load_pavimodel_dist(model_path, map_location=None):
+    """In distributed setting, this function only download checkpoint at local
+    rank 0."""
+    try:
+        from pavi import modelcloud
+    except ImportError:
+        raise ImportError(
+            'Please install pavi to load checkpoint from modelcloud.')
+    rank, world_size = get_dist_info()
+    rank = int(os.environ.get('LOCAL_RANK', rank))
+    if rank == 0:
+        model = modelcloud.get(model_path)
+        with TemporaryDirectory() as tmp_dir:
+            downloaded_file = osp.join(tmp_dir, model.name)
+            model.download(downloaded_file)
+            checkpoint = torch.load(downloaded_file, map_location=map_location)
+    if world_size > 1:
+        torch.distributed.barrier()
+        if rank > 0:
+            model = modelcloud.get(model_path)
+            with TemporaryDirectory() as tmp_dir:
+                downloaded_file = osp.join(tmp_dir, model.name)
+                model.download(downloaded_file)
+                checkpoint = torch.load(
+                    downloaded_file, map_location=map_location)
+    return checkpoint
+
+
+def load_fileclient_dist(filename, backend, map_location):
+    """In distributed setting, this function only download checkpoint at local
+    rank 0."""
+    rank, world_size = get_dist_info()
+    rank = int(os.environ.get('LOCAL_RANK', rank))
+    allowed_backends = ['ceph']
+    if backend not in allowed_backends:
+        raise ValueError(f'Load from Backend {backend} is not supported.')
+    if rank == 0:
+        fileclient = FileClient(backend=backend)
+        buffer = io.BytesIO(fileclient.get(filename))
+        checkpoint = torch.load(buffer, map_location=map_location)
+    if world_size > 1:
+        torch.distributed.barrier()
+        if rank > 0:
+            fileclient = FileClient(backend=backend)
+            buffer = io.BytesIO(fileclient.get(filename))
+            checkpoint = torch.load(buffer, map_location=map_location)
+    return checkpoint
+
+
+def get_torchvision_models():
+    model_urls = dict()
+    for _, name, ispkg in pkgutil.walk_packages(torchvision.models.__path__):
+        if ispkg:
+            continue
+        _zoo = import_module(f'torchvision.models.{name}')
+        if hasattr(_zoo, 'model_urls'):
+            _urls = getattr(_zoo, 'model_urls')
+            model_urls.update(_urls)
+    return model_urls
+
+
+def get_external_models():
+    mmcv_home = _get_mmcv_home()
+    default_json_path = osp.join(mmcv.__path__[0], 'model_zoo/open_mmlab.json')
+    default_urls = load_file(default_json_path)
+    assert isinstance(default_urls, dict)
+    external_json_path = osp.join(mmcv_home, 'open_mmlab.json')
+    if osp.exists(external_json_path):
+        external_urls = load_file(external_json_path)
+        assert isinstance(external_urls, dict)
+        default_urls.update(external_urls)
+
+    return default_urls
+
+
+def get_mmcls_models():
+    mmcls_json_path = osp.join(mmcv.__path__[0], 'model_zoo/mmcls.json')
+    mmcls_urls = load_file(mmcls_json_path)
+
+    return mmcls_urls
+
+
+def get_deprecated_model_names():
+    deprecate_json_path = osp.join(mmcv.__path__[0],
+                                   'model_zoo/deprecated.json')
+    deprecate_urls = load_file(deprecate_json_path)
+    assert isinstance(deprecate_urls, dict)
+
+    return deprecate_urls
+
+
+def _process_mmcls_checkpoint(checkpoint):
+    state_dict = checkpoint['state_dict']
+    new_state_dict = OrderedDict()
+    for k, v in state_dict.items():
+        if k.startswith('backbone.'):
+            new_state_dict[k[9:]] = v
+    new_checkpoint = dict(state_dict=new_state_dict)
+
+    return new_checkpoint
+
+
+def _load_checkpoint(filename, map_location=None):
+    """Load checkpoint from somewhere (modelzoo, file, url).
+
+    Args:
+        filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+            ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
+            details.
+        map_location (str | None): Same as :func:`torch.load`. Default: None.
+
+    Returns:
+        dict | OrderedDict: The loaded checkpoint. It can be either an
+            OrderedDict storing model weights or a dict containing other
+            information, which depends on the checkpoint.
+    """
+    if filename.startswith('modelzoo://'):
+        warnings.warn('The URL scheme of "modelzoo://" is deprecated, please '
+                      'use "torchvision://" instead')
+        model_urls = get_torchvision_models()
+        model_name = filename[11:]
+        checkpoint = load_url_dist(model_urls[model_name])
+    elif filename.startswith('torchvision://'):
+        model_urls = get_torchvision_models()
+        model_name = filename[14:]
+        checkpoint = load_url_dist(model_urls[model_name])
+    elif filename.startswith('open-mmlab://'):
+        model_urls = get_external_models()
+        model_name = filename[13:]
+        deprecated_urls = get_deprecated_model_names()
+        if model_name in deprecated_urls:
+            warnings.warn(f'open-mmlab://{model_name} is deprecated in favor '
+                          f'of open-mmlab://{deprecated_urls[model_name]}')
+            model_name = deprecated_urls[model_name]
+        model_url = model_urls[model_name]
+        # check if is url
+        if model_url.startswith(('http://', 'https://')):
+            checkpoint = load_url_dist(model_url)
+        else:
+            filename = osp.join(_get_mmcv_home(), model_url)
+            if not osp.isfile(filename):
+                raise IOError(f'{filename} is not a checkpoint file')
+            checkpoint = torch.load(filename, map_location=map_location)
+    elif filename.startswith('mmcls://'):
+        model_urls = get_mmcls_models()
+        model_name = filename[8:]
+        checkpoint = load_url_dist(model_urls[model_name])
+        checkpoint = _process_mmcls_checkpoint(checkpoint)
+    elif filename.startswith(('http://', 'https://')):
+        checkpoint = load_url_dist(filename)
+    elif filename.startswith('pavi://'):
+        model_path = filename[7:]
+        checkpoint = load_pavimodel_dist(model_path, map_location=map_location)
+    elif filename.startswith('s3://'):
+        checkpoint = load_fileclient_dist(
+            filename, backend='ceph', map_location=map_location)
+    else:
+        if not osp.isfile(filename):
+            raise IOError(f'{filename} is not a checkpoint file')
+        checkpoint = torch.load(filename, map_location=map_location)
+    return checkpoint
+
+
+def load_checkpoint(model,
+                    filename,
+                    map_location='cpu',
+                    strict=False,
+                    logger=None):
+    """Load checkpoint from a file or URI.
+
+    Args:
+        model (Module): Module to load checkpoint.
+        filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+            ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
+            details.
+        map_location (str): Same as :func:`torch.load`.
+        strict (bool): Whether to allow different params for the model and
+            checkpoint.
+        logger (:mod:`logging.Logger` or None): The logger for error message.
+
+    Returns:
+        dict or OrderedDict: The loaded checkpoint.
+    """
+    checkpoint = _load_checkpoint(filename, map_location)
+    # OrderedDict is a subclass of dict
+    if not isinstance(checkpoint, dict):
+        raise RuntimeError(
+            f'No state_dict found in checkpoint file {filename}')
+    # get state_dict from checkpoint
+    if 'state_dict' in checkpoint:
+        state_dict = checkpoint['state_dict']
+    elif 'model' in checkpoint:
+        state_dict = checkpoint['model']
+    else:
+        state_dict = checkpoint
+    # strip prefix of state_dict
+    if list(state_dict.keys())[0].startswith('module.'):
+        state_dict = {k[7:]: v for k, v in state_dict.items()}
+
+    # for MoBY, load model of online branch
+    if sorted(list(state_dict.keys()))[0].startswith('encoder'):
+        state_dict = {k.replace('encoder.', ''): v for k, v in state_dict.items() if k.startswith('encoder.')}
+
+    # reshape absolute position embedding
+    if state_dict.get('absolute_pos_embed') is not None:
+        absolute_pos_embed = state_dict['absolute_pos_embed']
+        N1, L, C1 = absolute_pos_embed.size()
+        N2, C2, H, W = model.absolute_pos_embed.size()
+        if N1 != N2 or C1 != C2 or L != H*W:
+            logger.warning("Error in loading absolute_pos_embed, pass")
+        else:
+            state_dict['absolute_pos_embed'] = absolute_pos_embed.view(N2, H, W, C2).permute(0, 3, 1, 2)
+
+    # interpolate position bias table if needed
+    # relative_position_bias_table_keys = [k for k in state_dict.keys() if "relative_position_bias_table" in k]
+    # for table_key in relative_position_bias_table_keys:
+    #     table_pretrained = state_dict[table_key]
+    #     table_current = model.state_dict()[table_key]
+    #     L1, nH1 = table_pretrained.size()
+    #     L2, nH2 = table_current.size()
+    #     if nH1 != nH2:
+    #         logger.warning(f"Error in loading {table_key}, pass")
+    #     else:
+    #         if L1 != L2:
+    #             S1 = int(L1 ** 0.5)
+    #             S2 = int(L2 ** 0.5)
+    #             table_pretrained_resized = F.interpolate(
+    #                  table_pretrained.permute(1, 0).view(1, nH1, S1, S1),
+    #                  size=(S2, S2), mode='bicubic')
+    #             state_dict[table_key] = table_pretrained_resized.view(nH2, L2).permute(1, 0)
+
+    # load state_dict
+    load_state_dict(model, state_dict, strict, logger)
+    return checkpoint
+
+
+def weights_to_cpu(state_dict):
+    """Copy a model state_dict to cpu.
+
+    Args:
+        state_dict (OrderedDict): Model weights on GPU.
+
+    Returns:
+        OrderedDict: Model weights on GPU.
+    """
+    state_dict_cpu = OrderedDict()
+    for key, val in state_dict.items():
+        state_dict_cpu[key] = val.cpu()
+    return state_dict_cpu
+
+
+def _save_to_state_dict(module, destination, prefix, keep_vars):
+    """Saves module state to `destination` dictionary.
+
+    This method is modified from :meth:`torch.nn.Module._save_to_state_dict`.
+
+    Args:
+        module (nn.Module): The module to generate state_dict.
+        destination (dict): A dict where state will be stored.
+        prefix (str): The prefix for parameters and buffers used in this
+            module.
+    """
+    for name, param in module._parameters.items():
+        if param is not None:
+            destination[prefix + name] = param if keep_vars else param.detach()
+    for name, buf in module._buffers.items():
+        # remove check of _non_persistent_buffers_set to allow nn.BatchNorm2d
+        if buf is not None:
+            destination[prefix + name] = buf if keep_vars else buf.detach()
+
+
+def get_state_dict(module, destination=None, prefix='', keep_vars=False):
+    """Returns a dictionary containing a whole state of the module.
+
+    Both parameters and persistent buffers (e.g. running averages) are
+    included. Keys are corresponding parameter and buffer names.
+
+    This method is modified from :meth:`torch.nn.Module.state_dict` to
+    recursively check parallel module in case that the model has a complicated
+    structure, e.g., nn.Module(nn.Module(DDP)).
+
+    Args:
+        module (nn.Module): The module to generate state_dict.
+        destination (OrderedDict): Returned dict for the state of the
+            module.
+        prefix (str): Prefix of the key.
+        keep_vars (bool): Whether to keep the variable property of the
+            parameters. Default: False.
+
+    Returns:
+        dict: A dictionary containing a whole state of the module.
+    """
+    # recursively check parallel module in case that the model has a
+    # complicated structure, e.g., nn.Module(nn.Module(DDP))
+    if is_module_wrapper(module):
+        module = module.module
+
+    # below is the same as torch.nn.Module.state_dict()
+    if destination is None:
+        destination = OrderedDict()
+        destination._metadata = OrderedDict()
+    destination._metadata[prefix[:-1]] = local_metadata = dict(
+        version=module._version)
+    _save_to_state_dict(module, destination, prefix, keep_vars)
+    for name, child in module._modules.items():
+        if child is not None:
+            get_state_dict(
+                child, destination, prefix + name + '.', keep_vars=keep_vars)
+    for hook in module._state_dict_hooks.values():
+        hook_result = hook(module, destination, prefix, local_metadata)
+        if hook_result is not None:
+            destination = hook_result
+    return destination
+
+
+def save_checkpoint(model, filename, optimizer=None, meta=None):
+    """Save checkpoint to file.
+
+    The checkpoint will have 3 fields: ``meta``, ``state_dict`` and
+    ``optimizer``. By default ``meta`` will contain version and time info.
+
+    Args:
+        model (Module): Module whose params are to be saved.
+        filename (str): Checkpoint filename.
+        optimizer (:obj:`Optimizer`, optional): Optimizer to be saved.
+        meta (dict, optional): Metadata to be saved in checkpoint.
+    """
+    if meta is None:
+        meta = {}
+    elif not isinstance(meta, dict):
+        raise TypeError(f'meta must be a dict or None, but got {type(meta)}')
+    meta.update(mmcv_version=mmcv.__version__, time=time.asctime())
+
+    if is_module_wrapper(model):
+        model = model.module
+
+    if hasattr(model, 'CLASSES') and model.CLASSES is not None:
+        # save class name to the meta
+        meta.update(CLASSES=model.CLASSES)
+
+    checkpoint = {
+        'meta': meta,
+        'state_dict': weights_to_cpu(get_state_dict(model))
+    }
+    # save optimizer state dict in the checkpoint
+    if isinstance(optimizer, Optimizer):
+        checkpoint['optimizer'] = optimizer.state_dict()
+    elif isinstance(optimizer, dict):
+        checkpoint['optimizer'] = {}
+        for name, optim in optimizer.items():
+            checkpoint['optimizer'][name] = optim.state_dict()
+
+    if filename.startswith('pavi://'):
+        try:
+            from pavi import modelcloud
+            from pavi.exception import NodeNotFoundError
+        except ImportError:
+            raise ImportError(
+                'Please install pavi to load checkpoint from modelcloud.')
+        model_path = filename[7:]
+        root = modelcloud.Folder()
+        model_dir, model_name = osp.split(model_path)
+        try:
+            model = modelcloud.get(model_dir)
+        except NodeNotFoundError:
+            model = root.create_training_model(model_dir)
+        with TemporaryDirectory() as tmp_dir:
+            checkpoint_file = osp.join(tmp_dir, model_name)
+            with open(checkpoint_file, 'wb') as f:
+                torch.save(checkpoint, f)
+                f.flush()
+            model.create_file(checkpoint_file, name=model_name)
+    else:
+        mmcv.mkdir_or_exist(osp.dirname(filename))
+        # immediately flush buffer
+        with open(filename, 'wb') as f:
+            torch.save(checkpoint, f)
+            f.flush()
\ No newline at end of file
diff --git a/main/transformer_utils/mmpose/models/backbones/i3d.py b/main/transformer_utils/mmpose/models/backbones/i3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..64f330abac1facc16db743ef3ffbcd23248d6865
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/backbones/i3d.py
@@ -0,0 +1,215 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Code is modified from `Third-party pytorch implementation of i3d
+# <https://github.com/hassony2/kinetics_i3d_pytorch>`.
+
+import torch
+import torch.nn as nn
+
+from ..builder import BACKBONES
+from .base_backbone import BaseBackbone
+
+
+class Conv3dBlock(nn.Module):
+    """Basic 3d convolution block for I3D.
+
+    Args:
+    in_channels (int): Input channels of this block.
+    out_channels (int): Output channels of this block.
+    expansion (float): The multiplier of in_channels and out_channels.
+        Default: 1.
+    kernel_size (tuple[int]): kernel size of the 3d convolution layer.
+        Default: (1, 1, 1).
+    stride (tuple[int]): stride of the block. Default: (1, 1, 1)
+    padding (tuple[int]): padding of the input tensor. Default: (0, 0, 0)
+    use_bias (bool): whether to enable bias in 3d convolution layer.
+        Default: False
+    use_bn (bool): whether to use Batch Normalization after 3d convolution
+        layer. Default: True
+    use_relu (bool): whether to use ReLU after Batch Normalization layer.
+        Default: True
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 expansion=1.0,
+                 kernel_size=(1, 1, 1),
+                 stride=(1, 1, 1),
+                 padding=(0, 0, 0),
+                 use_bias=False,
+                 use_bn=True,
+                 use_relu=True):
+        super().__init__()
+
+        in_channels = int(in_channels * expansion)
+        out_channels = int(out_channels * expansion)
+
+        self.conv3d = nn.Conv3d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            padding=padding,
+            stride=stride,
+            bias=use_bias)
+
+        self.use_bn = use_bn
+        self.use_relu = use_relu
+
+        if self.use_bn:
+            self.batch3d = nn.BatchNorm3d(out_channels)
+
+        if self.use_relu:
+            self.activation = nn.ReLU(inplace=True)
+
+    def forward(self, x):
+        """Forward function."""
+        out = self.conv3d(x)
+        if self.use_bn:
+            out = self.batch3d(out)
+        if self.use_relu:
+            out = self.activation(out)
+        return out
+
+
+class Mixed(nn.Module):
+    """Inception block for I3D.
+
+    Args:
+    in_channels (int): Input channels of this block.
+    out_channels (int): Output channels of this block.
+    expansion (float): The multiplier of in_channels and out_channels.
+        Default: 1.
+    """
+
+    def __init__(self, in_channels, out_channels, expansion=1.0):
+        super(Mixed, self).__init__()
+        # Branch 0
+        self.branch_0 = Conv3dBlock(
+            in_channels, out_channels[0], expansion, kernel_size=(1, 1, 1))
+
+        # Branch 1
+        branch_1_conv1 = Conv3dBlock(
+            in_channels, out_channels[1], expansion, kernel_size=(1, 1, 1))
+        branch_1_conv2 = Conv3dBlock(
+            out_channels[1],
+            out_channels[2],
+            expansion,
+            kernel_size=(3, 3, 3),
+            padding=(1, 1, 1))
+        self.branch_1 = nn.Sequential(branch_1_conv1, branch_1_conv2)
+
+        # Branch 2
+        branch_2_conv1 = Conv3dBlock(
+            in_channels, out_channels[3], expansion, kernel_size=(1, 1, 1))
+        branch_2_conv2 = Conv3dBlock(
+            out_channels[3],
+            out_channels[4],
+            expansion,
+            kernel_size=(3, 3, 3),
+            padding=(1, 1, 1))
+        self.branch_2 = nn.Sequential(branch_2_conv1, branch_2_conv2)
+
+        # Branch3
+        branch_3_pool = nn.MaxPool3d(
+            kernel_size=(3, 3, 3),
+            stride=(1, 1, 1),
+            padding=(1, 1, 1),
+            ceil_mode=True)
+        branch_3_conv2 = Conv3dBlock(
+            in_channels, out_channels[5], expansion, kernel_size=(1, 1, 1))
+        self.branch_3 = nn.Sequential(branch_3_pool, branch_3_conv2)
+
+    def forward(self, x):
+        """Forward function."""
+        out_0 = self.branch_0(x)
+        out_1 = self.branch_1(x)
+        out_2 = self.branch_2(x)
+        out_3 = self.branch_3(x)
+        out = torch.cat((out_0, out_1, out_2, out_3), 1)
+        return out
+
+
+@BACKBONES.register_module()
+class I3D(BaseBackbone):
+    """I3D backbone.
+
+    Please refer to the `paper <https://arxiv.org/abs/1705.07750>`__ for
+    details.
+
+    Args:
+    in_channels (int): Input channels of the backbone, which is decided
+        on the input modality.
+    expansion (float): The multiplier of in_channels and out_channels.
+        Default: 1.
+    """
+
+    def __init__(self, in_channels=3, expansion=1.0):
+        super(I3D, self).__init__()
+
+        # expansion must be an integer multiple of 1/8
+        expansion = round(8 * expansion) / 8.0
+
+        # xut Layer
+        self.conv3d_1a_7x7 = Conv3dBlock(
+            out_channels=64,
+            in_channels=in_channels / expansion,
+            expansion=expansion,
+            kernel_size=(7, 7, 7),
+            stride=(2, 2, 2),
+            padding=(2, 3, 3))
+        self.maxPool3d_2a_3x3 = nn.MaxPool3d(
+            kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1))
+
+        # Layer 2
+        self.conv3d_2b_1x1 = Conv3dBlock(
+            out_channels=64,
+            in_channels=64,
+            expansion=expansion,
+            kernel_size=(1, 1, 1))
+        self.conv3d_2c_3x3 = Conv3dBlock(
+            out_channels=192,
+            in_channels=64,
+            expansion=expansion,
+            kernel_size=(3, 3, 3),
+            padding=(1, 1, 1))
+        self.maxPool3d_3a_3x3 = nn.MaxPool3d(
+            kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1))
+
+        # Mixed_3b
+        self.mixed_3b = Mixed(192, [64, 96, 128, 16, 32, 32], expansion)
+        self.mixed_3c = Mixed(256, [128, 128, 192, 32, 96, 64], expansion)
+        self.maxPool3d_4a_3x3 = nn.MaxPool3d(
+            kernel_size=(3, 3, 3), stride=(2, 2, 2), padding=(1, 1, 1))
+
+        # Mixed 4
+        self.mixed_4b = Mixed(480, [192, 96, 208, 16, 48, 64], expansion)
+        self.mixed_4c = Mixed(512, [160, 112, 224, 24, 64, 64], expansion)
+        self.mixed_4d = Mixed(512, [128, 128, 256, 24, 64, 64], expansion)
+        self.mixed_4e = Mixed(512, [112, 144, 288, 32, 64, 64], expansion)
+        self.mixed_4f = Mixed(528, [256, 160, 320, 32, 128, 128], expansion)
+
+        self.maxPool3d_5a_2x2 = nn.MaxPool3d(
+            kernel_size=(2, 2, 2), stride=(2, 2, 2), padding=(0, 0, 0))
+
+        # Mixed 5
+        self.mixed_5b = Mixed(832, [256, 160, 320, 32, 128, 128], expansion)
+        self.mixed_5c = Mixed(832, [384, 192, 384, 48, 128, 128], expansion)
+
+    def forward(self, x):
+        out = self.conv3d_1a_7x7(x)
+        out = self.maxPool3d_2a_3x3(out)
+        out = self.conv3d_2b_1x1(out)
+        out = self.conv3d_2c_3x3(out)
+        out = self.maxPool3d_3a_3x3(out)
+        out = self.mixed_3b(out)
+        out = self.mixed_3c(out)
+        out = self.maxPool3d_4a_3x3(out)
+        out = self.mixed_4b(out)
+        out = self.mixed_4c(out)
+        out = self.mixed_4d(out)
+        out = self.mixed_4e(out)
+        out = self.mixed_4f(out)
+        out = self.maxPool3d_5a_2x2(out)
+        out = self.mixed_5b(out)
+        out = self.mixed_5c(out)
+        return out
diff --git a/main/transformer_utils/mmpose/models/backbones/litehrnet.py b/main/transformer_utils/mmpose/models/backbones/litehrnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..954368841eb631e3dc6c77e9810f6980f3739bf3
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/backbones/litehrnet.py
@@ -0,0 +1,984 @@
+# ------------------------------------------------------------------------------
+# Adapted from https://github.com/HRNet/Lite-HRNet
+# Original licence: Apache License 2.0.
+# ------------------------------------------------------------------------------
+
+import mmcv
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as cp
+from mmcv.cnn import (ConvModule, DepthwiseSeparableConvModule,
+                      build_conv_layer, build_norm_layer, constant_init,
+                      normal_init)
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmpose.utils import get_root_logger
+from ..builder import BACKBONES
+from .utils import channel_shuffle, load_checkpoint
+
+
+class SpatialWeighting(nn.Module):
+    """Spatial weighting module.
+
+    Args:
+        channels (int): The channels of the module.
+        ratio (int): channel reduction ratio.
+        conv_cfg (dict): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: None.
+        act_cfg (dict): Config dict for activation layer.
+            Default: (dict(type='ReLU'), dict(type='Sigmoid')).
+            The last ConvModule uses Sigmoid by default.
+    """
+
+    def __init__(self,
+                 channels,
+                 ratio=16,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 act_cfg=(dict(type='ReLU'), dict(type='Sigmoid'))):
+        super().__init__()
+        if isinstance(act_cfg, dict):
+            act_cfg = (act_cfg, act_cfg)
+        assert len(act_cfg) == 2
+        assert mmcv.is_tuple_of(act_cfg, dict)
+        self.global_avgpool = nn.AdaptiveAvgPool2d(1)
+        self.conv1 = ConvModule(
+            in_channels=channels,
+            out_channels=int(channels / ratio),
+            kernel_size=1,
+            stride=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg[0])
+        self.conv2 = ConvModule(
+            in_channels=int(channels / ratio),
+            out_channels=channels,
+            kernel_size=1,
+            stride=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg[1])
+
+    def forward(self, x):
+        out = self.global_avgpool(x)
+        out = self.conv1(out)
+        out = self.conv2(out)
+        return x * out
+
+
+class CrossResolutionWeighting(nn.Module):
+    """Cross-resolution channel weighting module.
+
+    Args:
+        channels (int): The channels of the module.
+        ratio (int): channel reduction ratio.
+        conv_cfg (dict): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: None.
+        act_cfg (dict): Config dict for activation layer.
+            Default: (dict(type='ReLU'), dict(type='Sigmoid')).
+            The last ConvModule uses Sigmoid by default.
+    """
+
+    def __init__(self,
+                 channels,
+                 ratio=16,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 act_cfg=(dict(type='ReLU'), dict(type='Sigmoid'))):
+        super().__init__()
+        if isinstance(act_cfg, dict):
+            act_cfg = (act_cfg, act_cfg)
+        assert len(act_cfg) == 2
+        assert mmcv.is_tuple_of(act_cfg, dict)
+        self.channels = channels
+        total_channel = sum(channels)
+        self.conv1 = ConvModule(
+            in_channels=total_channel,
+            out_channels=int(total_channel / ratio),
+            kernel_size=1,
+            stride=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg[0])
+        self.conv2 = ConvModule(
+            in_channels=int(total_channel / ratio),
+            out_channels=total_channel,
+            kernel_size=1,
+            stride=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg[1])
+
+    def forward(self, x):
+        mini_size = x[-1].size()[-2:]
+        out = [F.adaptive_avg_pool2d(s, mini_size) for s in x[:-1]] + [x[-1]]
+        out = torch.cat(out, dim=1)
+        out = self.conv1(out)
+        out = self.conv2(out)
+        out = torch.split(out, self.channels, dim=1)
+        out = [
+            s * F.interpolate(a, size=s.size()[-2:], mode='nearest')
+            for s, a in zip(x, out)
+        ]
+        return out
+
+
+class ConditionalChannelWeighting(nn.Module):
+    """Conditional channel weighting block.
+
+    Args:
+        in_channels (int): The input channels of the block.
+        stride (int): Stride of the 3x3 convolution layer.
+        reduce_ratio (int): channel reduction ratio.
+        conv_cfg (dict): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 stride,
+                 reduce_ratio,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 with_cp=False):
+        super().__init__()
+        self.with_cp = with_cp
+        self.stride = stride
+        assert stride in [1, 2]
+
+        branch_channels = [channel // 2 for channel in in_channels]
+
+        self.cross_resolution_weighting = CrossResolutionWeighting(
+            branch_channels,
+            ratio=reduce_ratio,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg)
+
+        self.depthwise_convs = nn.ModuleList([
+            ConvModule(
+                channel,
+                channel,
+                kernel_size=3,
+                stride=self.stride,
+                padding=1,
+                groups=channel,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=None) for channel in branch_channels
+        ])
+
+        self.spatial_weighting = nn.ModuleList([
+            SpatialWeighting(channels=channel, ratio=4)
+            for channel in branch_channels
+        ])
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            x = [s.chunk(2, dim=1) for s in x]
+            x1 = [s[0] for s in x]
+            x2 = [s[1] for s in x]
+
+            x2 = self.cross_resolution_weighting(x2)
+            x2 = [dw(s) for s, dw in zip(x2, self.depthwise_convs)]
+            x2 = [sw(s) for s, sw in zip(x2, self.spatial_weighting)]
+
+            out = [torch.cat([s1, s2], dim=1) for s1, s2 in zip(x1, x2)]
+            out = [channel_shuffle(s, 2) for s in out]
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        return out
+
+
+class Stem(nn.Module):
+    """Stem network block.
+
+    Args:
+        in_channels (int): The input channels of the block.
+        stem_channels (int): Output channels of the stem layer.
+        out_channels (int): The output channels of the block.
+        expand_ratio (int): adjusts number of channels of the hidden layer
+            in InvertedResidual by this amount.
+        conv_cfg (dict): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 stem_channels,
+                 out_channels,
+                 expand_ratio,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 with_cp=False):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.with_cp = with_cp
+
+        self.conv1 = ConvModule(
+            in_channels=in_channels,
+            out_channels=stem_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=dict(type='ReLU'))
+
+        mid_channels = int(round(stem_channels * expand_ratio))
+        branch_channels = stem_channels // 2
+        if stem_channels == self.out_channels:
+            inc_channels = self.out_channels - branch_channels
+        else:
+            inc_channels = self.out_channels - stem_channels
+
+        self.branch1 = nn.Sequential(
+            ConvModule(
+                branch_channels,
+                branch_channels,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                groups=branch_channels,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=None),
+            ConvModule(
+                branch_channels,
+                inc_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=dict(type='ReLU')),
+        )
+
+        self.expand_conv = ConvModule(
+            branch_channels,
+            mid_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=dict(type='ReLU'))
+        self.depthwise_conv = ConvModule(
+            mid_channels,
+            mid_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            groups=mid_channels,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+        self.linear_conv = ConvModule(
+            mid_channels,
+            branch_channels
+            if stem_channels == self.out_channels else stem_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=dict(type='ReLU'))
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            x = self.conv1(x)
+            x1, x2 = x.chunk(2, dim=1)
+
+            x2 = self.expand_conv(x2)
+            x2 = self.depthwise_conv(x2)
+            x2 = self.linear_conv(x2)
+
+            out = torch.cat((self.branch1(x1), x2), dim=1)
+
+            out = channel_shuffle(out, 2)
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        return out
+
+
+class IterativeHead(nn.Module):
+    """Extra iterative head for feature learning.
+
+    Args:
+        in_channels (int): The input channels of the block.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+    """
+
+    def __init__(self, in_channels, norm_cfg=dict(type='BN')):
+        super().__init__()
+        projects = []
+        num_branchs = len(in_channels)
+        self.in_channels = in_channels[::-1]
+
+        for i in range(num_branchs):
+            if i != num_branchs - 1:
+                projects.append(
+                    DepthwiseSeparableConvModule(
+                        in_channels=self.in_channels[i],
+                        out_channels=self.in_channels[i + 1],
+                        kernel_size=3,
+                        stride=1,
+                        padding=1,
+                        norm_cfg=norm_cfg,
+                        act_cfg=dict(type='ReLU'),
+                        dw_act_cfg=None,
+                        pw_act_cfg=dict(type='ReLU')))
+            else:
+                projects.append(
+                    DepthwiseSeparableConvModule(
+                        in_channels=self.in_channels[i],
+                        out_channels=self.in_channels[i],
+                        kernel_size=3,
+                        stride=1,
+                        padding=1,
+                        norm_cfg=norm_cfg,
+                        act_cfg=dict(type='ReLU'),
+                        dw_act_cfg=None,
+                        pw_act_cfg=dict(type='ReLU')))
+        self.projects = nn.ModuleList(projects)
+
+    def forward(self, x):
+        x = x[::-1]
+
+        y = []
+        last_x = None
+        for i, s in enumerate(x):
+            if last_x is not None:
+                last_x = F.interpolate(
+                    last_x,
+                    size=s.size()[-2:],
+                    mode='bilinear',
+                    align_corners=True)
+                s = s + last_x
+            s = self.projects[i](s)
+            y.append(s)
+            last_x = s
+
+        return y[::-1]
+
+
+class ShuffleUnit(nn.Module):
+    """InvertedResidual block for ShuffleNetV2 backbone.
+
+    Args:
+        in_channels (int): The input channels of the block.
+        out_channels (int): The output channels of the block.
+        stride (int): Stride of the 3x3 convolution layer. Default: 1
+        conv_cfg (dict): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU').
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride=1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 with_cp=False):
+        super().__init__()
+        self.stride = stride
+        self.with_cp = with_cp
+
+        branch_features = out_channels // 2
+        if self.stride == 1:
+            assert in_channels == branch_features * 2, (
+                f'in_channels ({in_channels}) should equal to '
+                f'branch_features * 2 ({branch_features * 2}) '
+                'when stride is 1')
+
+        if in_channels != branch_features * 2:
+            assert self.stride != 1, (
+                f'stride ({self.stride}) should not equal 1 when '
+                f'in_channels != branch_features * 2')
+
+        if self.stride > 1:
+            self.branch1 = nn.Sequential(
+                ConvModule(
+                    in_channels,
+                    in_channels,
+                    kernel_size=3,
+                    stride=self.stride,
+                    padding=1,
+                    groups=in_channels,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=None),
+                ConvModule(
+                    in_channels,
+                    branch_features,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg),
+            )
+
+        self.branch2 = nn.Sequential(
+            ConvModule(
+                in_channels if (self.stride > 1) else branch_features,
+                branch_features,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg),
+            ConvModule(
+                branch_features,
+                branch_features,
+                kernel_size=3,
+                stride=self.stride,
+                padding=1,
+                groups=branch_features,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=None),
+            ConvModule(
+                branch_features,
+                branch_features,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg))
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            if self.stride > 1:
+                out = torch.cat((self.branch1(x), self.branch2(x)), dim=1)
+            else:
+                x1, x2 = x.chunk(2, dim=1)
+                out = torch.cat((x1, self.branch2(x2)), dim=1)
+
+            out = channel_shuffle(out, 2)
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        return out
+
+
+class LiteHRModule(nn.Module):
+    """High-Resolution Module for LiteHRNet.
+
+    It contains conditional channel weighting blocks and
+    shuffle blocks.
+
+
+    Args:
+        num_branches (int): Number of branches in the module.
+        num_blocks (int): Number of blocks in the module.
+        in_channels (list(int)): Number of input image channels.
+        reduce_ratio (int): Channel reduction ratio.
+        module_type (str): 'LITE' or 'NAIVE'
+        multiscale_output (bool): Whether to output multi-scale features.
+        with_fuse (bool): Whether to use fuse layers.
+        conv_cfg (dict): dictionary to construct and config conv layer.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+    """
+
+    def __init__(
+            self,
+            num_branches,
+            num_blocks,
+            in_channels,
+            reduce_ratio,
+            module_type,
+            multiscale_output=False,
+            with_fuse=True,
+            conv_cfg=None,
+            norm_cfg=dict(type='BN'),
+            with_cp=False,
+    ):
+        super().__init__()
+        self._check_branches(num_branches, in_channels)
+
+        self.in_channels = in_channels
+        self.num_branches = num_branches
+
+        self.module_type = module_type
+        self.multiscale_output = multiscale_output
+        self.with_fuse = with_fuse
+        self.norm_cfg = norm_cfg
+        self.conv_cfg = conv_cfg
+        self.with_cp = with_cp
+
+        if self.module_type.upper() == 'LITE':
+            self.layers = self._make_weighting_blocks(num_blocks, reduce_ratio)
+        elif self.module_type.upper() == 'NAIVE':
+            self.layers = self._make_naive_branches(num_branches, num_blocks)
+        else:
+            raise ValueError("module_type should be either 'LITE' or 'NAIVE'.")
+        if self.with_fuse:
+            self.fuse_layers = self._make_fuse_layers()
+            self.relu = nn.ReLU()
+
+    def _check_branches(self, num_branches, in_channels):
+        """Check input to avoid ValueError."""
+        if num_branches != len(in_channels):
+            error_msg = f'NUM_BRANCHES({num_branches}) ' \
+                f'!= NUM_INCHANNELS({len(in_channels)})'
+            raise ValueError(error_msg)
+
+    def _make_weighting_blocks(self, num_blocks, reduce_ratio, stride=1):
+        """Make channel weighting blocks."""
+        layers = []
+        for i in range(num_blocks):
+            layers.append(
+                ConditionalChannelWeighting(
+                    self.in_channels,
+                    stride=stride,
+                    reduce_ratio=reduce_ratio,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    with_cp=self.with_cp))
+
+        return nn.Sequential(*layers)
+
+    def _make_one_branch(self, branch_index, num_blocks, stride=1):
+        """Make one branch."""
+        layers = []
+        layers.append(
+            ShuffleUnit(
+                self.in_channels[branch_index],
+                self.in_channels[branch_index],
+                stride=stride,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=dict(type='ReLU'),
+                with_cp=self.with_cp))
+        for i in range(1, num_blocks):
+            layers.append(
+                ShuffleUnit(
+                    self.in_channels[branch_index],
+                    self.in_channels[branch_index],
+                    stride=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=dict(type='ReLU'),
+                    with_cp=self.with_cp))
+
+        return nn.Sequential(*layers)
+
+    def _make_naive_branches(self, num_branches, num_blocks):
+        """Make branches."""
+        branches = []
+
+        for i in range(num_branches):
+            branches.append(self._make_one_branch(i, num_blocks))
+
+        return nn.ModuleList(branches)
+
+    def _make_fuse_layers(self):
+        """Make fuse layer."""
+        if self.num_branches == 1:
+            return None
+
+        num_branches = self.num_branches
+        in_channels = self.in_channels
+        fuse_layers = []
+        num_out_branches = num_branches if self.multiscale_output else 1
+        for i in range(num_out_branches):
+            fuse_layer = []
+            for j in range(num_branches):
+                if j > i:
+                    fuse_layer.append(
+                        nn.Sequential(
+                            build_conv_layer(
+                                self.conv_cfg,
+                                in_channels[j],
+                                in_channels[i],
+                                kernel_size=1,
+                                stride=1,
+                                padding=0,
+                                bias=False),
+                            build_norm_layer(self.norm_cfg, in_channels[i])[1],
+                            nn.Upsample(
+                                scale_factor=2**(j - i), mode='nearest')))
+                elif j == i:
+                    fuse_layer.append(None)
+                else:
+                    conv_downsamples = []
+                    for k in range(i - j):
+                        if k == i - j - 1:
+                            conv_downsamples.append(
+                                nn.Sequential(
+                                    build_conv_layer(
+                                        self.conv_cfg,
+                                        in_channels[j],
+                                        in_channels[j],
+                                        kernel_size=3,
+                                        stride=2,
+                                        padding=1,
+                                        groups=in_channels[j],
+                                        bias=False),
+                                    build_norm_layer(self.norm_cfg,
+                                                     in_channels[j])[1],
+                                    build_conv_layer(
+                                        self.conv_cfg,
+                                        in_channels[j],
+                                        in_channels[i],
+                                        kernel_size=1,
+                                        stride=1,
+                                        padding=0,
+                                        bias=False),
+                                    build_norm_layer(self.norm_cfg,
+                                                     in_channels[i])[1]))
+                        else:
+                            conv_downsamples.append(
+                                nn.Sequential(
+                                    build_conv_layer(
+                                        self.conv_cfg,
+                                        in_channels[j],
+                                        in_channels[j],
+                                        kernel_size=3,
+                                        stride=2,
+                                        padding=1,
+                                        groups=in_channels[j],
+                                        bias=False),
+                                    build_norm_layer(self.norm_cfg,
+                                                     in_channels[j])[1],
+                                    build_conv_layer(
+                                        self.conv_cfg,
+                                        in_channels[j],
+                                        in_channels[j],
+                                        kernel_size=1,
+                                        stride=1,
+                                        padding=0,
+                                        bias=False),
+                                    build_norm_layer(self.norm_cfg,
+                                                     in_channels[j])[1],
+                                    nn.ReLU(inplace=True)))
+                    fuse_layer.append(nn.Sequential(*conv_downsamples))
+            fuse_layers.append(nn.ModuleList(fuse_layer))
+
+        return nn.ModuleList(fuse_layers)
+
+    def forward(self, x):
+        """Forward function."""
+        if self.num_branches == 1:
+            return [self.layers[0](x[0])]
+
+        if self.module_type.upper() == 'LITE':
+            out = self.layers(x)
+        elif self.module_type.upper() == 'NAIVE':
+            for i in range(self.num_branches):
+                x[i] = self.layers[i](x[i])
+            out = x
+
+        if self.with_fuse:
+            out_fuse = []
+            for i in range(len(self.fuse_layers)):
+                # `y = 0` will lead to decreased accuracy (0.5~1 mAP)
+                y = out[0] if i == 0 else self.fuse_layers[i][0](out[0])
+                for j in range(self.num_branches):
+                    if i == j:
+                        y += out[j]
+                    else:
+                        y += self.fuse_layers[i][j](out[j])
+                out_fuse.append(self.relu(y))
+            out = out_fuse
+        if not self.multiscale_output:
+            out = [out[0]]
+        return out
+
+
+@BACKBONES.register_module()
+class LiteHRNet(nn.Module):
+    """Lite-HRNet backbone.
+
+    `Lite-HRNet: A Lightweight High-Resolution Network
+    <https://arxiv.org/abs/2104.06403>`_.
+
+    Code adapted from 'https://github.com/HRNet/Lite-HRNet'.
+
+    Args:
+        extra (dict): detailed configuration for each stage of HRNet.
+        in_channels (int): Number of input image channels. Default: 3.
+        conv_cfg (dict): dictionary to construct and config conv layer.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+
+    Example:
+        >>> from mmpose.models import LiteHRNet
+        >>> import torch
+        >>> extra=dict(
+        >>>    stem=dict(stem_channels=32, out_channels=32, expand_ratio=1),
+        >>>    num_stages=3,
+        >>>    stages_spec=dict(
+        >>>        num_modules=(2, 4, 2),
+        >>>        num_branches=(2, 3, 4),
+        >>>        num_blocks=(2, 2, 2),
+        >>>        module_type=('LITE', 'LITE', 'LITE'),
+        >>>        with_fuse=(True, True, True),
+        >>>        reduce_ratios=(8, 8, 8),
+        >>>        num_channels=(
+        >>>            (40, 80),
+        >>>            (40, 80, 160),
+        >>>            (40, 80, 160, 320),
+        >>>        )),
+        >>>    with_head=False)
+        >>> self = LiteHRNet(extra, in_channels=1)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 1, 32, 32)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 40, 8, 8)
+    """
+
+    def __init__(self,
+                 extra,
+                 in_channels=3,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 norm_eval=False,
+                 with_cp=False):
+        super().__init__()
+        self.extra = extra
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.norm_eval = norm_eval
+        self.with_cp = with_cp
+
+        self.stem = Stem(
+            in_channels,
+            stem_channels=self.extra['stem']['stem_channels'],
+            out_channels=self.extra['stem']['out_channels'],
+            expand_ratio=self.extra['stem']['expand_ratio'],
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg)
+
+        self.num_stages = self.extra['num_stages']
+        self.stages_spec = self.extra['stages_spec']
+
+        num_channels_last = [
+            self.stem.out_channels,
+        ]
+        for i in range(self.num_stages):
+            num_channels = self.stages_spec['num_channels'][i]
+            num_channels = [num_channels[i] for i in range(len(num_channels))]
+            setattr(
+                self, f'transition{i}',
+                self._make_transition_layer(num_channels_last, num_channels))
+
+            stage, num_channels_last = self._make_stage(
+                self.stages_spec, i, num_channels, multiscale_output=True)
+            setattr(self, f'stage{i}', stage)
+
+        self.with_head = self.extra['with_head']
+        if self.with_head:
+            self.head_layer = IterativeHead(
+                in_channels=num_channels_last,
+                norm_cfg=self.norm_cfg,
+            )
+
+    def _make_transition_layer(self, num_channels_pre_layer,
+                               num_channels_cur_layer):
+        """Make transition layer."""
+        num_branches_cur = len(num_channels_cur_layer)
+        num_branches_pre = len(num_channels_pre_layer)
+
+        transition_layers = []
+        for i in range(num_branches_cur):
+            if i < num_branches_pre:
+                if num_channels_cur_layer[i] != num_channels_pre_layer[i]:
+                    transition_layers.append(
+                        nn.Sequential(
+                            build_conv_layer(
+                                self.conv_cfg,
+                                num_channels_pre_layer[i],
+                                num_channels_pre_layer[i],
+                                kernel_size=3,
+                                stride=1,
+                                padding=1,
+                                groups=num_channels_pre_layer[i],
+                                bias=False),
+                            build_norm_layer(self.norm_cfg,
+                                             num_channels_pre_layer[i])[1],
+                            build_conv_layer(
+                                self.conv_cfg,
+                                num_channels_pre_layer[i],
+                                num_channels_cur_layer[i],
+                                kernel_size=1,
+                                stride=1,
+                                padding=0,
+                                bias=False),
+                            build_norm_layer(self.norm_cfg,
+                                             num_channels_cur_layer[i])[1],
+                            nn.ReLU()))
+                else:
+                    transition_layers.append(None)
+            else:
+                conv_downsamples = []
+                for j in range(i + 1 - num_branches_pre):
+                    in_channels = num_channels_pre_layer[-1]
+                    out_channels = num_channels_cur_layer[i] \
+                        if j == i - num_branches_pre else in_channels
+                    conv_downsamples.append(
+                        nn.Sequential(
+                            build_conv_layer(
+                                self.conv_cfg,
+                                in_channels,
+                                in_channels,
+                                kernel_size=3,
+                                stride=2,
+                                padding=1,
+                                groups=in_channels,
+                                bias=False),
+                            build_norm_layer(self.norm_cfg, in_channels)[1],
+                            build_conv_layer(
+                                self.conv_cfg,
+                                in_channels,
+                                out_channels,
+                                kernel_size=1,
+                                stride=1,
+                                padding=0,
+                                bias=False),
+                            build_norm_layer(self.norm_cfg, out_channels)[1],
+                            nn.ReLU()))
+                transition_layers.append(nn.Sequential(*conv_downsamples))
+
+        return nn.ModuleList(transition_layers)
+
+    def _make_stage(self,
+                    stages_spec,
+                    stage_index,
+                    in_channels,
+                    multiscale_output=True):
+        num_modules = stages_spec['num_modules'][stage_index]
+        num_branches = stages_spec['num_branches'][stage_index]
+        num_blocks = stages_spec['num_blocks'][stage_index]
+        reduce_ratio = stages_spec['reduce_ratios'][stage_index]
+        with_fuse = stages_spec['with_fuse'][stage_index]
+        module_type = stages_spec['module_type'][stage_index]
+
+        modules = []
+        for i in range(num_modules):
+            # multi_scale_output is only used last module
+            if not multiscale_output and i == num_modules - 1:
+                reset_multiscale_output = False
+            else:
+                reset_multiscale_output = True
+
+            modules.append(
+                LiteHRModule(
+                    num_branches,
+                    num_blocks,
+                    in_channels,
+                    reduce_ratio,
+                    module_type,
+                    multiscale_output=reset_multiscale_output,
+                    with_fuse=with_fuse,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    with_cp=self.with_cp))
+            in_channels = modules[-1].in_channels
+
+        return nn.Sequential(*modules), in_channels
+
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in backbone.
+
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        if isinstance(pretrained, str):
+            logger = get_root_logger()
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    normal_init(m, std=0.001)
+                elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
+                    constant_init(m, 1)
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, x):
+        """Forward function."""
+        x = self.stem(x)
+
+        y_list = [x]
+        for i in range(self.num_stages):
+            x_list = []
+            transition = getattr(self, f'transition{i}')
+            for j in range(self.stages_spec['num_branches'][i]):
+                if transition[j]:
+                    if j >= len(y_list):
+                        x_list.append(transition[j](y_list[-1]))
+                    else:
+                        x_list.append(transition[j](y_list[j]))
+                else:
+                    x_list.append(y_list[j])
+            y_list = getattr(self, f'stage{i}')(x_list)
+
+        x = y_list
+        if self.with_head:
+            x = self.head_layer(x)
+
+        return [x[0]]
+
+    def train(self, mode=True):
+        """Convert the model into training mode."""
+        super().train(mode)
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, _BatchNorm):
+                    m.eval()
diff --git a/main/transformer_utils/mmpose/models/backbones/mobilenet_v2.py b/main/transformer_utils/mmpose/models/backbones/mobilenet_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..5dc0cd1b7dfdec2aa751861e39fc1c1a45ec488e
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/backbones/mobilenet_v2.py
@@ -0,0 +1,275 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import logging
+
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import ConvModule, constant_init, kaiming_init
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from ..builder import BACKBONES
+from .base_backbone import BaseBackbone
+from .utils import load_checkpoint, make_divisible
+
+
+class InvertedResidual(nn.Module):
+    """InvertedResidual block for MobileNetV2.
+
+    Args:
+        in_channels (int): The input channels of the InvertedResidual block.
+        out_channels (int): The output channels of the InvertedResidual block.
+        stride (int): Stride of the middle (first) 3x3 convolution.
+        expand_ratio (int): adjusts number of channels of the hidden layer
+            in InvertedResidual by this amount.
+        conv_cfg (dict): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU6').
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 expand_ratio,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU6'),
+                 with_cp=False):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        act_cfg = copy.deepcopy(act_cfg)
+        super().__init__()
+        self.stride = stride
+        assert stride in [1, 2], f'stride must in [1, 2]. ' \
+            f'But received {stride}.'
+        self.with_cp = with_cp
+        self.use_res_connect = self.stride == 1 and in_channels == out_channels
+        hidden_dim = int(round(in_channels * expand_ratio))
+
+        layers = []
+        if expand_ratio != 1:
+            layers.append(
+                ConvModule(
+                    in_channels=in_channels,
+                    out_channels=hidden_dim,
+                    kernel_size=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+        layers.extend([
+            ConvModule(
+                in_channels=hidden_dim,
+                out_channels=hidden_dim,
+                kernel_size=3,
+                stride=stride,
+                padding=1,
+                groups=hidden_dim,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg),
+            ConvModule(
+                in_channels=hidden_dim,
+                out_channels=out_channels,
+                kernel_size=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=None)
+        ])
+        self.conv = nn.Sequential(*layers)
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            if self.use_res_connect:
+                return x + self.conv(x)
+            return self.conv(x)
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        return out
+
+
+@BACKBONES.register_module()
+class MobileNetV2(BaseBackbone):
+    """MobileNetV2 backbone.
+
+    Args:
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Default: 1.0.
+        out_indices (None or Sequence[int]): Output from which stages.
+            Default: (7, ).
+        frozen_stages (int): Stages to be frozen (all param fixed).
+            Default: -1, which means not freezing any parameters.
+        conv_cfg (dict): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU6').
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+    """
+
+    # Parameters to build layers. 4 parameters are needed to construct a
+    # layer, from left to right: expand_ratio, channel, num_blocks, stride.
+    arch_settings = [[1, 16, 1, 1], [6, 24, 2, 2], [6, 32, 3, 2],
+                     [6, 64, 4, 2], [6, 96, 3, 1], [6, 160, 3, 2],
+                     [6, 320, 1, 1]]
+
+    def __init__(self,
+                 widen_factor=1.,
+                 out_indices=(7, ),
+                 frozen_stages=-1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU6'),
+                 norm_eval=False,
+                 with_cp=False):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        act_cfg = copy.deepcopy(act_cfg)
+        super().__init__()
+        self.widen_factor = widen_factor
+        self.out_indices = out_indices
+        for index in out_indices:
+            if index not in range(0, 8):
+                raise ValueError('the item in out_indices must in '
+                                 f'range(0, 8). But received {index}')
+
+        if frozen_stages not in range(-1, 8):
+            raise ValueError('frozen_stages must be in range(-1, 8). '
+                             f'But received {frozen_stages}')
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.norm_eval = norm_eval
+        self.with_cp = with_cp
+
+        self.in_channels = make_divisible(32 * widen_factor, 8)
+
+        self.conv1 = ConvModule(
+            in_channels=3,
+            out_channels=self.in_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+        self.layers = []
+
+        for i, layer_cfg in enumerate(self.arch_settings):
+            expand_ratio, channel, num_blocks, stride = layer_cfg
+            out_channels = make_divisible(channel * widen_factor, 8)
+            inverted_res_layer = self.make_layer(
+                out_channels=out_channels,
+                num_blocks=num_blocks,
+                stride=stride,
+                expand_ratio=expand_ratio)
+            layer_name = f'layer{i + 1}'
+            self.add_module(layer_name, inverted_res_layer)
+            self.layers.append(layer_name)
+
+        if widen_factor > 1.0:
+            self.out_channel = int(1280 * widen_factor)
+        else:
+            self.out_channel = 1280
+
+        layer = ConvModule(
+            in_channels=self.in_channels,
+            out_channels=self.out_channel,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        self.add_module('conv2', layer)
+        self.layers.append('conv2')
+
+    def make_layer(self, out_channels, num_blocks, stride, expand_ratio):
+        """Stack InvertedResidual blocks to build a layer for MobileNetV2.
+
+        Args:
+            out_channels (int): out_channels of block.
+            num_blocks (int): number of blocks.
+            stride (int): stride of the first block. Default: 1
+            expand_ratio (int): Expand the number of channels of the
+                hidden layer in InvertedResidual by this ratio. Default: 6.
+        """
+        layers = []
+        for i in range(num_blocks):
+            if i >= 1:
+                stride = 1
+            layers.append(
+                InvertedResidual(
+                    self.in_channels,
+                    out_channels,
+                    stride,
+                    expand_ratio=expand_ratio,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg,
+                    with_cp=self.with_cp))
+            self.in_channels = out_channels
+
+        return nn.Sequential(*layers)
+
+    def init_weights(self, pretrained=None):
+        if isinstance(pretrained, str):
+            logger = logging.getLogger()
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    kaiming_init(m)
+                elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
+                    constant_init(m, 1)
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, x):
+        x = self.conv1(x)
+
+        outs = []
+        for i, layer_name in enumerate(self.layers):
+            layer = getattr(self, layer_name)
+            x = layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+
+        if len(outs) == 1:
+            return outs[0]
+        return tuple(outs)
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            for param in self.conv1.parameters():
+                param.requires_grad = False
+        for i in range(1, self.frozen_stages + 1):
+            layer = getattr(self, f'layer{i}')
+            layer.eval()
+            for param in layer.parameters():
+                param.requires_grad = False
+
+    def train(self, mode=True):
+        super().train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, _BatchNorm):
+                    m.eval()
diff --git a/main/transformer_utils/mmpose/models/backbones/mobilenet_v3.py b/main/transformer_utils/mmpose/models/backbones/mobilenet_v3.py
new file mode 100644
index 0000000000000000000000000000000000000000..d640abec79f06d689f2d4bc1e92999946bc07261
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/backbones/mobilenet_v3.py
@@ -0,0 +1,188 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import logging
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule, constant_init, kaiming_init
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from ..builder import BACKBONES
+from .base_backbone import BaseBackbone
+from .utils import InvertedResidual, load_checkpoint
+
+
+@BACKBONES.register_module()
+class MobileNetV3(BaseBackbone):
+    """MobileNetV3 backbone.
+
+    Args:
+        arch (str): Architecture of mobilnetv3, from {small, big}.
+            Default: small.
+        conv_cfg (dict): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        out_indices (None or Sequence[int]): Output from which stages.
+            Default: (-1, ), which means output tensors from final stage.
+        frozen_stages (int): Stages to be frozen (all param fixed).
+            Default: -1, which means not freezing any parameters.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save
+            some memory while slowing down the training speed.
+            Default: False.
+    """
+    # Parameters to build each block:
+    #     [kernel size, mid channels, out channels, with_se, act type, stride]
+    arch_settings = {
+        'small': [[3, 16, 16, True, 'ReLU', 2],
+                  [3, 72, 24, False, 'ReLU', 2],
+                  [3, 88, 24, False, 'ReLU', 1],
+                  [5, 96, 40, True, 'HSwish', 2],
+                  [5, 240, 40, True, 'HSwish', 1],
+                  [5, 240, 40, True, 'HSwish', 1],
+                  [5, 120, 48, True, 'HSwish', 1],
+                  [5, 144, 48, True, 'HSwish', 1],
+                  [5, 288, 96, True, 'HSwish', 2],
+                  [5, 576, 96, True, 'HSwish', 1],
+                  [5, 576, 96, True, 'HSwish', 1]],
+        'big': [[3, 16, 16, False, 'ReLU', 1],
+                [3, 64, 24, False, 'ReLU', 2],
+                [3, 72, 24, False, 'ReLU', 1],
+                [5, 72, 40, True, 'ReLU', 2],
+                [5, 120, 40, True, 'ReLU', 1],
+                [5, 120, 40, True, 'ReLU', 1],
+                [3, 240, 80, False, 'HSwish', 2],
+                [3, 200, 80, False, 'HSwish', 1],
+                [3, 184, 80, False, 'HSwish', 1],
+                [3, 184, 80, False, 'HSwish', 1],
+                [3, 480, 112, True, 'HSwish', 1],
+                [3, 672, 112, True, 'HSwish', 1],
+                [5, 672, 160, True, 'HSwish', 1],
+                [5, 672, 160, True, 'HSwish', 2],
+                [5, 960, 160, True, 'HSwish', 1]]
+    }  # yapf: disable
+
+    def __init__(self,
+                 arch='small',
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 out_indices=(-1, ),
+                 frozen_stages=-1,
+                 norm_eval=False,
+                 with_cp=False):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        super().__init__()
+        assert arch in self.arch_settings
+        for index in out_indices:
+            if index not in range(-len(self.arch_settings[arch]),
+                                  len(self.arch_settings[arch])):
+                raise ValueError('the item in out_indices must in '
+                                 f'range(0, {len(self.arch_settings[arch])}). '
+                                 f'But received {index}')
+
+        if frozen_stages not in range(-1, len(self.arch_settings[arch])):
+            raise ValueError('frozen_stages must be in range(-1, '
+                             f'{len(self.arch_settings[arch])}). '
+                             f'But received {frozen_stages}')
+        self.arch = arch
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.norm_eval = norm_eval
+        self.with_cp = with_cp
+
+        self.in_channels = 16
+        self.conv1 = ConvModule(
+            in_channels=3,
+            out_channels=self.in_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=dict(type='HSwish'))
+
+        self.layers = self._make_layer()
+        self.feat_dim = self.arch_settings[arch][-1][2]
+
+    def _make_layer(self):
+        layers = []
+        layer_setting = self.arch_settings[self.arch]
+        for i, params in enumerate(layer_setting):
+            (kernel_size, mid_channels, out_channels, with_se, act,
+             stride) = params
+            if with_se:
+                se_cfg = dict(
+                    channels=mid_channels,
+                    ratio=4,
+                    act_cfg=(dict(type='ReLU'), dict(type='HSigmoid')))
+            else:
+                se_cfg = None
+
+            layer = InvertedResidual(
+                in_channels=self.in_channels,
+                out_channels=out_channels,
+                mid_channels=mid_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                se_cfg=se_cfg,
+                with_expand_conv=True,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=dict(type=act),
+                with_cp=self.with_cp)
+            self.in_channels = out_channels
+            layer_name = f'layer{i + 1}'
+            self.add_module(layer_name, layer)
+            layers.append(layer_name)
+        return layers
+
+    def init_weights(self, pretrained=None):
+        if isinstance(pretrained, str):
+            logger = logging.getLogger()
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    kaiming_init(m)
+                elif isinstance(m, nn.BatchNorm2d):
+                    constant_init(m, 1)
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, x):
+        x = self.conv1(x)
+
+        outs = []
+        for i, layer_name in enumerate(self.layers):
+            layer = getattr(self, layer_name)
+            x = layer(x)
+            if i in self.out_indices or \
+                    i - len(self.layers) in self.out_indices:
+                outs.append(x)
+
+        if len(outs) == 1:
+            return outs[0]
+        return tuple(outs)
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            for param in self.conv1.parameters():
+                param.requires_grad = False
+        for i in range(1, self.frozen_stages + 1):
+            layer = getattr(self, f'layer{i}')
+            layer.eval()
+            for param in layer.parameters():
+                param.requires_grad = False
+
+    def train(self, mode=True):
+        super().train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, _BatchNorm):
+                    m.eval()
diff --git a/main/transformer_utils/mmpose/models/backbones/modules/__init__.py b/main/transformer_utils/mmpose/models/backbones/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/main/transformer_utils/mmpose/models/backbones/modules/basic_block.py b/main/transformer_utils/mmpose/models/backbones/modules/basic_block.py
new file mode 100644
index 0000000000000000000000000000000000000000..44feef44dfc43a7b40b82752d9a82df35f1108ba
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/backbones/modules/basic_block.py
@@ -0,0 +1,126 @@
+# --------------------------------------------------------
+# High Resolution Transformer
+# Copyright (c) 2021 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Rao Fu, RainbowSecret
+# --------------------------------------------------------
+
+import os
+import copy
+import logging
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as cp
+from .transformer_block import TransformerBlock
+
+from mmcv.cnn import (
+    build_conv_layer,
+    build_norm_layer,
+    build_plugin_layer,
+    constant_init,
+    kaiming_init,
+)
+
+
+class BasicBlock(nn.Module):
+    """Only replce the second 3x3 Conv with the TransformerBlocker"""
+
+    expansion = 1
+
+    def __init__(
+        self,
+        inplanes,
+        planes,
+        stride=1,
+        downsample=None,
+        with_cp=False,
+        conv_cfg=None,
+        norm_cfg=dict(type="BN"),
+        mhsa_flag=False,
+        num_heads=1,
+        num_halo_block=1,
+        num_mlp_ratio=4,
+        num_sr_ratio=1,
+        with_rpe=False,
+        with_ffn=True,
+    ):
+        super(BasicBlock, self).__init__()
+        norm_cfg = copy.deepcopy(norm_cfg)
+
+        self.in_channels = inplanes
+        self.out_channels = planes
+        self.stride = stride
+        self.with_cp = with_cp
+        self.downsample = downsample
+
+        self.norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2)
+
+        self.conv1 = build_conv_layer(
+            conv_cfg,
+            inplanes,
+            planes,
+            3,
+            stride=stride,
+            padding=1,
+            dilation=1,
+            bias=False,
+        )
+        self.add_module(self.norm1_name, norm1)
+
+        if not mhsa_flag:
+            self.conv2 = build_conv_layer(
+                conv_cfg, planes, planes, 3, padding=1, bias=False
+            )
+            self.add_module(self.norm2_name, norm2)
+        else:
+            self.conv2 = TransformerBlock(
+                planes,
+                num_heads=num_heads,
+                mlp_ratio=num_mlp_ratio,
+                sr_ratio=num_sr_ratio,
+                input_resolution=num_resolution,
+                with_rpe=with_rpe,
+                with_ffn=with_ffn,
+            )
+
+        self.relu = nn.ReLU(inplace=True)
+
+    @property
+    def norm1(self):
+        """nn.Module: normalization layer after the first convolution layer"""
+        return getattr(self, self.norm1_name)
+
+    @property
+    def norm2(self):
+        """nn.Module: normalization layer after the second convolution layer"""
+        return getattr(self, self.norm2_name)
+
+    def forward(self, x):
+        """Forward function."""
+
+        def _inner_forward(x):
+            identity = x
+
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            out = self.conv2(out)
+            out = self.norm2(out)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
diff --git a/main/transformer_utils/mmpose/models/backbones/modules/bottleneck_block.py b/main/transformer_utils/mmpose/models/backbones/modules/bottleneck_block.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ccd11c24b3e10391fd751ca8a7b7e571acd7aee
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/backbones/modules/bottleneck_block.py
@@ -0,0 +1,122 @@
+# --------------------------------------------------------
+# High Resolution Transformer
+# Copyright (c) 2021 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Rao Fu, RainbowSecret
+# --------------------------------------------------------
+
+import os
+import copy
+import logging
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as cp
+
+from mmcv.cnn import build_conv_layer, build_norm_layer
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(
+        self,
+        inplanes,
+        planes,
+        stride=1,
+        downsample=None,
+        with_cp=None,
+        norm_cfg=dict(type="BN"),
+        conv_cfg=None,
+    ):
+        super(Bottleneck, self).__init__()
+        norm_cfg = copy.deepcopy(norm_cfg)
+
+        self.in_channels = inplanes
+        self.out_channels = planes
+        self.stride = stride
+        self.with_cp = with_cp
+        self.downsample = downsample
+
+        self.conv1_stride = 1
+        self.conv2_stride = stride
+
+        self.norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2)
+        self.norm3_name, norm3 = build_norm_layer(
+            norm_cfg, planes * self.expansion, postfix=3
+        )
+
+        self.conv1 = build_conv_layer(
+            conv_cfg,
+            inplanes,
+            planes,
+            kernel_size=1,
+            stride=self.conv1_stride,
+            bias=False,
+        )
+        self.add_module(self.norm1_name, norm1)
+
+        self.conv2 = build_conv_layer(
+            conv_cfg,
+            planes,
+            planes,
+            kernel_size=3,
+            stride=self.conv2_stride,
+            padding=1,
+            bias=False,
+        )
+        self.add_module(self.norm2_name, norm2)
+
+        self.conv3 = build_conv_layer(
+            conv_cfg, planes, planes * self.expansion, kernel_size=1, bias=False
+        )
+        self.add_module(self.norm3_name, norm3)
+        self.relu = nn.ReLU(inplace=True)
+
+    @property
+    def norm1(self):
+        """nn.Module: normalization layer after the first convolution layer"""
+        return getattr(self, self.norm1_name)
+
+    @property
+    def norm2(self):
+        """nn.Module: normalization layer after the second convolution layer"""
+        return getattr(self, self.norm2_name)
+
+    @property
+    def norm3(self):
+        """nn.Module: normalization layer after the third convolution layer"""
+        return getattr(self, self.norm3_name)
+
+    def forward(self, x):
+        """Forward function."""
+
+        def _inner_forward(x):
+            identity = x
+
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            out = self.conv2(out)
+            out = self.norm2(out)
+            out = self.relu(out)
+
+            out = self.conv3(out)
+            out = self.norm3(out)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
diff --git a/main/transformer_utils/mmpose/models/backbones/modules/ffn_block.py b/main/transformer_utils/mmpose/models/backbones/modules/ffn_block.py
new file mode 100644
index 0000000000000000000000000000000000000000..00ef023334a3eb2ff4eb7172b4b75131d7c08262
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/backbones/modules/ffn_block.py
@@ -0,0 +1,195 @@
+# --------------------------------------------------------
+# High Resolution Transformer
+# Copyright (c) 2021 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Rao Fu, RainbowSecret
+# --------------------------------------------------------
+
+import torch
+import torch.nn as nn
+
+
+class Mlp(nn.Module):
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        drop=0.0,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x, H, W):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class MlpDW(nn.Module):
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        dw_act_layer=nn.GELU,
+        drop=0.0,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Conv2d(in_features, hidden_features, kernel_size=1)
+        self.act1 = act_layer()
+        self.dw3x3 = nn.Conv2d(
+            hidden_features,
+            hidden_features,
+            kernel_size=3,
+            stride=1,
+            groups=hidden_features,
+            padding=1,
+        )
+        self.act2 = dw_act_layer()
+        self.fc2 = nn.Conv2d(hidden_features, out_features, kernel_size=1)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x, H, W):
+        B, N, C = x.shape
+
+        if N == (H * W + 1):
+            cls_tokens = x[:, 0, :]
+            x_ = x[:, 1:, :].permute(0, 2, 1).contiguous().reshape(B, C, H, W)
+        else:
+            x_ = x.permute(0, 2, 1).contiguous().reshape(B, C, H, W)
+
+        x_ = self.fc1(x_)
+        x_ = self.act1(x_)
+        x_ = self.dw3x3(x_)
+        x_ = self.act2(x_)
+        x_ = self.drop(x_)
+        x_ = self.fc2(x_)
+        x_ = self.drop(x_)
+        x_ = x_.reshape(B, C, -1).permute(0, 2, 1).contiguous()
+
+        if N == (H * W + 1):
+            x = torch.cat((cls_tokens.unsqueeze(1), x_), dim=1)
+        else:
+            x = x_
+
+        return x
+
+
+class MlpDWBN(nn.Module):
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        dw_act_layer=nn.GELU,
+        drop=0.0,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Conv2d(in_features, hidden_features, kernel_size=1)
+        self.act1 = act_layer()
+        self.norm1 = nn.BatchNorm2d(hidden_features)
+        self.dw3x3 = nn.Conv2d(
+            hidden_features,
+            hidden_features,
+            kernel_size=3,
+            stride=1,
+            groups=hidden_features,
+            padding=1,
+        )
+        self.act2 = dw_act_layer()
+        self.norm2 = nn.BatchNorm2d(hidden_features)
+        self.fc2 = nn.Conv2d(hidden_features, out_features, kernel_size=1)
+        self.act3 = act_layer()
+        self.norm3 = nn.BatchNorm2d(out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x, H, W):
+        B, N, C = x.shape
+
+        if N == (H * W + 1):
+            cls_tokens = x[:, 0, :]
+            x_ = x[:, 1:, :].permute(0, 2, 1).contiguous().reshape(B, C, H, W)
+        else:
+            x_ = x.permute(0, 2, 1).contiguous().reshape(B, C, H, W)
+
+        x_ = self.fc1(x_)
+        x_ = self.norm1(x_)
+        x_ = self.act1(x_)
+        x_ = self.dw3x3(x_)
+        x_ = self.norm2(x_)
+        x_ = self.act2(x_)
+        x_ = self.drop(x_)
+        x_ = self.fc2(x_)
+        x_ = self.norm3(x_)
+        x_ = self.act3(x_)
+        x_ = self.drop(x_)
+        x_ = x_.reshape(B, C, -1).permute(0, 2, 1).contiguous()
+
+        if N == (H * W + 1):
+            x = torch.cat((cls_tokens.unsqueeze(1), x_), dim=1)
+        else:
+            x = x_
+
+        return x
+
+
+class MlpDWBN2D(nn.Module):
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        dw_act_layer=nn.GELU,
+        drop=0.0,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Conv2d(in_features, hidden_features, kernel_size=1)
+        self.act1 = act_layer()
+        self.norm1 = nn.BatchNorm2d(hidden_features)
+        self.dw3x3 = nn.Conv2d(
+            hidden_features,
+            hidden_features,
+            kernel_size=3,
+            stride=1,
+            groups=hidden_features,
+            padding=1,
+        )
+        self.act2 = dw_act_layer()
+        self.norm2 = nn.BatchNorm2d(hidden_features)
+        self.fc2 = nn.Conv2d(hidden_features, out_features, kernel_size=1)
+        self.act3 = act_layer()
+        self.norm3 = nn.BatchNorm2d(out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.norm1(x)
+        x = self.act1(x)
+        x = self.dw3x3(x)
+        x = self.norm2(x)
+        x = self.act2(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.norm3(x)
+        x = self.act3(x)
+        x = self.drop(x)
+        return x
diff --git a/main/transformer_utils/mmpose/models/backbones/modules/multihead_attention.py b/main/transformer_utils/mmpose/models/backbones/modules/multihead_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..d726ea377a407bdb4e8cf5d0bc44a371a1e3545b
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/backbones/modules/multihead_attention.py
@@ -0,0 +1,348 @@
+# --------------------------------------------------------
+# Copyright (c) 2021 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Modified by RainbowSecret from:
+#   https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/activation.py#L852
+# --------------------------------------------------------
+
+import copy
+import math
+import warnings
+import torch
+import torch.nn.functional as F
+from torch import nn, Tensor
+from torch.nn.modules.module import Module
+from torch._jit_internal import Optional, Tuple
+from torch.overrides import has_torch_function, handle_torch_function
+from torch.nn.functional import linear, pad, softmax, dropout
+
+
+class MultiheadAttention(Module):
+    bias_k: Optional[torch.Tensor]
+    bias_v: Optional[torch.Tensor]
+
+    def __init__(
+        self,
+        embed_dim,
+        num_heads,
+        dropout=0.0,
+        bias=True,
+        add_bias_kv=False,
+        add_zero_attn=False,
+        kdim=None,
+        vdim=None,
+    ):
+        super(MultiheadAttention, self).__init__()
+        self.embed_dim = embed_dim
+        self.kdim = kdim if kdim is not None else embed_dim
+        self.vdim = vdim if vdim is not None else embed_dim
+        self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim
+
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        assert (
+            self.head_dim * num_heads == self.embed_dim
+        ), "embed_dim must be divisible by num_heads"
+
+        self.k_proj = nn.Linear(self.kdim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(self.vdim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim)
+
+        self.in_proj_bias = None
+        self.in_proj_weight = None
+        self.bias_k = self.bias_v = None
+        self.q_proj_weight = None
+        self.k_proj_weight = None
+        self.v_proj_weight = None
+        self.add_zero_attn = add_zero_attn
+
+    def __setstate__(self, state):
+        # Support loading old MultiheadAttention checkpoints generated by v1.1.0
+        if "_qkv_same_embed_dim" not in state:
+            state["_qkv_same_embed_dim"] = True
+
+        super(MultiheadAttention, self).__setstate__(state)
+
+    def forward(
+        self,
+        query,
+        key,
+        value,
+        key_padding_mask=None,
+        need_weights=False,
+        attn_mask=None,
+        residual_attn=None,
+    ):
+        if not self._qkv_same_embed_dim:
+            return self.multi_head_attention_forward(
+                query,
+                key,
+                value,
+                self.embed_dim,
+                self.num_heads,
+                self.in_proj_weight,
+                self.in_proj_bias,
+                self.bias_k,
+                self.bias_v,
+                self.add_zero_attn,
+                self.dropout,
+                self.out_proj.weight,
+                self.out_proj.bias,
+                training=self.training,
+                key_padding_mask=key_padding_mask,
+                need_weights=need_weights,
+                attn_mask=attn_mask,
+                use_separate_proj_weight=True,
+                q_proj_weight=self.q_proj_weight,
+                k_proj_weight=self.k_proj_weight,
+                v_proj_weight=self.v_proj_weight,
+                out_dim=self.vdim,
+                residual_attn=residual_attn,
+            )
+        else:
+            return self.multi_head_attention_forward(
+                query,
+                key,
+                value,
+                self.embed_dim,
+                self.num_heads,
+                self.in_proj_weight,
+                self.in_proj_bias,
+                self.bias_k,
+                self.bias_v,
+                self.add_zero_attn,
+                self.dropout,
+                self.out_proj.weight,
+                self.out_proj.bias,
+                training=self.training,
+                key_padding_mask=key_padding_mask,
+                need_weights=need_weights,
+                attn_mask=attn_mask,
+                out_dim=self.vdim,
+                residual_attn=residual_attn,
+            )
+
+    def multi_head_attention_forward(
+        self,
+        query: Tensor,
+        key: Tensor,
+        value: Tensor,
+        embed_dim_to_check: int,
+        num_heads: int,
+        in_proj_weight: Tensor,
+        in_proj_bias: Tensor,
+        bias_k: Optional[Tensor],
+        bias_v: Optional[Tensor],
+        add_zero_attn: bool,
+        dropout_p: float,
+        out_proj_weight: Tensor,
+        out_proj_bias: Tensor,
+        training: bool = True,
+        key_padding_mask: Optional[Tensor] = None,
+        need_weights: bool = False,
+        attn_mask: Optional[Tensor] = None,
+        use_separate_proj_weight: bool = False,
+        q_proj_weight: Optional[Tensor] = None,
+        k_proj_weight: Optional[Tensor] = None,
+        v_proj_weight: Optional[Tensor] = None,
+        static_k: Optional[Tensor] = None,
+        static_v: Optional[Tensor] = None,
+        out_dim: Optional[Tensor] = None,
+        residual_attn: Optional[Tensor] = None,
+    ) -> Tuple[Tensor, Optional[Tensor]]:
+        if not torch.jit.is_scripting():
+            tens_ops = (
+                query,
+                key,
+                value,
+                in_proj_weight,
+                in_proj_bias,
+                bias_k,
+                bias_v,
+                out_proj_weight,
+                out_proj_bias,
+            )
+            if any([type(t) is not Tensor for t in tens_ops]) and has_torch_function(
+                tens_ops
+            ):
+                return handle_torch_function(
+                    multi_head_attention_forward,
+                    tens_ops,
+                    query,
+                    key,
+                    value,
+                    embed_dim_to_check,
+                    num_heads,
+                    in_proj_weight,
+                    in_proj_bias,
+                    bias_k,
+                    bias_v,
+                    add_zero_attn,
+                    dropout_p,
+                    out_proj_weight,
+                    out_proj_bias,
+                    training=training,
+                    key_padding_mask=key_padding_mask,
+                    need_weights=need_weights,
+                    attn_mask=attn_mask,
+                    use_separate_proj_weight=use_separate_proj_weight,
+                    q_proj_weight=q_proj_weight,
+                    k_proj_weight=k_proj_weight,
+                    v_proj_weight=v_proj_weight,
+                    static_k=static_k,
+                    static_v=static_v,
+                )
+        tgt_len, bsz, embed_dim = query.size()
+        key = query if key is None else key
+        value = query if value is None else value
+
+        assert embed_dim == embed_dim_to_check
+        # allow MHA to have different sizes for the feature dimension
+        assert key.size(0) == value.size(0) and key.size(1) == value.size(1)
+
+        head_dim = embed_dim // num_heads
+        v_head_dim = out_dim // num_heads
+        assert (
+            head_dim * num_heads == embed_dim
+        ), "embed_dim must be divisible by num_heads"
+        scaling = float(head_dim) ** -0.5
+
+        q = self.q_proj(query) * scaling
+        k = self.k_proj(key)
+        v = self.v_proj(value)
+
+        if attn_mask is not None:
+            assert (
+                attn_mask.dtype == torch.float32
+                or attn_mask.dtype == torch.float64
+                or attn_mask.dtype == torch.float16
+                or attn_mask.dtype == torch.uint8
+                or attn_mask.dtype == torch.bool
+            ), "Only float, byte, and bool types are supported for attn_mask, not {}".format(
+                attn_mask.dtype
+            )
+            if attn_mask.dtype == torch.uint8:
+                warnings.warn(
+                    "Byte tensor for attn_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead."
+                )
+                attn_mask = attn_mask.to(torch.bool)
+
+            if attn_mask.dim() == 2:
+                attn_mask = attn_mask.unsqueeze(0)
+                if list(attn_mask.size()) != [1, query.size(0), key.size(0)]:
+                    raise RuntimeError("The size of the 2D attn_mask is not correct.")
+            elif attn_mask.dim() == 3:
+                if list(attn_mask.size()) != [
+                    bsz * num_heads,
+                    query.size(0),
+                    key.size(0),
+                ]:
+                    raise RuntimeError("The size of the 3D attn_mask is not correct.")
+            else:
+                raise RuntimeError(
+                    "attn_mask's dimension {} is not supported".format(attn_mask.dim())
+                )
+
+        # convert ByteTensor key_padding_mask to bool
+        if key_padding_mask is not None and key_padding_mask.dtype == torch.uint8:
+            warnings.warn(
+                "Byte tensor for key_padding_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead."
+            )
+            key_padding_mask = key_padding_mask.to(torch.bool)
+
+        q = q.contiguous().view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1)
+        if k is not None:
+            k = k.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)
+        if v is not None:
+            v = v.contiguous().view(-1, bsz * num_heads, v_head_dim).transpose(0, 1)
+
+        src_len = k.size(1)
+
+        if key_padding_mask is not None:
+            assert key_padding_mask.size(0) == bsz
+            assert key_padding_mask.size(1) == src_len
+
+        if add_zero_attn:
+            src_len += 1
+            k = torch.cat(
+                [
+                    k,
+                    torch.zeros(
+                        (k.size(0), 1) + k.size()[2:], dtype=k.dtype, device=k.device
+                    ),
+                ],
+                dim=1,
+            )
+            v = torch.cat(
+                [
+                    v,
+                    torch.zeros(
+                        (v.size(0), 1) + v.size()[2:], dtype=v.dtype, device=v.device
+                    ),
+                ],
+                dim=1,
+            )
+            if attn_mask is not None:
+                attn_mask = pad(attn_mask, (0, 1))
+            if key_padding_mask is not None:
+                key_padding_mask = pad(key_padding_mask, (0, 1))
+
+        attn_output_weights = torch.bmm(q, k.transpose(1, 2))
+        assert list(attn_output_weights.size()) == [bsz * num_heads, tgt_len, src_len]
+
+        """
+        Attention weight for the invalid region is -inf
+        """
+        if attn_mask is not None:
+            if attn_mask.dtype == torch.bool:
+                attn_output_weights.masked_fill_(attn_mask, float("-inf"))
+            else:
+                attn_output_weights += attn_mask
+
+        if key_padding_mask is not None:
+            attn_output_weights = attn_output_weights.view(
+                bsz, num_heads, tgt_len, src_len
+            )
+            attn_output_weights = attn_output_weights.masked_fill(
+                key_padding_mask.unsqueeze(1).unsqueeze(2),
+                float("-inf"),
+            )
+            attn_output_weights = attn_output_weights.view(
+                bsz * num_heads, tgt_len, src_len
+            )
+
+        if residual_attn is not None:
+            attn_output_weights = attn_output_weights.view(
+                bsz, num_heads, tgt_len, src_len
+            )
+            attn_output_weights += residual_attn.unsqueeze(0)
+            attn_output_weights = attn_output_weights.view(
+                bsz * num_heads, tgt_len, src_len
+            )
+
+        """
+        Reweight the attention map before softmax().
+        attn_output_weights: (b*n_head, n, hw)
+        """
+        attn_output_weights = softmax(attn_output_weights, dim=-1)
+        attn_output_weights = dropout(
+            attn_output_weights, p=dropout_p, training=training
+        )
+
+        attn_output = torch.bmm(attn_output_weights, v)
+        assert list(attn_output.size()) == [bsz * num_heads, tgt_len, v_head_dim]
+        attn_output = (
+            attn_output.transpose(0, 1).contiguous().view(tgt_len, bsz, out_dim)
+        )
+        attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
+
+        if need_weights:
+            # average attention weights over heads
+            attn_output_weights = attn_output_weights.view(
+                bsz, num_heads, tgt_len, src_len
+            )
+            return attn_output, attn_output_weights.sum(dim=1) / num_heads
+        else:
+            return attn_output
diff --git a/main/transformer_utils/mmpose/models/backbones/modules/multihead_isa_attention.py b/main/transformer_utils/mmpose/models/backbones/modules/multihead_isa_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..deb25dfa96cc592cc58c825dc0eccd726c1592ed
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/backbones/modules/multihead_isa_attention.py
@@ -0,0 +1,435 @@
+# --------------------------------------------------------
+# Copyright (c) 2021 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Modified by Lang Huang, RainbowSecret from:
+#   https://github.com/openseg-group/openseg.pytorch/blob/master/lib/models/modules/isa_block.py
+# --------------------------------------------------------
+
+
+import copy
+import math
+import warnings
+import torch
+from torch import nn, Tensor
+from torch.nn import functional as F
+from torch._jit_internal import Optional, Tuple
+from torch.overrides import has_torch_function, handle_torch_function
+from torch.nn.functional import linear, pad, softmax, dropout
+
+from einops import rearrange
+from timm.models.layers import to_2tuple, trunc_normal_
+
+from .multihead_attention import MultiheadAttention
+
+
+class MHA_(MultiheadAttention):
+    """ "Multihead Attention with extra flags on the q/k/v and out projections."""
+
+    bias_k: Optional[torch.Tensor]
+    bias_v: Optional[torch.Tensor]
+
+    def __init__(self, *args, rpe=False, window_size=7, **kwargs):
+        super(MHA_, self).__init__(*args, **kwargs)
+
+        self.rpe = rpe
+        if rpe:
+            self.window_size = [window_size] * 2
+            # define a parameter table of relative position bias
+            # self.relative_position_bias_table = nn.Parameter(
+            #     torch.zeros(
+            #         (2 * self.window_size[0] - 1) * (2 * self.window_size[1] - 1),
+            #         self.num_heads,
+            #     )
+            # )  # 2*Wh-1 * 2*Ww-1, nH
+            # get pair-wise relative position index for each token inside the window
+            coords_h = torch.arange(self.window_size[0])
+            coords_w = torch.arange(self.window_size[1])
+            coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+            coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+            relative_coords = (
+                coords_flatten[:, :, None] - coords_flatten[:, None, :]
+            )  # 2, Wh*Ww, Wh*Ww
+            relative_coords = relative_coords.permute(
+                1, 2, 0
+            ).contiguous()  # Wh*Ww, Wh*Ww, 2
+            relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
+            relative_coords[:, :, 1] += self.window_size[1] - 1
+            relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+            relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+            self.register_buffer("relative_position_index", relative_position_index)
+            # trunc_normal_(self.relative_position_bias_table, std=0.02)
+
+    def forward(
+        self,
+        query,
+        key,
+        value,
+        key_padding_mask=None,
+        need_weights=False,
+        attn_mask=None,
+        do_qkv_proj=True,
+        do_out_proj=True,
+        rpe=True,
+    ):
+        if not self._qkv_same_embed_dim:
+            return self.multi_head_attention_forward(
+                query,
+                key,
+                value,
+                self.embed_dim,
+                self.num_heads,
+                self.in_proj_weight,
+                self.in_proj_bias,
+                self.bias_k,
+                self.bias_v,
+                self.add_zero_attn,
+                self.dropout,
+                self.out_proj.weight,
+                self.out_proj.bias,
+                training=self.training,
+                key_padding_mask=key_padding_mask,
+                need_weights=need_weights,
+                attn_mask=attn_mask,
+                use_separate_proj_weight=True,
+                q_proj_weight=self.q_proj_weight,
+                k_proj_weight=self.k_proj_weight,
+                v_proj_weight=self.v_proj_weight,
+                out_dim=self.vdim,
+                do_qkv_proj=do_qkv_proj,
+                do_out_proj=do_out_proj,
+                rpe=rpe,
+            )
+        else:
+            return self.multi_head_attention_forward(
+                query,
+                key,
+                value,
+                self.embed_dim,
+                self.num_heads,
+                self.in_proj_weight,
+                self.in_proj_bias,
+                self.bias_k,
+                self.bias_v,
+                self.add_zero_attn,
+                self.dropout,
+                self.out_proj.weight,
+                self.out_proj.bias,
+                training=self.training,
+                key_padding_mask=key_padding_mask,
+                need_weights=need_weights,
+                attn_mask=attn_mask,
+                out_dim=self.vdim,
+                do_qkv_proj=do_qkv_proj,
+                do_out_proj=do_out_proj,
+                rpe=rpe,
+            )
+
+    def multi_head_attention_forward(
+        self,
+        query: Tensor,
+        key: Tensor,
+        value: Tensor,
+        embed_dim_to_check: int,
+        num_heads: int,
+        in_proj_weight: Tensor,
+        in_proj_bias: Tensor,
+        bias_k: Optional[Tensor],
+        bias_v: Optional[Tensor],
+        add_zero_attn: bool,
+        dropout_p: float,
+        out_proj_weight: Tensor,
+        out_proj_bias: Tensor,
+        training: bool = True,
+        key_padding_mask: Optional[Tensor] = None,
+        need_weights: bool = False,
+        attn_mask: Optional[Tensor] = None,
+        use_separate_proj_weight: bool = False,
+        q_proj_weight: Optional[Tensor] = None,
+        k_proj_weight: Optional[Tensor] = None,
+        v_proj_weight: Optional[Tensor] = None,
+        static_k: Optional[Tensor] = None,
+        static_v: Optional[Tensor] = None,
+        out_dim: Optional[Tensor] = None,
+        do_qkv_proj: bool = True,
+        do_out_proj: bool = True,
+        rpe=True,
+    ) -> Tuple[Tensor, Optional[Tensor]]:
+        if not torch.jit.is_scripting():
+            tens_ops = (
+                query,
+                key,
+                value,
+                in_proj_weight,
+                in_proj_bias,
+                bias_k,
+                bias_v,
+                out_proj_weight,
+                out_proj_bias,
+            )
+            if any([type(t) is not Tensor for t in tens_ops]) and has_torch_function(
+                tens_ops
+            ):
+                return handle_torch_function(
+                    multi_head_attention_forward,
+                    tens_ops,
+                    query,
+                    key,
+                    value,
+                    embed_dim_to_check,
+                    num_heads,
+                    in_proj_weight,
+                    in_proj_bias,
+                    bias_k,
+                    bias_v,
+                    add_zero_attn,
+                    dropout_p,
+                    out_proj_weight,
+                    out_proj_bias,
+                    training=training,
+                    key_padding_mask=key_padding_mask,
+                    need_weights=need_weights,
+                    attn_mask=attn_mask,
+                    use_separate_proj_weight=use_separate_proj_weight,
+                    q_proj_weight=q_proj_weight,
+                    k_proj_weight=k_proj_weight,
+                    v_proj_weight=v_proj_weight,
+                    static_k=static_k,
+                    static_v=static_v,
+                )
+        tgt_len, bsz, embed_dim = query.size()
+        key = query if key is None else key
+        value = query if value is None else value
+
+        assert embed_dim == embed_dim_to_check
+        # allow MHA to have different sizes for the feature dimension
+        assert key.size(0) == value.size(0) and key.size(1) == value.size(1)
+
+        head_dim = embed_dim // num_heads
+        v_head_dim = out_dim // num_heads
+        assert (
+            head_dim * num_heads == embed_dim
+        ), "embed_dim must be divisible by num_heads"
+        scaling = float(head_dim) ** -0.5
+
+        # whether or not use the original query/key/value
+        q = self.q_proj(query) * scaling if do_qkv_proj else query
+        k = self.k_proj(key) if do_qkv_proj else key
+        v = self.v_proj(value) if do_qkv_proj else value
+
+        if attn_mask is not None:
+            assert (
+                attn_mask.dtype == torch.float32
+                or attn_mask.dtype == torch.float64
+                or attn_mask.dtype == torch.float16
+                or attn_mask.dtype == torch.uint8
+                or attn_mask.dtype == torch.bool
+            ), "Only float, byte, and bool types are supported for attn_mask, not {}".format(
+                attn_mask.dtype
+            )
+            if attn_mask.dtype == torch.uint8:
+                warnings.warn(
+                    "Byte tensor for attn_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead."
+                )
+                attn_mask = attn_mask.to(torch.bool)
+
+            if attn_mask.dim() == 2:
+                attn_mask = attn_mask.unsqueeze(0)
+                if list(attn_mask.size()) != [1, query.size(0), key.size(0)]:
+                    raise RuntimeError("The size of the 2D attn_mask is not correct.")
+            elif attn_mask.dim() == 3:
+                if list(attn_mask.size()) != [
+                    bsz * num_heads,
+                    query.size(0),
+                    key.size(0),
+                ]:
+                    raise RuntimeError("The size of the 3D attn_mask is not correct.")
+            else:
+                raise RuntimeError(
+                    "attn_mask's dimension {} is not supported".format(attn_mask.dim())
+                )
+
+        # convert ByteTensor key_padding_mask to bool
+        if key_padding_mask is not None and key_padding_mask.dtype == torch.uint8:
+            warnings.warn(
+                "Byte tensor for key_padding_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead."
+            )
+            key_padding_mask = key_padding_mask.to(torch.bool)
+
+        q = q.contiguous().view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1)
+        if k is not None:
+            k = k.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)
+        if v is not None:
+            v = v.contiguous().view(-1, bsz * num_heads, v_head_dim).transpose(0, 1)
+
+        src_len = k.size(1)
+
+        if key_padding_mask is not None:
+            assert key_padding_mask.size(0) == bsz
+            assert key_padding_mask.size(1) == src_len
+
+        if add_zero_attn:
+            src_len += 1
+            k = torch.cat(
+                [
+                    k,
+                    torch.zeros(
+                        (k.size(0), 1) + k.size()[2:], dtype=k.dtype, device=k.device
+                    ),
+                ],
+                dim=1,
+            )
+            v = torch.cat(
+                [
+                    v,
+                    torch.zeros(
+                        (v.size(0), 1) + v.size()[2:], dtype=v.dtype, device=v.device
+                    ),
+                ],
+                dim=1,
+            )
+            if attn_mask is not None:
+                attn_mask = pad(attn_mask, (0, 1))
+            if key_padding_mask is not None:
+                key_padding_mask = pad(key_padding_mask, (0, 1))
+
+        attn_output_weights = torch.bmm(q, k.transpose(1, 2))
+        assert list(attn_output_weights.size()) == [bsz * num_heads, tgt_len, src_len]
+
+        """
+        Add relative position embedding
+        """
+        if self.rpe and rpe:
+            # NOTE: for simplicity, we assume the src_len == tgt_len == window_size**2 here
+            # print('src, tar, window', src_len, tgt_len, self.window_size[0], self.window_size[1])
+            # assert src_len == self.window_size[0] * self.window_size[1] \
+            #                   and tgt_len == self.window_size[0] * self.window_size[1], \
+            #                   f"src{src_len}, tgt{tgt_len}, window{self.window_size[0]}"
+            # relative_position_bias = self.relative_position_bias_table[
+            #     self.relative_position_index.view(-1)
+            # ].view(
+            #     self.window_size[0] * self.window_size[1],
+            #     self.window_size[0] * self.window_size[1],
+            #     -1,
+            # )  # Wh*Ww,Wh*Ww,nH
+            # relative_position_bias = relative_position_bias.permute(
+            #     2, 0, 1
+            # ).contiguous()  # nH, Wh*Ww, Wh*Ww
+            # HELLO!!!!!
+            attn_output_weights = attn_output_weights.view(
+                bsz, num_heads, tgt_len, src_len
+            )  # + relative_position_bias.unsqueeze(0)
+            attn_output_weights = attn_output_weights.view(
+                bsz * num_heads, tgt_len, src_len
+            )
+
+        """
+        Attention weight for the invalid region is -inf
+        """
+        if attn_mask is not None:
+            if attn_mask.dtype == torch.bool:
+                attn_output_weights.masked_fill_(attn_mask, float("-inf"))
+            else:
+                attn_output_weights += attn_mask
+
+        if key_padding_mask is not None:
+            attn_output_weights = attn_output_weights.view(
+                bsz, num_heads, tgt_len, src_len
+            )
+            attn_output_weights = attn_output_weights.masked_fill(
+                key_padding_mask.unsqueeze(1).unsqueeze(2),
+                float("-inf"),
+            )
+            attn_output_weights = attn_output_weights.view(
+                bsz * num_heads, tgt_len, src_len
+            )
+
+        """
+        Reweight the attention map before softmax().
+        attn_output_weights: (b*n_head, n, hw)
+        """
+        attn_output_weights = softmax(attn_output_weights, dim=-1)
+        attn_output_weights = dropout(
+            attn_output_weights, p=dropout_p, training=training
+        )
+
+        attn_output = torch.bmm(attn_output_weights, v)
+        assert list(attn_output.size()) == [bsz * num_heads, tgt_len, v_head_dim]
+        attn_output = (
+            attn_output.transpose(0, 1).contiguous().view(tgt_len, bsz, out_dim)
+        )
+        if do_out_proj:
+            attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
+
+        if need_weights:
+            # average attention weights over heads
+            attn_output_weights = attn_output_weights.view(
+                bsz, num_heads, tgt_len, src_len
+            )
+            return attn_output, q, k, attn_output_weights.sum(dim=1) / num_heads
+        else:
+            return attn_output, q, k  # additionaly return the query and key
+
+
+class PadBlock(object):
+    """ "Make the size of feature map divisible by local group size."""
+
+    def __init__(self, local_group_size=7):
+        self.lgs = local_group_size
+        if not isinstance(self.lgs, (tuple, list)):
+            self.lgs = to_2tuple(self.lgs)
+        assert len(self.lgs) == 2
+
+    def pad_if_needed(self, x, size):
+        n, h, w, c = size
+        pad_h = math.ceil(h / self.lgs[0]) * self.lgs[0] - h
+        pad_w = math.ceil(w / self.lgs[1]) * self.lgs[1] - w
+        if pad_h > 0 or pad_w > 0:  # center-pad the feature on H and W axes
+            return F.pad(
+                x,
+                (0, 0, pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2),
+            )
+        return x
+
+    def depad_if_needed(self, x, size):
+        n, h, w, c = size
+        pad_h = math.ceil(h / self.lgs[0]) * self.lgs[0] - h
+        pad_w = math.ceil(w / self.lgs[1]) * self.lgs[1] - w
+        if pad_h > 0 or pad_w > 0:  # remove the center-padding on feature
+            return x[:, pad_h // 2 : pad_h // 2 + h, pad_w // 2 : pad_w // 2 + w, :]
+        return x
+
+
+class LocalPermuteModule(object):
+    """ "Permute the feature map to gather pixels in local groups, and the reverse permutation"""
+
+    def __init__(self, local_group_size=7):
+        self.lgs = local_group_size
+        if not isinstance(self.lgs, (tuple, list)):
+            self.lgs = to_2tuple(self.lgs)
+        assert len(self.lgs) == 2
+
+    def permute(self, x, size):
+        n, h, w, c = size
+        return rearrange(
+            x,
+            "n (qh ph) (qw pw) c -> (ph pw) (n qh qw) c",
+            n=n,
+            qh=h // self.lgs[0],
+            ph=self.lgs[0],
+            qw=w // self.lgs[0],
+            pw=self.lgs[0],
+            c=c,
+        )
+
+    def rev_permute(self, x, size):
+        n, h, w, c = size
+        return rearrange(
+            x,
+            "(ph pw) (n qh qw) c -> n (qh ph) (qw pw) c",
+            n=n,
+            qh=h // self.lgs[0],
+            ph=self.lgs[0],
+            qw=w // self.lgs[0],
+            pw=self.lgs[0],
+            c=c,
+        )
diff --git a/main/transformer_utils/mmpose/models/backbones/modules/multihead_isa_pool_attention.py b/main/transformer_utils/mmpose/models/backbones/modules/multihead_isa_pool_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b039022102a9a26fc1210e24910edd6d9ada560
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/backbones/modules/multihead_isa_pool_attention.py
@@ -0,0 +1,59 @@
+# --------------------------------------------------------
+# Copyright (c) 2021 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Modified by Lang Huang, RainbowSecret from:
+#   https://github.com/openseg-group/openseg.pytorch/blob/master/lib/models/modules/isa_block.py
+# --------------------------------------------------------
+
+import os
+import pdb
+import math
+import torch
+import torch.nn as nn
+
+from .multihead_isa_attention import MHA_, PadBlock, LocalPermuteModule
+
+
+class InterlacedPoolAttention(nn.Module):
+    r"""interlaced sparse multi-head self attention (ISA) module with relative position bias.
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): Window size.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+    """
+
+    def __init__(self, embed_dim, num_heads, window_size=7, rpe=True, **kwargs):
+        super(InterlacedPoolAttention, self).__init__()
+
+        self.dim = embed_dim
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.with_rpe = rpe
+
+        self.attn = MHA_(
+            embed_dim, num_heads, rpe=rpe, window_size=window_size, **kwargs
+        )
+        self.pad_helper = PadBlock(window_size)
+        self.permute_helper = LocalPermuteModule(window_size)
+
+    def forward(self, x, H, W, **kwargs):
+        B, N, C = x.shape
+        x = x.view(B, H, W, C)
+        # attention
+        # pad
+        x_pad = self.pad_helper.pad_if_needed(x, x.size())
+        # permute
+        x_permute = self.permute_helper.permute(x_pad, x_pad.size())
+        # attention
+        out, _, _ = self.attn(
+            x_permute, x_permute, x_permute, rpe=self.with_rpe, **kwargs
+        )
+        # reverse permutation
+        out = self.permute_helper.rev_permute(out, x_pad.size())
+        # de-pad, pooling with `ceil_mode=True` will do implicit padding, so we need to remove it, too
+        out = self.pad_helper.depad_if_needed(out, x.size())
+        return out.reshape(B, N, C)
diff --git a/main/transformer_utils/mmpose/models/backbones/modules/transformer_block.py b/main/transformer_utils/mmpose/models/backbones/modules/transformer_block.py
new file mode 100644
index 0000000000000000000000000000000000000000..9571e8c70843662d466d3903acc5d54eab27bd4c
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/backbones/modules/transformer_block.py
@@ -0,0 +1,211 @@
+# --------------------------------------------------------
+# High Resolution Transformer
+# Copyright (c) 2021 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Rao Fu, RainbowSecret
+# --------------------------------------------------------
+
+import os
+import math
+import logging
+import torch
+import torch.nn as nn
+from functools import partial
+
+from .multihead_isa_pool_attention import InterlacedPoolAttention
+
+from mmcv.cnn import build_conv_layer, build_norm_layer
+
+BN_MOMENTUM = 0.1
+
+
+def drop_path(x, drop_prob: float = 0.0, training: bool = False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
+    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
+    'survival rate' as the argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (
+        x.ndim - 1
+    )  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
+    random_tensor.floor_()  # binarize
+    output = x.div(keep_prob) * random_tensor
+    return output
+
+
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+    def extra_repr(self):
+        # (Optional)Set the extra information about this module. You can test
+        # it by printing an object of this class.
+        return "drop_prob={}".format(self.drop_prob)
+
+
+class MlpDWBN(nn.Module):
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        dw_act_layer=nn.GELU,
+        drop=0.0,
+        conv_cfg=None,
+        norm_cfg=dict(type="BN", requires_grad=True),
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = build_conv_layer(
+            conv_cfg,
+            in_features,
+            hidden_features,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True,
+        )
+        self.act1 = act_layer()
+        self.norm1 = build_norm_layer(norm_cfg, hidden_features)[1]
+        self.dw3x3 = build_conv_layer(
+            conv_cfg,
+            hidden_features,
+            hidden_features,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            groups=hidden_features,
+        )
+        self.act2 = dw_act_layer()
+        self.norm2 = build_norm_layer(norm_cfg, hidden_features)[1]
+        self.fc2 = build_conv_layer(
+            conv_cfg,
+            hidden_features,
+            out_features,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True,
+        )
+        self.act3 = act_layer()
+        self.norm3 = build_norm_layer(norm_cfg, out_features)[1]
+        # self.drop = nn.Dropout(drop, inplace=True)
+
+    def forward(self, x, H, W):
+        if len(x.shape) == 3:
+            B, N, C = x.shape
+            if N == (H * W + 1):
+                cls_tokens = x[:, 0, :]
+                x_ = x[:, 1:, :].permute(0, 2, 1).contiguous().reshape(B, C, H, W)
+            else:
+                x_ = x.permute(0, 2, 1).contiguous().reshape(B, C, H, W)
+
+            x_ = self.fc1(x_)
+            x_ = self.norm1(x_)
+            x_ = self.act1(x_)
+            x_ = self.dw3x3(x_)
+            x_ = self.norm2(x_)
+            x_ = self.act2(x_)
+            # x_ = self.drop(x_)
+            x_ = self.fc2(x_)
+            x_ = self.norm3(x_)
+            x_ = self.act3(x_)
+            # x_ = self.drop(x_)
+            x_ = x_.reshape(B, C, -1).permute(0, 2, 1).contiguous()
+            if N == (H * W + 1):
+                x = torch.cat((cls_tokens.unsqueeze(1), x_), dim=1)
+            else:
+                x = x_
+            return x
+
+        elif len(x.shape) == 4:
+            x = self.fc1(x)
+            x = self.norm1(x)
+            x = self.act1(x)
+            x = self.dw3x3(x)
+            x = self.norm2(x)
+            x = self.act2(x)
+            x = self.drop(x)
+            x = self.fc2(x)
+            x = self.norm3(x)
+            x = self.act3(x)
+            x = self.drop(x)
+            return x
+
+        else:
+            raise RuntimeError("Unsupported input shape: {}".format(x.shape))
+
+
+class GeneralTransformerBlock(nn.Module):
+    expansion = 1
+
+    def __init__(
+        self,
+        inplanes,
+        planes,
+        num_heads,
+        window_size=7,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        qk_scale=None,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        act_layer=nn.GELU,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        conv_cfg=None,
+        norm_cfg=dict(type="BN", requires_grad=True),
+    ):
+        super().__init__()
+        self.dim = inplanes
+        self.out_dim = planes
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.mlp_ratio = mlp_ratio
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+
+        self.attn = InterlacedPoolAttention(
+            self.dim, num_heads=num_heads, window_size=window_size, dropout=attn_drop
+        )
+
+        self.norm1 = norm_layer(self.dim)
+        self.norm2 = norm_layer(self.out_dim)
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        mlp_hidden_dim = int(self.dim * mlp_ratio)
+        self.mlp = MlpDWBN(
+            in_features=self.dim,
+            hidden_features=mlp_hidden_dim,
+            out_features=self.out_dim,
+            act_layer=act_layer,
+            dw_act_layer=act_layer,
+            drop=drop,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+        )
+
+    def forward(self, x):
+        B, C, H, W = x.size()
+        # reshape
+        x = x.view(B, C, -1).permute(0, 2, 1).contiguous()
+        # Attention
+        x = x + self.drop_path(self.attn(self.norm1(x), H, W))
+        # FFN
+        x = x + self.drop_path(self.mlp(self.norm2(x), H, W))
+        # reshape
+        x = x.permute(0, 2, 1).contiguous().view(B, C, H, W)
+        return x
diff --git a/main/transformer_utils/mmpose/models/backbones/mspn.py b/main/transformer_utils/mmpose/models/backbones/mspn.py
new file mode 100644
index 0000000000000000000000000000000000000000..71cee34e399780e8b67eac43d862b65a3ce05412
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/backbones/mspn.py
@@ -0,0 +1,513 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy as cp
+from collections import OrderedDict
+
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import (ConvModule, MaxPool2d, constant_init, kaiming_init,
+                      normal_init)
+from mmcv.runner.checkpoint import load_state_dict
+
+from mmpose.utils import get_root_logger
+from ..builder import BACKBONES
+from .base_backbone import BaseBackbone
+from .resnet import Bottleneck as _Bottleneck
+from .utils.utils import get_state_dict
+
+
+class Bottleneck(_Bottleneck):
+    expansion = 4
+    """Bottleneck block for MSPN.
+
+    Args:
+        in_channels (int): Input channels of this block.
+        out_channels (int): Output channels of this block.
+        stride (int): stride of the block. Default: 1
+        downsample (nn.Module): downsample operation on identity branch.
+            Default: None
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+    """
+
+    def __init__(self, in_channels, out_channels, **kwargs):
+        super().__init__(in_channels, out_channels * 4, **kwargs)
+
+
+class DownsampleModule(nn.Module):
+    """Downsample module for MSPN.
+
+    Args:
+        block (nn.Module): Downsample block.
+        num_blocks (list): Number of blocks in each downsample unit.
+        num_units (int): Numbers of downsample units. Default: 4
+        has_skip (bool): Have skip connections from prior upsample
+            module or not. Default:False
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        in_channels (int): Number of channels of the input feature to
+            downsample module. Default: 64
+    """
+
+    def __init__(self,
+                 block,
+                 num_blocks,
+                 num_units=4,
+                 has_skip=False,
+                 norm_cfg=dict(type='BN'),
+                 in_channels=64):
+        # Protect mutable default arguments
+        norm_cfg = cp.deepcopy(norm_cfg)
+        super().__init__()
+        self.has_skip = has_skip
+        self.in_channels = in_channels
+        assert len(num_blocks) == num_units
+        self.num_blocks = num_blocks
+        self.num_units = num_units
+        self.norm_cfg = norm_cfg
+        self.layer1 = self._make_layer(block, in_channels, num_blocks[0])
+        for i in range(1, num_units):
+            module_name = f'layer{i + 1}'
+            self.add_module(
+                module_name,
+                self._make_layer(
+                    block, in_channels * pow(2, i), num_blocks[i], stride=2))
+
+    def _make_layer(self, block, out_channels, blocks, stride=1):
+        downsample = None
+        if stride != 1 or self.in_channels != out_channels * block.expansion:
+            downsample = ConvModule(
+                self.in_channels,
+                out_channels * block.expansion,
+                kernel_size=1,
+                stride=stride,
+                padding=0,
+                norm_cfg=self.norm_cfg,
+                act_cfg=None,
+                inplace=True)
+
+        units = list()
+        units.append(
+            block(
+                self.in_channels,
+                out_channels,
+                stride=stride,
+                downsample=downsample,
+                norm_cfg=self.norm_cfg))
+        self.in_channels = out_channels * block.expansion
+        for _ in range(1, blocks):
+            units.append(block(self.in_channels, out_channels))
+
+        return nn.Sequential(*units)
+
+    def forward(self, x, skip1, skip2):
+        out = list()
+        for i in range(self.num_units):
+            module_name = f'layer{i + 1}'
+            module_i = getattr(self, module_name)
+            x = module_i(x)
+            if self.has_skip:
+                x = x + skip1[i] + skip2[i]
+            out.append(x)
+        out.reverse()
+
+        return tuple(out)
+
+
+class UpsampleUnit(nn.Module):
+    """Upsample unit for upsample module.
+
+    Args:
+        ind (int): Indicates whether to interpolate (>0) and whether to
+           generate feature map for the next hourglass-like module.
+        num_units (int): Number of units that form a upsample module. Along
+            with ind and gen_cross_conv, nm_units is used to decide whether
+            to generate feature map for the next hourglass-like module.
+        in_channels (int): Channel number of the skip-in feature maps from
+            the corresponding downsample unit.
+        unit_channels (int): Channel number in this unit. Default:256.
+        gen_skip: (bool): Whether or not to generate skips for the posterior
+            downsample module. Default:False
+        gen_cross_conv (bool): Whether to generate feature map for the next
+            hourglass-like module. Default:False
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        out_channels (int): Number of channels of feature output by upsample
+            module. Must equal to in_channels of downsample module. Default:64
+    """
+
+    def __init__(self,
+                 ind,
+                 num_units,
+                 in_channels,
+                 unit_channels=256,
+                 gen_skip=False,
+                 gen_cross_conv=False,
+                 norm_cfg=dict(type='BN'),
+                 out_channels=64):
+        # Protect mutable default arguments
+        norm_cfg = cp.deepcopy(norm_cfg)
+        super().__init__()
+        self.num_units = num_units
+        self.norm_cfg = norm_cfg
+        self.in_skip = ConvModule(
+            in_channels,
+            unit_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            norm_cfg=self.norm_cfg,
+            act_cfg=None,
+            inplace=True)
+        self.relu = nn.ReLU(inplace=True)
+
+        self.ind = ind
+        if self.ind > 0:
+            self.up_conv = ConvModule(
+                unit_channels,
+                unit_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                norm_cfg=self.norm_cfg,
+                act_cfg=None,
+                inplace=True)
+
+        self.gen_skip = gen_skip
+        if self.gen_skip:
+            self.out_skip1 = ConvModule(
+                in_channels,
+                in_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                norm_cfg=self.norm_cfg,
+                inplace=True)
+
+            self.out_skip2 = ConvModule(
+                unit_channels,
+                in_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                norm_cfg=self.norm_cfg,
+                inplace=True)
+
+        self.gen_cross_conv = gen_cross_conv
+        if self.ind == num_units - 1 and self.gen_cross_conv:
+            self.cross_conv = ConvModule(
+                unit_channels,
+                out_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                norm_cfg=self.norm_cfg,
+                inplace=True)
+
+    def forward(self, x, up_x):
+        out = self.in_skip(x)
+
+        if self.ind > 0:
+            up_x = F.interpolate(
+                up_x,
+                size=(x.size(2), x.size(3)),
+                mode='bilinear',
+                align_corners=True)
+            up_x = self.up_conv(up_x)
+            out = out + up_x
+        out = self.relu(out)
+
+        skip1 = None
+        skip2 = None
+        if self.gen_skip:
+            skip1 = self.out_skip1(x)
+            skip2 = self.out_skip2(out)
+
+        cross_conv = None
+        if self.ind == self.num_units - 1 and self.gen_cross_conv:
+            cross_conv = self.cross_conv(out)
+
+        return out, skip1, skip2, cross_conv
+
+
+class UpsampleModule(nn.Module):
+    """Upsample module for MSPN.
+
+    Args:
+        unit_channels (int): Channel number in the upsample units.
+            Default:256.
+        num_units (int): Numbers of upsample units. Default: 4
+        gen_skip (bool): Whether to generate skip for posterior downsample
+            module or not. Default:False
+        gen_cross_conv (bool): Whether to generate feature map for the next
+            hourglass-like module. Default:False
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        out_channels (int): Number of channels of feature output by upsample
+            module. Must equal to in_channels of downsample module. Default:64
+    """
+
+    def __init__(self,
+                 unit_channels=256,
+                 num_units=4,
+                 gen_skip=False,
+                 gen_cross_conv=False,
+                 norm_cfg=dict(type='BN'),
+                 out_channels=64):
+        # Protect mutable default arguments
+        norm_cfg = cp.deepcopy(norm_cfg)
+        super().__init__()
+        self.in_channels = list()
+        for i in range(num_units):
+            self.in_channels.append(Bottleneck.expansion * out_channels *
+                                    pow(2, i))
+        self.in_channels.reverse()
+        self.num_units = num_units
+        self.gen_skip = gen_skip
+        self.gen_cross_conv = gen_cross_conv
+        self.norm_cfg = norm_cfg
+        for i in range(num_units):
+            module_name = f'up{i + 1}'
+            self.add_module(
+                module_name,
+                UpsampleUnit(
+                    i,
+                    self.num_units,
+                    self.in_channels[i],
+                    unit_channels,
+                    self.gen_skip,
+                    self.gen_cross_conv,
+                    norm_cfg=self.norm_cfg,
+                    out_channels=64))
+
+    def forward(self, x):
+        out = list()
+        skip1 = list()
+        skip2 = list()
+        cross_conv = None
+        for i in range(self.num_units):
+            module_i = getattr(self, f'up{i + 1}')
+            if i == 0:
+                outi, skip1_i, skip2_i, _ = module_i(x[i], None)
+            elif i == self.num_units - 1:
+                outi, skip1_i, skip2_i, cross_conv = module_i(x[i], out[i - 1])
+            else:
+                outi, skip1_i, skip2_i, _ = module_i(x[i], out[i - 1])
+            out.append(outi)
+            skip1.append(skip1_i)
+            skip2.append(skip2_i)
+        skip1.reverse()
+        skip2.reverse()
+
+        return out, skip1, skip2, cross_conv
+
+
+class SingleStageNetwork(nn.Module):
+    """Single_stage Network.
+
+    Args:
+        unit_channels (int): Channel number in the upsample units. Default:256.
+        num_units (int): Numbers of downsample/upsample units. Default: 4
+        gen_skip (bool): Whether to generate skip for posterior downsample
+            module or not. Default:False
+        gen_cross_conv (bool): Whether to generate feature map for the next
+            hourglass-like module. Default:False
+        has_skip (bool): Have skip connections from prior upsample
+            module or not. Default:False
+        num_blocks (list): Number of blocks in each downsample unit.
+            Default: [2, 2, 2, 2] Note: Make sure num_units==len(num_blocks)
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        in_channels (int): Number of channels of the feature from ResNetTop.
+            Default: 64.
+    """
+
+    def __init__(self,
+                 has_skip=False,
+                 gen_skip=False,
+                 gen_cross_conv=False,
+                 unit_channels=256,
+                 num_units=4,
+                 num_blocks=[2, 2, 2, 2],
+                 norm_cfg=dict(type='BN'),
+                 in_channels=64):
+        # Protect mutable default arguments
+        norm_cfg = cp.deepcopy(norm_cfg)
+        num_blocks = cp.deepcopy(num_blocks)
+        super().__init__()
+        assert len(num_blocks) == num_units
+        self.has_skip = has_skip
+        self.gen_skip = gen_skip
+        self.gen_cross_conv = gen_cross_conv
+        self.num_units = num_units
+        self.unit_channels = unit_channels
+        self.num_blocks = num_blocks
+        self.norm_cfg = norm_cfg
+
+        self.downsample = DownsampleModule(Bottleneck, num_blocks, num_units,
+                                           has_skip, norm_cfg, in_channels)
+        self.upsample = UpsampleModule(unit_channels, num_units, gen_skip,
+                                       gen_cross_conv, norm_cfg, in_channels)
+
+    def forward(self, x, skip1, skip2):
+        mid = self.downsample(x, skip1, skip2)
+        out, skip1, skip2, cross_conv = self.upsample(mid)
+
+        return out, skip1, skip2, cross_conv
+
+
+class ResNetTop(nn.Module):
+    """ResNet top for MSPN.
+
+    Args:
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        channels (int): Number of channels of the feature output by ResNetTop.
+    """
+
+    def __init__(self, norm_cfg=dict(type='BN'), channels=64):
+        # Protect mutable default arguments
+        norm_cfg = cp.deepcopy(norm_cfg)
+        super().__init__()
+        self.top = nn.Sequential(
+            ConvModule(
+                3,
+                channels,
+                kernel_size=7,
+                stride=2,
+                padding=3,
+                norm_cfg=norm_cfg,
+                inplace=True), MaxPool2d(kernel_size=3, stride=2, padding=1))
+
+    def forward(self, img):
+        return self.top(img)
+
+
+@BACKBONES.register_module()
+class MSPN(BaseBackbone):
+    """MSPN backbone. Paper ref: Li et al. "Rethinking on Multi-Stage Networks
+    for Human Pose Estimation" (CVPR 2020).
+
+    Args:
+        unit_channels (int): Number of Channels in an upsample unit.
+            Default: 256
+        num_stages (int): Number of stages in a multi-stage MSPN. Default: 4
+        num_units (int): Number of downsample/upsample units in a single-stage
+            network. Default: 4
+            Note: Make sure num_units == len(self.num_blocks)
+        num_blocks (list): Number of bottlenecks in each
+            downsample unit. Default: [2, 2, 2, 2]
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        res_top_channels (int): Number of channels of feature from ResNetTop.
+            Default: 64.
+
+    Example:
+        >>> from mmpose.models import MSPN
+        >>> import torch
+        >>> self = MSPN(num_stages=2,num_units=2,num_blocks=[2,2])
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 511, 511)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_output in level_outputs:
+        ...     for feature in level_output:
+        ...         print(tuple(feature.shape))
+        ...
+        (1, 256, 64, 64)
+        (1, 256, 128, 128)
+        (1, 256, 64, 64)
+        (1, 256, 128, 128)
+    """
+
+    def __init__(self,
+                 unit_channels=256,
+                 num_stages=4,
+                 num_units=4,
+                 num_blocks=[2, 2, 2, 2],
+                 norm_cfg=dict(type='BN'),
+                 res_top_channels=64):
+        # Protect mutable default arguments
+        norm_cfg = cp.deepcopy(norm_cfg)
+        num_blocks = cp.deepcopy(num_blocks)
+        super().__init__()
+        self.unit_channels = unit_channels
+        self.num_stages = num_stages
+        self.num_units = num_units
+        self.num_blocks = num_blocks
+        self.norm_cfg = norm_cfg
+
+        assert self.num_stages > 0
+        assert self.num_units > 1
+        assert self.num_units == len(self.num_blocks)
+        self.top = ResNetTop(norm_cfg=norm_cfg)
+        self.multi_stage_mspn = nn.ModuleList([])
+        for i in range(self.num_stages):
+            if i == 0:
+                has_skip = False
+            else:
+                has_skip = True
+            if i != self.num_stages - 1:
+                gen_skip = True
+                gen_cross_conv = True
+            else:
+                gen_skip = False
+                gen_cross_conv = False
+            self.multi_stage_mspn.append(
+                SingleStageNetwork(has_skip, gen_skip, gen_cross_conv,
+                                   unit_channels, num_units, num_blocks,
+                                   norm_cfg, res_top_channels))
+
+    def forward(self, x):
+        """Model forward function."""
+        out_feats = []
+        skip1 = None
+        skip2 = None
+        x = self.top(x)
+        for i in range(self.num_stages):
+            out, skip1, skip2, x = self.multi_stage_mspn[i](x, skip1, skip2)
+            out_feats.append(out)
+
+        return out_feats
+
+    def init_weights(self, pretrained=None):
+        """Initialize model weights."""
+        if isinstance(pretrained, str):
+            logger = get_root_logger()
+            state_dict_tmp = get_state_dict(pretrained)
+            state_dict = OrderedDict()
+            state_dict['top'] = OrderedDict()
+            state_dict['bottlenecks'] = OrderedDict()
+            for k, v in state_dict_tmp.items():
+                if k.startswith('layer'):
+                    if 'downsample.0' in k:
+                        state_dict['bottlenecks'][k.replace(
+                            'downsample.0', 'downsample.conv')] = v
+                    elif 'downsample.1' in k:
+                        state_dict['bottlenecks'][k.replace(
+                            'downsample.1', 'downsample.bn')] = v
+                    else:
+                        state_dict['bottlenecks'][k] = v
+                elif k.startswith('conv1'):
+                    state_dict['top'][k.replace('conv1', 'top.0.conv')] = v
+                elif k.startswith('bn1'):
+                    state_dict['top'][k.replace('bn1', 'top.0.bn')] = v
+
+            load_state_dict(
+                self.top, state_dict['top'], strict=False, logger=logger)
+            for i in range(self.num_stages):
+                load_state_dict(
+                    self.multi_stage_mspn[i].downsample,
+                    state_dict['bottlenecks'],
+                    strict=False,
+                    logger=logger)
+        else:
+            for m in self.multi_stage_mspn.modules():
+                if isinstance(m, nn.Conv2d):
+                    kaiming_init(m)
+                elif isinstance(m, nn.BatchNorm2d):
+                    constant_init(m, 1)
+                elif isinstance(m, nn.Linear):
+                    normal_init(m, std=0.01)
+
+            for m in self.top.modules():
+                if isinstance(m, nn.Conv2d):
+                    kaiming_init(m)
diff --git a/main/transformer_utils/mmpose/models/backbones/pvt.py b/main/transformer_utils/mmpose/models/backbones/pvt.py
new file mode 100644
index 0000000000000000000000000000000000000000..62527a7dc817513c08f42ccbb166c75cab514873
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/backbones/pvt.py
@@ -0,0 +1,592 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+import warnings
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import (Conv2d, build_activation_layer, build_norm_layer,
+                      constant_init, normal_init, trunc_normal_init)
+from mmcv.cnn.bricks.drop import build_dropout
+from mmcv.cnn.bricks.transformer import MultiheadAttention
+from mmcv.cnn.utils.weight_init import trunc_normal_
+from mmcv.runner import (BaseModule, ModuleList, Sequential, _load_checkpoint,
+                         load_state_dict)
+from torch.nn.modules.utils import _pair as to_2tuple
+
+from ...utils import get_root_logger
+from ..builder import BACKBONES
+from ..utils import PatchEmbed, nchw_to_nlc, nlc_to_nchw, pvt_convert
+
+
+class MixFFN(BaseModule):
+    """An implementation of MixFFN of PVT.
+
+    The differences between MixFFN & FFN:
+        1. Use 1X1 Conv to replace Linear layer.
+        2. Introduce 3X3 Depth-wise Conv to encode positional information.
+
+    Args:
+        embed_dims (int): The feature dimension. Same as
+            `MultiheadAttention`.
+        feedforward_channels (int): The hidden dimension of FFNs.
+        act_cfg (dict, optional): The activation config for FFNs.
+            Default: dict(type='GELU').
+        ffn_drop (float, optional): Probability of an element to be
+            zeroed in FFN. Default 0.0.
+        dropout_layer (obj:`ConfigDict`): The dropout_layer used
+            when adding the shortcut.
+            Default: None.
+        use_conv (bool): If True, add 3x3 DWConv between two Linear layers.
+            Defaults: False.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 feedforward_channels,
+                 act_cfg=dict(type='GELU'),
+                 ffn_drop=0.,
+                 dropout_layer=None,
+                 use_conv=False,
+                 init_cfg=None):
+        super(MixFFN, self).__init__(init_cfg=init_cfg)
+
+        self.embed_dims = embed_dims
+        self.feedforward_channels = feedforward_channels
+        self.act_cfg = act_cfg
+        activate = build_activation_layer(act_cfg)
+
+        in_channels = embed_dims
+        fc1 = Conv2d(
+            in_channels=in_channels,
+            out_channels=feedforward_channels,
+            kernel_size=1,
+            stride=1,
+            bias=True)
+        if use_conv:
+            # 3x3 depth wise conv to provide positional encode information
+            dw_conv = Conv2d(
+                in_channels=feedforward_channels,
+                out_channels=feedforward_channels,
+                kernel_size=3,
+                stride=1,
+                padding=(3 - 1) // 2,
+                bias=True,
+                groups=feedforward_channels)
+        fc2 = Conv2d(
+            in_channels=feedforward_channels,
+            out_channels=in_channels,
+            kernel_size=1,
+            stride=1,
+            bias=True)
+        drop = nn.Dropout(ffn_drop)
+        layers = [fc1, activate, drop, fc2, drop]
+        if use_conv:
+            layers.insert(1, dw_conv)
+        self.layers = Sequential(*layers)
+        self.dropout_layer = build_dropout(
+            dropout_layer) if dropout_layer else torch.nn.Identity()
+
+    def forward(self, x, hw_shape, identity=None):
+        out = nlc_to_nchw(x, hw_shape)
+        out = self.layers(out)
+        out = nchw_to_nlc(out)
+        if identity is None:
+            identity = x
+        return identity + self.dropout_layer(out)
+
+
+class SpatialReductionAttention(MultiheadAttention):
+    """An implementation of Spatial Reduction Attention of PVT.
+
+    This module is modified from MultiheadAttention which is a module from
+    mmcv.cnn.bricks.transformer.
+
+    Args:
+        embed_dims (int): The embedding dimension.
+        num_heads (int): Parallel attention heads.
+        attn_drop (float): A Dropout layer on attn_output_weights.
+            Default: 0.0.
+        proj_drop (float): A Dropout layer after `nn.MultiheadAttention`.
+            Default: 0.0.
+        dropout_layer (obj:`ConfigDict`): The dropout_layer used
+            when adding the shortcut. Default: None.
+        batch_first (bool): Key, Query and Value are shape of
+            (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default: False.
+        qkv_bias (bool): enable bias for qkv if True. Default: True.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+        sr_ratio (int): The ratio of spatial reduction of Spatial Reduction
+            Attention of PVT. Default: 1.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 dropout_layer=None,
+                 batch_first=True,
+                 qkv_bias=True,
+                 norm_cfg=dict(type='LN'),
+                 sr_ratio=1,
+                 init_cfg=None):
+        super().__init__(
+            embed_dims,
+            num_heads,
+            attn_drop,
+            proj_drop,
+            batch_first=batch_first,
+            dropout_layer=dropout_layer,
+            bias=qkv_bias,
+            init_cfg=init_cfg)
+
+        self.sr_ratio = sr_ratio
+        if sr_ratio > 1:
+            self.sr = Conv2d(
+                in_channels=embed_dims,
+                out_channels=embed_dims,
+                kernel_size=sr_ratio,
+                stride=sr_ratio)
+            # The ret[0] of build_norm_layer is norm name.
+            self.norm = build_norm_layer(norm_cfg, embed_dims)[1]
+
+        # handle the BC-breaking from https://github.com/open-mmlab/mmcv/pull/1418 # noqa
+        from mmpose import digit_version, mmcv_version
+        if mmcv_version < digit_version('1.3.17'):
+            warnings.warn('The legacy version of forward function in'
+                          'SpatialReductionAttention is deprecated in'
+                          'mmcv>=1.3.17 and will no longer support in the'
+                          'future. Please upgrade your mmcv.')
+            self.forward = self.legacy_forward
+
+    def forward(self, x, hw_shape, identity=None):
+
+        x_q = x
+        if self.sr_ratio > 1:
+            x_kv = nlc_to_nchw(x, hw_shape)
+            x_kv = self.sr(x_kv)
+            x_kv = nchw_to_nlc(x_kv)
+            x_kv = self.norm(x_kv)
+        else:
+            x_kv = x
+
+        if identity is None:
+            identity = x_q
+
+        # Because the dataflow('key', 'query', 'value') of
+        # ``torch.nn.MultiheadAttention`` is (num_query, batch,
+        # embed_dims), We should adjust the shape of dataflow from
+        # batch_first (batch, num_query, embed_dims) to num_query_first
+        # (num_query ,batch, embed_dims), and recover ``attn_output``
+        # from num_query_first to batch_first.
+        if self.batch_first:
+            x_q = x_q.transpose(0, 1)
+            x_kv = x_kv.transpose(0, 1)
+
+        out = self.attn(query=x_q, key=x_kv, value=x_kv)[0]
+
+        if self.batch_first:
+            out = out.transpose(0, 1)
+
+        return identity + self.dropout_layer(self.proj_drop(out))
+
+    def legacy_forward(self, x, hw_shape, identity=None):
+        """multi head attention forward in mmcv version < 1.3.17."""
+        x_q = x
+        if self.sr_ratio > 1:
+            x_kv = nlc_to_nchw(x, hw_shape)
+            x_kv = self.sr(x_kv)
+            x_kv = nchw_to_nlc(x_kv)
+            x_kv = self.norm(x_kv)
+        else:
+            x_kv = x
+
+        if identity is None:
+            identity = x_q
+
+        out = self.attn(query=x_q, key=x_kv, value=x_kv)[0]
+
+        return identity + self.dropout_layer(self.proj_drop(out))
+
+
+class PVTEncoderLayer(BaseModule):
+    """Implements one encoder layer in PVT.
+
+    Args:
+        embed_dims (int): The feature dimension.
+        num_heads (int): Parallel attention heads.
+        feedforward_channels (int): The hidden dimension for FFNs.
+        drop_rate (float): Probability of an element to be zeroed.
+            after the feed forward layer. Default: 0.0.
+        attn_drop_rate (float): The drop out rate for attention layer.
+            Default: 0.0.
+        drop_path_rate (float): stochastic depth rate. Default: 0.0.
+        qkv_bias (bool): enable bias for qkv if True.
+            Default: True.
+        act_cfg (dict): The activation config for FFNs.
+            Default: dict(type='GELU').
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+        sr_ratio (int): The ratio of spatial reduction of Spatial Reduction
+            Attention of PVT. Default: 1.
+        use_conv_ffn (bool): If True, use Convolutional FFN to replace FFN.
+            Default: False.
+        init_cfg (dict, optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 feedforward_channels,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 qkv_bias=True,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='LN'),
+                 sr_ratio=1,
+                 use_conv_ffn=False,
+                 init_cfg=None):
+        super(PVTEncoderLayer, self).__init__(init_cfg=init_cfg)
+
+        # The ret[0] of build_norm_layer is norm name.
+        self.norm1 = build_norm_layer(norm_cfg, embed_dims)[1]
+
+        self.attn = SpatialReductionAttention(
+            embed_dims=embed_dims,
+            num_heads=num_heads,
+            attn_drop=attn_drop_rate,
+            proj_drop=drop_rate,
+            dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
+            qkv_bias=qkv_bias,
+            norm_cfg=norm_cfg,
+            sr_ratio=sr_ratio)
+
+        # The ret[0] of build_norm_layer is norm name.
+        self.norm2 = build_norm_layer(norm_cfg, embed_dims)[1]
+
+        self.ffn = MixFFN(
+            embed_dims=embed_dims,
+            feedforward_channels=feedforward_channels,
+            ffn_drop=drop_rate,
+            dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
+            use_conv=use_conv_ffn,
+            act_cfg=act_cfg)
+
+    def forward(self, x, hw_shape):
+        x = self.attn(self.norm1(x), hw_shape, identity=x)
+        x = self.ffn(self.norm2(x), hw_shape, identity=x)
+
+        return x
+
+
+class AbsolutePositionEmbedding(BaseModule):
+    """An implementation of the absolute position embedding in PVT.
+
+    Args:
+        pos_shape (int): The shape of the absolute position embedding.
+        pos_dim (int): The dimension of the absolute position embedding.
+        drop_rate (float): Probability of an element to be zeroed.
+            Default: 0.0.
+    """
+
+    def __init__(self, pos_shape, pos_dim, drop_rate=0., init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+
+        if isinstance(pos_shape, int):
+            pos_shape = to_2tuple(pos_shape)
+        elif isinstance(pos_shape, tuple):
+            if len(pos_shape) == 1:
+                pos_shape = to_2tuple(pos_shape[0])
+            assert len(pos_shape) == 2, \
+                f'The size of image should have length 1 or 2, ' \
+                f'but got {len(pos_shape)}'
+        self.pos_shape = pos_shape
+        self.pos_dim = pos_dim
+
+        self.pos_embed = nn.Parameter(
+            torch.zeros(1, pos_shape[0] * pos_shape[1], pos_dim))
+        self.drop = nn.Dropout(p=drop_rate)
+
+    def init_weights(self):
+        trunc_normal_(self.pos_embed, std=0.02)
+
+    def resize_pos_embed(self, pos_embed, input_shape, mode='bilinear'):
+        """Resize pos_embed weights.
+
+        Resize pos_embed using bilinear interpolate method.
+
+        Args:
+            pos_embed (torch.Tensor): Position embedding weights.
+            input_shape (tuple): Tuple for (downsampled input image height,
+                downsampled input image width).
+            mode (str): Algorithm used for upsampling:
+                ``'nearest'`` | ``'linear'`` | ``'bilinear'`` | ``'bicubic'`` |
+                ``'trilinear'``. Default: ``'bilinear'``.
+
+        Return:
+            torch.Tensor: The resized pos_embed of shape [B, L_new, C].
+        """
+        assert pos_embed.ndim == 3, 'shape of pos_embed must be [B, L, C]'
+        pos_h, pos_w = self.pos_shape
+        pos_embed_weight = pos_embed[:, (-1 * pos_h * pos_w):]
+        pos_embed_weight = pos_embed_weight.reshape(
+            1, pos_h, pos_w, self.pos_dim).permute(0, 3, 1, 2).contiguous()
+        pos_embed_weight = F.interpolate(
+            pos_embed_weight, size=input_shape, mode=mode)
+        pos_embed_weight = torch.flatten(pos_embed_weight,
+                                         2).transpose(1, 2).contiguous()
+        pos_embed = pos_embed_weight
+
+        return pos_embed
+
+    def forward(self, x, hw_shape, mode='bilinear'):
+        pos_embed = self.resize_pos_embed(self.pos_embed, hw_shape, mode)
+        return self.drop(x + pos_embed)
+
+
+@BACKBONES.register_module()
+class PyramidVisionTransformer(BaseModule):
+    """Pyramid Vision Transformer (PVT)
+
+    Implementation of `Pyramid Vision Transformer: A Versatile Backbone for
+    Dense Prediction without Convolutions
+    <https://arxiv.org/pdf/2102.12122.pdf>`_.
+
+    Args:
+        pretrain_img_size (int | tuple[int]): The size of input image when
+            pretrain. Defaults: 224.
+        in_channels (int): Number of input channels. Default: 3.
+        embed_dims (int): Embedding dimension. Default: 64.
+        num_stags (int): The num of stages. Default: 4.
+        num_layers (Sequence[int]): The layer number of each transformer encode
+            layer. Default: [3, 4, 6, 3].
+        num_heads (Sequence[int]): The attention heads of each transformer
+            encode layer. Default: [1, 2, 5, 8].
+        patch_sizes (Sequence[int]): The patch_size of each patch embedding.
+            Default: [4, 2, 2, 2].
+        strides (Sequence[int]): The stride of each patch embedding.
+            Default: [4, 2, 2, 2].
+        paddings (Sequence[int]): The padding of each patch embedding.
+            Default: [0, 0, 0, 0].
+        sr_ratios (Sequence[int]): The spatial reduction rate of each
+            transformer encode layer. Default: [8, 4, 2, 1].
+        out_indices (Sequence[int] | int): Output from which stages.
+            Default: (0, 1, 2, 3).
+        mlp_ratios (Sequence[int]): The ratio of the mlp hidden dim to the
+            embedding dim of each transformer encode layer.
+            Default: [8, 8, 4, 4].
+        qkv_bias (bool): Enable bias for qkv if True. Default: True.
+        drop_rate (float): Probability of an element to be zeroed.
+            Default 0.0.
+        attn_drop_rate (float): The drop out rate for attention layer.
+            Default 0.0.
+        drop_path_rate (float): stochastic depth rate. Default 0.1.
+        use_abs_pos_embed (bool): If True, add absolute position embedding to
+            the patch embedding. Defaults: True.
+        use_conv_ffn (bool): If True, use Convolutional FFN to replace FFN.
+            Default: False.
+        act_cfg (dict): The activation config for FFNs.
+            Default: dict(type='GELU').
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+        pretrained (str, optional): model pretrained path. Default: None.
+        convert_weights (bool): The flag indicates whether the
+            pre-trained model is from the original repo. We may need
+            to convert some keys to make it compatible.
+            Default: True.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 pretrain_img_size=224,
+                 in_channels=3,
+                 embed_dims=64,
+                 num_stages=4,
+                 num_layers=[3, 4, 6, 3],
+                 num_heads=[1, 2, 5, 8],
+                 patch_sizes=[4, 2, 2, 2],
+                 strides=[4, 2, 2, 2],
+                 paddings=[0, 0, 0, 0],
+                 sr_ratios=[8, 4, 2, 1],
+                 out_indices=(0, 1, 2, 3),
+                 mlp_ratios=[8, 8, 4, 4],
+                 qkv_bias=True,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.1,
+                 use_abs_pos_embed=True,
+                 norm_after_stage=False,
+                 use_conv_ffn=False,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='LN', eps=1e-6),
+                 pretrained=None,
+                 convert_weights=True,
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+
+        self.convert_weights = convert_weights
+        if isinstance(pretrain_img_size, int):
+            pretrain_img_size = to_2tuple(pretrain_img_size)
+        elif isinstance(pretrain_img_size, tuple):
+            if len(pretrain_img_size) == 1:
+                pretrain_img_size = to_2tuple(pretrain_img_size[0])
+            assert len(pretrain_img_size) == 2, \
+                f'The size of image should have length 1 or 2, ' \
+                f'but got {len(pretrain_img_size)}'
+
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be setting at the same time'
+        if isinstance(pretrained, str):
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            self.init_cfg = init_cfg
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+        self.embed_dims = embed_dims
+
+        self.num_stages = num_stages
+        self.num_layers = num_layers
+        self.num_heads = num_heads
+        self.patch_sizes = patch_sizes
+        self.strides = strides
+        self.sr_ratios = sr_ratios
+        assert num_stages == len(num_layers) == len(num_heads) \
+               == len(patch_sizes) == len(strides) == len(sr_ratios)
+
+        self.out_indices = out_indices
+        assert max(out_indices) < self.num_stages
+        self.pretrained = pretrained
+
+        # transformer encoder
+        dpr = [
+            x.item()
+            for x in torch.linspace(0, drop_path_rate, sum(num_layers))
+        ]  # stochastic num_layer decay rule
+
+        cur = 0
+        self.layers = ModuleList()
+        for i, num_layer in enumerate(num_layers):
+            embed_dims_i = embed_dims * num_heads[i]
+            patch_embed = PatchEmbed(
+                in_channels=in_channels,
+                embed_dims=embed_dims_i,
+                kernel_size=patch_sizes[i],
+                stride=strides[i],
+                padding=paddings[i],
+                bias=True,
+                norm_cfg=norm_cfg)
+
+            layers = ModuleList()
+            if use_abs_pos_embed:
+                pos_shape = pretrain_img_size // np.prod(patch_sizes[:i + 1])
+                pos_embed = AbsolutePositionEmbedding(
+                    pos_shape=pos_shape,
+                    pos_dim=embed_dims_i,
+                    drop_rate=drop_rate)
+                layers.append(pos_embed)
+            layers.extend([
+                PVTEncoderLayer(
+                    embed_dims=embed_dims_i,
+                    num_heads=num_heads[i],
+                    feedforward_channels=mlp_ratios[i] * embed_dims_i,
+                    drop_rate=drop_rate,
+                    attn_drop_rate=attn_drop_rate,
+                    drop_path_rate=dpr[cur + idx],
+                    qkv_bias=qkv_bias,
+                    act_cfg=act_cfg,
+                    norm_cfg=norm_cfg,
+                    sr_ratio=sr_ratios[i],
+                    use_conv_ffn=use_conv_ffn) for idx in range(num_layer)
+            ])
+            in_channels = embed_dims_i
+            # The ret[0] of build_norm_layer is norm name.
+            if norm_after_stage:
+                norm = build_norm_layer(norm_cfg, embed_dims_i)[1]
+            else:
+                norm = nn.Identity()
+            self.layers.append(ModuleList([patch_embed, layers, norm]))
+            cur += num_layer
+
+    def init_weights(self, pretrained=None):
+        if isinstance(pretrained, str):
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+
+        logger = get_root_logger()
+        if self.init_cfg is None:
+            logger.warn(f'No pre-trained weights for '
+                        f'{self.__class__.__name__}, '
+                        f'training start from scratch')
+            for m in self.modules():
+                if isinstance(m, nn.Linear):
+                    trunc_normal_init(m, std=.02, bias=0.)
+                elif isinstance(m, nn.LayerNorm):
+                    constant_init(m, 1.0)
+                elif isinstance(m, nn.Conv2d):
+                    fan_out = m.kernel_size[0] * m.kernel_size[
+                        1] * m.out_channels
+                    fan_out //= m.groups
+                    normal_init(m, 0, math.sqrt(2.0 / fan_out))
+                elif isinstance(m, AbsolutePositionEmbedding):
+                    m.init_weights()
+        else:
+            assert 'checkpoint' in self.init_cfg, f'Only support ' \
+                                                  f'specify `Pretrained` in ' \
+                                                  f'`init_cfg` in ' \
+                                                  f'{self.__class__.__name__} '
+            checkpoint = _load_checkpoint(
+                self.init_cfg['checkpoint'], logger=logger, map_location='cpu')
+            logger.warn(f'Load pre-trained model for '
+                        f'{self.__class__.__name__} from original repo')
+            if 'state_dict' in checkpoint:
+                state_dict = checkpoint['state_dict']
+            elif 'model' in checkpoint:
+                state_dict = checkpoint['model']
+            else:
+                state_dict = checkpoint
+            if self.convert_weights:
+                # Because pvt backbones are not supported by mmcls,
+                # so we need to convert pre-trained weights to match this
+                # implementation.
+                state_dict = pvt_convert(state_dict)
+            load_state_dict(self, state_dict, strict=False, logger=logger)
+
+    def forward(self, x):
+        outs = []
+
+        for i, layer in enumerate(self.layers):
+            x, hw_shape = layer[0](x)
+
+            for block in layer[1]:
+                x = block(x, hw_shape)
+            x = layer[2](x)
+            x = nlc_to_nchw(x, hw_shape)
+            if i in self.out_indices:
+                outs.append(x)
+
+        return outs
+
+
+@BACKBONES.register_module()
+class PyramidVisionTransformerV2(PyramidVisionTransformer):
+    """Implementation of `PVTv2: Improved Baselines with Pyramid Vision
+    Transformer <https://arxiv.org/pdf/2106.13797.pdf>`_."""
+
+    def __init__(self, **kwargs):
+        super(PyramidVisionTransformerV2, self).__init__(
+            patch_sizes=[7, 3, 3, 3],
+            paddings=[3, 1, 1, 1],
+            use_abs_pos_embed=False,
+            norm_after_stage=True,
+            use_conv_ffn=True,
+            **kwargs)
diff --git a/main/transformer_utils/mmpose/models/backbones/regnet.py b/main/transformer_utils/mmpose/models/backbones/regnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..693417c2d61066e4e9a90989ad61700448028e58
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/backbones/regnet.py
@@ -0,0 +1,317 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import numpy as np
+import torch.nn as nn
+from mmcv.cnn import build_conv_layer, build_norm_layer
+
+from ..builder import BACKBONES
+from .resnet import ResNet
+from .resnext import Bottleneck
+
+
+@BACKBONES.register_module()
+class RegNet(ResNet):
+    """RegNet backbone.
+
+    More details can be found in `paper <https://arxiv.org/abs/2003.13678>`__ .
+
+    Args:
+        arch (dict): The parameter of RegNets.
+            - w0 (int): initial width
+            - wa (float): slope of width
+            - wm (float): quantization parameter to quantize the width
+            - depth (int): depth of the backbone
+            - group_w (int): width of group
+            - bot_mul (float): bottleneck ratio, i.e. expansion of bottleneck.
+        strides (Sequence[int]): Strides of the first block of each stage.
+        base_channels (int): Base channels after stem layer.
+        in_channels (int): Number of input image channels. Default: 3.
+        dilations (Sequence[int]): Dilation of each stage.
+        out_indices (Sequence[int]): Output from which stages.
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer. Default: "pytorch".
+        frozen_stages (int): Stages to be frozen (all param fixed). -1 means
+            not freezing any parameters. Default: -1.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN', requires_grad=True).
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        zero_init_residual (bool): whether to use zero init for last norm layer
+            in resblocks to let them behave as identity. Default: True.
+
+    Example:
+        >>> from mmpose.models import RegNet
+        >>> import torch
+        >>> self = RegNet(
+                arch=dict(
+                    w0=88,
+                    wa=26.31,
+                    wm=2.25,
+                    group_w=48,
+                    depth=25,
+                    bot_mul=1.0),
+                 out_indices=(0, 1, 2, 3))
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 32, 32)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 96, 8, 8)
+        (1, 192, 4, 4)
+        (1, 432, 2, 2)
+        (1, 1008, 1, 1)
+    """
+    arch_settings = {
+        'regnetx_400mf':
+        dict(w0=24, wa=24.48, wm=2.54, group_w=16, depth=22, bot_mul=1.0),
+        'regnetx_800mf':
+        dict(w0=56, wa=35.73, wm=2.28, group_w=16, depth=16, bot_mul=1.0),
+        'regnetx_1.6gf':
+        dict(w0=80, wa=34.01, wm=2.25, group_w=24, depth=18, bot_mul=1.0),
+        'regnetx_3.2gf':
+        dict(w0=88, wa=26.31, wm=2.25, group_w=48, depth=25, bot_mul=1.0),
+        'regnetx_4.0gf':
+        dict(w0=96, wa=38.65, wm=2.43, group_w=40, depth=23, bot_mul=1.0),
+        'regnetx_6.4gf':
+        dict(w0=184, wa=60.83, wm=2.07, group_w=56, depth=17, bot_mul=1.0),
+        'regnetx_8.0gf':
+        dict(w0=80, wa=49.56, wm=2.88, group_w=120, depth=23, bot_mul=1.0),
+        'regnetx_12gf':
+        dict(w0=168, wa=73.36, wm=2.37, group_w=112, depth=19, bot_mul=1.0),
+    }
+
+    def __init__(self,
+                 arch,
+                 in_channels=3,
+                 stem_channels=32,
+                 base_channels=32,
+                 strides=(2, 2, 2, 2),
+                 dilations=(1, 1, 1, 1),
+                 out_indices=(3, ),
+                 style='pytorch',
+                 deep_stem=False,
+                 avg_down=False,
+                 frozen_stages=-1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 norm_eval=False,
+                 with_cp=False,
+                 zero_init_residual=True):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        super(ResNet, self).__init__()
+
+        # Generate RegNet parameters first
+        if isinstance(arch, str):
+            assert arch in self.arch_settings, \
+                f'"arch": "{arch}" is not one of the' \
+                ' arch_settings'
+            arch = self.arch_settings[arch]
+        elif not isinstance(arch, dict):
+            raise TypeError('Expect "arch" to be either a string '
+                            f'or a dict, got {type(arch)}')
+
+        widths, num_stages = self.generate_regnet(
+            arch['w0'],
+            arch['wa'],
+            arch['wm'],
+            arch['depth'],
+        )
+        # Convert to per stage format
+        stage_widths, stage_blocks = self.get_stages_from_blocks(widths)
+        # Generate group widths and bot muls
+        group_widths = [arch['group_w'] for _ in range(num_stages)]
+        self.bottleneck_ratio = [arch['bot_mul'] for _ in range(num_stages)]
+        # Adjust the compatibility of stage_widths and group_widths
+        stage_widths, group_widths = self.adjust_width_group(
+            stage_widths, self.bottleneck_ratio, group_widths)
+
+        # Group params by stage
+        self.stage_widths = stage_widths
+        self.group_widths = group_widths
+        self.depth = sum(stage_blocks)
+        self.stem_channels = stem_channels
+        self.base_channels = base_channels
+        self.num_stages = num_stages
+        assert 1 <= num_stages <= 4
+        self.strides = strides
+        self.dilations = dilations
+        assert len(strides) == len(dilations) == num_stages
+        self.out_indices = out_indices
+        assert max(out_indices) < num_stages
+        self.style = style
+        self.deep_stem = deep_stem
+        if self.deep_stem:
+            raise NotImplementedError(
+                'deep_stem has not been implemented for RegNet')
+        self.avg_down = avg_down
+        self.frozen_stages = frozen_stages
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.with_cp = with_cp
+        self.norm_eval = norm_eval
+        self.zero_init_residual = zero_init_residual
+        self.stage_blocks = stage_blocks[:num_stages]
+
+        self._make_stem_layer(in_channels, stem_channels)
+
+        _in_channels = stem_channels
+        self.res_layers = []
+        for i, num_blocks in enumerate(self.stage_blocks):
+            stride = self.strides[i]
+            dilation = self.dilations[i]
+            group_width = self.group_widths[i]
+            width = int(round(self.stage_widths[i] * self.bottleneck_ratio[i]))
+            stage_groups = width // group_width
+
+            res_layer = self.make_res_layer(
+                block=Bottleneck,
+                num_blocks=num_blocks,
+                in_channels=_in_channels,
+                out_channels=self.stage_widths[i],
+                expansion=1,
+                stride=stride,
+                dilation=dilation,
+                style=self.style,
+                avg_down=self.avg_down,
+                with_cp=self.with_cp,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                base_channels=self.stage_widths[i],
+                groups=stage_groups,
+                width_per_group=group_width)
+            _in_channels = self.stage_widths[i]
+            layer_name = f'layer{i + 1}'
+            self.add_module(layer_name, res_layer)
+            self.res_layers.append(layer_name)
+
+        self._freeze_stages()
+
+        self.feat_dim = stage_widths[-1]
+
+    def _make_stem_layer(self, in_channels, base_channels):
+        self.conv1 = build_conv_layer(
+            self.conv_cfg,
+            in_channels,
+            base_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            bias=False)
+        self.norm1_name, norm1 = build_norm_layer(
+            self.norm_cfg, base_channels, postfix=1)
+        self.add_module(self.norm1_name, norm1)
+        self.relu = nn.ReLU(inplace=True)
+
+    @staticmethod
+    def generate_regnet(initial_width,
+                        width_slope,
+                        width_parameter,
+                        depth,
+                        divisor=8):
+        """Generates per block width from RegNet parameters.
+
+        Args:
+            initial_width ([int]): Initial width of the backbone
+            width_slope ([float]): Slope of the quantized linear function
+            width_parameter ([int]): Parameter used to quantize the width.
+            depth ([int]): Depth of the backbone.
+            divisor (int, optional): The divisor of channels. Defaults to 8.
+
+        Returns:
+            list, int: return a list of widths of each stage and the number of
+                stages
+        """
+        assert width_slope >= 0
+        assert initial_width > 0
+        assert width_parameter > 1
+        assert initial_width % divisor == 0
+        widths_cont = np.arange(depth) * width_slope + initial_width
+        ks = np.round(
+            np.log(widths_cont / initial_width) / np.log(width_parameter))
+        widths = initial_width * np.power(width_parameter, ks)
+        widths = np.round(np.divide(widths, divisor)) * divisor
+        num_stages = len(np.unique(widths))
+        widths, widths_cont = widths.astype(int).tolist(), widths_cont.tolist()
+        return widths, num_stages
+
+    @staticmethod
+    def quantize_float(number, divisor):
+        """Converts a float to closest non-zero int divisible by divior.
+
+        Args:
+            number (int): Original number to be quantized.
+            divisor (int): Divisor used to quantize the number.
+
+        Returns:
+            int: quantized number that is divisible by devisor.
+        """
+        return int(round(number / divisor) * divisor)
+
+    def adjust_width_group(self, widths, bottleneck_ratio, groups):
+        """Adjusts the compatibility of widths and groups.
+
+        Args:
+            widths (list[int]): Width of each stage.
+            bottleneck_ratio (float): Bottleneck ratio.
+            groups (int): number of groups in each stage
+
+        Returns:
+            tuple(list): The adjusted widths and groups of each stage.
+        """
+        bottleneck_width = [
+            int(w * b) for w, b in zip(widths, bottleneck_ratio)
+        ]
+        groups = [min(g, w_bot) for g, w_bot in zip(groups, bottleneck_width)]
+        bottleneck_width = [
+            self.quantize_float(w_bot, g)
+            for w_bot, g in zip(bottleneck_width, groups)
+        ]
+        widths = [
+            int(w_bot / b)
+            for w_bot, b in zip(bottleneck_width, bottleneck_ratio)
+        ]
+        return widths, groups
+
+    def get_stages_from_blocks(self, widths):
+        """Gets widths/stage_blocks of network at each stage.
+
+        Args:
+            widths (list[int]): Width in each stage.
+
+        Returns:
+            tuple(list): width and depth of each stage
+        """
+        width_diff = [
+            width != width_prev
+            for width, width_prev in zip(widths + [0], [0] + widths)
+        ]
+        stage_widths = [
+            width for width, diff in zip(widths, width_diff[:-1]) if diff
+        ]
+        stage_blocks = np.diff([
+            depth for depth, diff in zip(range(len(width_diff)), width_diff)
+            if diff
+        ]).tolist()
+        return stage_widths, stage_blocks
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.relu(x)
+
+        outs = []
+        for i, layer_name in enumerate(self.res_layers):
+            res_layer = getattr(self, layer_name)
+            x = res_layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+
+        if len(outs) == 1:
+            return outs[0]
+        return tuple(outs)
diff --git a/main/transformer_utils/mmpose/models/backbones/resnest.py b/main/transformer_utils/mmpose/models/backbones/resnest.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a2d4081df1417155f0626646f5fe3d0dbfc2864
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/backbones/resnest.py
@@ -0,0 +1,338 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as cp
+from mmcv.cnn import build_conv_layer, build_norm_layer
+
+from ..builder import BACKBONES
+from .resnet import Bottleneck as _Bottleneck
+from .resnet import ResLayer, ResNetV1d
+
+
+class RSoftmax(nn.Module):
+    """Radix Softmax module in ``SplitAttentionConv2d``.
+
+    Args:
+        radix (int): Radix of input.
+        groups (int): Groups of input.
+    """
+
+    def __init__(self, radix, groups):
+        super().__init__()
+        self.radix = radix
+        self.groups = groups
+
+    def forward(self, x):
+        batch = x.size(0)
+        if self.radix > 1:
+            x = x.view(batch, self.groups, self.radix, -1).transpose(1, 2)
+            x = F.softmax(x, dim=1)
+            x = x.reshape(batch, -1)
+        else:
+            x = torch.sigmoid(x)
+        return x
+
+
+class SplitAttentionConv2d(nn.Module):
+    """Split-Attention Conv2d.
+
+    Args:
+        in_channels (int): Same as nn.Conv2d.
+        out_channels (int): Same as nn.Conv2d.
+        kernel_size (int | tuple[int]): Same as nn.Conv2d.
+        stride (int | tuple[int]): Same as nn.Conv2d.
+        padding (int | tuple[int]): Same as nn.Conv2d.
+        dilation (int | tuple[int]): Same as nn.Conv2d.
+        groups (int): Same as nn.Conv2d.
+        radix (int): Radix of SpltAtConv2d. Default: 2
+        reduction_factor (int): Reduction factor of SplitAttentionConv2d.
+            Default: 4.
+        conv_cfg (dict): Config dict for convolution layer. Default: None,
+            which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 radix=2,
+                 reduction_factor=4,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN')):
+        super().__init__()
+        inter_channels = max(in_channels * radix // reduction_factor, 32)
+        self.radix = radix
+        self.groups = groups
+        self.channels = channels
+        self.conv = build_conv_layer(
+            conv_cfg,
+            in_channels,
+            channels * radix,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups * radix,
+            bias=False)
+        self.norm0_name, norm0 = build_norm_layer(
+            norm_cfg, channels * radix, postfix=0)
+        self.add_module(self.norm0_name, norm0)
+        self.relu = nn.ReLU(inplace=True)
+        self.fc1 = build_conv_layer(
+            None, channels, inter_channels, 1, groups=self.groups)
+        self.norm1_name, norm1 = build_norm_layer(
+            norm_cfg, inter_channels, postfix=1)
+        self.add_module(self.norm1_name, norm1)
+        self.fc2 = build_conv_layer(
+            None, inter_channels, channels * radix, 1, groups=self.groups)
+        self.rsoftmax = RSoftmax(radix, groups)
+
+    @property
+    def norm0(self):
+        return getattr(self, self.norm0_name)
+
+    @property
+    def norm1(self):
+        return getattr(self, self.norm1_name)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.norm0(x)
+        x = self.relu(x)
+
+        batch, rchannel = x.shape[:2]
+        if self.radix > 1:
+            splits = x.view(batch, self.radix, -1, *x.shape[2:])
+            gap = splits.sum(dim=1)
+        else:
+            gap = x
+        gap = F.adaptive_avg_pool2d(gap, 1)
+        gap = self.fc1(gap)
+
+        gap = self.norm1(gap)
+        gap = self.relu(gap)
+
+        atten = self.fc2(gap)
+        atten = self.rsoftmax(atten).view(batch, -1, 1, 1)
+
+        if self.radix > 1:
+            attens = atten.view(batch, self.radix, -1, *atten.shape[2:])
+            out = torch.sum(attens * splits, dim=1)
+        else:
+            out = atten * x
+        return out.contiguous()
+
+
+class Bottleneck(_Bottleneck):
+    """Bottleneck block for ResNeSt.
+
+    Args:
+        in_channels (int): Input channels of this block.
+        out_channels (int): Output channels of this block.
+        groups (int): Groups of conv2.
+        width_per_group (int): Width per group of conv2. 64x4d indicates
+            ``groups=64, width_per_group=4`` and 32x8d indicates
+            ``groups=32, width_per_group=8``.
+        radix (int): Radix of SpltAtConv2d. Default: 2
+        reduction_factor (int): Reduction factor of SplitAttentionConv2d.
+            Default: 4.
+        avg_down_stride (bool): Whether to use average pool for stride in
+            Bottleneck. Default: True.
+        stride (int): stride of the block. Default: 1
+        dilation (int): dilation of convolution. Default: 1
+        downsample (nn.Module): downsample operation on identity branch.
+            Default: None
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Default: None
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 groups=1,
+                 width_per_group=4,
+                 base_channels=64,
+                 radix=2,
+                 reduction_factor=4,
+                 avg_down_stride=True,
+                 **kwargs):
+        super().__init__(in_channels, out_channels, **kwargs)
+
+        self.groups = groups
+        self.width_per_group = width_per_group
+
+        # For ResNet bottleneck, middle channels are determined by expansion
+        # and out_channels, but for ResNeXt bottleneck, it is determined by
+        # groups and width_per_group and the stage it is located in.
+        if groups != 1:
+            assert self.mid_channels % base_channels == 0
+            self.mid_channels = (
+                groups * width_per_group * self.mid_channels // base_channels)
+
+        self.avg_down_stride = avg_down_stride and self.conv2_stride > 1
+
+        self.norm1_name, norm1 = build_norm_layer(
+            self.norm_cfg, self.mid_channels, postfix=1)
+        self.norm3_name, norm3 = build_norm_layer(
+            self.norm_cfg, self.out_channels, postfix=3)
+
+        self.conv1 = build_conv_layer(
+            self.conv_cfg,
+            self.in_channels,
+            self.mid_channels,
+            kernel_size=1,
+            stride=self.conv1_stride,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        self.conv2 = SplitAttentionConv2d(
+            self.mid_channels,
+            self.mid_channels,
+            kernel_size=3,
+            stride=1 if self.avg_down_stride else self.conv2_stride,
+            padding=self.dilation,
+            dilation=self.dilation,
+            groups=groups,
+            radix=radix,
+            reduction_factor=reduction_factor,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg)
+        delattr(self, self.norm2_name)
+
+        if self.avg_down_stride:
+            self.avd_layer = nn.AvgPool2d(3, self.conv2_stride, padding=1)
+
+        self.conv3 = build_conv_layer(
+            self.conv_cfg,
+            self.mid_channels,
+            self.out_channels,
+            kernel_size=1,
+            bias=False)
+        self.add_module(self.norm3_name, norm3)
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            identity = x
+
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            out = self.conv2(out)
+
+            if self.avg_down_stride:
+                out = self.avd_layer(out)
+
+            out = self.conv3(out)
+            out = self.norm3(out)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+@BACKBONES.register_module()
+class ResNeSt(ResNetV1d):
+    """ResNeSt backbone.
+
+    Please refer to the `paper <https://arxiv.org/pdf/2004.08955.pdf>`__
+    for details.
+
+    Args:
+        depth (int): Network depth, from {50, 101, 152, 200}.
+        groups (int): Groups of conv2 in Bottleneck. Default: 32.
+        width_per_group (int): Width per group of conv2 in Bottleneck.
+            Default: 4.
+        radix (int): Radix of SpltAtConv2d. Default: 2
+        reduction_factor (int): Reduction factor of SplitAttentionConv2d.
+            Default: 4.
+        avg_down_stride (bool): Whether to use average pool for stride in
+            Bottleneck. Default: True.
+        in_channels (int): Number of input image channels. Default: 3.
+        stem_channels (int): Output channels of the stem layer. Default: 64.
+        num_stages (int): Stages of the network. Default: 4.
+        strides (Sequence[int]): Strides of the first block of each stage.
+            Default: ``(1, 2, 2, 2)``.
+        dilations (Sequence[int]): Dilation of each stage.
+            Default: ``(1, 1, 1, 1)``.
+        out_indices (Sequence[int]): Output from which stages. If only one
+            stage is specified, a single tensor (feature map) is returned,
+            otherwise multiple stages are specified, a tuple of tensors will
+            be returned. Default: ``(3, )``.
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv.
+            Default: False.
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottleneck. Default: False.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters. Default: -1.
+        conv_cfg (dict | None): The config dict for conv layers. Default: None.
+        norm_cfg (dict): The config dict for norm layers.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        zero_init_residual (bool): Whether to use zero init for last norm layer
+            in resblocks to let them behave as identity. Default: True.
+    """
+
+    arch_settings = {
+        50: (Bottleneck, (3, 4, 6, 3)),
+        101: (Bottleneck, (3, 4, 23, 3)),
+        152: (Bottleneck, (3, 8, 36, 3)),
+        200: (Bottleneck, (3, 24, 36, 3)),
+        269: (Bottleneck, (3, 30, 48, 8))
+    }
+
+    def __init__(self,
+                 depth,
+                 groups=1,
+                 width_per_group=4,
+                 radix=2,
+                 reduction_factor=4,
+                 avg_down_stride=True,
+                 **kwargs):
+        self.groups = groups
+        self.width_per_group = width_per_group
+        self.radix = radix
+        self.reduction_factor = reduction_factor
+        self.avg_down_stride = avg_down_stride
+        super().__init__(depth=depth, **kwargs)
+
+    def make_res_layer(self, **kwargs):
+        return ResLayer(
+            groups=self.groups,
+            width_per_group=self.width_per_group,
+            base_channels=self.base_channels,
+            radix=self.radix,
+            reduction_factor=self.reduction_factor,
+            avg_down_stride=self.avg_down_stride,
+            **kwargs)
diff --git a/main/transformer_utils/mmpose/models/backbones/resnet.py b/main/transformer_utils/mmpose/models/backbones/resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..649496a755020140d94eb32fbe79d1ff135c86ca
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/backbones/resnet.py
@@ -0,0 +1,701 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import (ConvModule, build_conv_layer, build_norm_layer,
+                      constant_init, kaiming_init)
+from mmcv.utils.parrots_wrapper import _BatchNorm
+
+from ..builder import BACKBONES
+from .base_backbone import BaseBackbone
+
+
+class BasicBlock(nn.Module):
+    """BasicBlock for ResNet.
+
+    Args:
+        in_channels (int): Input channels of this block.
+        out_channels (int): Output channels of this block.
+        expansion (int): The ratio of ``out_channels/mid_channels`` where
+            ``mid_channels`` is the output channels of conv1. This is a
+            reserved argument in BasicBlock and should always be 1. Default: 1.
+        stride (int): stride of the block. Default: 1
+        dilation (int): dilation of convolution. Default: 1
+        downsample (nn.Module): downsample operation on identity branch.
+            Default: None.
+        style (str): `pytorch` or `caffe`. It is unused and reserved for
+            unified API with Bottleneck.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Default: None
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 expansion=1,
+                 stride=1,
+                 dilation=1,
+                 downsample=None,
+                 style='pytorch',
+                 with_cp=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN')):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.expansion = expansion
+        assert self.expansion == 1
+        assert out_channels % expansion == 0
+        self.mid_channels = out_channels // expansion
+        self.stride = stride
+        self.dilation = dilation
+        self.style = style
+        self.with_cp = with_cp
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+
+        self.norm1_name, norm1 = build_norm_layer(
+            norm_cfg, self.mid_channels, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(
+            norm_cfg, out_channels, postfix=2)
+
+        self.conv1 = build_conv_layer(
+            conv_cfg,
+            in_channels,
+            self.mid_channels,
+            3,
+            stride=stride,
+            padding=dilation,
+            dilation=dilation,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        self.conv2 = build_conv_layer(
+            conv_cfg,
+            self.mid_channels,
+            out_channels,
+            3,
+            padding=1,
+            bias=False)
+        self.add_module(self.norm2_name, norm2)
+
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+
+    @property
+    def norm1(self):
+        """nn.Module: the normalization layer named "norm1" """
+        return getattr(self, self.norm1_name)
+
+    @property
+    def norm2(self):
+        """nn.Module: the normalization layer named "norm2" """
+        return getattr(self, self.norm2_name)
+
+    def forward(self, x):
+        """Forward function."""
+
+        def _inner_forward(x):
+            identity = x
+
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            out = self.conv2(out)
+            out = self.norm2(out)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    """Bottleneck block for ResNet.
+
+    Args:
+        in_channels (int): Input channels of this block.
+        out_channels (int): Output channels of this block.
+        expansion (int): The ratio of ``out_channels/mid_channels`` where
+            ``mid_channels`` is the input/output channels of conv2. Default: 4.
+        stride (int): stride of the block. Default: 1
+        dilation (int): dilation of convolution. Default: 1
+        downsample (nn.Module): downsample operation on identity branch.
+            Default: None.
+        style (str): ``"pytorch"`` or ``"caffe"``. If set to "pytorch", the
+            stride-two layer is the 3x3 conv layer, otherwise the stride-two
+            layer is the first 1x1 conv layer. Default: "pytorch".
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Default: None
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 expansion=4,
+                 stride=1,
+                 dilation=1,
+                 downsample=None,
+                 style='pytorch',
+                 with_cp=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN')):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        super().__init__()
+        assert style in ['pytorch', 'caffe']
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.expansion = expansion
+        assert out_channels % expansion == 0
+        self.mid_channels = out_channels // expansion
+        self.stride = stride
+        self.dilation = dilation
+        self.style = style
+        self.with_cp = with_cp
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+
+        if self.style == 'pytorch':
+            self.conv1_stride = 1
+            self.conv2_stride = stride
+        else:
+            self.conv1_stride = stride
+            self.conv2_stride = 1
+
+        self.norm1_name, norm1 = build_norm_layer(
+            norm_cfg, self.mid_channels, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(
+            norm_cfg, self.mid_channels, postfix=2)
+        self.norm3_name, norm3 = build_norm_layer(
+            norm_cfg, out_channels, postfix=3)
+
+        self.conv1 = build_conv_layer(
+            conv_cfg,
+            in_channels,
+            self.mid_channels,
+            kernel_size=1,
+            stride=self.conv1_stride,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        self.conv2 = build_conv_layer(
+            conv_cfg,
+            self.mid_channels,
+            self.mid_channels,
+            kernel_size=3,
+            stride=self.conv2_stride,
+            padding=dilation,
+            dilation=dilation,
+            bias=False)
+
+        self.add_module(self.norm2_name, norm2)
+        self.conv3 = build_conv_layer(
+            conv_cfg,
+            self.mid_channels,
+            out_channels,
+            kernel_size=1,
+            bias=False)
+        self.add_module(self.norm3_name, norm3)
+
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+
+    @property
+    def norm1(self):
+        """nn.Module: the normalization layer named "norm1" """
+        return getattr(self, self.norm1_name)
+
+    @property
+    def norm2(self):
+        """nn.Module: the normalization layer named "norm2" """
+        return getattr(self, self.norm2_name)
+
+    @property
+    def norm3(self):
+        """nn.Module: the normalization layer named "norm3" """
+        return getattr(self, self.norm3_name)
+
+    def forward(self, x):
+        """Forward function."""
+
+        def _inner_forward(x):
+            identity = x
+
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            out = self.conv2(out)
+            out = self.norm2(out)
+            out = self.relu(out)
+
+            out = self.conv3(out)
+            out = self.norm3(out)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+def get_expansion(block, expansion=None):
+    """Get the expansion of a residual block.
+
+    The block expansion will be obtained by the following order:
+
+    1. If ``expansion`` is given, just return it.
+    2. If ``block`` has the attribute ``expansion``, then return
+       ``block.expansion``.
+    3. Return the default value according the the block type:
+       1 for ``BasicBlock`` and 4 for ``Bottleneck``.
+
+    Args:
+        block (class): The block class.
+        expansion (int | None): The given expansion ratio.
+
+    Returns:
+        int: The expansion of the block.
+    """
+    if isinstance(expansion, int):
+        assert expansion > 0
+    elif expansion is None:
+        if hasattr(block, 'expansion'):
+            expansion = block.expansion
+        elif issubclass(block, BasicBlock):
+            expansion = 1
+        elif issubclass(block, Bottleneck):
+            expansion = 4
+        else:
+            raise TypeError(f'expansion is not specified for {block.__name__}')
+    else:
+        raise TypeError('expansion must be an integer or None')
+
+    return expansion
+
+
+class ResLayer(nn.Sequential):
+    """ResLayer to build ResNet style backbone.
+
+    Args:
+        block (nn.Module): Residual block used to build ResLayer.
+        num_blocks (int): Number of blocks.
+        in_channels (int): Input channels of this block.
+        out_channels (int): Output channels of this block.
+        expansion (int, optional): The expansion for BasicBlock/Bottleneck.
+            If not specified, it will firstly be obtained via
+            ``block.expansion``. If the block has no attribute "expansion",
+            the following default values will be used: 1 for BasicBlock and
+            4 for Bottleneck. Default: None.
+        stride (int): stride of the first block. Default: 1.
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottleneck. Default: False
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Default: None
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        downsample_first (bool): Downsample at the first block or last block.
+            False for Hourglass, True for ResNet. Default: True
+    """
+
+    def __init__(self,
+                 block,
+                 num_blocks,
+                 in_channels,
+                 out_channels,
+                 expansion=None,
+                 stride=1,
+                 avg_down=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 downsample_first=True,
+                 **kwargs):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        self.block = block
+        self.expansion = get_expansion(block, expansion)
+
+        downsample = None
+        if stride != 1 or in_channels != out_channels:
+            downsample = []
+            conv_stride = stride
+            if avg_down and stride != 1:
+                conv_stride = 1
+                downsample.append(
+                    nn.AvgPool2d(
+                        kernel_size=stride,
+                        stride=stride,
+                        ceil_mode=True,
+                        count_include_pad=False))
+            downsample.extend([
+                build_conv_layer(
+                    conv_cfg,
+                    in_channels,
+                    out_channels,
+                    kernel_size=1,
+                    stride=conv_stride,
+                    bias=False),
+                build_norm_layer(norm_cfg, out_channels)[1]
+            ])
+            downsample = nn.Sequential(*downsample)
+
+        layers = []
+        if downsample_first:
+            layers.append(
+                block(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    expansion=self.expansion,
+                    stride=stride,
+                    downsample=downsample,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    **kwargs))
+            in_channels = out_channels
+            for _ in range(1, num_blocks):
+                layers.append(
+                    block(
+                        in_channels=in_channels,
+                        out_channels=out_channels,
+                        expansion=self.expansion,
+                        stride=1,
+                        conv_cfg=conv_cfg,
+                        norm_cfg=norm_cfg,
+                        **kwargs))
+        else:  # downsample_first=False is for HourglassModule
+            for i in range(0, num_blocks - 1):
+                layers.append(
+                    block(
+                        in_channels=in_channels,
+                        out_channels=in_channels,
+                        expansion=self.expansion,
+                        stride=1,
+                        conv_cfg=conv_cfg,
+                        norm_cfg=norm_cfg,
+                        **kwargs))
+            layers.append(
+                block(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    expansion=self.expansion,
+                    stride=stride,
+                    downsample=downsample,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    **kwargs))
+
+        super().__init__(*layers)
+
+
+@BACKBONES.register_module()
+class ResNet(BaseBackbone):
+    """ResNet backbone.
+
+    Please refer to the `paper <https://arxiv.org/abs/1512.03385>`__ for
+    details.
+
+    Args:
+        depth (int): Network depth, from {18, 34, 50, 101, 152}.
+        in_channels (int): Number of input image channels. Default: 3.
+        stem_channels (int): Output channels of the stem layer. Default: 64.
+        base_channels (int): Middle channels of the first stage. Default: 64.
+        num_stages (int): Stages of the network. Default: 4.
+        strides (Sequence[int]): Strides of the first block of each stage.
+            Default: ``(1, 2, 2, 2)``.
+        dilations (Sequence[int]): Dilation of each stage.
+            Default: ``(1, 1, 1, 1)``.
+        out_indices (Sequence[int]): Output from which stages. If only one
+            stage is specified, a single tensor (feature map) is returned,
+            otherwise multiple stages are specified, a tuple of tensors will
+            be returned. Default: ``(3, )``.
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv.
+            Default: False.
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottleneck. Default: False.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters. Default: -1.
+        conv_cfg (dict | None): The config dict for conv layers. Default: None.
+        norm_cfg (dict): The config dict for norm layers.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        zero_init_residual (bool): Whether to use zero init for last norm layer
+            in resblocks to let them behave as identity. Default: True.
+
+    Example:
+        >>> from mmpose.models import ResNet
+        >>> import torch
+        >>> self = ResNet(depth=18, out_indices=(0, 1, 2, 3))
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 32, 32)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 64, 8, 8)
+        (1, 128, 4, 4)
+        (1, 256, 2, 2)
+        (1, 512, 1, 1)
+    """
+
+    arch_settings = {
+        18: (BasicBlock, (2, 2, 2, 2)),
+        34: (BasicBlock, (3, 4, 6, 3)),
+        50: (Bottleneck, (3, 4, 6, 3)),
+        101: (Bottleneck, (3, 4, 23, 3)),
+        152: (Bottleneck, (3, 8, 36, 3))
+    }
+
+    def __init__(self,
+                 depth,
+                 in_channels=3,
+                 stem_channels=64,
+                 base_channels=64,
+                 expansion=None,
+                 num_stages=4,
+                 strides=(1, 2, 2, 2),
+                 dilations=(1, 1, 1, 1),
+                 out_indices=(3, ),
+                 style='pytorch',
+                 deep_stem=False,
+                 avg_down=False,
+                 frozen_stages=-1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 norm_eval=False,
+                 with_cp=False,
+                 zero_init_residual=True):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        super().__init__()
+        if depth not in self.arch_settings:
+            raise KeyError(f'invalid depth {depth} for resnet')
+        self.depth = depth
+        self.stem_channels = stem_channels
+        self.base_channels = base_channels
+        self.num_stages = num_stages
+        assert 1 <= num_stages <= 4
+        self.strides = strides
+        self.dilations = dilations
+        assert len(strides) == len(dilations) == num_stages
+        self.out_indices = out_indices
+        assert max(out_indices) < num_stages
+        self.style = style
+        self.deep_stem = deep_stem
+        self.avg_down = avg_down
+        self.frozen_stages = frozen_stages
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.with_cp = with_cp
+        self.norm_eval = norm_eval
+        self.zero_init_residual = zero_init_residual
+        self.block, stage_blocks = self.arch_settings[depth]
+        self.stage_blocks = stage_blocks[:num_stages]
+        self.expansion = get_expansion(self.block, expansion)
+
+        self._make_stem_layer(in_channels, stem_channels)
+
+        self.res_layers = []
+        _in_channels = stem_channels
+        _out_channels = base_channels * self.expansion
+        for i, num_blocks in enumerate(self.stage_blocks):
+            stride = strides[i]
+            dilation = dilations[i]
+            res_layer = self.make_res_layer(
+                block=self.block,
+                num_blocks=num_blocks,
+                in_channels=_in_channels,
+                out_channels=_out_channels,
+                expansion=self.expansion,
+                stride=stride,
+                dilation=dilation,
+                style=self.style,
+                avg_down=self.avg_down,
+                with_cp=with_cp,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg)
+            _in_channels = _out_channels
+            _out_channels *= 2
+            layer_name = f'layer{i + 1}'
+            self.add_module(layer_name, res_layer)
+            self.res_layers.append(layer_name)
+
+        self._freeze_stages()
+
+        self.feat_dim = res_layer[-1].out_channels
+
+    def make_res_layer(self, **kwargs):
+        """Make a ResLayer."""
+        return ResLayer(**kwargs)
+
+    @property
+    def norm1(self):
+        """nn.Module: the normalization layer named "norm1" """
+        return getattr(self, self.norm1_name)
+
+    def _make_stem_layer(self, in_channels, stem_channels):
+        """Make stem layer."""
+        if self.deep_stem:
+            self.stem = nn.Sequential(
+                ConvModule(
+                    in_channels,
+                    stem_channels // 2,
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    inplace=True),
+                ConvModule(
+                    stem_channels // 2,
+                    stem_channels // 2,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    inplace=True),
+                ConvModule(
+                    stem_channels // 2,
+                    stem_channels,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    inplace=True))
+        else:
+            self.conv1 = build_conv_layer(
+                self.conv_cfg,
+                in_channels,
+                stem_channels,
+                kernel_size=7,
+                stride=2,
+                padding=3,
+                bias=False)
+            self.norm1_name, norm1 = build_norm_layer(
+                self.norm_cfg, stem_channels, postfix=1)
+            self.add_module(self.norm1_name, norm1)
+            self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+    def _freeze_stages(self):
+        """Freeze parameters."""
+        if self.frozen_stages >= 0:
+            if self.deep_stem:
+                self.stem.eval()
+                for param in self.stem.parameters():
+                    param.requires_grad = False
+            else:
+                self.norm1.eval()
+                for m in [self.conv1, self.norm1]:
+                    for param in m.parameters():
+                        param.requires_grad = False
+
+        for i in range(1, self.frozen_stages + 1):
+            m = getattr(self, f'layer{i}')
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in backbone.
+
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        super().init_weights(pretrained)
+        if pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    kaiming_init(m)
+                elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
+                    constant_init(m, 1)
+
+            if self.zero_init_residual:
+                for m in self.modules():
+                    if isinstance(m, Bottleneck):
+                        constant_init(m.norm3, 0)
+                    elif isinstance(m, BasicBlock):
+                        constant_init(m.norm2, 0)
+
+    def forward(self, x):
+        """Forward function."""
+        if self.deep_stem:
+            x = self.stem(x)
+        else:
+            x = self.conv1(x)
+            x = self.norm1(x)
+            x = self.relu(x)
+        x = self.maxpool(x)
+        outs = []
+        for i, layer_name in enumerate(self.res_layers):
+            res_layer = getattr(self, layer_name)
+            x = res_layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+        if len(outs) == 1:
+            return outs[0]
+        return tuple(outs)
+
+    def train(self, mode=True):
+        """Convert the model into training mode."""
+        super().train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                # trick: eval have effect on BatchNorm only
+                if isinstance(m, _BatchNorm):
+                    m.eval()
+
+
+@BACKBONES.register_module()
+class ResNetV1d(ResNet):
+    r"""ResNetV1d variant described in `Bag of Tricks
+    <https://arxiv.org/pdf/1812.01187.pdf>`__.
+
+    Compared with default ResNet(ResNetV1b), ResNetV1d replaces the 7x7 conv in
+    the input stem with three 3x3 convs. And in the downsampling block, a 2x2
+    avg_pool with stride 2 is added before conv, whose stride is changed to 1.
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(deep_stem=True, avg_down=True, **kwargs)
diff --git a/main/transformer_utils/mmpose/models/backbones/resnext.py b/main/transformer_utils/mmpose/models/backbones/resnext.py
new file mode 100644
index 0000000000000000000000000000000000000000..c10dc33f98ac3229c77bf306acf19950c295f904
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/backbones/resnext.py
@@ -0,0 +1,162 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.cnn import build_conv_layer, build_norm_layer
+
+from ..builder import BACKBONES
+from .resnet import Bottleneck as _Bottleneck
+from .resnet import ResLayer, ResNet
+
+
+class Bottleneck(_Bottleneck):
+    """Bottleneck block for ResNeXt.
+
+    Args:
+        in_channels (int): Input channels of this block.
+        out_channels (int): Output channels of this block.
+        groups (int): Groups of conv2.
+        width_per_group (int): Width per group of conv2. 64x4d indicates
+            ``groups=64, width_per_group=4`` and 32x8d indicates
+            ``groups=32, width_per_group=8``.
+        stride (int): stride of the block. Default: 1
+        dilation (int): dilation of convolution. Default: 1
+        downsample (nn.Module): downsample operation on identity branch.
+            Default: None
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Default: None
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 base_channels=64,
+                 groups=32,
+                 width_per_group=4,
+                 **kwargs):
+        super().__init__(in_channels, out_channels, **kwargs)
+        self.groups = groups
+        self.width_per_group = width_per_group
+
+        # For ResNet bottleneck, middle channels are determined by expansion
+        # and out_channels, but for ResNeXt bottleneck, it is determined by
+        # groups and width_per_group and the stage it is located in.
+        if groups != 1:
+            assert self.mid_channels % base_channels == 0
+            self.mid_channels = (
+                groups * width_per_group * self.mid_channels // base_channels)
+
+        self.norm1_name, norm1 = build_norm_layer(
+            self.norm_cfg, self.mid_channels, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(
+            self.norm_cfg, self.mid_channels, postfix=2)
+        self.norm3_name, norm3 = build_norm_layer(
+            self.norm_cfg, self.out_channels, postfix=3)
+
+        self.conv1 = build_conv_layer(
+            self.conv_cfg,
+            self.in_channels,
+            self.mid_channels,
+            kernel_size=1,
+            stride=self.conv1_stride,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        self.conv2 = build_conv_layer(
+            self.conv_cfg,
+            self.mid_channels,
+            self.mid_channels,
+            kernel_size=3,
+            stride=self.conv2_stride,
+            padding=self.dilation,
+            dilation=self.dilation,
+            groups=groups,
+            bias=False)
+
+        self.add_module(self.norm2_name, norm2)
+        self.conv3 = build_conv_layer(
+            self.conv_cfg,
+            self.mid_channels,
+            self.out_channels,
+            kernel_size=1,
+            bias=False)
+        self.add_module(self.norm3_name, norm3)
+
+
+@BACKBONES.register_module()
+class ResNeXt(ResNet):
+    """ResNeXt backbone.
+
+    Please refer to the `paper <https://arxiv.org/abs/1611.05431>`__ for
+    details.
+
+    Args:
+        depth (int): Network depth, from {50, 101, 152}.
+        groups (int): Groups of conv2 in Bottleneck. Default: 32.
+        width_per_group (int): Width per group of conv2 in Bottleneck.
+            Default: 4.
+        in_channels (int): Number of input image channels. Default: 3.
+        stem_channels (int): Output channels of the stem layer. Default: 64.
+        num_stages (int): Stages of the network. Default: 4.
+        strides (Sequence[int]): Strides of the first block of each stage.
+            Default: ``(1, 2, 2, 2)``.
+        dilations (Sequence[int]): Dilation of each stage.
+            Default: ``(1, 1, 1, 1)``.
+        out_indices (Sequence[int]): Output from which stages. If only one
+            stage is specified, a single tensor (feature map) is returned,
+            otherwise multiple stages are specified, a tuple of tensors will
+            be returned. Default: ``(3, )``.
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv.
+            Default: False.
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottleneck. Default: False.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters. Default: -1.
+        conv_cfg (dict | None): The config dict for conv layers. Default: None.
+        norm_cfg (dict): The config dict for norm layers.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        zero_init_residual (bool): Whether to use zero init for last norm layer
+            in resblocks to let them behave as identity. Default: True.
+
+     Example:
+        >>> from mmpose.models import ResNeXt
+        >>> import torch
+        >>> self = ResNeXt(depth=50, out_indices=(0, 1, 2, 3))
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 32, 32)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 256, 8, 8)
+        (1, 512, 4, 4)
+        (1, 1024, 2, 2)
+        (1, 2048, 1, 1)
+    """
+
+    arch_settings = {
+        50: (Bottleneck, (3, 4, 6, 3)),
+        101: (Bottleneck, (3, 4, 23, 3)),
+        152: (Bottleneck, (3, 8, 36, 3))
+    }
+
+    def __init__(self, depth, groups=32, width_per_group=4, **kwargs):
+        self.groups = groups
+        self.width_per_group = width_per_group
+        super().__init__(depth, **kwargs)
+
+    def make_res_layer(self, **kwargs):
+        return ResLayer(
+            groups=self.groups,
+            width_per_group=self.width_per_group,
+            base_channels=self.base_channels,
+            **kwargs)
diff --git a/main/transformer_utils/mmpose/models/backbones/rsn.py b/main/transformer_utils/mmpose/models/backbones/rsn.py
new file mode 100644
index 0000000000000000000000000000000000000000..29038afe2a77dcb3d3b027b1549d478916a50727
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/backbones/rsn.py
@@ -0,0 +1,616 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy as cp
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import (ConvModule, MaxPool2d, constant_init, kaiming_init,
+                      normal_init)
+
+from ..builder import BACKBONES
+from .base_backbone import BaseBackbone
+
+
+class RSB(nn.Module):
+    """Residual Steps block for RSN. Paper ref: Cai et al. "Learning Delicate
+    Local Representations for Multi-Person Pose Estimation" (ECCV 2020).
+
+    Args:
+        in_channels (int): Input channels of this block.
+        out_channels (int): Output channels of this block.
+        num_steps (int): Numbers of steps in RSB
+        stride (int): stride of the block. Default: 1
+        downsample (nn.Module): downsample operation on identity branch.
+            Default: None.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        expand_times (int): Times by which the in_channels are expanded.
+            Default:26.
+        res_top_channels (int): Number of channels of feature output by
+            ResNet_top. Default:64.
+    """
+
+    expansion = 1
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_steps=4,
+                 stride=1,
+                 downsample=None,
+                 with_cp=False,
+                 norm_cfg=dict(type='BN'),
+                 expand_times=26,
+                 res_top_channels=64):
+        # Protect mutable default arguments
+        norm_cfg = cp.deepcopy(norm_cfg)
+        super().__init__()
+        assert num_steps > 1
+        self.in_channels = in_channels
+        self.branch_channels = self.in_channels * expand_times
+        self.branch_channels //= res_top_channels
+        self.out_channels = out_channels
+        self.stride = stride
+        self.downsample = downsample
+        self.with_cp = with_cp
+        self.norm_cfg = norm_cfg
+        self.num_steps = num_steps
+        self.conv_bn_relu1 = ConvModule(
+            self.in_channels,
+            self.num_steps * self.branch_channels,
+            kernel_size=1,
+            stride=self.stride,
+            padding=0,
+            norm_cfg=self.norm_cfg,
+            inplace=False)
+        for i in range(self.num_steps):
+            for j in range(i + 1):
+                module_name = f'conv_bn_relu2_{i + 1}_{j + 1}'
+                self.add_module(
+                    module_name,
+                    ConvModule(
+                        self.branch_channels,
+                        self.branch_channels,
+                        kernel_size=3,
+                        stride=1,
+                        padding=1,
+                        norm_cfg=self.norm_cfg,
+                        inplace=False))
+        self.conv_bn3 = ConvModule(
+            self.num_steps * self.branch_channels,
+            self.out_channels * self.expansion,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            act_cfg=None,
+            norm_cfg=self.norm_cfg,
+            inplace=False)
+        self.relu = nn.ReLU(inplace=False)
+
+    def forward(self, x):
+        """Forward function."""
+
+        identity = x
+        x = self.conv_bn_relu1(x)
+        spx = torch.split(x, self.branch_channels, 1)
+        outputs = list()
+        outs = list()
+        for i in range(self.num_steps):
+            outputs_i = list()
+            outputs.append(outputs_i)
+            for j in range(i + 1):
+                if j == 0:
+                    inputs = spx[i]
+                else:
+                    inputs = outputs[i][j - 1]
+                if i > j:
+                    inputs = inputs + outputs[i - 1][j]
+                module_name = f'conv_bn_relu2_{i + 1}_{j + 1}'
+                module_i_j = getattr(self, module_name)
+                outputs[i].append(module_i_j(inputs))
+
+            outs.append(outputs[i][i])
+        out = torch.cat(tuple(outs), 1)
+        out = self.conv_bn3(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(identity)
+        out = out + identity
+
+        out = self.relu(out)
+
+        return out
+
+
+class Downsample_module(nn.Module):
+    """Downsample module for RSN.
+
+    Args:
+        block (nn.Module): Downsample block.
+        num_blocks (list): Number of blocks in each downsample unit.
+        num_units (int): Numbers of downsample units. Default: 4
+        has_skip (bool): Have skip connections from prior upsample
+            module or not. Default:False
+        num_steps (int): Number of steps in a block. Default:4
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        in_channels (int): Number of channels of the input feature to
+            downsample module. Default: 64
+        expand_times (int): Times by which the in_channels are expanded.
+            Default:26.
+    """
+
+    def __init__(self,
+                 block,
+                 num_blocks,
+                 num_steps=4,
+                 num_units=4,
+                 has_skip=False,
+                 norm_cfg=dict(type='BN'),
+                 in_channels=64,
+                 expand_times=26):
+        # Protect mutable default arguments
+        norm_cfg = cp.deepcopy(norm_cfg)
+        super().__init__()
+        self.has_skip = has_skip
+        self.in_channels = in_channels
+        assert len(num_blocks) == num_units
+        self.num_blocks = num_blocks
+        self.num_units = num_units
+        self.num_steps = num_steps
+        self.norm_cfg = norm_cfg
+        self.layer1 = self._make_layer(
+            block,
+            in_channels,
+            num_blocks[0],
+            expand_times=expand_times,
+            res_top_channels=in_channels)
+        for i in range(1, num_units):
+            module_name = f'layer{i + 1}'
+            self.add_module(
+                module_name,
+                self._make_layer(
+                    block,
+                    in_channels * pow(2, i),
+                    num_blocks[i],
+                    stride=2,
+                    expand_times=expand_times,
+                    res_top_channels=in_channels))
+
+    def _make_layer(self,
+                    block,
+                    out_channels,
+                    blocks,
+                    stride=1,
+                    expand_times=26,
+                    res_top_channels=64):
+        downsample = None
+        if stride != 1 or self.in_channels != out_channels * block.expansion:
+            downsample = ConvModule(
+                self.in_channels,
+                out_channels * block.expansion,
+                kernel_size=1,
+                stride=stride,
+                padding=0,
+                norm_cfg=self.norm_cfg,
+                act_cfg=None,
+                inplace=True)
+
+        units = list()
+        units.append(
+            block(
+                self.in_channels,
+                out_channels,
+                num_steps=self.num_steps,
+                stride=stride,
+                downsample=downsample,
+                norm_cfg=self.norm_cfg,
+                expand_times=expand_times,
+                res_top_channels=res_top_channels))
+        self.in_channels = out_channels * block.expansion
+        for _ in range(1, blocks):
+            units.append(
+                block(
+                    self.in_channels,
+                    out_channels,
+                    num_steps=self.num_steps,
+                    expand_times=expand_times,
+                    res_top_channels=res_top_channels))
+
+        return nn.Sequential(*units)
+
+    def forward(self, x, skip1, skip2):
+        out = list()
+        for i in range(self.num_units):
+            module_name = f'layer{i + 1}'
+            module_i = getattr(self, module_name)
+            x = module_i(x)
+            if self.has_skip:
+                x = x + skip1[i] + skip2[i]
+            out.append(x)
+        out.reverse()
+
+        return tuple(out)
+
+
+class Upsample_unit(nn.Module):
+    """Upsample unit for upsample module.
+
+    Args:
+        ind (int): Indicates whether to interpolate (>0) and whether to
+           generate feature map for the next hourglass-like module.
+        num_units (int): Number of units that form a upsample module. Along
+            with ind and gen_cross_conv, nm_units is used to decide whether
+            to generate feature map for the next hourglass-like module.
+        in_channels (int): Channel number of the skip-in feature maps from
+            the corresponding downsample unit.
+        unit_channels (int): Channel number in this unit. Default:256.
+        gen_skip: (bool): Whether or not to generate skips for the posterior
+            downsample module. Default:False
+        gen_cross_conv (bool): Whether to generate feature map for the next
+            hourglass-like module. Default:False
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        out_channels (in): Number of channels of feature output by upsample
+            module. Must equal to in_channels of downsample module. Default:64
+    """
+
+    def __init__(self,
+                 ind,
+                 num_units,
+                 in_channels,
+                 unit_channels=256,
+                 gen_skip=False,
+                 gen_cross_conv=False,
+                 norm_cfg=dict(type='BN'),
+                 out_channels=64):
+        # Protect mutable default arguments
+        norm_cfg = cp.deepcopy(norm_cfg)
+        super().__init__()
+        self.num_units = num_units
+        self.norm_cfg = norm_cfg
+        self.in_skip = ConvModule(
+            in_channels,
+            unit_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            norm_cfg=self.norm_cfg,
+            act_cfg=None,
+            inplace=True)
+        self.relu = nn.ReLU(inplace=True)
+
+        self.ind = ind
+        if self.ind > 0:
+            self.up_conv = ConvModule(
+                unit_channels,
+                unit_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                norm_cfg=self.norm_cfg,
+                act_cfg=None,
+                inplace=True)
+
+        self.gen_skip = gen_skip
+        if self.gen_skip:
+            self.out_skip1 = ConvModule(
+                in_channels,
+                in_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                norm_cfg=self.norm_cfg,
+                inplace=True)
+
+            self.out_skip2 = ConvModule(
+                unit_channels,
+                in_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                norm_cfg=self.norm_cfg,
+                inplace=True)
+
+        self.gen_cross_conv = gen_cross_conv
+        if self.ind == num_units - 1 and self.gen_cross_conv:
+            self.cross_conv = ConvModule(
+                unit_channels,
+                out_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                norm_cfg=self.norm_cfg,
+                inplace=True)
+
+    def forward(self, x, up_x):
+        out = self.in_skip(x)
+
+        if self.ind > 0:
+            up_x = F.interpolate(
+                up_x,
+                size=(x.size(2), x.size(3)),
+                mode='bilinear',
+                align_corners=True)
+            up_x = self.up_conv(up_x)
+            out = out + up_x
+        out = self.relu(out)
+
+        skip1 = None
+        skip2 = None
+        if self.gen_skip:
+            skip1 = self.out_skip1(x)
+            skip2 = self.out_skip2(out)
+
+        cross_conv = None
+        if self.ind == self.num_units - 1 and self.gen_cross_conv:
+            cross_conv = self.cross_conv(out)
+
+        return out, skip1, skip2, cross_conv
+
+
+class Upsample_module(nn.Module):
+    """Upsample module for RSN.
+
+    Args:
+        unit_channels (int): Channel number in the upsample units.
+            Default:256.
+        num_units (int): Numbers of upsample units. Default: 4
+        gen_skip (bool): Whether to generate skip for posterior downsample
+            module or not. Default:False
+        gen_cross_conv (bool): Whether to generate feature map for the next
+            hourglass-like module. Default:False
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        out_channels (int): Number of channels of feature output by upsample
+            module. Must equal to in_channels of downsample module. Default:64
+    """
+
+    def __init__(self,
+                 unit_channels=256,
+                 num_units=4,
+                 gen_skip=False,
+                 gen_cross_conv=False,
+                 norm_cfg=dict(type='BN'),
+                 out_channels=64):
+        # Protect mutable default arguments
+        norm_cfg = cp.deepcopy(norm_cfg)
+        super().__init__()
+        self.in_channels = list()
+        for i in range(num_units):
+            self.in_channels.append(RSB.expansion * out_channels * pow(2, i))
+        self.in_channels.reverse()
+        self.num_units = num_units
+        self.gen_skip = gen_skip
+        self.gen_cross_conv = gen_cross_conv
+        self.norm_cfg = norm_cfg
+        for i in range(num_units):
+            module_name = f'up{i + 1}'
+            self.add_module(
+                module_name,
+                Upsample_unit(
+                    i,
+                    self.num_units,
+                    self.in_channels[i],
+                    unit_channels,
+                    self.gen_skip,
+                    self.gen_cross_conv,
+                    norm_cfg=self.norm_cfg,
+                    out_channels=64))
+
+    def forward(self, x):
+        out = list()
+        skip1 = list()
+        skip2 = list()
+        cross_conv = None
+        for i in range(self.num_units):
+            module_i = getattr(self, f'up{i + 1}')
+            if i == 0:
+                outi, skip1_i, skip2_i, _ = module_i(x[i], None)
+            elif i == self.num_units - 1:
+                outi, skip1_i, skip2_i, cross_conv = module_i(x[i], out[i - 1])
+            else:
+                outi, skip1_i, skip2_i, _ = module_i(x[i], out[i - 1])
+            out.append(outi)
+            skip1.append(skip1_i)
+            skip2.append(skip2_i)
+        skip1.reverse()
+        skip2.reverse()
+
+        return out, skip1, skip2, cross_conv
+
+
+class Single_stage_RSN(nn.Module):
+    """Single_stage Residual Steps Network.
+
+    Args:
+        unit_channels (int): Channel number in the upsample units. Default:256.
+        num_units (int): Numbers of downsample/upsample units. Default: 4
+        gen_skip (bool): Whether to generate skip for posterior downsample
+            module or not. Default:False
+        gen_cross_conv (bool): Whether to generate feature map for the next
+            hourglass-like module. Default:False
+        has_skip (bool): Have skip connections from prior upsample
+            module or not. Default:False
+        num_steps (int): Number of steps in RSB. Default: 4
+        num_blocks (list): Number of blocks in each downsample unit.
+            Default: [2, 2, 2, 2] Note: Make sure num_units==len(num_blocks)
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        in_channels (int): Number of channels of the feature from ResNet_Top.
+            Default: 64.
+        expand_times (int): Times by which the in_channels are expanded in RSB.
+            Default:26.
+    """
+
+    def __init__(self,
+                 has_skip=False,
+                 gen_skip=False,
+                 gen_cross_conv=False,
+                 unit_channels=256,
+                 num_units=4,
+                 num_steps=4,
+                 num_blocks=[2, 2, 2, 2],
+                 norm_cfg=dict(type='BN'),
+                 in_channels=64,
+                 expand_times=26):
+        # Protect mutable default arguments
+        norm_cfg = cp.deepcopy(norm_cfg)
+        num_blocks = cp.deepcopy(num_blocks)
+        super().__init__()
+        assert len(num_blocks) == num_units
+        self.has_skip = has_skip
+        self.gen_skip = gen_skip
+        self.gen_cross_conv = gen_cross_conv
+        self.num_units = num_units
+        self.num_steps = num_steps
+        self.unit_channels = unit_channels
+        self.num_blocks = num_blocks
+        self.norm_cfg = norm_cfg
+
+        self.downsample = Downsample_module(RSB, num_blocks, num_steps,
+                                            num_units, has_skip, norm_cfg,
+                                            in_channels, expand_times)
+        self.upsample = Upsample_module(unit_channels, num_units, gen_skip,
+                                        gen_cross_conv, norm_cfg, in_channels)
+
+    def forward(self, x, skip1, skip2):
+        mid = self.downsample(x, skip1, skip2)
+        out, skip1, skip2, cross_conv = self.upsample(mid)
+
+        return out, skip1, skip2, cross_conv
+
+
+class ResNet_top(nn.Module):
+    """ResNet top for RSN.
+
+    Args:
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        channels (int): Number of channels of the feature output by ResNet_top.
+    """
+
+    def __init__(self, norm_cfg=dict(type='BN'), channels=64):
+        # Protect mutable default arguments
+        norm_cfg = cp.deepcopy(norm_cfg)
+        super().__init__()
+        self.top = nn.Sequential(
+            ConvModule(
+                3,
+                channels,
+                kernel_size=7,
+                stride=2,
+                padding=3,
+                norm_cfg=norm_cfg,
+                inplace=True), MaxPool2d(kernel_size=3, stride=2, padding=1))
+
+    def forward(self, img):
+        return self.top(img)
+
+
+@BACKBONES.register_module()
+class RSN(BaseBackbone):
+    """Residual Steps Network backbone. Paper ref: Cai et al. "Learning
+    Delicate Local Representations for Multi-Person Pose Estimation" (ECCV
+    2020).
+
+    Args:
+        unit_channels (int): Number of Channels in an upsample unit.
+            Default: 256
+        num_stages (int): Number of stages in a multi-stage RSN. Default: 4
+        num_units (int): NUmber of downsample/upsample units in a single-stage
+            RSN. Default: 4 Note: Make sure num_units == len(self.num_blocks)
+        num_blocks (list): Number of RSBs (Residual Steps Block) in each
+            downsample unit. Default: [2, 2, 2, 2]
+        num_steps (int): Number of steps in a RSB. Default:4
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        res_top_channels (int): Number of channels of feature from ResNet_top.
+            Default: 64.
+        expand_times (int): Times by which the in_channels are expanded in RSB.
+            Default:26.
+    Example:
+        >>> from mmpose.models import RSN
+        >>> import torch
+        >>> self = RSN(num_stages=2,num_units=2,num_blocks=[2,2])
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 511, 511)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_output in level_outputs:
+        ...     for feature in level_output:
+        ...         print(tuple(feature.shape))
+        ...
+        (1, 256, 64, 64)
+        (1, 256, 128, 128)
+        (1, 256, 64, 64)
+        (1, 256, 128, 128)
+    """
+
+    def __init__(self,
+                 unit_channels=256,
+                 num_stages=4,
+                 num_units=4,
+                 num_blocks=[2, 2, 2, 2],
+                 num_steps=4,
+                 norm_cfg=dict(type='BN'),
+                 res_top_channels=64,
+                 expand_times=26):
+        # Protect mutable default arguments
+        norm_cfg = cp.deepcopy(norm_cfg)
+        num_blocks = cp.deepcopy(num_blocks)
+        super().__init__()
+        self.unit_channels = unit_channels
+        self.num_stages = num_stages
+        self.num_units = num_units
+        self.num_blocks = num_blocks
+        self.num_steps = num_steps
+        self.norm_cfg = norm_cfg
+
+        assert self.num_stages > 0
+        assert self.num_steps > 1
+        assert self.num_units > 1
+        assert self.num_units == len(self.num_blocks)
+        self.top = ResNet_top(norm_cfg=norm_cfg)
+        self.multi_stage_rsn = nn.ModuleList([])
+        for i in range(self.num_stages):
+            if i == 0:
+                has_skip = False
+            else:
+                has_skip = True
+            if i != self.num_stages - 1:
+                gen_skip = True
+                gen_cross_conv = True
+            else:
+                gen_skip = False
+                gen_cross_conv = False
+            self.multi_stage_rsn.append(
+                Single_stage_RSN(has_skip, gen_skip, gen_cross_conv,
+                                 unit_channels, num_units, num_steps,
+                                 num_blocks, norm_cfg, res_top_channels,
+                                 expand_times))
+
+    def forward(self, x):
+        """Model forward function."""
+        out_feats = []
+        skip1 = None
+        skip2 = None
+        x = self.top(x)
+        for i in range(self.num_stages):
+            out, skip1, skip2, x = self.multi_stage_rsn[i](x, skip1, skip2)
+            out_feats.append(out)
+
+        return out_feats
+
+    def init_weights(self, pretrained=None):
+        """Initialize model weights."""
+        for m in self.multi_stage_rsn.modules():
+            if isinstance(m, nn.Conv2d):
+                kaiming_init(m)
+            elif isinstance(m, nn.BatchNorm2d):
+                constant_init(m, 1)
+            elif isinstance(m, nn.Linear):
+                normal_init(m, std=0.01)
+
+        for m in self.top.modules():
+            if isinstance(m, nn.Conv2d):
+                kaiming_init(m)
diff --git a/main/transformer_utils/mmpose/models/backbones/scnet.py b/main/transformer_utils/mmpose/models/backbones/scnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..3786c5731d685638cfa64a83e5d4a5e2eee545de
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/backbones/scnet.py
@@ -0,0 +1,248 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as cp
+from mmcv.cnn import build_conv_layer, build_norm_layer
+
+from ..builder import BACKBONES
+from .resnet import Bottleneck, ResNet
+
+
+class SCConv(nn.Module):
+    """SCConv (Self-calibrated Convolution)
+
+    Args:
+        in_channels (int): The input channels of the SCConv.
+        out_channels (int): The output channel of the SCConv.
+        stride (int): stride of SCConv.
+        pooling_r (int): size of pooling for scconv.
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Default: None
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 pooling_r,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', momentum=0.1)):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        super().__init__()
+
+        assert in_channels == out_channels
+
+        self.k2 = nn.Sequential(
+            nn.AvgPool2d(kernel_size=pooling_r, stride=pooling_r),
+            build_conv_layer(
+                conv_cfg,
+                in_channels,
+                in_channels,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias=False),
+            build_norm_layer(norm_cfg, in_channels)[1],
+        )
+        self.k3 = nn.Sequential(
+            build_conv_layer(
+                conv_cfg,
+                in_channels,
+                in_channels,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias=False),
+            build_norm_layer(norm_cfg, in_channels)[1],
+        )
+        self.k4 = nn.Sequential(
+            build_conv_layer(
+                conv_cfg,
+                in_channels,
+                in_channels,
+                kernel_size=3,
+                stride=stride,
+                padding=1,
+                bias=False),
+            build_norm_layer(norm_cfg, out_channels)[1],
+            nn.ReLU(inplace=True),
+        )
+
+    def forward(self, x):
+        """Forward function."""
+        identity = x
+
+        out = torch.sigmoid(
+            torch.add(identity, F.interpolate(self.k2(x),
+                                              identity.size()[2:])))
+        out = torch.mul(self.k3(x), out)
+        out = self.k4(out)
+
+        return out
+
+
+class SCBottleneck(Bottleneck):
+    """SC(Self-calibrated) Bottleneck.
+
+    Args:
+        in_channels (int): The input channels of the SCBottleneck block.
+        out_channels (int): The output channel of the SCBottleneck block.
+    """
+
+    pooling_r = 4
+
+    def __init__(self, in_channels, out_channels, **kwargs):
+        super().__init__(in_channels, out_channels, **kwargs)
+        self.mid_channels = out_channels // self.expansion // 2
+
+        self.norm1_name, norm1 = build_norm_layer(
+            self.norm_cfg, self.mid_channels, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(
+            self.norm_cfg, self.mid_channels, postfix=2)
+        self.norm3_name, norm3 = build_norm_layer(
+            self.norm_cfg, out_channels, postfix=3)
+
+        self.conv1 = build_conv_layer(
+            self.conv_cfg,
+            in_channels,
+            self.mid_channels,
+            kernel_size=1,
+            stride=1,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+
+        self.k1 = nn.Sequential(
+            build_conv_layer(
+                self.conv_cfg,
+                self.mid_channels,
+                self.mid_channels,
+                kernel_size=3,
+                stride=self.stride,
+                padding=1,
+                bias=False),
+            build_norm_layer(self.norm_cfg, self.mid_channels)[1],
+            nn.ReLU(inplace=True))
+
+        self.conv2 = build_conv_layer(
+            self.conv_cfg,
+            in_channels,
+            self.mid_channels,
+            kernel_size=1,
+            stride=1,
+            bias=False)
+        self.add_module(self.norm2_name, norm2)
+
+        self.scconv = SCConv(self.mid_channels, self.mid_channels, self.stride,
+                             self.pooling_r, self.conv_cfg, self.norm_cfg)
+
+        self.conv3 = build_conv_layer(
+            self.conv_cfg,
+            self.mid_channels * 2,
+            out_channels,
+            kernel_size=1,
+            stride=1,
+            bias=False)
+        self.add_module(self.norm3_name, norm3)
+
+    def forward(self, x):
+        """Forward function."""
+
+        def _inner_forward(x):
+            identity = x
+
+            out_a = self.conv1(x)
+            out_a = self.norm1(out_a)
+            out_a = self.relu(out_a)
+
+            out_a = self.k1(out_a)
+
+            out_b = self.conv2(x)
+            out_b = self.norm2(out_b)
+            out_b = self.relu(out_b)
+
+            out_b = self.scconv(out_b)
+
+            out = self.conv3(torch.cat([out_a, out_b], dim=1))
+            out = self.norm3(out)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+@BACKBONES.register_module()
+class SCNet(ResNet):
+    """SCNet backbone.
+
+    Improving Convolutional Networks with Self-Calibrated Convolutions,
+    Jiang-Jiang Liu, Qibin Hou, Ming-Ming Cheng, Changhu Wang, Jiashi Feng,
+    IEEE CVPR, 2020.
+    http://mftp.mmcheng.net/Papers/20cvprSCNet.pdf
+
+    Args:
+        depth (int): Depth of scnet, from {50, 101}.
+        in_channels (int): Number of input image channels. Normally 3.
+        base_channels (int): Number of base channels of hidden layer.
+        num_stages (int): SCNet stages, normally 4.
+        strides (Sequence[int]): Strides of the first block of each stage.
+        dilations (Sequence[int]): Dilation of each stage.
+        out_indices (Sequence[int]): Output from which stages.
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottleneck.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+        zero_init_residual (bool): Whether to use zero init for last norm layer
+            in resblocks to let them behave as identity.
+
+    Example:
+        >>> from mmpose.models import SCNet
+        >>> import torch
+        >>> self = SCNet(depth=50, out_indices=(0, 1, 2, 3))
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 224, 224)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 256, 56, 56)
+        (1, 512, 28, 28)
+        (1, 1024, 14, 14)
+        (1, 2048, 7, 7)
+    """
+
+    arch_settings = {
+        50: (SCBottleneck, [3, 4, 6, 3]),
+        101: (SCBottleneck, [3, 4, 23, 3])
+    }
+
+    def __init__(self, depth, **kwargs):
+        if depth not in self.arch_settings:
+            raise KeyError(f'invalid depth {depth} for SCNet')
+        super().__init__(depth, **kwargs)
diff --git a/main/transformer_utils/mmpose/models/backbones/seresnet.py b/main/transformer_utils/mmpose/models/backbones/seresnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac2d53b40a4593bce96d5c7c3bb4e06d38353d0b
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/backbones/seresnet.py
@@ -0,0 +1,125 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.utils.checkpoint as cp
+
+from ..builder import BACKBONES
+from .resnet import Bottleneck, ResLayer, ResNet
+from .utils.se_layer import SELayer
+
+
+class SEBottleneck(Bottleneck):
+    """SEBottleneck block for SEResNet.
+
+    Args:
+        in_channels (int): The input channels of the SEBottleneck block.
+        out_channels (int): The output channel of the SEBottleneck block.
+        se_ratio (int): Squeeze ratio in SELayer. Default: 16
+    """
+
+    def __init__(self, in_channels, out_channels, se_ratio=16, **kwargs):
+        super().__init__(in_channels, out_channels, **kwargs)
+        self.se_layer = SELayer(out_channels, ratio=se_ratio)
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            identity = x
+
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            out = self.conv2(out)
+            out = self.norm2(out)
+            out = self.relu(out)
+
+            out = self.conv3(out)
+            out = self.norm3(out)
+
+            out = self.se_layer(out)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+@BACKBONES.register_module()
+class SEResNet(ResNet):
+    """SEResNet backbone.
+
+    Please refer to the `paper <https://arxiv.org/abs/1709.01507>`__ for
+    details.
+
+    Args:
+        depth (int): Network depth, from {50, 101, 152}.
+        se_ratio (int): Squeeze ratio in SELayer. Default: 16.
+        in_channels (int): Number of input image channels. Default: 3.
+        stem_channels (int): Output channels of the stem layer. Default: 64.
+        num_stages (int): Stages of the network. Default: 4.
+        strides (Sequence[int]): Strides of the first block of each stage.
+            Default: ``(1, 2, 2, 2)``.
+        dilations (Sequence[int]): Dilation of each stage.
+            Default: ``(1, 1, 1, 1)``.
+        out_indices (Sequence[int]): Output from which stages. If only one
+            stage is specified, a single tensor (feature map) is returned,
+            otherwise multiple stages are specified, a tuple of tensors will
+            be returned. Default: ``(3, )``.
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv.
+            Default: False.
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottleneck. Default: False.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters. Default: -1.
+        conv_cfg (dict | None): The config dict for conv layers. Default: None.
+        norm_cfg (dict): The config dict for norm layers.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        zero_init_residual (bool): Whether to use zero init for last norm layer
+            in resblocks to let them behave as identity. Default: True.
+
+    Example:
+        >>> from mmpose.models import SEResNet
+        >>> import torch
+        >>> self = SEResNet(depth=50, out_indices=(0, 1, 2, 3))
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 224, 224)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 256, 56, 56)
+        (1, 512, 28, 28)
+        (1, 1024, 14, 14)
+        (1, 2048, 7, 7)
+    """
+
+    arch_settings = {
+        50: (SEBottleneck, (3, 4, 6, 3)),
+        101: (SEBottleneck, (3, 4, 23, 3)),
+        152: (SEBottleneck, (3, 8, 36, 3))
+    }
+
+    def __init__(self, depth, se_ratio=16, **kwargs):
+        if depth not in self.arch_settings:
+            raise KeyError(f'invalid depth {depth} for SEResNet')
+        self.se_ratio = se_ratio
+        super().__init__(depth, **kwargs)
+
+    def make_res_layer(self, **kwargs):
+        return ResLayer(se_ratio=self.se_ratio, **kwargs)
diff --git a/main/transformer_utils/mmpose/models/backbones/seresnext.py b/main/transformer_utils/mmpose/models/backbones/seresnext.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5c4e4ce03684f8a9bd0c6166969c01bace54bd2
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/backbones/seresnext.py
@@ -0,0 +1,168 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.cnn import build_conv_layer, build_norm_layer
+
+from ..builder import BACKBONES
+from .resnet import ResLayer
+from .seresnet import SEBottleneck as _SEBottleneck
+from .seresnet import SEResNet
+
+
+class SEBottleneck(_SEBottleneck):
+    """SEBottleneck block for SEResNeXt.
+
+    Args:
+        in_channels (int): Input channels of this block.
+        out_channels (int): Output channels of this block.
+        base_channels (int): Middle channels of the first stage. Default: 64.
+        groups (int): Groups of conv2.
+        width_per_group (int): Width per group of conv2. 64x4d indicates
+            ``groups=64, width_per_group=4`` and 32x8d indicates
+            ``groups=32, width_per_group=8``.
+        stride (int): stride of the block. Default: 1
+        dilation (int): dilation of convolution. Default: 1
+        downsample (nn.Module): downsample operation on identity branch.
+            Default: None
+        se_ratio (int): Squeeze ratio in SELayer. Default: 16
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Default: None
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 base_channels=64,
+                 groups=32,
+                 width_per_group=4,
+                 se_ratio=16,
+                 **kwargs):
+        super().__init__(in_channels, out_channels, se_ratio, **kwargs)
+        self.groups = groups
+        self.width_per_group = width_per_group
+
+        # We follow the same rational of ResNext to compute mid_channels.
+        # For SEResNet bottleneck, middle channels are determined by expansion
+        # and out_channels, but for SEResNeXt bottleneck, it is determined by
+        # groups and width_per_group and the stage it is located in.
+        if groups != 1:
+            assert self.mid_channels % base_channels == 0
+            self.mid_channels = (
+                groups * width_per_group * self.mid_channels // base_channels)
+
+        self.norm1_name, norm1 = build_norm_layer(
+            self.norm_cfg, self.mid_channels, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(
+            self.norm_cfg, self.mid_channels, postfix=2)
+        self.norm3_name, norm3 = build_norm_layer(
+            self.norm_cfg, self.out_channels, postfix=3)
+
+        self.conv1 = build_conv_layer(
+            self.conv_cfg,
+            self.in_channels,
+            self.mid_channels,
+            kernel_size=1,
+            stride=self.conv1_stride,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        self.conv2 = build_conv_layer(
+            self.conv_cfg,
+            self.mid_channels,
+            self.mid_channels,
+            kernel_size=3,
+            stride=self.conv2_stride,
+            padding=self.dilation,
+            dilation=self.dilation,
+            groups=groups,
+            bias=False)
+
+        self.add_module(self.norm2_name, norm2)
+        self.conv3 = build_conv_layer(
+            self.conv_cfg,
+            self.mid_channels,
+            self.out_channels,
+            kernel_size=1,
+            bias=False)
+        self.add_module(self.norm3_name, norm3)
+
+
+@BACKBONES.register_module()
+class SEResNeXt(SEResNet):
+    """SEResNeXt backbone.
+
+    Please refer to the `paper <https://arxiv.org/abs/1709.01507>`__ for
+    details.
+
+    Args:
+        depth (int): Network depth, from {50, 101, 152}.
+        groups (int): Groups of conv2 in Bottleneck. Default: 32.
+        width_per_group (int): Width per group of conv2 in Bottleneck.
+            Default: 4.
+        se_ratio (int): Squeeze ratio in SELayer. Default: 16.
+        in_channels (int): Number of input image channels. Default: 3.
+        stem_channels (int): Output channels of the stem layer. Default: 64.
+        num_stages (int): Stages of the network. Default: 4.
+        strides (Sequence[int]): Strides of the first block of each stage.
+            Default: ``(1, 2, 2, 2)``.
+        dilations (Sequence[int]): Dilation of each stage.
+            Default: ``(1, 1, 1, 1)``.
+        out_indices (Sequence[int]): Output from which stages. If only one
+            stage is specified, a single tensor (feature map) is returned,
+            otherwise multiple stages are specified, a tuple of tensors will
+            be returned. Default: ``(3, )``.
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv.
+            Default: False.
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottleneck. Default: False.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters. Default: -1.
+        conv_cfg (dict | None): The config dict for conv layers. Default: None.
+        norm_cfg (dict): The config dict for norm layers.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        zero_init_residual (bool): Whether to use zero init for last norm layer
+            in resblocks to let them behave as identity. Default: True.
+
+    Example:
+        >>> from mmpose.models import SEResNeXt
+        >>> import torch
+        >>> self = SEResNet(depth=50, out_indices=(0, 1, 2, 3))
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 224, 224)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 256, 56, 56)
+        (1, 512, 28, 28)
+        (1, 1024, 14, 14)
+        (1, 2048, 7, 7)
+    """
+
+    arch_settings = {
+        50: (SEBottleneck, (3, 4, 6, 3)),
+        101: (SEBottleneck, (3, 4, 23, 3)),
+        152: (SEBottleneck, (3, 8, 36, 3))
+    }
+
+    def __init__(self, depth, groups=32, width_per_group=4, **kwargs):
+        self.groups = groups
+        self.width_per_group = width_per_group
+        super().__init__(depth, **kwargs)
+
+    def make_res_layer(self, **kwargs):
+        return ResLayer(
+            groups=self.groups,
+            width_per_group=self.width_per_group,
+            base_channels=self.base_channels,
+            **kwargs)
diff --git a/main/transformer_utils/mmpose/models/backbones/shufflenet_v1.py b/main/transformer_utils/mmpose/models/backbones/shufflenet_v1.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f98cbd2132250ec13adcce6e642c966b0dbd7cc
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/backbones/shufflenet_v1.py
@@ -0,0 +1,329 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import logging
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import (ConvModule, build_activation_layer, constant_init,
+                      normal_init)
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from ..builder import BACKBONES
+from .base_backbone import BaseBackbone
+from .utils import channel_shuffle, load_checkpoint, make_divisible
+
+
+class ShuffleUnit(nn.Module):
+    """ShuffleUnit block.
+
+    ShuffleNet unit with pointwise group convolution (GConv) and channel
+    shuffle.
+
+    Args:
+        in_channels (int): The input channels of the ShuffleUnit.
+        out_channels (int): The output channels of the ShuffleUnit.
+        groups (int, optional): The number of groups to be used in grouped 1x1
+            convolutions in each ShuffleUnit. Default: 3
+        first_block (bool, optional): Whether it is the first ShuffleUnit of a
+            sequential ShuffleUnits. Default: True, which means not using the
+            grouped 1x1 convolution.
+        combine (str, optional): The ways to combine the input and output
+            branches. Default: 'add'.
+        conv_cfg (dict): Config dict for convolution layer. Default: None,
+            which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU').
+        with_cp (bool, optional): Use checkpoint or not. Using checkpoint
+            will save some memory while slowing down the training speed.
+            Default: False.
+
+    Returns:
+        Tensor: The output tensor.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 groups=3,
+                 first_block=True,
+                 combine='add',
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 with_cp=False):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        act_cfg = copy.deepcopy(act_cfg)
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.first_block = first_block
+        self.combine = combine
+        self.groups = groups
+        self.bottleneck_channels = self.out_channels // 4
+        self.with_cp = with_cp
+
+        if self.combine == 'add':
+            self.depthwise_stride = 1
+            self._combine_func = self._add
+            assert in_channels == out_channels, (
+                'in_channels must be equal to out_channels when combine '
+                'is add')
+        elif self.combine == 'concat':
+            self.depthwise_stride = 2
+            self._combine_func = self._concat
+            self.out_channels -= self.in_channels
+            self.avgpool = nn.AvgPool2d(kernel_size=3, stride=2, padding=1)
+        else:
+            raise ValueError(f'Cannot combine tensors with {self.combine}. '
+                             'Only "add" and "concat" are supported')
+
+        self.first_1x1_groups = 1 if first_block else self.groups
+        self.g_conv_1x1_compress = ConvModule(
+            in_channels=self.in_channels,
+            out_channels=self.bottleneck_channels,
+            kernel_size=1,
+            groups=self.first_1x1_groups,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+        self.depthwise_conv3x3_bn = ConvModule(
+            in_channels=self.bottleneck_channels,
+            out_channels=self.bottleneck_channels,
+            kernel_size=3,
+            stride=self.depthwise_stride,
+            padding=1,
+            groups=self.bottleneck_channels,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+
+        self.g_conv_1x1_expand = ConvModule(
+            in_channels=self.bottleneck_channels,
+            out_channels=self.out_channels,
+            kernel_size=1,
+            groups=self.groups,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+
+        self.act = build_activation_layer(act_cfg)
+
+    @staticmethod
+    def _add(x, out):
+        # residual connection
+        return x + out
+
+    @staticmethod
+    def _concat(x, out):
+        # concatenate along channel axis
+        return torch.cat((x, out), 1)
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            residual = x
+
+            out = self.g_conv_1x1_compress(x)
+            out = self.depthwise_conv3x3_bn(out)
+
+            if self.groups > 1:
+                out = channel_shuffle(out, self.groups)
+
+            out = self.g_conv_1x1_expand(out)
+
+            if self.combine == 'concat':
+                residual = self.avgpool(residual)
+                out = self.act(out)
+                out = self._combine_func(residual, out)
+            else:
+                out = self._combine_func(residual, out)
+                out = self.act(out)
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        return out
+
+
+@BACKBONES.register_module()
+class ShuffleNetV1(BaseBackbone):
+    """ShuffleNetV1 backbone.
+
+    Args:
+        groups (int, optional): The number of groups to be used in grouped 1x1
+            convolutions in each ShuffleUnit. Default: 3.
+        widen_factor (float, optional): Width multiplier - adjusts the number
+            of channels in each layer by this amount. Default: 1.0.
+        out_indices (Sequence[int]): Output from which stages.
+            Default: (2, )
+        frozen_stages (int): Stages to be frozen (all param fixed).
+            Default: -1, which means not freezing any parameters.
+        conv_cfg (dict): Config dict for convolution layer. Default: None,
+            which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU').
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+    """
+
+    def __init__(self,
+                 groups=3,
+                 widen_factor=1.0,
+                 out_indices=(2, ),
+                 frozen_stages=-1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 norm_eval=False,
+                 with_cp=False):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        act_cfg = copy.deepcopy(act_cfg)
+        super().__init__()
+        self.stage_blocks = [4, 8, 4]
+        self.groups = groups
+
+        for index in out_indices:
+            if index not in range(0, 3):
+                raise ValueError('the item in out_indices must in '
+                                 f'range(0, 3). But received {index}')
+
+        if frozen_stages not in range(-1, 3):
+            raise ValueError('frozen_stages must be in range(-1, 3). '
+                             f'But received {frozen_stages}')
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.norm_eval = norm_eval
+        self.with_cp = with_cp
+
+        if groups == 1:
+            channels = (144, 288, 576)
+        elif groups == 2:
+            channels = (200, 400, 800)
+        elif groups == 3:
+            channels = (240, 480, 960)
+        elif groups == 4:
+            channels = (272, 544, 1088)
+        elif groups == 8:
+            channels = (384, 768, 1536)
+        else:
+            raise ValueError(f'{groups} groups is not supported for 1x1 '
+                             'Grouped Convolutions')
+
+        channels = [make_divisible(ch * widen_factor, 8) for ch in channels]
+
+        self.in_channels = int(24 * widen_factor)
+
+        self.conv1 = ConvModule(
+            in_channels=3,
+            out_channels=self.in_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+        self.layers = nn.ModuleList()
+        for i, num_blocks in enumerate(self.stage_blocks):
+            first_block = (i == 0)
+            layer = self.make_layer(channels[i], num_blocks, first_block)
+            self.layers.append(layer)
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            for param in self.conv1.parameters():
+                param.requires_grad = False
+        for i in range(self.frozen_stages):
+            layer = self.layers[i]
+            layer.eval()
+            for param in layer.parameters():
+                param.requires_grad = False
+
+    def init_weights(self, pretrained=None):
+        if isinstance(pretrained, str):
+            logger = logging.getLogger()
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            for name, m in self.named_modules():
+                if isinstance(m, nn.Conv2d):
+                    if 'conv1' in name:
+                        normal_init(m, mean=0, std=0.01)
+                    else:
+                        normal_init(m, mean=0, std=1.0 / m.weight.shape[1])
+                elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
+                    constant_init(m, val=1, bias=0.0001)
+                    if isinstance(m, _BatchNorm):
+                        if m.running_mean is not None:
+                            nn.init.constant_(m.running_mean, 0)
+        else:
+            raise TypeError('pretrained must be a str or None. But received '
+                            f'{type(pretrained)}')
+
+    def make_layer(self, out_channels, num_blocks, first_block=False):
+        """Stack ShuffleUnit blocks to make a layer.
+
+        Args:
+            out_channels (int): out_channels of the block.
+            num_blocks (int): Number of blocks.
+            first_block (bool, optional): Whether is the first ShuffleUnit of a
+                sequential ShuffleUnits. Default: False, which means using
+                the grouped 1x1 convolution.
+        """
+        layers = []
+        for i in range(num_blocks):
+            first_block = first_block if i == 0 else False
+            combine_mode = 'concat' if i == 0 else 'add'
+            layers.append(
+                ShuffleUnit(
+                    self.in_channels,
+                    out_channels,
+                    groups=self.groups,
+                    first_block=first_block,
+                    combine=combine_mode,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg,
+                    with_cp=self.with_cp))
+            self.in_channels = out_channels
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.maxpool(x)
+
+        outs = []
+        for i, layer in enumerate(self.layers):
+            x = layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+
+        if len(outs) == 1:
+            return outs[0]
+        return tuple(outs)
+
+    def train(self, mode=True):
+        super().train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, _BatchNorm):
+                    m.eval()
diff --git a/main/transformer_utils/mmpose/models/backbones/shufflenet_v2.py b/main/transformer_utils/mmpose/models/backbones/shufflenet_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..e93533367afe4efa01fa67d14cafcca006c990e8
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/backbones/shufflenet_v2.py
@@ -0,0 +1,302 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import logging
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import ConvModule, constant_init, normal_init
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from ..builder import BACKBONES
+from .base_backbone import BaseBackbone
+from .utils import channel_shuffle, load_checkpoint
+
+
+class InvertedResidual(nn.Module):
+    """InvertedResidual block for ShuffleNetV2 backbone.
+
+    Args:
+        in_channels (int): The input channels of the block.
+        out_channels (int): The output channels of the block.
+        stride (int): Stride of the 3x3 convolution layer. Default: 1
+        conv_cfg (dict): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU').
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride=1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 with_cp=False):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        act_cfg = copy.deepcopy(act_cfg)
+        super().__init__()
+        self.stride = stride
+        self.with_cp = with_cp
+
+        branch_features = out_channels // 2
+        if self.stride == 1:
+            assert in_channels == branch_features * 2, (
+                f'in_channels ({in_channels}) should equal to '
+                f'branch_features * 2 ({branch_features * 2}) '
+                'when stride is 1')
+
+        if in_channels != branch_features * 2:
+            assert self.stride != 1, (
+                f'stride ({self.stride}) should not equal 1 when '
+                f'in_channels != branch_features * 2')
+
+        if self.stride > 1:
+            self.branch1 = nn.Sequential(
+                ConvModule(
+                    in_channels,
+                    in_channels,
+                    kernel_size=3,
+                    stride=self.stride,
+                    padding=1,
+                    groups=in_channels,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=None),
+                ConvModule(
+                    in_channels,
+                    branch_features,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg),
+            )
+
+        self.branch2 = nn.Sequential(
+            ConvModule(
+                in_channels if (self.stride > 1) else branch_features,
+                branch_features,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg),
+            ConvModule(
+                branch_features,
+                branch_features,
+                kernel_size=3,
+                stride=self.stride,
+                padding=1,
+                groups=branch_features,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=None),
+            ConvModule(
+                branch_features,
+                branch_features,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg))
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            if self.stride > 1:
+                out = torch.cat((self.branch1(x), self.branch2(x)), dim=1)
+            else:
+                x1, x2 = x.chunk(2, dim=1)
+                out = torch.cat((x1, self.branch2(x2)), dim=1)
+
+            out = channel_shuffle(out, 2)
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        return out
+
+
+@BACKBONES.register_module()
+class ShuffleNetV2(BaseBackbone):
+    """ShuffleNetV2 backbone.
+
+    Args:
+        widen_factor (float): Width multiplier - adjusts the number of
+            channels in each layer by this amount. Default: 1.0.
+        out_indices (Sequence[int]): Output from which stages.
+            Default: (0, 1, 2, 3).
+        frozen_stages (int): Stages to be frozen (all param fixed).
+            Default: -1, which means not freezing any parameters.
+        conv_cfg (dict): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU').
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+    """
+
+    def __init__(self,
+                 widen_factor=1.0,
+                 out_indices=(3, ),
+                 frozen_stages=-1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 norm_eval=False,
+                 with_cp=False):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        act_cfg = copy.deepcopy(act_cfg)
+        super().__init__()
+        self.stage_blocks = [4, 8, 4]
+        for index in out_indices:
+            if index not in range(0, 4):
+                raise ValueError('the item in out_indices must in '
+                                 f'range(0, 4). But received {index}')
+
+        if frozen_stages not in range(-1, 4):
+            raise ValueError('frozen_stages must be in range(-1, 4). '
+                             f'But received {frozen_stages}')
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.norm_eval = norm_eval
+        self.with_cp = with_cp
+
+        if widen_factor == 0.5:
+            channels = [48, 96, 192, 1024]
+        elif widen_factor == 1.0:
+            channels = [116, 232, 464, 1024]
+        elif widen_factor == 1.5:
+            channels = [176, 352, 704, 1024]
+        elif widen_factor == 2.0:
+            channels = [244, 488, 976, 2048]
+        else:
+            raise ValueError('widen_factor must be in [0.5, 1.0, 1.5, 2.0]. '
+                             f'But received {widen_factor}')
+
+        self.in_channels = 24
+        self.conv1 = ConvModule(
+            in_channels=3,
+            out_channels=self.in_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+        self.layers = nn.ModuleList()
+        for i, num_blocks in enumerate(self.stage_blocks):
+            layer = self._make_layer(channels[i], num_blocks)
+            self.layers.append(layer)
+
+        output_channels = channels[-1]
+        self.layers.append(
+            ConvModule(
+                in_channels=self.in_channels,
+                out_channels=output_channels,
+                kernel_size=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg))
+
+    def _make_layer(self, out_channels, num_blocks):
+        """Stack blocks to make a layer.
+
+        Args:
+            out_channels (int): out_channels of the block.
+            num_blocks (int): number of blocks.
+        """
+        layers = []
+        for i in range(num_blocks):
+            stride = 2 if i == 0 else 1
+            layers.append(
+                InvertedResidual(
+                    in_channels=self.in_channels,
+                    out_channels=out_channels,
+                    stride=stride,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg,
+                    with_cp=self.with_cp))
+            self.in_channels = out_channels
+
+        return nn.Sequential(*layers)
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            for param in self.conv1.parameters():
+                param.requires_grad = False
+
+        for i in range(self.frozen_stages):
+            m = self.layers[i]
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+    def init_weights(self, pretrained=None):
+        if isinstance(pretrained, str):
+            logger = logging.getLogger()
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            for name, m in self.named_modules():
+                if isinstance(m, nn.Conv2d):
+                    if 'conv1' in name:
+                        normal_init(m, mean=0, std=0.01)
+                    else:
+                        normal_init(m, mean=0, std=1.0 / m.weight.shape[1])
+                elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
+                    constant_init(m.weight, val=1, bias=0.0001)
+                    if isinstance(m, _BatchNorm):
+                        if m.running_mean is not None:
+                            nn.init.constant_(m.running_mean, 0)
+        else:
+            raise TypeError('pretrained must be a str or None. But received '
+                            f'{type(pretrained)}')
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.maxpool(x)
+
+        outs = []
+        for i, layer in enumerate(self.layers):
+            x = layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+
+        if len(outs) == 1:
+            return outs[0]
+        return tuple(outs)
+
+    def train(self, mode=True):
+        super().train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, nn.BatchNorm2d):
+                    m.eval()
diff --git a/main/transformer_utils/mmpose/models/backbones/swin.py b/main/transformer_utils/mmpose/models/backbones/swin.py
new file mode 100644
index 0000000000000000000000000000000000000000..2449cdca591bc0bbf601295bde11efe834b49f8a
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/backbones/swin.py
@@ -0,0 +1,733 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections import OrderedDict
+from copy import deepcopy
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as cp
+from mmcv.cnn import build_norm_layer, constant_init, trunc_normal_init
+from mmcv.cnn.bricks.transformer import FFN, build_dropout
+from mmcv.cnn.utils.weight_init import trunc_normal_
+from mmcv.runner import _load_checkpoint
+from mmcv.utils import to_2tuple
+
+from ...utils import get_root_logger
+from ..builder import BACKBONES
+from ..utils.transformer import PatchEmbed, PatchMerging
+from .base_backbone import BaseBackbone
+from .utils.ckpt_convert import swin_converter
+
+
+class WindowMSA(nn.Module):
+    """Window based multi-head self-attention (W-MSA) module with relative
+    position bias.
+
+    Args:
+        embed_dims (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (tuple[int]): The height and width of the window.
+        qkv_bias (bool, optional):  If True, add a learnable bias to q, k, v.
+            Default: True.
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Default: None.
+        attn_drop_rate (float, optional): Dropout ratio of attention weight.
+            Default: 0.0
+        proj_drop_rate (float, optional): Dropout ratio of output. Default: 0.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 window_size,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 attn_drop_rate=0.,
+                 proj_drop_rate=0.):
+
+        super().__init__()
+        self.embed_dims = embed_dims
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_embed_dims = embed_dims // num_heads
+        self.scale = qk_scale or head_embed_dims**-0.5
+
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1),
+                        num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+
+        # About 2x faster than original impl
+        Wh, Ww = self.window_size
+        rel_index_coords = self.double_step_seq(2 * Ww - 1, Wh, 1, Ww)
+        rel_position_index = rel_index_coords + rel_index_coords.T
+        rel_position_index = rel_position_index.flip(1).contiguous()
+        self.register_buffer('relative_position_index', rel_position_index)
+
+        self.qkv = nn.Linear(embed_dims, embed_dims * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop_rate)
+        self.proj = nn.Linear(embed_dims, embed_dims)
+        self.proj_drop = nn.Dropout(proj_drop_rate)
+
+        self.softmax = nn.Softmax(dim=-1)
+
+    def init_weights(self):
+        trunc_normal_(self.relative_position_bias_table, std=0.02)
+
+    def forward(self, x, mask=None):
+        """
+        Args:
+
+            x (tensor): input features with shape of (num_windows*B, N, C)
+            mask (tensor | None, Optional): mask with shape of (num_windows,
+                Wh*Ww, Wh*Ww), value should be between (-inf, 0].
+        """
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads,
+                                  C // self.num_heads).permute(2, 0, 3, 1, 4)
+        # make torchscript happy (cannot use tensor as tuple)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+
+        relative_position_bias = self.relative_position_bias_table[
+            self.relative_position_index.view(-1)].view(
+                self.window_size[0] * self.window_size[1],
+                self.window_size[0] * self.window_size[1],
+                -1)  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.permute(
+            2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)
+
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B // nW, nW, self.num_heads, N,
+                             N) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+        attn = self.softmax(attn)
+
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+    @staticmethod
+    def double_step_seq(step1, len1, step2, len2):
+        seq1 = torch.arange(0, step1 * len1, step1)
+        seq2 = torch.arange(0, step2 * len2, step2)
+        return (seq1[:, None] + seq2[None, :]).reshape(1, -1)
+
+
+class ShiftWindowMSA(nn.Module):
+    """Shifted Window Multihead Self-Attention Module.
+
+    Args:
+        embed_dims (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (int): The height and width of the window.
+        shift_size (int, optional): The shift step of each window towards
+            right-bottom. If zero, act as regular window-msa. Defaults to 0.
+        qkv_bias (bool, optional): If True, add a learnable bias to q, k, v.
+            Default: True
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Defaults: None.
+        attn_drop_rate (float, optional): Dropout ratio of attention weight.
+            Defaults: 0.
+        proj_drop_rate (float, optional): Dropout ratio of output.
+            Defaults: 0.
+        dropout_layer (dict, optional): The dropout_layer used before output.
+            Defaults: dict(type='DropPath', drop_prob=0.).
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 window_size,
+                 shift_size=0,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 attn_drop_rate=0,
+                 proj_drop_rate=0,
+                 dropout_layer=dict(type='DropPath', drop_prob=0.)):
+        super().__init__()
+
+        self.window_size = window_size
+        self.shift_size = shift_size
+        assert 0 <= self.shift_size < self.window_size
+
+        self.w_msa = WindowMSA(
+            embed_dims=embed_dims,
+            num_heads=num_heads,
+            window_size=to_2tuple(window_size),
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop_rate=attn_drop_rate,
+            proj_drop_rate=proj_drop_rate)
+
+        self.drop = build_dropout(dropout_layer)
+
+    def forward(self, query, hw_shape):
+        B, L, C = query.shape
+        H, W = hw_shape
+        assert L == H * W, 'input feature has wrong size'
+        query = query.view(B, H, W, C)
+
+        # pad feature maps to multiples of window size
+        pad_r = (self.window_size - W % self.window_size) % self.window_size
+        pad_b = (self.window_size - H % self.window_size) % self.window_size
+        query = F.pad(query, (0, 0, 0, pad_r, 0, pad_b))
+        H_pad, W_pad = query.shape[1], query.shape[2]
+
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_query = torch.roll(
+                query,
+                shifts=(-self.shift_size, -self.shift_size),
+                dims=(1, 2))
+
+            # calculate attention mask for SW-MSA
+            img_mask = torch.zeros((1, H_pad, W_pad, 1), device=query.device)
+            h_slices = (slice(0, -self.window_size),
+                        slice(-self.window_size,
+                              -self.shift_size), slice(-self.shift_size, None))
+            w_slices = (slice(0, -self.window_size),
+                        slice(-self.window_size,
+                              -self.shift_size), slice(-self.shift_size, None))
+            cnt = 0
+            for h in h_slices:
+                for w in w_slices:
+                    img_mask[:, h, w, :] = cnt
+                    cnt += 1
+
+            # nW, window_size, window_size, 1
+            mask_windows = self.window_partition(img_mask)
+            mask_windows = mask_windows.view(
+                -1, self.window_size * self.window_size)
+            attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+            attn_mask = attn_mask.masked_fill(attn_mask != 0,
+                                              float(-100.0)).masked_fill(
+                                                  attn_mask == 0, float(0.0))
+        else:
+            shifted_query = query
+            attn_mask = None
+
+        # nW*B, window_size, window_size, C
+        query_windows = self.window_partition(shifted_query)
+        # nW*B, window_size*window_size, C
+        query_windows = query_windows.view(-1, self.window_size**2, C)
+
+        # W-MSA/SW-MSA (nW*B, window_size*window_size, C)
+        attn_windows = self.w_msa(query_windows, mask=attn_mask)
+
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size,
+                                         self.window_size, C)
+
+        # B H' W' C
+        shifted_x = self.window_reverse(attn_windows, H_pad, W_pad)
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = torch.roll(
+                shifted_x,
+                shifts=(self.shift_size, self.shift_size),
+                dims=(1, 2))
+        else:
+            x = shifted_x
+
+        if pad_r > 0 or pad_b:
+            x = x[:, :H, :W, :].contiguous()
+
+        x = x.view(B, H * W, C)
+
+        x = self.drop(x)
+        return x
+
+    def window_reverse(self, windows, H, W):
+        """
+        Args:
+            windows: (num_windows*B, window_size, window_size, C)
+            H (int): Height of image
+            W (int): Width of image
+        Returns:
+            x: (B, H, W, C)
+        """
+        window_size = self.window_size
+        B = int(windows.shape[0] / (H * W / window_size / window_size))
+        x = windows.view(B, H // window_size, W // window_size, window_size,
+                         window_size, -1)
+        x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+        return x
+
+    def window_partition(self, x):
+        """
+        Args:
+            x: (B, H, W, C)
+        Returns:
+            windows: (num_windows*B, window_size, window_size, C)
+        """
+        B, H, W, C = x.shape
+        window_size = self.window_size
+        x = x.view(B, H // window_size, window_size, W // window_size,
+                   window_size, C)
+        windows = x.permute(0, 1, 3, 2, 4, 5).contiguous()
+        windows = windows.view(-1, window_size, window_size, C)
+        return windows
+
+
+class SwinBlock(nn.Module):
+    """"
+    Args:
+        embed_dims (int): The feature dimension.
+        num_heads (int): Parallel attention heads.
+        feedforward_channels (int): The hidden dimension for FFNs.
+        window_size (int, optional): The local window scale. Default: 7.
+        shift (bool, optional): whether to shift window or not. Default False.
+        qkv_bias (bool, optional): enable bias for qkv if True. Default: True.
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Default: None.
+        drop_rate (float, optional): Dropout rate. Default: 0.
+        attn_drop_rate (float, optional): Attention dropout rate. Default: 0.
+        drop_path_rate (float, optional): Stochastic depth rate. Default: 0.
+        act_cfg (dict, optional): The config dict of activation function.
+            Default: dict(type='GELU').
+        norm_cfg (dict, optional): The config dict of normalization.
+            Default: dict(type='LN').
+        with_cp (bool, optional): Use checkpoint or not. Using checkpoint
+            will save some memory while slowing down the training speed.
+            Default: False.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 feedforward_channels,
+                 window_size=7,
+                 shift=False,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='LN'),
+                 with_cp=False):
+
+        super(SwinBlock, self).__init__()
+
+        self.with_cp = with_cp
+
+        self.norm1 = build_norm_layer(norm_cfg, embed_dims)[1]
+        self.attn = ShiftWindowMSA(
+            embed_dims=embed_dims,
+            num_heads=num_heads,
+            window_size=window_size,
+            shift_size=window_size // 2 if shift else 0,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop_rate=attn_drop_rate,
+            proj_drop_rate=drop_rate,
+            dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate))
+
+        self.norm2 = build_norm_layer(norm_cfg, embed_dims)[1]
+        self.ffn = FFN(
+            embed_dims=embed_dims,
+            feedforward_channels=feedforward_channels,
+            num_fcs=2,
+            ffn_drop=drop_rate,
+            dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
+            act_cfg=act_cfg,
+            add_identity=True,
+            init_cfg=None)
+
+    def forward(self, x, hw_shape):
+
+        def _inner_forward(x):
+            identity = x
+            x = self.norm1(x)
+            x = self.attn(x, hw_shape)
+
+            x = x + identity
+
+            identity = x
+            x = self.norm2(x)
+            x = self.ffn(x, identity=identity)
+
+            return x
+
+        if self.with_cp and x.requires_grad:
+            x = cp.checkpoint(_inner_forward, x)
+        else:
+            x = _inner_forward(x)
+
+        return x
+
+
+class SwinBlockSequence(nn.Module):
+    """Implements one stage in Swin Transformer.
+
+    Args:
+        embed_dims (int): The feature dimension.
+        num_heads (int): Parallel attention heads.
+        feedforward_channels (int): The hidden dimension for FFNs.
+        depth (int): The number of blocks in this stage.
+        window_size (int, optional): The local window scale. Default: 7.
+        qkv_bias (bool, optional): enable bias for qkv if True. Default: True.
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Default: None.
+        drop_rate (float, optional): Dropout rate. Default: 0.
+        attn_drop_rate (float, optional): Attention dropout rate. Default: 0.
+        drop_path_rate (float | list[float], optional): Stochastic depth
+            rate. Default: 0.
+        downsample (nn.Module | None, optional): The downsample operation
+            module. Default: None.
+        act_cfg (dict, optional): The config dict of activation function.
+            Default: dict(type='GELU').
+        norm_cfg (dict, optional): The config dict of normalization.
+            Default: dict(type='LN').
+        with_cp (bool, optional): Use checkpoint or not. Using checkpoint
+            will save some memory while slowing down the training speed.
+            Default: False.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 feedforward_channels,
+                 depth,
+                 window_size=7,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 downsample=None,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='LN'),
+                 with_cp=False):
+        super().__init__()
+
+        if isinstance(drop_path_rate, list):
+            drop_path_rates = drop_path_rate
+            assert len(drop_path_rates) == depth
+        else:
+            drop_path_rates = [deepcopy(drop_path_rate) for _ in range(depth)]
+
+        self.blocks = nn.ModuleList()
+        for i in range(depth):
+            block = SwinBlock(
+                embed_dims=embed_dims,
+                num_heads=num_heads,
+                feedforward_channels=feedforward_channels,
+                window_size=window_size,
+                shift=False if i % 2 == 0 else True,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop_rate=drop_rate,
+                attn_drop_rate=attn_drop_rate,
+                drop_path_rate=drop_path_rates[i],
+                act_cfg=act_cfg,
+                norm_cfg=norm_cfg,
+                with_cp=with_cp)
+            self.blocks.append(block)
+
+        self.downsample = downsample
+
+    def forward(self, x, hw_shape):
+        for block in self.blocks:
+            x = block(x, hw_shape)
+
+        if self.downsample:
+            x_down, down_hw_shape = self.downsample(x, hw_shape)
+            return x_down, down_hw_shape, x, hw_shape
+        else:
+            return x, hw_shape, x, hw_shape
+
+
+@BACKBONES.register_module()
+class SwinTransformer(BaseBackbone):
+    """ Swin Transformer
+    A PyTorch implement of : `Swin Transformer:
+    Hierarchical Vision Transformer using Shifted Windows`  -
+        https://arxiv.org/abs/2103.14030
+
+    Inspiration from
+    https://github.com/microsoft/Swin-Transformer
+
+    Args:
+        pretrain_img_size (int | tuple[int]): The size of input image when
+            pretrain. Defaults: 224.
+        in_channels (int): The num of input channels.
+            Defaults: 3.
+        embed_dims (int): The feature dimension. Default: 96.
+        patch_size (int | tuple[int]): Patch size. Default: 4.
+        window_size (int): Window size. Default: 7.
+        mlp_ratio (int): Ratio of mlp hidden dim to embedding dim.
+            Default: 4.
+        depths (tuple[int]): Depths of each Swin Transformer stage.
+            Default: (2, 2, 6, 2).
+        num_heads (tuple[int]): Parallel attention heads of each Swin
+            Transformer stage. Default: (3, 6, 12, 24).
+        strides (tuple[int]): The patch merging or patch embedding stride of
+            each Swin Transformer stage. (In swin, we set kernel size equal to
+            stride.) Default: (4, 2, 2, 2).
+        out_indices (tuple[int]): Output from which stages.
+            Default: (0, 1, 2, 3).
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key,
+            value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Default: None.
+        patch_norm (bool): If add a norm layer for patch embed and patch
+            merging. Default: True.
+        drop_rate (float): Dropout rate. Defaults: 0.
+        attn_drop_rate (float): Attention dropout rate. Default: 0.
+        drop_path_rate (float): Stochastic depth rate. Defaults: 0.1.
+        use_abs_pos_embed (bool): If True, add absolute position embedding to
+            the patch embedding. Defaults: False.
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='LN').
+        norm_cfg (dict): Config dict for normalization layer at
+            output of backone. Defaults: dict(type='LN').
+        with_cp (bool, optional): Use checkpoint or not. Using checkpoint
+            will save some memory while slowing down the training speed.
+            Default: False.
+        pretrained (str, optional): model pretrained path. Default: None.
+        convert_weights (bool): The flag indicates whether the
+            pre-trained model is from the original repo. We may need
+            to convert some keys to make it compatible.
+            Default: False.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            Default: -1 (-1 means not freezing any parameters).
+    """
+
+    def __init__(
+        self,
+        pretrain_img_size=224,
+        in_channels=3,
+        embed_dims=96,
+        patch_size=4,
+        window_size=7,
+        mlp_ratio=4,
+        depths=(2, 2, 6, 2),
+        num_heads=(3, 6, 12, 24),
+        strides=(4, 2, 2, 2),
+        out_indices=(0, 1, 2, 3),
+        qkv_bias=True,
+        qk_scale=None,
+        patch_norm=True,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.1,
+        use_abs_pos_embed=False,
+        act_cfg=dict(type='GELU'),
+        norm_cfg=dict(type='LN'),
+        with_cp=False,
+        convert_weights=False,
+        frozen_stages=-1,
+    ):
+        self.convert_weights = convert_weights
+        self.frozen_stages = frozen_stages
+        if isinstance(pretrain_img_size, int):
+            pretrain_img_size = to_2tuple(pretrain_img_size)
+        elif isinstance(pretrain_img_size, tuple):
+            if len(pretrain_img_size) == 1:
+                pretrain_img_size = to_2tuple(pretrain_img_size[0])
+            assert len(pretrain_img_size) == 2, \
+                f'The size of image should have length 1 or 2, ' \
+                f'but got {len(pretrain_img_size)}'
+
+        super(SwinTransformer, self).__init__()
+
+        num_layers = len(depths)
+        self.out_indices = out_indices
+        self.use_abs_pos_embed = use_abs_pos_embed
+
+        assert strides[0] == patch_size, 'Use non-overlapping patch embed.'
+
+        self.patch_embed = PatchEmbed(
+            in_channels=in_channels,
+            embed_dims=embed_dims,
+            conv_type='Conv2d',
+            kernel_size=patch_size,
+            stride=strides[0],
+            norm_cfg=norm_cfg if patch_norm else None,
+            init_cfg=None)
+
+        if self.use_abs_pos_embed:
+            patch_row = pretrain_img_size[0] // patch_size
+            patch_col = pretrain_img_size[1] // patch_size
+            num_patches = patch_row * patch_col
+            self.absolute_pos_embed = nn.Parameter(
+                torch.zeros((1, num_patches, embed_dims)))
+
+        self.drop_after_pos = nn.Dropout(p=drop_rate)
+
+        # set stochastic depth decay rule
+        total_depth = sum(depths)
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, total_depth)
+        ]
+
+        self.stages = nn.ModuleList()
+        in_channels = embed_dims
+        for i in range(num_layers):
+            if i < num_layers - 1:
+                downsample = PatchMerging(
+                    in_channels=in_channels,
+                    out_channels=2 * in_channels,
+                    stride=strides[i + 1],
+                    norm_cfg=norm_cfg if patch_norm else None,
+                    init_cfg=None)
+            else:
+                downsample = None
+
+            stage = SwinBlockSequence(
+                embed_dims=in_channels,
+                num_heads=num_heads[i],
+                feedforward_channels=mlp_ratio * in_channels,
+                depth=depths[i],
+                window_size=window_size,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop_rate=drop_rate,
+                attn_drop_rate=attn_drop_rate,
+                drop_path_rate=dpr[sum(depths[:i]):sum(depths[:i + 1])],
+                downsample=downsample,
+                act_cfg=act_cfg,
+                norm_cfg=norm_cfg,
+                with_cp=with_cp)
+            self.stages.append(stage)
+            if downsample:
+                in_channels = downsample.out_channels
+
+        self.num_features = [int(embed_dims * 2**i) for i in range(num_layers)]
+        # Add a norm layer for each output
+        for i in out_indices:
+            layer = build_norm_layer(norm_cfg, self.num_features[i])[1]
+            layer_name = f'norm{i}'
+            self.add_module(layer_name, layer)
+
+    def train(self, mode=True):
+        """Convert the model into training mode while keep layers freezed."""
+        super(SwinTransformer, self).train(mode)
+        self._freeze_stages()
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.requires_grad = False
+            if self.use_abs_pos_embed:
+                self.absolute_pos_embed.requires_grad = False
+            self.drop_after_pos.eval()
+
+        for i in range(1, self.frozen_stages + 1):
+
+            if (i - 1) in self.out_indices:
+                norm_layer = getattr(self, f'norm{i-1}')
+                norm_layer.eval()
+                for param in norm_layer.parameters():
+                    param.requires_grad = False
+
+            m = self.stages[i - 1]
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in backbone.
+
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        if isinstance(pretrained, str):
+            logger = get_root_logger()
+            ckpt = _load_checkpoint(
+                pretrained, logger=logger, map_location='cpu')
+            if 'state_dict' in ckpt:
+                _state_dict = ckpt['state_dict']
+            elif 'model' in ckpt:
+                _state_dict = ckpt['model']
+            else:
+                _state_dict = ckpt
+            if self.convert_weights:
+                # supported loading weight from original repo,
+                _state_dict = swin_converter(_state_dict)
+
+            state_dict = OrderedDict()
+            for k, v in _state_dict.items():
+                if k.startswith('backbone.'):
+                    state_dict[k[9:]] = v
+
+            # strip prefix of state_dict
+            if list(state_dict.keys())[0].startswith('module.'):
+                state_dict = {k[7:]: v for k, v in state_dict.items()}
+
+            # reshape absolute position embedding
+            if state_dict.get('absolute_pos_embed') is not None:
+                absolute_pos_embed = state_dict['absolute_pos_embed']
+                N1, L, C1 = absolute_pos_embed.size()
+                N2, C2, H, W = self.absolute_pos_embed.size()
+                if N1 != N2 or C1 != C2 or L != H * W:
+                    logger.warning('Error in loading absolute_pos_embed, pass')
+                else:
+                    state_dict['absolute_pos_embed'] = absolute_pos_embed.view(
+                        N2, H, W, C2).permute(0, 3, 1, 2).contiguous()
+
+            # interpolate position bias table if needed
+            relative_position_bias_table_keys = [
+                k for k in state_dict.keys()
+                if 'relative_position_bias_table' in k
+            ]
+            for table_key in relative_position_bias_table_keys:
+                table_pretrained = state_dict[table_key]
+                table_current = self.state_dict()[table_key]
+                L1, nH1 = table_pretrained.size()
+                L2, nH2 = table_current.size()
+                if nH1 != nH2:
+                    logger.warning(f'Error in loading {table_key}, pass')
+                elif L1 != L2:
+                    S1 = int(L1**0.5)
+                    S2 = int(L2**0.5)
+                    table_pretrained_resized = F.interpolate(
+                        table_pretrained.permute(1, 0).reshape(1, nH1, S1, S1),
+                        size=(S2, S2),
+                        mode='bicubic')
+                    state_dict[table_key] = table_pretrained_resized.view(
+                        nH2, L2).permute(1, 0).contiguous()
+
+            # load state_dict
+            self.load_state_dict(state_dict, False)
+        elif pretrained is None:
+            if self.use_abs_pos_embed:
+                trunc_normal_(self.absolute_pos_embed, std=0.02)
+            for m in self.modules():
+                if isinstance(m, nn.Linear):
+                    trunc_normal_init(m, std=.02, bias=0.)
+                elif isinstance(m, nn.LayerNorm):
+                    constant_init(m, 1.0)
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, x):
+        x, hw_shape = self.patch_embed(x)
+
+        if self.use_abs_pos_embed:
+            x = x + self.absolute_pos_embed
+        x = self.drop_after_pos(x)
+
+        outs = []
+        for i, stage in enumerate(self.stages):
+            x, hw_shape, out, out_hw_shape = stage(x, hw_shape)
+            if i in self.out_indices:
+                norm_layer = getattr(self, f'norm{i}')
+                out = norm_layer(out)
+                out = out.view(-1, *out_hw_shape,
+                               self.num_features[i]).permute(0, 3, 1,
+                                                             2).contiguous()
+                outs.append(out)
+
+        return outs
diff --git a/main/transformer_utils/mmpose/models/backbones/tcformer.py b/main/transformer_utils/mmpose/models/backbones/tcformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0805cdddd17bbba50bf203e2bc9012efd86ba03
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/backbones/tcformer.py
@@ -0,0 +1,283 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import (build_norm_layer, constant_init, normal_init,
+                      trunc_normal_init)
+from mmcv.runner import _load_checkpoint, load_state_dict
+
+from ...utils import get_root_logger
+from ..builder import BACKBONES
+from ..utils import (PatchEmbed, TCFormerDynamicBlock, TCFormerRegularBlock,
+                     TokenConv, cluster_dpc_knn, merge_tokens,
+                     tcformer_convert, token2map)
+
+
+class CTM(nn.Module):
+    """Clustering-based Token Merging module in TCFormer.
+
+    Args:
+        sample_ratio (float): The sample ratio of tokens.
+        embed_dim (int): Input token feature dimension.
+        dim_out (int): Output token feature dimension.
+        k (int): number of the nearest neighbor used i DPC-knn algorithm.
+    """
+
+    def __init__(self, sample_ratio, embed_dim, dim_out, k=5):
+        super().__init__()
+        self.sample_ratio = sample_ratio
+        self.dim_out = dim_out
+        self.conv = TokenConv(
+            in_channels=embed_dim,
+            out_channels=dim_out,
+            kernel_size=3,
+            stride=2,
+            padding=1)
+        self.norm = nn.LayerNorm(self.dim_out)
+        self.score = nn.Linear(self.dim_out, 1)
+        self.k = k
+
+    def forward(self, token_dict):
+        token_dict = token_dict.copy()
+        x = self.conv(token_dict)
+        x = self.norm(x)
+        token_score = self.score(x)
+        token_weight = token_score.exp()
+
+        token_dict['x'] = x
+        B, N, C = x.shape
+        token_dict['token_score'] = token_score
+
+        cluster_num = max(math.ceil(N * self.sample_ratio), 1)
+        idx_cluster, cluster_num = cluster_dpc_knn(token_dict, cluster_num,
+                                                   self.k)
+        down_dict = merge_tokens(token_dict, idx_cluster, cluster_num,
+                                 token_weight)
+
+        H, W = token_dict['map_size']
+        H = math.floor((H - 1) / 2 + 1)
+        W = math.floor((W - 1) / 2 + 1)
+        down_dict['map_size'] = [H, W]
+
+        return down_dict, token_dict
+
+
+@BACKBONES.register_module()
+class TCFormer(nn.Module):
+    """Token Clustering Transformer (TCFormer)
+
+    Implementation of `Not All Tokens Are Equal: Human-centric Visual
+    Analysis via Token Clustering Transformer
+    <https://arxiv.org/abs/2204.08680>`
+
+        Args:
+        in_channels (int): Number of input channels. Default: 3.
+        embed_dims (list[int]): Embedding dimension. Default:
+            [64, 128, 256, 512].
+        num_heads (Sequence[int]): The attention heads of each transformer
+            encode layer. Default: [1, 2, 5, 8].
+        mlp_ratios (Sequence[int]): The ratio of the mlp hidden dim to the
+            embedding dim of each transformer block.
+        qkv_bias (bool): Enable bias for qkv if True. Default: True.
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Default: None.
+        drop_rate (float): Probability of an element to be zeroed.
+            Default 0.0.
+        attn_drop_rate (float): The drop out rate for attention layer.
+            Default 0.0.
+        drop_path_rate (float): stochastic depth rate. Default 0.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN', eps=1e-6).
+        num_layers (Sequence[int]): The layer number of each transformer encode
+            layer. Default: [3, 4, 6, 3].
+        sr_ratios (Sequence[int]): The spatial reduction rate of each
+            transformer block. Default: [8, 4, 2, 1].
+        num_stages (int): The num of stages. Default: 4.
+        pretrained (str, optional): model pretrained path. Default: None.
+        k (int): number of the nearest neighbor used for local density.
+        sample_ratios (list[float]): The sample ratios of CTM modules.
+            Default: [0.25, 0.25, 0.25]
+        return_map (bool): If True, transfer dynamic tokens to feature map at
+            last. Default: False
+        convert_weights (bool): The flag indicates whether the
+            pre-trained model is from the original repo. We may need
+            to convert some keys to make it compatible.
+            Default: True.
+    """
+
+    def __init__(self,
+                 in_channels=3,
+                 embed_dims=[64, 128, 256, 512],
+                 num_heads=[1, 2, 4, 8],
+                 mlp_ratios=[4, 4, 4, 4],
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 norm_cfg=dict(type='LN', eps=1e-6),
+                 num_layers=[3, 4, 6, 3],
+                 sr_ratios=[8, 4, 2, 1],
+                 num_stages=4,
+                 pretrained=None,
+                 k=5,
+                 sample_ratios=[0.25, 0.25, 0.25],
+                 return_map=False,
+                 convert_weights=True):
+        super().__init__()
+
+        self.num_layers = num_layers
+        self.num_stages = num_stages
+        self.grid_stride = sr_ratios[0]
+        self.embed_dims = embed_dims
+        self.sr_ratios = sr_ratios
+        self.mlp_ratios = mlp_ratios
+        self.sample_ratios = sample_ratios
+        self.return_map = return_map
+        self.convert_weights = convert_weights
+
+        # stochastic depth decay rule
+        dpr = [
+            x.item()
+            for x in torch.linspace(0, drop_path_rate, sum(num_layers))
+        ]
+        cur = 0
+
+        # In stage 1, use the standard transformer blocks
+        for i in range(1):
+            patch_embed = PatchEmbed(
+                in_channels=in_channels if i == 0 else embed_dims[i - 1],
+                embed_dims=embed_dims[i],
+                kernel_size=7,
+                stride=4,
+                padding=3,
+                bias=True,
+                norm_cfg=dict(type='LN', eps=1e-6))
+
+            block = nn.ModuleList([
+                TCFormerRegularBlock(
+                    dim=embed_dims[i],
+                    num_heads=num_heads[i],
+                    mlp_ratio=mlp_ratios[i],
+                    qkv_bias=qkv_bias,
+                    qk_scale=qk_scale,
+                    drop=drop_rate,
+                    attn_drop=attn_drop_rate,
+                    drop_path=dpr[cur + j],
+                    norm_cfg=norm_cfg,
+                    sr_ratio=sr_ratios[i]) for j in range(num_layers[i])
+            ])
+            norm = build_norm_layer(norm_cfg, embed_dims[i])[1]
+
+            cur += num_layers[i]
+
+            setattr(self, f'patch_embed{i + 1}', patch_embed)
+            setattr(self, f'block{i + 1}', block)
+            setattr(self, f'norm{i + 1}', norm)
+
+        # In stage 2~4, use TCFormerDynamicBlock for dynamic tokens
+        for i in range(1, num_stages):
+            ctm = CTM(sample_ratios[i - 1], embed_dims[i - 1], embed_dims[i],
+                      k)
+
+            block = nn.ModuleList([
+                TCFormerDynamicBlock(
+                    dim=embed_dims[i],
+                    num_heads=num_heads[i],
+                    mlp_ratio=mlp_ratios[i],
+                    qkv_bias=qkv_bias,
+                    qk_scale=qk_scale,
+                    drop=drop_rate,
+                    attn_drop=attn_drop_rate,
+                    drop_path=dpr[cur + j],
+                    norm_cfg=norm_cfg,
+                    sr_ratio=sr_ratios[i]) for j in range(num_layers[i])
+            ])
+            norm = build_norm_layer(norm_cfg, embed_dims[i])[1]
+            cur += num_layers[i]
+
+            setattr(self, f'ctm{i}', ctm)
+            setattr(self, f'block{i + 1}', block)
+            setattr(self, f'norm{i + 1}', norm)
+
+        self.init_weights(pretrained)
+
+    def init_weights(self, pretrained=None):
+        if isinstance(pretrained, str):
+            logger = get_root_logger()
+
+            checkpoint = _load_checkpoint(
+                pretrained, logger=logger, map_location='cpu')
+            logger.warning(f'Load pre-trained model for '
+                           f'{self.__class__.__name__} from original repo')
+            if 'state_dict' in checkpoint:
+                state_dict = checkpoint['state_dict']
+            elif 'model' in checkpoint:
+                state_dict = checkpoint['model']
+            else:
+                state_dict = checkpoint
+
+            if self.convert_weights:
+                # We need to convert pre-trained weights to match this
+                # implementation.
+                state_dict = tcformer_convert(state_dict)
+            load_state_dict(self, state_dict, strict=False, logger=logger)
+
+        elif pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Linear):
+                    trunc_normal_init(m, std=.02, bias=0.)
+                elif isinstance(m, nn.LayerNorm):
+                    constant_init(m, 1.0)
+                elif isinstance(m, nn.Conv2d):
+                    fan_out = m.kernel_size[0] * m.kernel_size[
+                        1] * m.out_channels
+                    fan_out //= m.groups
+                    normal_init(m, 0, math.sqrt(2.0 / fan_out))
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, x):
+        outs = []
+
+        i = 0
+        patch_embed = getattr(self, f'patch_embed{i + 1}')
+        block = getattr(self, f'block{i + 1}')
+        norm = getattr(self, f'norm{i + 1}')
+        x, (H, W) = patch_embed(x)
+        for blk in block:
+            x = blk(x, H, W)
+        x = norm(x)
+
+        # init token dict
+        B, N, _ = x.shape
+        device = x.device
+        idx_token = torch.arange(N)[None, :].repeat(B, 1).to(device)
+        agg_weight = x.new_ones(B, N, 1)
+        token_dict = {
+            'x': x,
+            'token_num': N,
+            'map_size': [H, W],
+            'init_grid_size': [H, W],
+            'idx_token': idx_token,
+            'agg_weight': agg_weight
+        }
+        outs.append(token_dict.copy())
+
+        # stage 2~4
+        for i in range(1, self.num_stages):
+            ctm = getattr(self, f'ctm{i}')
+            block = getattr(self, f'block{i + 1}')
+            norm = getattr(self, f'norm{i + 1}')
+
+            token_dict = ctm(token_dict)  # down sample
+            for j, blk in enumerate(block):
+                token_dict = blk(token_dict)
+
+            token_dict['x'] = norm(token_dict['x'])
+            outs.append(token_dict)
+
+        if self.return_map:
+            outs = [token2map(token_dict) for token_dict in outs]
+        return outs
diff --git a/main/transformer_utils/mmpose/models/backbones/tcn.py b/main/transformer_utils/mmpose/models/backbones/tcn.py
new file mode 100644
index 0000000000000000000000000000000000000000..deca2290aeb1830bc3e241b819157369371aaf27
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/backbones/tcn.py
@@ -0,0 +1,267 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule, build_conv_layer, constant_init, kaiming_init
+from mmcv.utils.parrots_wrapper import _BatchNorm
+
+from mmpose.core import WeightNormClipHook
+from ..builder import BACKBONES
+from .base_backbone import BaseBackbone
+
+
+class BasicTemporalBlock(nn.Module):
+    """Basic block for VideoPose3D.
+
+    Args:
+        in_channels (int): Input channels of this block.
+        out_channels (int): Output channels of this block.
+        mid_channels (int): The output channels of conv1. Default: 1024.
+        kernel_size (int): Size of the convolving kernel. Default: 3.
+        dilation (int): Spacing between kernel elements. Default: 3.
+        dropout (float): Dropout rate. Default: 0.25.
+        causal (bool): Use causal convolutions instead of symmetric
+            convolutions (for real-time applications). Default: False.
+        residual (bool): Use residual connection. Default: True.
+        use_stride_conv (bool): Use optimized TCN that designed
+            specifically for single-frame batching, i.e. where batches have
+            input length = receptive field, and output length = 1. This
+            implementation replaces dilated convolutions with strided
+            convolutions to avoid generating unused intermediate results.
+            Default: False.
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Default: dict(type='Conv1d').
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN1d').
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 mid_channels=1024,
+                 kernel_size=3,
+                 dilation=3,
+                 dropout=0.25,
+                 causal=False,
+                 residual=True,
+                 use_stride_conv=False,
+                 conv_cfg=dict(type='Conv1d'),
+                 norm_cfg=dict(type='BN1d')):
+        # Protect mutable default arguments
+        conv_cfg = copy.deepcopy(conv_cfg)
+        norm_cfg = copy.deepcopy(norm_cfg)
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.mid_channels = mid_channels
+        self.kernel_size = kernel_size
+        self.dilation = dilation
+        self.dropout = dropout
+        self.causal = causal
+        self.residual = residual
+        self.use_stride_conv = use_stride_conv
+
+        self.pad = (kernel_size - 1) * dilation // 2
+        if use_stride_conv:
+            self.stride = kernel_size
+            self.causal_shift = kernel_size // 2 if causal else 0
+            self.dilation = 1
+        else:
+            self.stride = 1
+            self.causal_shift = kernel_size // 2 * dilation if causal else 0
+
+        self.conv1 = nn.Sequential(
+            ConvModule(
+                in_channels,
+                mid_channels,
+                kernel_size=kernel_size,
+                stride=self.stride,
+                dilation=self.dilation,
+                bias='auto',
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg))
+        self.conv2 = nn.Sequential(
+            ConvModule(
+                mid_channels,
+                out_channels,
+                kernel_size=1,
+                bias='auto',
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg))
+
+        if residual and in_channels != out_channels:
+            self.short_cut = build_conv_layer(conv_cfg, in_channels,
+                                              out_channels, 1)
+        else:
+            self.short_cut = None
+
+        self.dropout = nn.Dropout(dropout) if dropout > 0 else None
+
+    def forward(self, x):
+        """Forward function."""
+        if self.use_stride_conv:
+            assert self.causal_shift + self.kernel_size // 2 < x.shape[2]
+        else:
+            assert 0 <= self.pad + self.causal_shift < x.shape[2] - \
+                self.pad + self.causal_shift <= x.shape[2]
+
+        out = self.conv1(x)
+        if self.dropout is not None:
+            out = self.dropout(out)
+
+        out = self.conv2(out)
+        if self.dropout is not None:
+            out = self.dropout(out)
+
+        if self.residual:
+            if self.use_stride_conv:
+                res = x[:, :, self.causal_shift +
+                        self.kernel_size // 2::self.kernel_size]
+            else:
+                res = x[:, :,
+                        (self.pad + self.causal_shift):(x.shape[2] - self.pad +
+                                                        self.causal_shift)]
+
+            if self.short_cut is not None:
+                res = self.short_cut(res)
+            out = out + res
+
+        return out
+
+
+@BACKBONES.register_module()
+class TCN(BaseBackbone):
+    """TCN backbone.
+
+    Temporal Convolutional Networks.
+    More details can be found in the
+    `paper <https://arxiv.org/abs/1811.11742>`__ .
+
+    Args:
+        in_channels (int): Number of input channels, which equals to
+            num_keypoints * num_features.
+        stem_channels (int): Number of feature channels. Default: 1024.
+        num_blocks (int): NUmber of basic temporal convolutional blocks.
+            Default: 2.
+        kernel_sizes (Sequence[int]): Sizes of the convolving kernel of
+            each basic block. Default: ``(3, 3, 3)``.
+        dropout (float): Dropout rate. Default: 0.25.
+        causal (bool): Use causal convolutions instead of symmetric
+            convolutions (for real-time applications).
+            Default: False.
+        residual (bool): Use residual connection. Default: True.
+        use_stride_conv (bool): Use TCN backbone optimized for
+            single-frame batching, i.e. where batches have input length =
+            receptive field, and output length = 1. This implementation
+            replaces dilated convolutions with strided convolutions to avoid
+            generating unused intermediate results. The weights are
+            interchangeable with the reference implementation. Default: False
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Default: dict(type='Conv1d').
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN1d').
+        max_norm (float|None): if not None, the weight of convolution layers
+            will be clipped to have a maximum norm of max_norm.
+
+    Example:
+        >>> from mmpose.models import TCN
+        >>> import torch
+        >>> self = TCN(in_channels=34)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 34, 243)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 1024, 235)
+        (1, 1024, 217)
+    """
+
+    def __init__(self,
+                 in_channels,
+                 stem_channels=1024,
+                 num_blocks=2,
+                 kernel_sizes=(3, 3, 3),
+                 dropout=0.25,
+                 causal=False,
+                 residual=True,
+                 use_stride_conv=False,
+                 conv_cfg=dict(type='Conv1d'),
+                 norm_cfg=dict(type='BN1d'),
+                 max_norm=None):
+        # Protect mutable default arguments
+        conv_cfg = copy.deepcopy(conv_cfg)
+        norm_cfg = copy.deepcopy(norm_cfg)
+        super().__init__()
+        self.in_channels = in_channels
+        self.stem_channels = stem_channels
+        self.num_blocks = num_blocks
+        self.kernel_sizes = kernel_sizes
+        self.dropout = dropout
+        self.causal = causal
+        self.residual = residual
+        self.use_stride_conv = use_stride_conv
+        self.max_norm = max_norm
+
+        assert num_blocks == len(kernel_sizes) - 1
+        for ks in kernel_sizes:
+            assert ks % 2 == 1, 'Only odd filter widths are supported.'
+
+        self.expand_conv = ConvModule(
+            in_channels,
+            stem_channels,
+            kernel_size=kernel_sizes[0],
+            stride=kernel_sizes[0] if use_stride_conv else 1,
+            bias='auto',
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg)
+
+        dilation = kernel_sizes[0]
+        self.tcn_blocks = nn.ModuleList()
+        for i in range(1, num_blocks + 1):
+            self.tcn_blocks.append(
+                BasicTemporalBlock(
+                    in_channels=stem_channels,
+                    out_channels=stem_channels,
+                    mid_channels=stem_channels,
+                    kernel_size=kernel_sizes[i],
+                    dilation=dilation,
+                    dropout=dropout,
+                    causal=causal,
+                    residual=residual,
+                    use_stride_conv=use_stride_conv,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg))
+            dilation *= kernel_sizes[i]
+
+        if self.max_norm is not None:
+            # Apply weight norm clip to conv layers
+            weight_clip = WeightNormClipHook(self.max_norm)
+            for module in self.modules():
+                if isinstance(module, nn.modules.conv._ConvNd):
+                    weight_clip.register(module)
+
+        self.dropout = nn.Dropout(dropout) if dropout > 0 else None
+
+    def forward(self, x):
+        """Forward function."""
+        x = self.expand_conv(x)
+
+        if self.dropout is not None:
+            x = self.dropout(x)
+
+        outs = []
+        for i in range(self.num_blocks):
+            x = self.tcn_blocks[i](x)
+            outs.append(x)
+
+        return tuple(outs)
+
+    def init_weights(self, pretrained=None):
+        """Initialize the weights."""
+        super().init_weights(pretrained)
+        if pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.modules.conv._ConvNd):
+                    kaiming_init(m, mode='fan_in', nonlinearity='relu')
+                elif isinstance(m, _BatchNorm):
+                    constant_init(m, 1)
diff --git a/main/transformer_utils/mmpose/models/backbones/utils/__init__.py b/main/transformer_utils/mmpose/models/backbones/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..52a30ca9f7c8e90b6c6fa2fd8a9705ca0403b259
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/backbones/utils/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .channel_shuffle import channel_shuffle
+from .inverted_residual import InvertedResidual
+from .make_divisible import make_divisible
+from .se_layer import SELayer
+from .utils import load_checkpoint
+
+__all__ = [
+    'channel_shuffle', 'make_divisible', 'InvertedResidual', 'SELayer',
+    'load_checkpoint'
+]
diff --git a/main/transformer_utils/mmpose/models/backbones/utils/channel_shuffle.py b/main/transformer_utils/mmpose/models/backbones/utils/channel_shuffle.py
new file mode 100644
index 0000000000000000000000000000000000000000..aedd826bee690d42d92ed8a7f538b221e5b069e2
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/backbones/utils/channel_shuffle.py
@@ -0,0 +1,29 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+
+def channel_shuffle(x, groups):
+    """Channel Shuffle operation.
+
+    This function enables cross-group information flow for multiple groups
+    convolution layers.
+
+    Args:
+        x (Tensor): The input tensor.
+        groups (int): The number of groups to divide the input tensor
+            in the channel dimension.
+
+    Returns:
+        Tensor: The output tensor after channel shuffle operation.
+    """
+
+    batch_size, num_channels, height, width = x.size()
+    assert (num_channels % groups == 0), ('num_channels should be '
+                                          'divisible by groups')
+    channels_per_group = num_channels // groups
+
+    x = x.view(batch_size, groups, channels_per_group, height, width)
+    x = torch.transpose(x, 1, 2).contiguous()
+    x = x.view(batch_size, groups * channels_per_group, height, width)
+
+    return x
diff --git a/main/transformer_utils/mmpose/models/backbones/utils/ckpt_convert.py b/main/transformer_utils/mmpose/models/backbones/utils/ckpt_convert.py
new file mode 100644
index 0000000000000000000000000000000000000000..14a43892c6630be31e915ed1f8b9164ba250e8bd
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/backbones/utils/ckpt_convert.py
@@ -0,0 +1,62 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# This script consists of several convert functions which
+# can modify the weights of model in original repo to be
+# pre-trained weights.
+
+from collections import OrderedDict
+
+
+def swin_converter(ckpt):
+
+    new_ckpt = OrderedDict()
+
+    def correct_unfold_reduction_order(x):
+        out_channel, in_channel = x.shape
+        x = x.reshape(out_channel, 4, in_channel // 4)
+        x = x[:, [0, 2, 1, 3], :].transpose(1,
+                                            2).reshape(out_channel, in_channel)
+        return x
+
+    def correct_unfold_norm_order(x):
+        in_channel = x.shape[0]
+        x = x.reshape(4, in_channel // 4)
+        x = x[[0, 2, 1, 3], :].transpose(0, 1).reshape(in_channel)
+        return x
+
+    for k, v in ckpt.items():
+        if k.startswith('head'):
+            continue
+        elif k.startswith('layers'):
+            new_v = v
+            if 'attn.' in k:
+                new_k = k.replace('attn.', 'attn.w_msa.')
+            elif 'mlp.' in k:
+                if 'mlp.fc1.' in k:
+                    new_k = k.replace('mlp.fc1.', 'ffn.layers.0.0.')
+                elif 'mlp.fc2.' in k:
+                    new_k = k.replace('mlp.fc2.', 'ffn.layers.1.')
+                else:
+                    new_k = k.replace('mlp.', 'ffn.')
+            elif 'downsample' in k:
+                new_k = k
+                if 'reduction.' in k:
+                    new_v = correct_unfold_reduction_order(v)
+                elif 'norm.' in k:
+                    new_v = correct_unfold_norm_order(v)
+            else:
+                new_k = k
+            new_k = new_k.replace('layers', 'stages', 1)
+        elif k.startswith('patch_embed'):
+            new_v = v
+            if 'proj' in k:
+                new_k = k.replace('proj', 'projection')
+            else:
+                new_k = k
+        else:
+            new_v = v
+            new_k = k
+
+        new_ckpt['backbone.' + new_k] = new_v
+
+    return new_ckpt
diff --git a/main/transformer_utils/mmpose/models/backbones/utils/inverted_residual.py b/main/transformer_utils/mmpose/models/backbones/utils/inverted_residual.py
new file mode 100644
index 0000000000000000000000000000000000000000..dff762c570550e4a738ae1833a4c82c18777115d
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/backbones/utils/inverted_residual.py
@@ -0,0 +1,128 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import ConvModule
+
+from .se_layer import SELayer
+
+
+class InvertedResidual(nn.Module):
+    """Inverted Residual Block.
+
+    Args:
+        in_channels (int): The input channels of this Module.
+        out_channels (int): The output channels of this Module.
+        mid_channels (int): The input channels of the depthwise convolution.
+        kernel_size (int): The kernel size of the depthwise convolution.
+            Default: 3.
+        groups (None or int): The group number of the depthwise convolution.
+            Default: None, which means group number = mid_channels.
+        stride (int): The stride of the depthwise convolution. Default: 1.
+        se_cfg (dict): Config dict for se layer. Default: None, which means no
+            se layer.
+        with_expand_conv (bool): Use expand conv or not. If set False,
+            mid_channels must be the same with in_channels.
+            Default: True.
+        conv_cfg (dict): Config dict for convolution layer. Default: None,
+            which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU').
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+
+    Returns:
+        Tensor: The output tensor.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 mid_channels,
+                 kernel_size=3,
+                 groups=None,
+                 stride=1,
+                 se_cfg=None,
+                 with_expand_conv=True,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 with_cp=False):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        act_cfg = copy.deepcopy(act_cfg)
+        super().__init__()
+        self.with_res_shortcut = (stride == 1 and in_channels == out_channels)
+        assert stride in [1, 2]
+        self.with_cp = with_cp
+        self.with_se = se_cfg is not None
+        self.with_expand_conv = with_expand_conv
+
+        if groups is None:
+            groups = mid_channels
+
+        if self.with_se:
+            assert isinstance(se_cfg, dict)
+        if not self.with_expand_conv:
+            assert mid_channels == in_channels
+
+        if self.with_expand_conv:
+            self.expand_conv = ConvModule(
+                in_channels=in_channels,
+                out_channels=mid_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+        self.depthwise_conv = ConvModule(
+            in_channels=mid_channels,
+            out_channels=mid_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=kernel_size // 2,
+            groups=groups,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        if self.with_se:
+            self.se = SELayer(**se_cfg)
+        self.linear_conv = ConvModule(
+            in_channels=mid_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            out = x
+
+            if self.with_expand_conv:
+                out = self.expand_conv(out)
+
+            out = self.depthwise_conv(out)
+
+            if self.with_se:
+                out = self.se(out)
+
+            out = self.linear_conv(out)
+
+            if self.with_res_shortcut:
+                return x + out
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        return out
diff --git a/main/transformer_utils/mmpose/models/backbones/utils/make_divisible.py b/main/transformer_utils/mmpose/models/backbones/utils/make_divisible.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7666be65939d5c76057e73927c230029cb1871d
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/backbones/utils/make_divisible.py
@@ -0,0 +1,25 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+def make_divisible(value, divisor, min_value=None, min_ratio=0.9):
+    """Make divisible function.
+
+    This function rounds the channel number down to the nearest value that can
+    be divisible by the divisor.
+
+    Args:
+        value (int): The original channel number.
+        divisor (int): The divisor to fully divide the channel number.
+        min_value (int, optional): The minimum value of the output channel.
+            Default: None, means that the minimum value equal to the divisor.
+        min_ratio (float, optional): The minimum ratio of the rounded channel
+            number to the original channel number. Default: 0.9.
+    Returns:
+        int: The modified output channel number
+    """
+
+    if min_value is None:
+        min_value = divisor
+    new_value = max(min_value, int(value + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than (1-min_ratio).
+    if new_value < min_ratio * value:
+        new_value += divisor
+    return new_value
diff --git a/main/transformer_utils/mmpose/models/backbones/utils/se_layer.py b/main/transformer_utils/mmpose/models/backbones/utils/se_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..07f70802eb1b98b1f22516ba62b1533557f428ed
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/backbones/utils/se_layer.py
@@ -0,0 +1,54 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+
+
+class SELayer(nn.Module):
+    """Squeeze-and-Excitation Module.
+
+    Args:
+        channels (int): The input (and output) channels of the SE layer.
+        ratio (int): Squeeze ratio in SELayer, the intermediate channel will be
+            ``int(channels/ratio)``. Default: 16.
+        conv_cfg (None or dict): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        act_cfg (dict or Sequence[dict]): Config dict for activation layer.
+            If act_cfg is a dict, two activation layers will be configurated
+            by this dict. If act_cfg is a sequence of dicts, the first
+            activation layer will be configurated by the first dict and the
+            second activation layer will be configurated by the second dict.
+            Default: (dict(type='ReLU'), dict(type='Sigmoid'))
+    """
+
+    def __init__(self,
+                 channels,
+                 ratio=16,
+                 conv_cfg=None,
+                 act_cfg=(dict(type='ReLU'), dict(type='Sigmoid'))):
+        super().__init__()
+        if isinstance(act_cfg, dict):
+            act_cfg = (act_cfg, act_cfg)
+        assert len(act_cfg) == 2
+        assert mmcv.is_tuple_of(act_cfg, dict)
+        self.global_avgpool = nn.AdaptiveAvgPool2d(1)
+        self.conv1 = ConvModule(
+            in_channels=channels,
+            out_channels=int(channels / ratio),
+            kernel_size=1,
+            stride=1,
+            conv_cfg=conv_cfg,
+            act_cfg=act_cfg[0])
+        self.conv2 = ConvModule(
+            in_channels=int(channels / ratio),
+            out_channels=channels,
+            kernel_size=1,
+            stride=1,
+            conv_cfg=conv_cfg,
+            act_cfg=act_cfg[1])
+
+    def forward(self, x):
+        out = self.global_avgpool(x)
+        out = self.conv1(out)
+        out = self.conv2(out)
+        return x * out
diff --git a/main/transformer_utils/mmpose/models/backbones/utils/utils.py b/main/transformer_utils/mmpose/models/backbones/utils/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a53c94a90a1802cc0c4dcfceba241711c989640
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/backbones/utils/utils.py
@@ -0,0 +1,612 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections import OrderedDict
+
+from mmcv.runner.checkpoint import _load_checkpoint, load_state_dict
+
+
+# Copyright (c) Open-MMLab. All rights reserved.
+import io
+import os
+import os.path as osp
+import pkgutil
+import time
+import warnings
+from collections import OrderedDict
+from importlib import import_module
+from tempfile import TemporaryDirectory
+
+import torch
+import torchvision
+from torch.optim import Optimizer
+from torch.utils import model_zoo
+from torch.nn import functional as F
+
+import mmcv
+from mmcv.fileio import FileClient
+from mmcv.fileio import load as load_file
+from mmcv.parallel import is_module_wrapper
+from mmcv.utils import mkdir_or_exist
+from mmcv.runner import get_dist_info
+
+from scipy import interpolate
+import numpy as np
+import math
+
+ENV_MMCV_HOME = 'MMCV_HOME'
+ENV_XDG_CACHE_HOME = 'XDG_CACHE_HOME'
+DEFAULT_CACHE_DIR = '~/.cache'
+
+
+def _get_mmcv_home():
+    mmcv_home = os.path.expanduser(
+        os.getenv(
+            ENV_MMCV_HOME,
+            os.path.join(
+                os.getenv(ENV_XDG_CACHE_HOME, DEFAULT_CACHE_DIR), 'mmcv')))
+
+    mkdir_or_exist(mmcv_home)
+    return mmcv_home
+
+
+def load_state_dict(module, state_dict, strict=False, logger=None):
+    """Load state_dict to a module.
+    This method is modified from :meth:`torch.nn.Module.load_state_dict`.
+    Default value for ``strict`` is set to ``False`` and the message for
+    param mismatch will be shown even if strict is False.
+    Args:
+        module (Module): Module that receives the state_dict.
+        state_dict (OrderedDict): Weights.
+        strict (bool): whether to strictly enforce that the keys
+            in :attr:`state_dict` match the keys returned by this module's
+            :meth:`~torch.nn.Module.state_dict` function. Default: ``False``.
+        logger (:obj:`logging.Logger`, optional): Logger to log the error
+            message. If not specified, print function will be used.
+    """
+    unexpected_keys = []
+    all_missing_keys = []
+    err_msg = []
+
+    metadata = getattr(state_dict, '_metadata', None)
+    state_dict = state_dict.copy()
+    if metadata is not None:
+        state_dict._metadata = metadata
+
+    # use _load_from_state_dict to enable checkpoint version control
+    def load(module, prefix=''):
+        # recursively check parallel module in case that the model has a
+        # complicated structure, e.g., nn.Module(nn.Module(DDP))
+        if is_module_wrapper(module):
+            module = module.module
+        local_metadata = {} if metadata is None else metadata.get(
+            prefix[:-1], {})
+        module._load_from_state_dict(state_dict, prefix, local_metadata, True,
+                                     all_missing_keys, unexpected_keys,
+                                     err_msg)
+        for name, child in module._modules.items():
+            if child is not None:
+                load(child, prefix + name + '.')
+
+    load(module)
+    load = None  # break load->load reference cycle
+
+    # ignore "num_batches_tracked" of BN layers
+    missing_keys = [
+        key for key in all_missing_keys if 'num_batches_tracked' not in key
+    ]
+
+    if unexpected_keys:
+        err_msg.append('unexpected key in source '
+                       f'state_dict: {", ".join(unexpected_keys)}\n')
+    if missing_keys:
+        err_msg.append(
+            f'missing keys in source state_dict: {", ".join(missing_keys)}\n')
+
+    rank, _ = get_dist_info()
+    if len(err_msg) > 0 and rank == 0:
+        err_msg.insert(
+            0, 'The model and loaded state dict do not match exactly\n')
+        err_msg = '\n'.join(err_msg)
+        if strict:
+            raise RuntimeError(err_msg)
+        elif logger is not None:
+            logger.warning(err_msg)
+        else:
+            print(err_msg)
+
+
+def load_url_dist(url, model_dir=None, map_location="cpu"):
+    """In distributed setting, this function only download checkpoint at local
+    rank 0."""
+    rank, world_size = get_dist_info()
+    rank = int(os.environ.get('LOCAL_RANK', rank))
+    if rank == 0:
+        checkpoint = model_zoo.load_url(url, model_dir=model_dir, map_location=map_location)
+    if world_size > 1:
+        torch.distributed.barrier()
+        if rank > 0:
+            checkpoint = model_zoo.load_url(url, model_dir=model_dir, map_location=map_location)
+    return checkpoint
+
+
+def load_pavimodel_dist(model_path, map_location=None):
+    """In distributed setting, this function only download checkpoint at local
+    rank 0."""
+    try:
+        from pavi import modelcloud
+    except ImportError:
+        raise ImportError(
+            'Please install pavi to load checkpoint from modelcloud.')
+    rank, world_size = get_dist_info()
+    rank = int(os.environ.get('LOCAL_RANK', rank))
+    if rank == 0:
+        model = modelcloud.get(model_path)
+        with TemporaryDirectory() as tmp_dir:
+            downloaded_file = osp.join(tmp_dir, model.name)
+            model.download(downloaded_file)
+            checkpoint = torch.load(downloaded_file, map_location=map_location)
+    if world_size > 1:
+        torch.distributed.barrier()
+        if rank > 0:
+            model = modelcloud.get(model_path)
+            with TemporaryDirectory() as tmp_dir:
+                downloaded_file = osp.join(tmp_dir, model.name)
+                model.download(downloaded_file)
+                checkpoint = torch.load(
+                    downloaded_file, map_location=map_location)
+    return checkpoint
+
+
+def load_fileclient_dist(filename, backend, map_location):
+    """In distributed setting, this function only download checkpoint at local
+    rank 0."""
+    rank, world_size = get_dist_info()
+    rank = int(os.environ.get('LOCAL_RANK', rank))
+    allowed_backends = ['ceph']
+    if backend not in allowed_backends:
+        raise ValueError(f'Load from Backend {backend} is not supported.')
+    if rank == 0:
+        fileclient = FileClient(backend=backend)
+        buffer = io.BytesIO(fileclient.get(filename))
+        checkpoint = torch.load(buffer, map_location=map_location)
+    if world_size > 1:
+        torch.distributed.barrier()
+        if rank > 0:
+            fileclient = FileClient(backend=backend)
+            buffer = io.BytesIO(fileclient.get(filename))
+            checkpoint = torch.load(buffer, map_location=map_location)
+    return checkpoint
+
+
+def get_torchvision_models():
+    model_urls = dict()
+    for _, name, ispkg in pkgutil.walk_packages(torchvision.models.__path__):
+        if ispkg:
+            continue
+        _zoo = import_module(f'torchvision.models.{name}')
+        if hasattr(_zoo, 'model_urls'):
+            _urls = getattr(_zoo, 'model_urls')
+            model_urls.update(_urls)
+    return model_urls
+
+
+def get_external_models():
+    mmcv_home = _get_mmcv_home()
+    default_json_path = osp.join(mmcv.__path__[0], 'model_zoo/open_mmlab.json')
+    default_urls = load_file(default_json_path)
+    assert isinstance(default_urls, dict)
+    external_json_path = osp.join(mmcv_home, 'open_mmlab.json')
+    if osp.exists(external_json_path):
+        external_urls = load_file(external_json_path)
+        assert isinstance(external_urls, dict)
+        default_urls.update(external_urls)
+
+    return default_urls
+
+
+def get_mmcls_models():
+    mmcls_json_path = osp.join(mmcv.__path__[0], 'model_zoo/mmcls.json')
+    mmcls_urls = load_file(mmcls_json_path)
+
+    return mmcls_urls
+
+
+def get_deprecated_model_names():
+    deprecate_json_path = osp.join(mmcv.__path__[0],
+                                   'model_zoo/deprecated.json')
+    deprecate_urls = load_file(deprecate_json_path)
+    assert isinstance(deprecate_urls, dict)
+
+    return deprecate_urls
+
+
+def _process_mmcls_checkpoint(checkpoint):
+    state_dict = checkpoint['state_dict']
+    new_state_dict = OrderedDict()
+    for k, v in state_dict.items():
+        if k.startswith('backbone.'):
+            new_state_dict[k[9:]] = v
+    new_checkpoint = dict(state_dict=new_state_dict)
+
+    return new_checkpoint
+
+
+def _load_checkpoint(filename, map_location=None):
+    """Load checkpoint from somewhere (modelzoo, file, url).
+    Args:
+        filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+            ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
+            details.
+        map_location (str | None): Same as :func:`torch.load`. Default: None.
+    Returns:
+        dict | OrderedDict: The loaded checkpoint. It can be either an
+            OrderedDict storing model weights or a dict containing other
+            information, which depends on the checkpoint.
+    """
+    if filename.startswith('modelzoo://'):
+        warnings.warn('The URL scheme of "modelzoo://" is deprecated, please '
+                      'use "torchvision://" instead')
+        model_urls = get_torchvision_models()
+        model_name = filename[11:]
+        checkpoint = load_url_dist(model_urls[model_name])
+    elif filename.startswith('torchvision://'):
+        model_urls = get_torchvision_models()
+        model_name = filename[14:]
+        checkpoint = load_url_dist(model_urls[model_name])
+    elif filename.startswith('open-mmlab://'):
+        model_urls = get_external_models()
+        model_name = filename[13:]
+        deprecated_urls = get_deprecated_model_names()
+        if model_name in deprecated_urls:
+            warnings.warn(f'open-mmlab://{model_name} is deprecated in favor '
+                          f'of open-mmlab://{deprecated_urls[model_name]}')
+            model_name = deprecated_urls[model_name]
+        model_url = model_urls[model_name]
+        # check if is url
+        if model_url.startswith(('http://', 'https://')):
+            checkpoint = load_url_dist(model_url)
+        else:
+            filename = osp.join(_get_mmcv_home(), model_url)
+            if not osp.isfile(filename):
+                raise IOError(f'{filename} is not a checkpoint file')
+            checkpoint = torch.load(filename, map_location=map_location)
+    elif filename.startswith('mmcls://'):
+        model_urls = get_mmcls_models()
+        model_name = filename[8:]
+        checkpoint = load_url_dist(model_urls[model_name])
+        checkpoint = _process_mmcls_checkpoint(checkpoint)
+    elif filename.startswith(('http://', 'https://')):
+        checkpoint = load_url_dist(filename)
+    elif filename.startswith('pavi://'):
+        model_path = filename[7:]
+        checkpoint = load_pavimodel_dist(model_path, map_location=map_location)
+    elif filename.startswith('s3://'):
+        checkpoint = load_fileclient_dist(
+            filename, backend='ceph', map_location=map_location)
+    else:
+        if not osp.isfile(filename):
+            raise IOError(f'{filename} is not a checkpoint file')
+        checkpoint = torch.load(filename, map_location=map_location)
+    return checkpoint
+
+
+def cosine_scheduler(base_value, final_value, epochs, niter_per_ep, warmup_epochs=0,
+                     start_warmup_value=0, warmup_steps=-1):
+    warmup_schedule = np.array([])
+    warmup_iters = warmup_epochs * niter_per_ep
+    if warmup_steps > 0:
+        warmup_iters = warmup_steps
+    print("Set warmup steps = %d" % warmup_iters)
+    if warmup_epochs > 0:
+        warmup_schedule = np.linspace(start_warmup_value, base_value, warmup_iters)
+
+    iters = np.arange(epochs * niter_per_ep - warmup_iters)
+    schedule = np.array(
+        [final_value + 0.5 * (base_value - final_value) * (1 + math.cos(math.pi * i / (len(iters)))) for i in iters])
+
+    schedule = np.concatenate((warmup_schedule, schedule))
+
+    assert len(schedule) == epochs * niter_per_ep
+    return schedule
+
+
+def load_checkpoint(model,
+                    filename,
+                    map_location='cpu',
+                    strict=False,
+                    logger=None,
+                    patch_padding='pad',
+                    ):
+    """Load checkpoint from a file or URI.
+    Args:
+        model (Module): Module to load checkpoint.
+        filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+            ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
+            details.
+        map_location (str): Same as :func:`torch.load`.
+        strict (bool): Whether to allow different params for the model and
+            checkpoint.
+        logger (:mod:`logging.Logger` or None): The logger for error message.
+        patch_padding (str): 'pad' or 'bilinear' or 'bicubic', used for interpolate patch embed from 14x14 to 16x16
+    Returns:
+        dict or OrderedDict: The loaded checkpoint.
+    """
+    checkpoint = _load_checkpoint(filename, map_location)
+    # OrderedDict is a subclass of dict
+    if not isinstance(checkpoint, dict):
+        raise RuntimeError(
+            f'No state_dict found in checkpoint file {filename}')
+    # get state_dict from checkpoint
+    if 'state_dict' in checkpoint:
+        state_dict = checkpoint['state_dict']
+    elif 'model' in checkpoint:
+        state_dict = checkpoint['model']
+    elif 'module' in checkpoint:
+        state_dict = checkpoint['module']
+    else:
+        state_dict = checkpoint
+    # strip prefix of state_dict
+    if list(state_dict.keys())[0].startswith('module.'):
+        state_dict = {k[7:]: v for k, v in state_dict.items()}
+
+    # for MoBY, load model of online branch
+    if sorted(list(state_dict.keys()))[0].startswith('encoder'):
+        state_dict = {k.replace('encoder.', ''): v for k, v in state_dict.items() if k.startswith('encoder.')}
+
+    rank, _ = get_dist_info()
+
+    if 'patch_embed.proj.weight' in state_dict:
+        proj_weight = state_dict['patch_embed.proj.weight']
+        orig_size = proj_weight.shape[2:]
+        current_size = model.patch_embed.proj.weight.shape[2:]
+        padding_size = current_size[0] - orig_size[0]
+        padding_l = padding_size // 2
+        padding_r = padding_size - padding_l
+        if orig_size != current_size:
+            if 'pad' in patch_padding:
+                proj_weight = torch.nn.functional.pad(proj_weight, (padding_l, padding_r, padding_l, padding_r))
+            elif 'bilinear' in patch_padding:
+                proj_weight = torch.nn.functional.interpolate(proj_weight, size=current_size, mode='bilinear', align_corners=False)
+            elif 'bicubic' in patch_padding:
+                proj_weight = torch.nn.functional.interpolate(proj_weight, size=current_size, mode='bicubic', align_corners=False)
+            state_dict['patch_embed.proj.weight'] = proj_weight
+
+    if 'pos_embed' in state_dict:
+        pos_embed_checkpoint = state_dict['pos_embed']
+        embedding_size = pos_embed_checkpoint.shape[-1]
+        H, W = model.patch_embed.patch_shape
+        num_patches = model.patch_embed.num_patches
+        num_extra_tokens = model.pos_embed.shape[-2] - num_patches
+        # height (== width) for the checkpoint position embedding
+        orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5)
+        if rank == 0:
+            print("Position interpolate from %dx%d to %dx%d" % (orig_size, orig_size, H, W))
+        extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
+        # only the position tokens are interpolated
+        pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
+        pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2)
+        pos_tokens = torch.nn.functional.interpolate(
+            pos_tokens, size=(H, W), mode='bicubic', align_corners=False)
+        pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
+        new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
+        state_dict['pos_embed'] = new_pos_embed
+
+    # load state_dict
+    load_state_dict(model, state_dict, strict, logger)
+    return checkpoint
+
+
+def weights_to_cpu(state_dict):
+    """Copy a model state_dict to cpu.
+    Args:
+        state_dict (OrderedDict): Model weights on GPU.
+    Returns:
+        OrderedDict: Model weights on GPU.
+    """
+    state_dict_cpu = OrderedDict()
+    for key, val in state_dict.items():
+        state_dict_cpu[key] = val.cpu()
+    return state_dict_cpu
+
+
+def _save_to_state_dict(module, destination, prefix, keep_vars):
+    """Saves module state to `destination` dictionary.
+    This method is modified from :meth:`torch.nn.Module._save_to_state_dict`.
+    Args:
+        module (nn.Module): The module to generate state_dict.
+        destination (dict): A dict where state will be stored.
+        prefix (str): The prefix for parameters and buffers used in this
+            module.
+    """
+    for name, param in module._parameters.items():
+        if param is not None:
+            destination[prefix + name] = param if keep_vars else param.detach()
+    for name, buf in module._buffers.items():
+        # remove check of _non_persistent_buffers_set to allow nn.BatchNorm2d
+        if buf is not None:
+            destination[prefix + name] = buf if keep_vars else buf.detach()
+
+
+def get_state_dict(module, destination=None, prefix='', keep_vars=False):
+    """Returns a dictionary containing a whole state of the module.
+    Both parameters and persistent buffers (e.g. running averages) are
+    included. Keys are corresponding parameter and buffer names.
+    This method is modified from :meth:`torch.nn.Module.state_dict` to
+    recursively check parallel module in case that the model has a complicated
+    structure, e.g., nn.Module(nn.Module(DDP)).
+    Args:
+        module (nn.Module): The module to generate state_dict.
+        destination (OrderedDict): Returned dict for the state of the
+            module.
+        prefix (str): Prefix of the key.
+        keep_vars (bool): Whether to keep the variable property of the
+            parameters. Default: False.
+    Returns:
+        dict: A dictionary containing a whole state of the module.
+    """
+    # recursively check parallel module in case that the model has a
+    # complicated structure, e.g., nn.Module(nn.Module(DDP))
+    if is_module_wrapper(module):
+        module = module.module
+
+    # below is the same as torch.nn.Module.state_dict()
+    if destination is None:
+        destination = OrderedDict()
+        destination._metadata = OrderedDict()
+    destination._metadata[prefix[:-1]] = local_metadata = dict(
+        version=module._version)
+    _save_to_state_dict(module, destination, prefix, keep_vars)
+    for name, child in module._modules.items():
+        if child is not None:
+            get_state_dict(
+                child, destination, prefix + name + '.', keep_vars=keep_vars)
+    for hook in module._state_dict_hooks.values():
+        hook_result = hook(module, destination, prefix, local_metadata)
+        if hook_result is not None:
+            destination = hook_result
+    return destination
+
+
+def save_checkpoint(model, filename, optimizer=None, meta=None):
+    """Save checkpoint to file.
+    The checkpoint will have 3 fields: ``meta``, ``state_dict`` and
+    ``optimizer``. By default ``meta`` will contain version and time info.
+    Args:
+        model (Module): Module whose params are to be saved.
+        filename (str): Checkpoint filename.
+        optimizer (:obj:`Optimizer`, optional): Optimizer to be saved.
+        meta (dict, optional): Metadata to be saved in checkpoint.
+    """
+    if meta is None:
+        meta = {}
+    elif not isinstance(meta, dict):
+        raise TypeError(f'meta must be a dict or None, but got {type(meta)}')
+    meta.update(mmcv_version=mmcv.__version__, time=time.asctime())
+
+    if is_module_wrapper(model):
+        model = model.module
+
+    if hasattr(model, 'CLASSES') and model.CLASSES is not None:
+        # save class name to the meta
+        meta.update(CLASSES=model.CLASSES)
+
+    checkpoint = {
+        'meta': meta,
+        'state_dict': weights_to_cpu(get_state_dict(model))
+    }
+    # save optimizer state dict in the checkpoint
+    if isinstance(optimizer, Optimizer):
+        checkpoint['optimizer'] = optimizer.state_dict()
+    elif isinstance(optimizer, dict):
+        checkpoint['optimizer'] = {}
+        for name, optim in optimizer.items():
+            checkpoint['optimizer'][name] = optim.state_dict()
+
+    if filename.startswith('pavi://'):
+        try:
+            from pavi import modelcloud
+            from pavi.exception import NodeNotFoundError
+        except ImportError:
+            raise ImportError(
+                'Please install pavi to load checkpoint from modelcloud.')
+        model_path = filename[7:]
+        root = modelcloud.Folder()
+        model_dir, model_name = osp.split(model_path)
+        try:
+            model = modelcloud.get(model_dir)
+        except NodeNotFoundError:
+            model = root.create_training_model(model_dir)
+        with TemporaryDirectory() as tmp_dir:
+            checkpoint_file = osp.join(tmp_dir, model_name)
+            with open(checkpoint_file, 'wb') as f:
+                torch.save(checkpoint, f)
+                f.flush()
+            model.create_file(checkpoint_file, name=model_name)
+    else:
+        mmcv.mkdir_or_exist(osp.dirname(filename))
+        # immediately flush buffer
+        with open(filename, 'wb') as f:
+            torch.save(checkpoint, f)
+            f.flush()
+
+
+# def load_checkpoint(model,
+#                     filename,
+#                     map_location='cpu',
+#                     strict=False,
+#                     logger=None):
+#     """Load checkpoint from a file or URI.
+#
+#     Args:
+#         model (Module): Module to load checkpoint.
+#         filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+#             ``open-mmlab://xxx``.
+#         map_location (str): Same as :func:`torch.load`.
+#         strict (bool): Whether to allow different params for the model and
+#             checkpoint.
+#         logger (:mod:`logging.Logger` or None): The logger for error message.
+#
+#     Returns:
+#         dict or OrderedDict: The loaded checkpoint.
+#     """
+#     checkpoint = _load_checkpoint(filename, map_location)
+#     # OrderedDict is a subclass of dict
+#     if not isinstance(checkpoint, dict):
+#         raise RuntimeError(
+#             f'No state_dict found in checkpoint file {filename}')
+#     # get state_dict from checkpoint
+#     if 'state_dict' in checkpoint:
+#         state_dict_tmp = checkpoint['state_dict']
+#     else:
+#         state_dict_tmp = checkpoint
+#
+#     state_dict = OrderedDict()
+#     # strip prefix of state_dict
+#     for k, v in state_dict_tmp.items():
+#         if k.startswith('module.backbone.'):
+#             state_dict[k[16:]] = v
+#         elif k.startswith('module.'):
+#             state_dict[k[7:]] = v
+#         elif k.startswith('backbone.'):
+#             state_dict[k[9:]] = v
+#         else:
+#             state_dict[k] = v
+#     # load state_dict
+#     load_state_dict(model, state_dict, strict, logger)
+#     return checkpoint
+#
+#
+# def get_state_dict(filename, map_location='cpu'):
+#     """Get state_dict from a file or URI.
+#
+#     Args:
+#         filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+#             ``open-mmlab://xxx``.
+#         map_location (str): Same as :func:`torch.load`.
+#
+#     Returns:
+#         OrderedDict: The state_dict.
+#     """
+#     checkpoint = _load_checkpoint(filename, map_location)
+#     # OrderedDict is a subclass of dict
+#     if not isinstance(checkpoint, dict):
+#         raise RuntimeError(
+#             f'No state_dict found in checkpoint file {filename}')
+#     # get state_dict from checkpoint
+#     if 'state_dict' in checkpoint:
+#         state_dict_tmp = checkpoint['state_dict']
+#     else:
+#         state_dict_tmp = checkpoint
+#
+#     state_dict = OrderedDict()
+#     # strip prefix of state_dict
+#     for k, v in state_dict_tmp.items():
+#         if k.startswith('module.backbone.'):
+#             state_dict[k[16:]] = v
+#         elif k.startswith('module.'):
+#             state_dict[k[7:]] = v
+#         elif k.startswith('backbone.'):
+#             state_dict[k[9:]] = v
+#         else:
+#             state_dict[k] = v
+#
+#     return state_dict
diff --git a/main/transformer_utils/mmpose/models/backbones/v2v_net.py b/main/transformer_utils/mmpose/models/backbones/v2v_net.py
new file mode 100644
index 0000000000000000000000000000000000000000..99462af711069a34c13628364e2c466163507861
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/backbones/v2v_net.py
@@ -0,0 +1,257 @@
+# ------------------------------------------------------------------------------
+# Copyright and License Information
+# Adapted from
+# https://github.com/microsoft/voxelpose-pytorch/blob/main/lib/models/v2v_net.py
+# Original Licence: MIT License
+# ------------------------------------------------------------------------------
+
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+
+from ..builder import BACKBONES
+from .base_backbone import BaseBackbone
+
+
+class Basic3DBlock(nn.Module):
+    """A basic 3D convolutional block.
+
+    Args:
+        in_channels (int): Input channels of this block.
+        out_channels (int): Output channels of this block.
+        kernel_size (int): Kernel size of the convolution operation
+        conv_cfg (dict): Dictionary to construct and config conv layer.
+            Default: dict(type='Conv3d')
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Default: dict(type='BN3d')
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 conv_cfg=dict(type='Conv3d'),
+                 norm_cfg=dict(type='BN3d')):
+        super(Basic3DBlock, self).__init__()
+        self.block = ConvModule(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=1,
+            padding=((kernel_size - 1) // 2),
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            bias=True)
+
+    def forward(self, x):
+        """Forward function."""
+        return self.block(x)
+
+
+class Res3DBlock(nn.Module):
+    """A residual 3D convolutional block.
+
+    Args:
+        in_channels (int): Input channels of this block.
+        out_channels (int): Output channels of this block.
+        kernel_size (int): Kernel size of the convolution operation
+            Default: 3
+        conv_cfg (dict): Dictionary to construct and config conv layer.
+            Default: dict(type='Conv3d')
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Default: dict(type='BN3d')
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 conv_cfg=dict(type='Conv3d'),
+                 norm_cfg=dict(type='BN3d')):
+        super(Res3DBlock, self).__init__()
+        self.res_branch = nn.Sequential(
+            ConvModule(
+                in_channels,
+                out_channels,
+                kernel_size,
+                stride=1,
+                padding=((kernel_size - 1) // 2),
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                bias=True),
+            ConvModule(
+                out_channels,
+                out_channels,
+                kernel_size,
+                stride=1,
+                padding=((kernel_size - 1) // 2),
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=None,
+                bias=True))
+
+        if in_channels == out_channels:
+            self.skip_con = nn.Sequential()
+        else:
+            self.skip_con = ConvModule(
+                in_channels,
+                out_channels,
+                1,
+                stride=1,
+                padding=0,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=None,
+                bias=True)
+
+    def forward(self, x):
+        """Forward function."""
+        res = self.res_branch(x)
+        skip = self.skip_con(x)
+        return F.relu(res + skip, True)
+
+
+class Pool3DBlock(nn.Module):
+    """A 3D max-pool block.
+
+    Args:
+        pool_size (int): Pool size of the 3D max-pool layer
+    """
+
+    def __init__(self, pool_size):
+        super(Pool3DBlock, self).__init__()
+        self.pool_size = pool_size
+
+    def forward(self, x):
+        """Forward function."""
+        return F.max_pool3d(
+            x, kernel_size=self.pool_size, stride=self.pool_size)
+
+
+class Upsample3DBlock(nn.Module):
+    """A 3D upsample block.
+
+    Args:
+        in_channels (int): Input channels of this block.
+        out_channels (int): Output channels of this block.
+        kernel_size (int): Kernel size of the transposed convolution operation.
+            Default: 2
+        stride (int):  Kernel size of the transposed convolution operation.
+            Default: 2
+    """
+
+    def __init__(self, in_channels, out_channels, kernel_size=2, stride=2):
+        super(Upsample3DBlock, self).__init__()
+        assert kernel_size == 2
+        assert stride == 2
+        self.block = nn.Sequential(
+            nn.ConvTranspose3d(
+                in_channels,
+                out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=0,
+                output_padding=0), nn.BatchNorm3d(out_channels), nn.ReLU(True))
+
+    def forward(self, x):
+        """Forward function."""
+        return self.block(x)
+
+
+class EncoderDecorder(nn.Module):
+    """An encoder-decoder block.
+
+    Args:
+        in_channels (int): Input channels of this block
+    """
+
+    def __init__(self, in_channels=32):
+        super(EncoderDecorder, self).__init__()
+
+        self.encoder_pool1 = Pool3DBlock(2)
+        self.encoder_res1 = Res3DBlock(in_channels, in_channels * 2)
+        self.encoder_pool2 = Pool3DBlock(2)
+        self.encoder_res2 = Res3DBlock(in_channels * 2, in_channels * 4)
+
+        self.mid_res = Res3DBlock(in_channels * 4, in_channels * 4)
+
+        self.decoder_res2 = Res3DBlock(in_channels * 4, in_channels * 4)
+        self.decoder_upsample2 = Upsample3DBlock(in_channels * 4,
+                                                 in_channels * 2, 2, 2)
+        self.decoder_res1 = Res3DBlock(in_channels * 2, in_channels * 2)
+        self.decoder_upsample1 = Upsample3DBlock(in_channels * 2, in_channels,
+                                                 2, 2)
+
+        self.skip_res1 = Res3DBlock(in_channels, in_channels)
+        self.skip_res2 = Res3DBlock(in_channels * 2, in_channels * 2)
+
+    def forward(self, x):
+        """Forward function."""
+        skip_x1 = self.skip_res1(x)
+        x = self.encoder_pool1(x)
+        x = self.encoder_res1(x)
+
+        skip_x2 = self.skip_res2(x)
+        x = self.encoder_pool2(x)
+        x = self.encoder_res2(x)
+
+        x = self.mid_res(x)
+
+        x = self.decoder_res2(x)
+        x = self.decoder_upsample2(x)
+        x = x + skip_x2
+
+        x = self.decoder_res1(x)
+        x = self.decoder_upsample1(x)
+        x = x + skip_x1
+
+        return x
+
+
+@BACKBONES.register_module()
+class V2VNet(BaseBackbone):
+    """V2VNet.
+
+    Please refer to the `paper <https://arxiv.org/abs/1711.07399>`
+        for details.
+
+    Args:
+        input_channels (int):
+            Number of channels of the input feature volume.
+        output_channels (int):
+            Number of channels of the output volume.
+        mid_channels (int):
+            Input and output channels of the encoder-decoder block.
+    """
+
+    def __init__(self, input_channels, output_channels, mid_channels=32):
+        super(V2VNet, self).__init__()
+
+        self.front_layers = nn.Sequential(
+            Basic3DBlock(input_channels, mid_channels // 2, 7),
+            Res3DBlock(mid_channels // 2, mid_channels),
+        )
+
+        self.encoder_decoder = EncoderDecorder(in_channels=mid_channels)
+
+        self.output_layer = nn.Conv3d(
+            mid_channels, output_channels, kernel_size=1, stride=1, padding=0)
+
+        self._initialize_weights()
+
+    def forward(self, x):
+        """Forward function."""
+        x = self.front_layers(x)
+        x = self.encoder_decoder(x)
+        x = self.output_layer(x)
+
+        return x
+
+    def _initialize_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv3d):
+                nn.init.normal_(m.weight, 0, 0.001)
+                nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.ConvTranspose3d):
+                nn.init.normal_(m.weight, 0, 0.001)
+                nn.init.constant_(m.bias, 0)
diff --git a/main/transformer_utils/mmpose/models/backbones/vgg.py b/main/transformer_utils/mmpose/models/backbones/vgg.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7d467017a5520f399c84b1235ec64c99b805b42
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/backbones/vgg.py
@@ -0,0 +1,193 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+from mmcv.cnn import ConvModule, constant_init, kaiming_init, normal_init
+from mmcv.utils.parrots_wrapper import _BatchNorm
+
+from ..builder import BACKBONES
+from .base_backbone import BaseBackbone
+
+
+def make_vgg_layer(in_channels,
+                   out_channels,
+                   num_blocks,
+                   conv_cfg=None,
+                   norm_cfg=None,
+                   act_cfg=dict(type='ReLU'),
+                   dilation=1,
+                   with_norm=False,
+                   ceil_mode=False):
+    layers = []
+    for _ in range(num_blocks):
+        layer = ConvModule(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            dilation=dilation,
+            padding=dilation,
+            bias=True,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        layers.append(layer)
+        in_channels = out_channels
+    layers.append(nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=ceil_mode))
+
+    return layers
+
+
+@BACKBONES.register_module()
+class VGG(BaseBackbone):
+    """VGG backbone.
+
+    Args:
+        depth (int): Depth of vgg, from {11, 13, 16, 19}.
+        with_norm (bool): Use BatchNorm or not.
+        num_classes (int): number of classes for classification.
+        num_stages (int): VGG stages, normally 5.
+        dilations (Sequence[int]): Dilation of each stage.
+        out_indices (Sequence[int]): Output from which stages. If only one
+            stage is specified, a single tensor (feature map) is returned,
+            otherwise multiple stages are specified, a tuple of tensors will
+            be returned. When it is None, the default behavior depends on
+            whether num_classes is specified. If num_classes <= 0, the default
+            value is (4, ), outputting the last feature map before classifier.
+            If num_classes > 0, the default value is (5, ), outputting the
+            classification score. Default: None.
+        frozen_stages (int): Stages to be frozen (all param fixed). -1 means
+            not freezing any parameters.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        ceil_mode (bool): Whether to use ceil_mode of MaxPool. Default: False.
+        with_last_pool (bool): Whether to keep the last pooling before
+            classifier. Default: True.
+    """
+
+    # Parameters to build layers. Each element specifies the number of conv in
+    # each stage. For example, VGG11 contains 11 layers with learnable
+    # parameters. 11 is computed as 11 = (1 + 1 + 2 + 2 + 2) + 3,
+    # where 3 indicates the last three fully-connected layers.
+    arch_settings = {
+        11: (1, 1, 2, 2, 2),
+        13: (2, 2, 2, 2, 2),
+        16: (2, 2, 3, 3, 3),
+        19: (2, 2, 4, 4, 4)
+    }
+
+    def __init__(self,
+                 depth,
+                 num_classes=-1,
+                 num_stages=5,
+                 dilations=(1, 1, 1, 1, 1),
+                 out_indices=None,
+                 frozen_stages=-1,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 act_cfg=dict(type='ReLU'),
+                 norm_eval=False,
+                 ceil_mode=False,
+                 with_last_pool=True):
+        super().__init__()
+        if depth not in self.arch_settings:
+            raise KeyError(f'invalid depth {depth} for vgg')
+        assert num_stages >= 1 and num_stages <= 5
+        stage_blocks = self.arch_settings[depth]
+        self.stage_blocks = stage_blocks[:num_stages]
+        assert len(dilations) == num_stages
+
+        self.num_classes = num_classes
+        self.frozen_stages = frozen_stages
+        self.norm_eval = norm_eval
+        with_norm = norm_cfg is not None
+
+        if out_indices is None:
+            out_indices = (5, ) if num_classes > 0 else (4, )
+        assert max(out_indices) <= num_stages
+        self.out_indices = out_indices
+
+        self.in_channels = 3
+        start_idx = 0
+        vgg_layers = []
+        self.range_sub_modules = []
+        for i, num_blocks in enumerate(self.stage_blocks):
+            num_modules = num_blocks + 1
+            end_idx = start_idx + num_modules
+            dilation = dilations[i]
+            out_channels = 64 * 2**i if i < 4 else 512
+            vgg_layer = make_vgg_layer(
+                self.in_channels,
+                out_channels,
+                num_blocks,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg,
+                dilation=dilation,
+                with_norm=with_norm,
+                ceil_mode=ceil_mode)
+            vgg_layers.extend(vgg_layer)
+            self.in_channels = out_channels
+            self.range_sub_modules.append([start_idx, end_idx])
+            start_idx = end_idx
+        if not with_last_pool:
+            vgg_layers.pop(-1)
+            self.range_sub_modules[-1][1] -= 1
+        self.module_name = 'features'
+        self.add_module(self.module_name, nn.Sequential(*vgg_layers))
+
+        if self.num_classes > 0:
+            self.classifier = nn.Sequential(
+                nn.Linear(512 * 7 * 7, 4096),
+                nn.ReLU(True),
+                nn.Dropout(),
+                nn.Linear(4096, 4096),
+                nn.ReLU(True),
+                nn.Dropout(),
+                nn.Linear(4096, num_classes),
+            )
+
+    def init_weights(self, pretrained=None):
+        super().init_weights(pretrained)
+        if pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    kaiming_init(m)
+                elif isinstance(m, _BatchNorm):
+                    constant_init(m, 1)
+                elif isinstance(m, nn.Linear):
+                    normal_init(m, std=0.01)
+
+    def forward(self, x):
+        outs = []
+        vgg_layers = getattr(self, self.module_name)
+        for i in range(len(self.stage_blocks)):
+            for j in range(*self.range_sub_modules[i]):
+                vgg_layer = vgg_layers[j]
+                x = vgg_layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+        if self.num_classes > 0:
+            x = x.view(x.size(0), -1)
+            x = self.classifier(x)
+            outs.append(x)
+        if len(outs) == 1:
+            return outs[0]
+        else:
+            return tuple(outs)
+
+    def _freeze_stages(self):
+        vgg_layers = getattr(self, self.module_name)
+        for i in range(self.frozen_stages):
+            for j in range(*self.range_sub_modules[i]):
+                m = vgg_layers[j]
+                m.eval()
+                for param in m.parameters():
+                    param.requires_grad = False
+
+    def train(self, mode=True):
+        super().train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                # trick: eval have effect on BatchNorm only
+                if isinstance(m, _BatchNorm):
+                    m.eval()
diff --git a/main/transformer_utils/mmpose/models/backbones/vipnas_mbv3.py b/main/transformer_utils/mmpose/models/backbones/vipnas_mbv3.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed990e3966b27301dbaf081e3ec0e908704dfc8b
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/backbones/vipnas_mbv3.py
@@ -0,0 +1,179 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import logging
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from ..builder import BACKBONES
+from .base_backbone import BaseBackbone
+from .utils import InvertedResidual, load_checkpoint
+
+
+@BACKBONES.register_module()
+class ViPNAS_MobileNetV3(BaseBackbone):
+    """ViPNAS_MobileNetV3 backbone.
+
+    "ViPNAS: Efficient Video Pose Estimation via Neural Architecture Search"
+    More details can be found in the `paper
+    <https://arxiv.org/abs/2105.10154>`__ .
+
+    Args:
+        wid (list(int)): Searched width config for each stage.
+        expan (list(int)): Searched expansion ratio config for each stage.
+        dep (list(int)): Searched depth config for each stage.
+        ks (list(int)): Searched kernel size config for each stage.
+        group (list(int)): Searched group number config for each stage.
+        att (list(bool)): Searched attention config for each stage.
+        stride (list(int)): Stride config for each stage.
+        act (list(dict)): Activation config for each stage.
+        conv_cfg (dict): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        frozen_stages (int): Stages to be frozen (all param fixed).
+            Default: -1, which means not freezing any parameters.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save
+            some memory while slowing down the training speed.
+            Default: False.
+    """
+
+    def __init__(self,
+                 wid=[16, 16, 24, 40, 80, 112, 160],
+                 expan=[None, 1, 5, 4, 5, 5, 6],
+                 dep=[None, 1, 4, 4, 4, 4, 4],
+                 ks=[3, 3, 7, 7, 5, 7, 5],
+                 group=[None, 8, 120, 20, 100, 280, 240],
+                 att=[None, True, True, False, True, True, True],
+                 stride=[2, 1, 2, 2, 2, 1, 2],
+                 act=[
+                     'HSwish', 'ReLU', 'ReLU', 'ReLU', 'HSwish', 'HSwish',
+                     'HSwish'
+                 ],
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 frozen_stages=-1,
+                 norm_eval=False,
+                 with_cp=False):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        super().__init__()
+        self.wid = wid
+        self.expan = expan
+        self.dep = dep
+        self.ks = ks
+        self.group = group
+        self.att = att
+        self.stride = stride
+        self.act = act
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.frozen_stages = frozen_stages
+        self.norm_eval = norm_eval
+        self.with_cp = with_cp
+
+        self.conv1 = ConvModule(
+            in_channels=3,
+            out_channels=self.wid[0],
+            kernel_size=self.ks[0],
+            stride=self.stride[0],
+            padding=self.ks[0] // 2,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=dict(type=self.act[0]))
+
+        self.layers = self._make_layer()
+
+    def _make_layer(self):
+        layers = []
+        layer_index = 0
+        for i, dep in enumerate(self.dep[1:]):
+            mid_channels = self.wid[i + 1] * self.expan[i + 1]
+
+            if self.att[i + 1]:
+                se_cfg = dict(
+                    channels=mid_channels,
+                    ratio=4,
+                    act_cfg=(dict(type='ReLU'), dict(type='HSigmoid')))
+            else:
+                se_cfg = None
+
+            if self.expan[i + 1] == 1:
+                with_expand_conv = False
+            else:
+                with_expand_conv = True
+
+            for j in range(dep):
+                if j == 0:
+                    stride = self.stride[i + 1]
+                    in_channels = self.wid[i]
+                else:
+                    stride = 1
+                    in_channels = self.wid[i + 1]
+
+                layer = InvertedResidual(
+                    in_channels=in_channels,
+                    out_channels=self.wid[i + 1],
+                    mid_channels=mid_channels,
+                    kernel_size=self.ks[i + 1],
+                    groups=self.group[i + 1],
+                    stride=stride,
+                    se_cfg=se_cfg,
+                    with_expand_conv=with_expand_conv,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=dict(type=self.act[i + 1]),
+                    with_cp=self.with_cp)
+                layer_index += 1
+                layer_name = f'layer{layer_index}'
+                self.add_module(layer_name, layer)
+                layers.append(layer_name)
+        return layers
+
+    def init_weights(self, pretrained=None):
+        if isinstance(pretrained, str):
+            logger = logging.getLogger()
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    nn.init.normal_(m.weight, std=0.001)
+                    for name, _ in m.named_parameters():
+                        if name in ['bias']:
+                            nn.init.constant_(m.bias, 0)
+                elif isinstance(m, nn.BatchNorm2d):
+                    nn.init.constant_(m.weight, 1)
+                    nn.init.constant_(m.bias, 0)
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, x):
+        x = self.conv1(x)
+
+        for i, layer_name in enumerate(self.layers):
+            layer = getattr(self, layer_name)
+            x = layer(x)
+
+        return x
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            for param in self.conv1.parameters():
+                param.requires_grad = False
+        for i in range(1, self.frozen_stages + 1):
+            layer = getattr(self, f'layer{i}')
+            layer.eval()
+            for param in layer.parameters():
+                param.requires_grad = False
+
+    def train(self, mode=True):
+        super().train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, _BatchNorm):
+                    m.eval()
diff --git a/main/transformer_utils/mmpose/models/backbones/vipnas_resnet.py b/main/transformer_utils/mmpose/models/backbones/vipnas_resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..81b028ed5f5caad5f59c68b7f82c1a4661cf4d6f
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/backbones/vipnas_resnet.py
@@ -0,0 +1,589 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import ConvModule, build_conv_layer, build_norm_layer
+from mmcv.cnn.bricks import ContextBlock
+from mmcv.utils.parrots_wrapper import _BatchNorm
+
+from ..builder import BACKBONES
+from .base_backbone import BaseBackbone
+
+
+class ViPNAS_Bottleneck(nn.Module):
+    """Bottleneck block for ViPNAS_ResNet.
+
+    Args:
+        in_channels (int): Input channels of this block.
+        out_channels (int): Output channels of this block.
+        expansion (int): The ratio of ``out_channels/mid_channels`` where
+            ``mid_channels`` is the input/output channels of conv2. Default: 4.
+        stride (int): stride of the block. Default: 1
+        dilation (int): dilation of convolution. Default: 1
+        downsample (nn.Module): downsample operation on identity branch.
+            Default: None.
+        style (str): ``"pytorch"`` or ``"caffe"``. If set to "pytorch", the
+            stride-two layer is the 3x3 conv layer, otherwise the stride-two
+            layer is the first 1x1 conv layer. Default: "pytorch".
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Default: None
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        kernel_size (int): kernel size of conv2 searched in ViPANS.
+        groups (int): group number of conv2 searched in ViPNAS.
+        attention (bool): whether to use attention module in the end of
+            the block.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 expansion=4,
+                 stride=1,
+                 dilation=1,
+                 downsample=None,
+                 style='pytorch',
+                 with_cp=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 kernel_size=3,
+                 groups=1,
+                 attention=False):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        super().__init__()
+        assert style in ['pytorch', 'caffe']
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.expansion = expansion
+        assert out_channels % expansion == 0
+        self.mid_channels = out_channels // expansion
+        self.stride = stride
+        self.dilation = dilation
+        self.style = style
+        self.with_cp = with_cp
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+
+        if self.style == 'pytorch':
+            self.conv1_stride = 1
+            self.conv2_stride = stride
+        else:
+            self.conv1_stride = stride
+            self.conv2_stride = 1
+
+        self.norm1_name, norm1 = build_norm_layer(
+            norm_cfg, self.mid_channels, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(
+            norm_cfg, self.mid_channels, postfix=2)
+        self.norm3_name, norm3 = build_norm_layer(
+            norm_cfg, out_channels, postfix=3)
+
+        self.conv1 = build_conv_layer(
+            conv_cfg,
+            in_channels,
+            self.mid_channels,
+            kernel_size=1,
+            stride=self.conv1_stride,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        self.conv2 = build_conv_layer(
+            conv_cfg,
+            self.mid_channels,
+            self.mid_channels,
+            kernel_size=kernel_size,
+            stride=self.conv2_stride,
+            padding=kernel_size // 2,
+            groups=groups,
+            dilation=dilation,
+            bias=False)
+
+        self.add_module(self.norm2_name, norm2)
+        self.conv3 = build_conv_layer(
+            conv_cfg,
+            self.mid_channels,
+            out_channels,
+            kernel_size=1,
+            bias=False)
+        self.add_module(self.norm3_name, norm3)
+
+        if attention:
+            self.attention = ContextBlock(out_channels,
+                                          max(1.0 / 16, 16.0 / out_channels))
+        else:
+            self.attention = None
+
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+
+    @property
+    def norm1(self):
+        """nn.Module: the normalization layer named "norm1" """
+        return getattr(self, self.norm1_name)
+
+    @property
+    def norm2(self):
+        """nn.Module: the normalization layer named "norm2" """
+        return getattr(self, self.norm2_name)
+
+    @property
+    def norm3(self):
+        """nn.Module: the normalization layer named "norm3" """
+        return getattr(self, self.norm3_name)
+
+    def forward(self, x):
+        """Forward function."""
+
+        def _inner_forward(x):
+            identity = x
+
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            out = self.conv2(out)
+            out = self.norm2(out)
+            out = self.relu(out)
+
+            out = self.conv3(out)
+            out = self.norm3(out)
+
+            if self.attention is not None:
+                out = self.attention(out)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+def get_expansion(block, expansion=None):
+    """Get the expansion of a residual block.
+
+    The block expansion will be obtained by the following order:
+
+    1. If ``expansion`` is given, just return it.
+    2. If ``block`` has the attribute ``expansion``, then return
+       ``block.expansion``.
+    3. Return the default value according the the block type:
+       4 for ``ViPNAS_Bottleneck``.
+
+    Args:
+        block (class): The block class.
+        expansion (int | None): The given expansion ratio.
+
+    Returns:
+        int: The expansion of the block.
+    """
+    if isinstance(expansion, int):
+        assert expansion > 0
+    elif expansion is None:
+        if hasattr(block, 'expansion'):
+            expansion = block.expansion
+        elif issubclass(block, ViPNAS_Bottleneck):
+            expansion = 1
+        else:
+            raise TypeError(f'expansion is not specified for {block.__name__}')
+    else:
+        raise TypeError('expansion must be an integer or None')
+
+    return expansion
+
+
+class ViPNAS_ResLayer(nn.Sequential):
+    """ViPNAS_ResLayer to build ResNet style backbone.
+
+    Args:
+        block (nn.Module): Residual block used to build ViPNAS ResLayer.
+        num_blocks (int): Number of blocks.
+        in_channels (int): Input channels of this block.
+        out_channels (int): Output channels of this block.
+        expansion (int, optional): The expansion for BasicBlock/Bottleneck.
+            If not specified, it will firstly be obtained via
+            ``block.expansion``. If the block has no attribute "expansion",
+            the following default values will be used: 1 for BasicBlock and
+            4 for Bottleneck. Default: None.
+        stride (int): stride of the first block. Default: 1.
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottleneck. Default: False
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Default: None
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        downsample_first (bool): Downsample at the first block or last block.
+            False for Hourglass, True for ResNet. Default: True
+        kernel_size (int): Kernel Size of the corresponding convolution layer
+            searched in the block.
+        groups (int): Group number of the corresponding convolution layer
+            searched in the block.
+        attention (bool): Whether to use attention module in the end of the
+            block.
+    """
+
+    def __init__(self,
+                 block,
+                 num_blocks,
+                 in_channels,
+                 out_channels,
+                 expansion=None,
+                 stride=1,
+                 avg_down=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 downsample_first=True,
+                 kernel_size=3,
+                 groups=1,
+                 attention=False,
+                 **kwargs):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        self.block = block
+        self.expansion = get_expansion(block, expansion)
+
+        downsample = None
+        if stride != 1 or in_channels != out_channels:
+            downsample = []
+            conv_stride = stride
+            if avg_down and stride != 1:
+                conv_stride = 1
+                downsample.append(
+                    nn.AvgPool2d(
+                        kernel_size=stride,
+                        stride=stride,
+                        ceil_mode=True,
+                        count_include_pad=False))
+            downsample.extend([
+                build_conv_layer(
+                    conv_cfg,
+                    in_channels,
+                    out_channels,
+                    kernel_size=1,
+                    stride=conv_stride,
+                    bias=False),
+                build_norm_layer(norm_cfg, out_channels)[1]
+            ])
+            downsample = nn.Sequential(*downsample)
+
+        layers = []
+        if downsample_first:
+            layers.append(
+                block(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    expansion=self.expansion,
+                    stride=stride,
+                    downsample=downsample,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    kernel_size=kernel_size,
+                    groups=groups,
+                    attention=attention,
+                    **kwargs))
+            in_channels = out_channels
+            for _ in range(1, num_blocks):
+                layers.append(
+                    block(
+                        in_channels=in_channels,
+                        out_channels=out_channels,
+                        expansion=self.expansion,
+                        stride=1,
+                        conv_cfg=conv_cfg,
+                        norm_cfg=norm_cfg,
+                        kernel_size=kernel_size,
+                        groups=groups,
+                        attention=attention,
+                        **kwargs))
+        else:  # downsample_first=False is for HourglassModule
+            for i in range(0, num_blocks - 1):
+                layers.append(
+                    block(
+                        in_channels=in_channels,
+                        out_channels=in_channels,
+                        expansion=self.expansion,
+                        stride=1,
+                        conv_cfg=conv_cfg,
+                        norm_cfg=norm_cfg,
+                        kernel_size=kernel_size,
+                        groups=groups,
+                        attention=attention,
+                        **kwargs))
+            layers.append(
+                block(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    expansion=self.expansion,
+                    stride=stride,
+                    downsample=downsample,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    kernel_size=kernel_size,
+                    groups=groups,
+                    attention=attention,
+                    **kwargs))
+
+        super().__init__(*layers)
+
+
+@BACKBONES.register_module()
+class ViPNAS_ResNet(BaseBackbone):
+    """ViPNAS_ResNet backbone.
+
+    "ViPNAS: Efficient Video Pose Estimation via Neural Architecture Search"
+    More details can be found in the `paper
+    <https://arxiv.org/abs/2105.10154>`__ .
+
+    Args:
+        depth (int): Network depth, from {18, 34, 50, 101, 152}.
+        in_channels (int): Number of input image channels. Default: 3.
+        num_stages (int): Stages of the network. Default: 4.
+        strides (Sequence[int]): Strides of the first block of each stage.
+            Default: ``(1, 2, 2, 2)``.
+        dilations (Sequence[int]): Dilation of each stage.
+            Default: ``(1, 1, 1, 1)``.
+        out_indices (Sequence[int]): Output from which stages. If only one
+            stage is specified, a single tensor (feature map) is returned,
+            otherwise multiple stages are specified, a tuple of tensors will
+            be returned. Default: ``(3, )``.
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv.
+            Default: False.
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottleneck. Default: False.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters. Default: -1.
+        conv_cfg (dict | None): The config dict for conv layers. Default: None.
+        norm_cfg (dict): The config dict for norm layers.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        zero_init_residual (bool): Whether to use zero init for last norm layer
+            in resblocks to let them behave as identity. Default: True.
+        wid (list(int)): Searched width config for each stage.
+        expan (list(int)): Searched expansion ratio config for each stage.
+        dep (list(int)): Searched depth config for each stage.
+        ks (list(int)): Searched kernel size config for each stage.
+        group (list(int)): Searched group number config for each stage.
+        att (list(bool)): Searched attention config for each stage.
+    """
+
+    arch_settings = {
+        50: ViPNAS_Bottleneck,
+    }
+
+    def __init__(self,
+                 depth,
+                 in_channels=3,
+                 num_stages=4,
+                 strides=(1, 2, 2, 2),
+                 dilations=(1, 1, 1, 1),
+                 out_indices=(3, ),
+                 style='pytorch',
+                 deep_stem=False,
+                 avg_down=False,
+                 frozen_stages=-1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 norm_eval=False,
+                 with_cp=False,
+                 zero_init_residual=True,
+                 wid=[48, 80, 160, 304, 608],
+                 expan=[None, 1, 1, 1, 1],
+                 dep=[None, 4, 6, 7, 3],
+                 ks=[7, 3, 5, 5, 5],
+                 group=[None, 16, 16, 16, 16],
+                 att=[None, True, False, True, True]):
+        # Protect mutable default arguments
+        norm_cfg = copy.deepcopy(norm_cfg)
+        super().__init__()
+        if depth not in self.arch_settings:
+            raise KeyError(f'invalid depth {depth} for resnet')
+        self.depth = depth
+        self.stem_channels = dep[0]
+        self.num_stages = num_stages
+        assert 1 <= num_stages <= 4
+        self.strides = strides
+        self.dilations = dilations
+        assert len(strides) == len(dilations) == num_stages
+        self.out_indices = out_indices
+        assert max(out_indices) < num_stages
+        self.style = style
+        self.deep_stem = deep_stem
+        self.avg_down = avg_down
+        self.frozen_stages = frozen_stages
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.with_cp = with_cp
+        self.norm_eval = norm_eval
+        self.zero_init_residual = zero_init_residual
+        self.block = self.arch_settings[depth]
+        self.stage_blocks = dep[1:1 + num_stages]
+
+        self._make_stem_layer(in_channels, wid[0], ks[0])
+
+        self.res_layers = []
+        _in_channels = wid[0]
+        for i, num_blocks in enumerate(self.stage_blocks):
+            expansion = get_expansion(self.block, expan[i + 1])
+            _out_channels = wid[i + 1] * expansion
+            stride = strides[i]
+            dilation = dilations[i]
+            res_layer = self.make_res_layer(
+                block=self.block,
+                num_blocks=num_blocks,
+                in_channels=_in_channels,
+                out_channels=_out_channels,
+                expansion=expansion,
+                stride=stride,
+                dilation=dilation,
+                style=self.style,
+                avg_down=self.avg_down,
+                with_cp=with_cp,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                kernel_size=ks[i + 1],
+                groups=group[i + 1],
+                attention=att[i + 1])
+            _in_channels = _out_channels
+            layer_name = f'layer{i + 1}'
+            self.add_module(layer_name, res_layer)
+            self.res_layers.append(layer_name)
+
+        self._freeze_stages()
+
+        self.feat_dim = res_layer[-1].out_channels
+
+    def make_res_layer(self, **kwargs):
+        """Make a ViPNAS ResLayer."""
+        return ViPNAS_ResLayer(**kwargs)
+
+    @property
+    def norm1(self):
+        """nn.Module: the normalization layer named "norm1" """
+        return getattr(self, self.norm1_name)
+
+    def _make_stem_layer(self, in_channels, stem_channels, kernel_size):
+        """Make stem layer."""
+        if self.deep_stem:
+            self.stem = nn.Sequential(
+                ConvModule(
+                    in_channels,
+                    stem_channels // 2,
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    inplace=True),
+                ConvModule(
+                    stem_channels // 2,
+                    stem_channels // 2,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    inplace=True),
+                ConvModule(
+                    stem_channels // 2,
+                    stem_channels,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    inplace=True))
+        else:
+            self.conv1 = build_conv_layer(
+                self.conv_cfg,
+                in_channels,
+                stem_channels,
+                kernel_size=kernel_size,
+                stride=2,
+                padding=kernel_size // 2,
+                bias=False)
+            self.norm1_name, norm1 = build_norm_layer(
+                self.norm_cfg, stem_channels, postfix=1)
+            self.add_module(self.norm1_name, norm1)
+            self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+    def _freeze_stages(self):
+        """Freeze parameters."""
+        if self.frozen_stages >= 0:
+            if self.deep_stem:
+                self.stem.eval()
+                for param in self.stem.parameters():
+                    param.requires_grad = False
+            else:
+                self.norm1.eval()
+                for m in [self.conv1, self.norm1]:
+                    for param in m.parameters():
+                        param.requires_grad = False
+
+        for i in range(1, self.frozen_stages + 1):
+            m = getattr(self, f'layer{i}')
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+    def init_weights(self, pretrained=None):
+        """Initialize model weights."""
+        super().init_weights(pretrained)
+        if pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    nn.init.normal_(m.weight, std=0.001)
+                    for name, _ in m.named_parameters():
+                        if name in ['bias']:
+                            nn.init.constant_(m.bias, 0)
+                elif isinstance(m, nn.BatchNorm2d):
+                    nn.init.constant_(m.weight, 1)
+                    nn.init.constant_(m.bias, 0)
+
+    def forward(self, x):
+        """Forward function."""
+        if self.deep_stem:
+            x = self.stem(x)
+        else:
+            x = self.conv1(x)
+            x = self.norm1(x)
+            x = self.relu(x)
+        x = self.maxpool(x)
+        outs = []
+        for i, layer_name in enumerate(self.res_layers):
+            res_layer = getattr(self, layer_name)
+            x = res_layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+        if len(outs) == 1:
+            return outs[0]
+        return tuple(outs)
+
+    def train(self, mode=True):
+        """Convert the model into training mode."""
+        super().train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                # trick: eval have effect on BatchNorm only
+                if isinstance(m, _BatchNorm):
+                    m.eval()
diff --git a/main/transformer_utils/mmpose/models/backbones/vit.py b/main/transformer_utils/mmpose/models/backbones/vit.py
new file mode 100644
index 0000000000000000000000000000000000000000..09a2294d46716cef6c3d08d37369c9e1f853b0f7
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/backbones/vit.py
@@ -0,0 +1,327 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import torch
+from functools import partial
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+
+from timm.models.layers import drop_path, to_2tuple, trunc_normal_
+
+from ..builder import BACKBONES
+from .base_backbone import BaseBackbone
+from einops import repeat
+
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+    def extra_repr(self):
+        return 'p={}'.format(self.drop_prob)
+
+
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class Attention(nn.Module):
+    def __init__(
+            self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0.,
+            proj_drop=0., attn_head_dim=None, ):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.dim = dim
+
+        if attn_head_dim is not None:
+            head_dim = attn_head_dim
+        all_head_dim = head_dim * self.num_heads
+
+        self.scale = qk_scale or head_dim ** -0.5
+
+        self.qkv = nn.Linear(dim, all_head_dim * 3, bias=qkv_bias)
+
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(all_head_dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x)
+        qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)
+
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+
+        return x
+
+
+class Block(nn.Module):
+
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None,
+                 drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm, attn_head_dim=None
+                 ):
+        super().__init__()
+
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
+            attn_drop=attn_drop, proj_drop=drop, attn_head_dim=attn_head_dim
+        )
+
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+
+    def forward(self, x):
+        x = x + self.drop_path(self.attn(self.norm1(x)))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+
+class PatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    """
+
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, ratio=1):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) * (ratio ** 2)
+        self.patch_shape = (int(img_size[0] // patch_size[0] * ratio), int(img_size[1] // patch_size[1] * ratio))
+        self.origin_patch_shape = (int(img_size[0] // patch_size[0]), int(img_size[1] // patch_size[1]))
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=(patch_size[0] // ratio),
+                              padding=4 + 2 * (ratio // 2 - 1))
+
+    def forward(self, x, **kwargs):
+        B, C, H, W = x.shape
+        x = self.proj(x)
+        Hp, Wp = x.shape[2], x.shape[3]
+
+        x = x.flatten(2).transpose(1, 2)
+        return x, (Hp, Wp)
+
+
+class HybridEmbed(nn.Module):
+    """ CNN Feature Map Embedding
+    Extract feature map from CNN, flatten, project to embedding dim.
+    """
+
+    def __init__(self, backbone, img_size=224, feature_size=None, in_chans=3, embed_dim=768):
+        super().__init__()
+        assert isinstance(backbone, nn.Module)
+        img_size = to_2tuple(img_size)
+        self.img_size = img_size
+        self.backbone = backbone
+        if feature_size is None:
+            with torch.no_grad():
+                training = backbone.training
+                if training:
+                    backbone.eval()
+                o = self.backbone(torch.zeros(1, in_chans, img_size[0], img_size[1]))[-1]
+                feature_size = o.shape[-2:]
+                feature_dim = o.shape[1]
+                backbone.train(training)
+        else:
+            feature_size = to_2tuple(feature_size)
+            feature_dim = self.backbone.feature_info.channels()[-1]
+        self.num_patches = feature_size[0] * feature_size[1]
+        self.proj = nn.Linear(feature_dim, embed_dim)
+
+    def forward(self, x):
+        x = self.backbone(x)[-1]
+        x = x.flatten(2).transpose(1, 2)
+        x = self.proj(x)
+        return x
+
+
+@BACKBONES.register_module()
+class ViT(BaseBackbone):
+
+    def __init__(self,
+                 img_size=224, patch_size=16, in_chans=3, num_classes=80, embed_dim=768, depth=12,
+                 num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0.,
+                 drop_path_rate=0., hybrid_backbone=None, norm_layer=None, use_checkpoint=False,
+                 frozen_stages=-1, ratio=1, last_norm=True,
+                 patch_padding='pad', freeze_attn=False, freeze_ffn=False, task_tokens_num=1+1+2+2+25
+                 ):
+        # Protect mutable default arguments
+        super(ViT, self).__init__()
+        norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
+        self.num_classes = num_classes
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.frozen_stages = frozen_stages
+        self.use_checkpoint = use_checkpoint
+        self.patch_padding = patch_padding
+        self.freeze_attn = freeze_attn
+        self.freeze_ffn = freeze_ffn
+        self.depth = depth
+        self.task_tokens_num = task_tokens_num
+
+        if hybrid_backbone is not None:
+            self.patch_embed = HybridEmbed(
+                hybrid_backbone, img_size=img_size, in_chans=in_chans, embed_dim=embed_dim)
+        else:
+            self.patch_embed = PatchEmbed(
+                img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, ratio=ratio)
+        num_patches = self.patch_embed.num_patches
+
+        # task tokens for HPS estimation
+        self.task_tokens = nn.Parameter(torch.zeros(1, task_tokens_num, embed_dim))
+        trunc_normal_(self.task_tokens, std=.02)
+
+        # since the pretraining model has class token
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
+
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+
+        self.blocks = nn.ModuleList([
+            Block(
+                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer,
+            )
+            for i in range(depth)])
+
+        self.last_norm = norm_layer(embed_dim) if last_norm else nn.Identity()
+
+        if self.pos_embed is not None:
+            trunc_normal_(self.pos_embed, std=.02)
+
+        self._freeze_stages()
+
+    def _freeze_stages(self):
+        """Freeze parameters."""
+        if self.frozen_stages >= 0:
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.requires_grad = False
+
+        for i in range(1, self.frozen_stages + 1):
+            m = self.blocks[i]
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+        if self.freeze_attn:
+            for i in range(0, self.depth):
+                m = self.blocks[i]
+                m.attn.eval()
+                m.norm1.eval()
+                for param in m.attn.parameters():
+                    param.requires_grad = False
+                for param in m.norm1.parameters():
+                    param.requires_grad = False
+
+        if self.freeze_ffn:
+            self.pos_embed.requires_grad = False
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.requires_grad = False
+            for i in range(0, self.depth):
+                m = self.blocks[i]
+                m.mlp.eval()
+                m.norm2.eval()
+                for param in m.mlp.parameters():
+                    param.requires_grad = False
+                for param in m.norm2.parameters():
+                    param.requires_grad = False
+
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in backbone.
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        super().init_weights(pretrained, patch_padding=self.patch_padding)
+
+        if pretrained is None:
+            def _init_weights(m):
+                if isinstance(m, nn.Linear):
+                    trunc_normal_(m.weight, std=.02)
+                    if isinstance(m, nn.Linear) and m.bias is not None:
+                        nn.init.constant_(m.bias, 0)
+                elif isinstance(m, nn.LayerNorm):
+                    nn.init.constant_(m.bias, 0)
+                    nn.init.constant_(m.weight, 1.0)
+
+            self.apply(_init_weights)
+
+    def get_num_layers(self):
+        return len(self.blocks)
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'pos_embed', 'cls_token'}
+
+    def forward_features(self, x):
+        B, C, H, W = x.shape
+        x, (Hp, Wp) = self.patch_embed(x)
+        task_tokens = repeat(self.task_tokens, '() n d -> b n d', b=B)
+        if self.pos_embed is not None:
+            # fit for multiple GPU training
+            # since the first element for pos embed (sin-cos manner) is zero, it will cause no difference
+            x = x + self.pos_embed[:, 1:] + self.pos_embed[:, :1]
+
+        x = torch.cat((task_tokens, x), dim=1)
+
+        for blk in self.blocks:
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x)
+            else:
+                x = blk(x)
+
+        x = self.last_norm(x)
+
+        task_tokens = x[:, :self.task_tokens_num]  # [N,J,C]
+        # task_tokens = torch.cat(task_tokens_, dim=-1)
+        xp = x[:, self.task_tokens_num:]  # [N,Hp*Wp,C]
+
+        xp = xp.permute(0, 2, 1).reshape(B, -1, Hp, Wp).contiguous()
+
+        return xp, task_tokens
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        return x
+
+    def train(self, mode=True):
+        """Convert the model into training mode."""
+        super().train(mode)
+        self._freeze_stages()
\ No newline at end of file
diff --git a/main/transformer_utils/mmpose/models/builder.py b/main/transformer_utils/mmpose/models/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..47f0a53121633fb6185a4d514c05a5862a9d74cf
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/builder.py
@@ -0,0 +1,49 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.cnn import MODELS as MMCV_MODELS
+from mmcv.cnn import build_model_from_cfg
+from mmcv.utils import Registry, build_from_cfg
+
+MODELS = Registry(
+    'models', build_func=build_model_from_cfg, parent=MMCV_MODELS)
+
+BACKBONES = MODELS
+NECKS = MODELS
+HEADS = MODELS
+LOSSES = MODELS
+POSENETS = MODELS
+MESH_MODELS = MODELS
+TRANSFORMER = Registry('Transformer')
+
+
+def build_backbone(cfg):
+    """Build backbone."""
+    return BACKBONES.build(cfg)
+
+
+def build_neck(cfg):
+    """Build neck."""
+    return NECKS.build(cfg)
+
+
+def build_head(cfg):
+    """Build head."""
+    return HEADS.build(cfg)
+
+
+def build_loss(cfg):
+    """Build loss."""
+    return LOSSES.build(cfg)
+
+
+def build_posenet(cfg):
+    """Build posenet."""
+    return POSENETS.build(cfg)
+
+
+def build_mesh_model(cfg):
+    """Build mesh model."""
+    return MESH_MODELS.build(cfg)
+
+def build_transformer(cfg, default_args=None):
+    """Builder for Transformer."""
+    return build_from_cfg(cfg, TRANSFORMER, default_args)
\ No newline at end of file
diff --git a/main/transformer_utils/mmpose/models/detectors/__init__.py b/main/transformer_utils/mmpose/models/detectors/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..823cd5d52c2723c6a537765a7f083a444016e8f7
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/detectors/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .top_down import TopDown
+from .poseur import Poseur
+
+__all__ = [
+    'TopDown', 'Poseur'
+]
diff --git a/main/transformer_utils/mmpose/models/detectors/base.py b/main/transformer_utils/mmpose/models/detectors/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d459b42de66012c88ff37d7d845265d06efebc7
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/detectors/base.py
@@ -0,0 +1,131 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+from collections import OrderedDict
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+
+
+class BasePose(nn.Module, metaclass=ABCMeta):
+    """Base class for pose detectors.
+
+    All recognizers should subclass it.
+    All subclass should overwrite:
+        Methods:`forward_train`, supporting to forward when training.
+        Methods:`forward_test`, supporting to forward when testing.
+
+    Args:
+        backbone (dict): Backbone modules to extract feature.
+        head (dict): Head modules to give output.
+        train_cfg (dict): Config for training. Default: None.
+        test_cfg (dict): Config for testing. Default: None.
+    """
+
+    @abstractmethod
+    def forward_train(self, img, img_metas, **kwargs):
+        """Defines the computation performed at training."""
+
+    @abstractmethod
+    def forward_test(self, img, img_metas, **kwargs):
+        """Defines the computation performed at testing."""
+
+    @abstractmethod
+    def forward(self, img, img_metas, return_loss=True, **kwargs):
+        """Forward function."""
+
+    @staticmethod
+    def _parse_losses(losses):
+        """Parse the raw outputs (losses) of the network.
+
+        Args:
+            losses (dict): Raw output of the network, which usually contain
+                losses and other necessary information.
+
+        Returns:
+            tuple[Tensor, dict]: (loss, log_vars), loss is the loss tensor \
+                which may be a weighted sum of all losses, log_vars \
+                contains all the variables to be sent to the logger.
+        """
+        log_vars = OrderedDict()
+        for loss_name, loss_value in losses.items():
+            if isinstance(loss_value, torch.Tensor):
+                log_vars[loss_name] = loss_value.mean()
+            elif isinstance(loss_value, float):
+                log_vars[loss_name] = loss_value
+            elif isinstance(loss_value, list):
+                log_vars[loss_name] = sum(_loss.mean() for _loss in loss_value)
+            else:
+                raise TypeError(
+                    f'{loss_name} is not a tensor or list of tensors or float')
+
+        loss = sum(_value for _key, _value in log_vars.items()
+                   if 'loss' in _key)
+
+        log_vars['loss'] = loss
+        for loss_name, loss_value in log_vars.items():
+            # reduce loss when distributed training
+            if not isinstance(loss_value, float):
+                if dist.is_available() and dist.is_initialized():
+                    loss_value = loss_value.data.clone()
+                    dist.all_reduce(loss_value.div_(dist.get_world_size()))
+                log_vars[loss_name] = loss_value.item()
+            else:
+                log_vars[loss_name] = loss_value
+
+        return loss, log_vars
+
+    def train_step(self, data_batch, optimizer, **kwargs):
+        """The iteration step during training.
+
+        This method defines an iteration step during training, except for the
+        back propagation and optimizer updating, which are done in an optimizer
+        hook. Note that in some complicated cases or models, the whole process
+        including back propagation and optimizer updating is also defined in
+        this method, such as GAN.
+
+        Args:
+            data_batch (dict): The output of dataloader.
+            optimizer (:obj:`torch.optim.Optimizer` | dict): The optimizer of
+                runner is passed to ``train_step()``. This argument is unused
+                and reserved.
+
+        Returns:
+            dict: It should contain at least 3 keys: ``loss``, ``log_vars``,
+                ``num_samples``.
+                ``loss`` is a tensor for back propagation, which can be a
+                weighted sum of multiple losses.
+                ``log_vars`` contains all the variables to be sent to the
+                logger.
+                ``num_samples`` indicates the batch size (when the model is
+                DDP, it means the batch size on each GPU), which is used for
+                averaging the logs.
+        """
+        losses = self.forward(**data_batch)
+
+        loss, log_vars = self._parse_losses(losses)
+
+        outputs = dict(
+            loss=loss,
+            log_vars=log_vars,
+            num_samples=len(next(iter(data_batch.values()))))
+
+        return outputs
+
+    def val_step(self, data_batch, optimizer, **kwargs):
+        """The iteration step during validation.
+
+        This method shares the same signature as :func:`train_step`, but used
+        during val epochs. Note that the evaluation after training epochs is
+        not implemented with this method, but an evaluation hook.
+        """
+        results = self.forward(return_loss=False, **data_batch)
+
+        outputs = dict(results=results)
+
+        return outputs
+
+    @abstractmethod
+    def show_result(self, **kwargs):
+        """Visualize the results."""
+        raise NotImplementedError
diff --git a/main/transformer_utils/mmpose/models/detectors/poseur.py b/main/transformer_utils/mmpose/models/detectors/poseur.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5c98ea95af4ee114e2dc731bf1b3e83489b8563
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/detectors/poseur.py
@@ -0,0 +1,278 @@
+import warnings
+
+import mmcv
+import numpy as np
+from mmcv.image import imwrite
+from mmcv.visualization.image import imshow
+
+from mmpose.core import imshow_keypoints
+from .. import builder
+from ..builder import POSENETS
+from .base import BasePose
+import torch
+from config import cfg
+
+try:
+    from mmcv.runner import auto_fp16
+except ImportError:
+    warnings.warn('auto_fp16 from mmpose will be deprecated from v0.15.0'
+                  'Please install mmcv>=1.1.4')
+    from mmpose.core import auto_fp16
+
+from .top_down import TopDown
+
+
+@POSENETS.register_module()
+class Poseur(TopDown):
+    def __init__(self, *args, **kwargs):
+        if 'filp_fuse_type' in kwargs:
+            self.filp_fuse_type = kwargs.pop('filp_fuse_type')
+        else:
+            self.filp_fuse_type = 'default'
+        super().__init__(*args, **kwargs)
+
+    def init_weights(self, pretrained=None):
+        """Weight initialization for model."""
+        self.backbone.init_weights(pretrained)
+        if self.with_neck:
+            self.neck.init_weights()
+        if self.with_keypoint:
+            self.keypoint_head.init_weights()
+
+    @auto_fp16(apply_to=('img',))
+    def forward(self,
+                img,
+                coord_target=None,
+                coord_target_weight=None,
+                bbox_target=None,
+                bbox_target_weight=None,
+                hp_target=None,
+                hp_target_weight=None,
+                img_metas=None,
+                return_loss=True,
+                return_heatmap=False,
+                coord_init=None,
+                query_init=None,
+                **kwargs):
+        """Calls either forward_train or forward_test depending on whether
+        return_loss=True. Note this setting will change the expected inputs.
+        When `return_loss=True`, img and img_meta are single-nested (i.e.
+        Tensor and List[dict]), and when `resturn_loss=False`, img and img_meta
+        should be double nested (i.e.  List[Tensor], List[List[dict]]), with
+        the outer list indicating test time augmentations.
+
+        Note:
+            batch_size: N
+            num_keypoints: K
+            num_img_channel: C (Default: 3)
+            img height: imgH
+            img weight: imgW
+            heatmaps height: H
+            heatmaps weight: W
+
+        Args:
+            img (torch.Tensor[NxCximgHximgW]): Input images.
+            target (torch.Tensor[NxKxHxW]): Target heatmaps.
+            target_weight (torch.Tensor[NxKx1]): Weights across
+                different joint types.
+            img_metas (list(dict)): Information about data augmentation
+                By default this includes:
+                - "image_file: path to the image file
+                - "center": center of the bbox
+                - "scale": scale of the bbox
+                - "rotation": rotation of the bbox
+                - "bbox_score": score of bbox
+            return_loss (bool): Option to `return loss`. `return loss=True`
+                for training, `return loss=False` for validation & test.
+            return_heatmap (bool) : Option to return heatmap.
+
+        Returns:
+            dict|tuple: if `return loss` is true, then return losses.
+              Otherwise, return predicted poses, boxes, image paths
+                  and heatmaps.
+        """
+        return self.forward_mesh_recovery(img, coord_init=coord_init, query_init=query_init,
+                                          **kwargs)
+        # if return_loss:
+        #     return self.forward_train(img,
+        #                               coord_target, coord_target_weight,
+        #                               hp_target, hp_target_weight, img_metas,
+        #                               **kwargs)
+        # return self.forward_test(
+        #     img, img_metas, return_heatmap=return_heatmap, **kwargs)
+
+    def forward_train(self, img, coord_target, coord_target_weight,
+                      hp_target, hp_target_weight, img_metas, **kwargs):
+        """
+        :param img:
+        :param coord_target: [2, 17, 2]
+        :param coord_target_weight: [2, 17, 2]
+        :param hp_target: [2, 4, 17, 64, 48]
+        :param hp_target_weight: [2, 4, 17, 1]
+        :param img_metas:
+        :param kwargs:
+        :return:
+        """
+        """Defines the computation performed at every call when training."""
+        output = self.backbone(img)
+        img_feat = output[-1]
+        if self.with_neck:
+            output = self.neck(output)
+        if self.with_keypoint:
+            # output = self.keypoint_head(output, img_metas)
+            enc_output, dec_output = self.keypoint_head(output)
+
+        return img_feat, enc_output, dec_output, None
+
+    def seperate_sigma_from_score(self, score):
+        if score.shape[2] == 3:
+            sigma = score[:, :, [1, 2]]
+            score = score[:, :, [0]]
+            return score, sigma
+        elif score.shape[2] == 1:
+            return score, None
+        else:
+            raise
+
+    def forward_mesh_recovery(self, output, coord_init=None, query_init=None, **kwargs):
+        """
+        :param img:
+        :param coord_target: [2, 17, 2]
+        :param coord_target_weight: [2, 17, 2]
+        :param hp_target: [2, 4, 17, 64, 48]
+        :param hp_target_weight: [2, 4, 17, 1]
+        :param img_metas:
+        :param kwargs:
+        :return:
+        """
+        """Defines the computation performed at every call when training."""
+        # output = self.backbone(img)
+        img_feat = output[-1]
+        # print(len(output))
+        if self.with_neck:
+            output = self.neck(output)
+        if self.with_keypoint:
+            # output = self.keypoint_head(output, img_metas)
+            enc_output, dec_output = self.keypoint_head(output, coord_init=coord_init, query_init=query_init)
+
+            return dec_output.feat[-1]
+
+    def forward_test(self, img, img_metas, return_heatmap=False, **kwargs):
+        """Defines the computation performed at every call when testing."""
+        assert img.size(0) == len(img_metas)
+        batch_size, _, img_height, img_width = img.shape
+        if batch_size > 1:
+            assert 'bbox_id' in img_metas[0]
+
+        result = {}
+
+        features = self.backbone(img)
+        if self.with_neck:
+            features = self.neck(features)
+        if self.with_keypoint:
+            output_regression, output_regression_score = self.keypoint_head.inference_model(
+                features, flip_pairs=None)
+            output_regression_score, output_regression_sigma = self.seperate_sigma_from_score(output_regression_score)
+
+        if self.test_cfg['flip_test']:
+            img_flipped = img.flip(3)
+            features_flipped = self.backbone(img_flipped)
+            if self.with_neck:
+                features_flipped = self.neck(features_flipped)
+            if self.with_keypoint:
+                output_regression_flipped, output_regression_score_flipped = self.keypoint_head.inference_model(
+                    features_flipped, img_metas[0]['flip_pairs'])
+                output_regression_score_flipped, output_regression_sigma_flipped = \
+                    self.seperate_sigma_from_score(output_regression_score_flipped)
+                if self.filp_fuse_type == 'default':
+                    output_regression = (output_regression +
+                                         output_regression_flipped) * 0.5
+
+                    output_regression_score = (output_regression_score +
+                                               output_regression_score_flipped) * 0.5
+                elif self.filp_fuse_type == 'type1':
+                    # output_regression = (output_regression * output_regression_score + output_regression_flipped * output_regression_score_flipped)\
+                    #     /(output_regression_score + output_regression_score_flipped+1e-9)
+                    output_regression, output_regression_flipped = \
+                        torch.from_numpy(output_regression), torch.from_numpy(output_regression_flipped)
+
+                    output_regression_score, output_regression_score_flipped = \
+                        torch.from_numpy(output_regression_score), torch.from_numpy(output_regression_score_flipped)
+
+                    output_regression = (
+                                                    output_regression * output_regression_score + output_regression_flipped * output_regression_score_flipped) \
+                                        / (output_regression_score + output_regression_score_flipped + 1e-9)
+
+                    diff = 1 - (output_regression_score - output_regression_score_flipped).abs()
+                    output_regression_score = (output_regression_score * output_regression_score_flipped * diff) ** 2
+
+                    output_regression = output_regression.numpy()
+                    output_regression_score = output_regression_score.numpy()
+                elif self.filp_fuse_type == 'type2':
+                    # output_regression = (output_regression * output_regression_score + output_regression_flipped * output_regression_score_flipped)\
+                    #     /(output_regression_score + output_regression_score_flipped+1e-9)
+                    output_regression, output_regression_flipped = \
+                        torch.from_numpy(output_regression), torch.from_numpy(output_regression_flipped)
+
+                    output_regression_sigma, output_regression_sigma_flipped = \
+                        torch.from_numpy(output_regression_sigma), torch.from_numpy(output_regression_sigma_flipped)
+
+                    output_regression_p, output_regression_p_flipped = \
+                        self.get_p(output_regression_sigma), self.get_p(output_regression_sigma_flipped)
+
+                    p_to_coord_index = 5
+                    output_regression = (
+                                                    output_regression * output_regression_p ** p_to_coord_index + output_regression_flipped * output_regression_p_flipped ** p_to_coord_index) \
+                                        / (
+                                                    output_regression_p ** p_to_coord_index + output_regression_p_flipped ** p_to_coord_index + 1e-10)
+
+                    output_regression_score = (output_regression_p + output_regression_p_flipped) * 0.5
+
+                    output_regression = output_regression.numpy()
+                    output_regression_score = output_regression_score.numpy()
+                else:
+                    NotImplementedError
+
+        if self.with_keypoint:
+            keypoint_result = self.keypoint_head.decode_keypoints(
+                img_metas, output_regression, output_regression_score, [img_width, img_height])
+            result.update(keypoint_result)
+
+            if not return_heatmap:
+                output_heatmap = None
+
+            result['output_heatmap'] = output_heatmap
+
+        return result
+
+    def get_p(self, output_regression_sigma, p_x=0.2):
+        output_regression_p = (1 - np.exp(-(p_x / output_regression_sigma)))
+        output_regression_p = output_regression_p[:, :, 0] * output_regression_p[:, :, 1]
+        output_regression_p = output_regression_p[:, :, None]
+        return output_regression_p * 0.7
+        # 0.2  0.7 7421
+        # 0.2  0.7 7610
+        # 0.17 0.7
+
+    def forward_dummy(self, img):
+        """Used for computing network FLOPs.
+
+        See ``tools/get_flops.py``.
+
+        Args:
+            img (torch.Tensor): Input image.
+
+        Returns:
+            Tensor: Output heatmaps.
+        """
+        output = self.backbone(img)
+        if self.with_neck:
+            output = self.neck(output)
+        if self.with_keypoint:
+            img_h, img_w = 256, 192
+            img_metas = [{}]
+            img_metas[0]['batch_input_shape'] = (img_h, img_w)
+            img_metas[0]['img_shape'] = (img_h, img_w, 3)
+            # output = self.keypoint_head(output, img_metas)
+            output = self.keypoint_head(output)
+        return output
diff --git a/main/transformer_utils/mmpose/models/detectors/top_down.py b/main/transformer_utils/mmpose/models/detectors/top_down.py
new file mode 100644
index 0000000000000000000000000000000000000000..99215ec70b2381fbc01be6e448e30a09f83cda2b
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/detectors/top_down.py
@@ -0,0 +1,311 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import mmcv
+import numpy as np
+from mmcv.image import imwrite
+from mmcv.utils.misc import deprecated_api_warning
+from mmcv.visualization.image import imshow
+
+from mmpose.core import imshow_bboxes, imshow_keypoints
+from .. import builder
+from ..builder import POSENETS
+from .base import BasePose
+
+try:
+    from mmcv.runner import auto_fp16
+except ImportError:
+    warnings.warn('auto_fp16 from mmpose will be deprecated from v0.15.0'
+                  'Please install mmcv>=1.1.4')
+    from mmpose.core import auto_fp16
+
+
+@POSENETS.register_module()
+class TopDown(BasePose):
+    """Top-down pose detectors.
+
+    Args:
+        backbone (dict): Backbone modules to extract feature.
+        keypoint_head (dict): Keypoint head to process feature.
+        train_cfg (dict): Config for training. Default: None.
+        test_cfg (dict): Config for testing. Default: None.
+        pretrained (str): Path to the pretrained models.
+        loss_pose (None): Deprecated arguments. Please use
+            `loss_keypoint` for heads instead.
+    """
+
+    def __init__(self,
+                 backbone,
+                 neck=None,
+                 keypoint_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 loss_pose=None):
+        super().__init__()
+        self.fp16_enabled = False
+
+        self.backbone = builder.build_backbone(backbone)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+        if neck is not None:
+            self.neck = builder.build_neck(neck)
+
+        if keypoint_head is not None:
+            keypoint_head['train_cfg'] = train_cfg
+            keypoint_head['test_cfg'] = test_cfg
+
+            if 'loss_keypoint' not in keypoint_head and loss_pose is not None:
+                warnings.warn(
+                    '`loss_pose` for TopDown is deprecated, '
+                    'use `loss_keypoint` for heads instead. See '
+                    'https://github.com/open-mmlab/mmpose/pull/382'
+                    ' for more information.', DeprecationWarning)
+                keypoint_head['loss_keypoint'] = loss_pose
+
+            self.keypoint_head = builder.build_head(keypoint_head)
+        self.pretrained = pretrained
+        self.init_weights()
+
+    @property
+    def with_neck(self):
+        """Check if has neck."""
+        return hasattr(self, 'neck')
+
+    @property
+    def with_keypoint(self):
+        """Check if has keypoint_head."""
+        return hasattr(self, 'keypoint_head')
+
+    def init_weights(self, pretrained=None):
+        """Weight initialization for model."""
+        if pretrained is not None:
+            self.pretrained = pretrained
+        self.backbone.init_weights(self.pretrained)
+        if self.with_neck:
+            self.neck.init_weights()
+        if self.with_keypoint:
+            self.keypoint_head.init_weights()
+
+    @auto_fp16(apply_to=('img', ))
+    def forward(self,
+                img,
+                target=None,
+                target_weight=None,
+                img_metas=None,
+                return_loss=True,
+                return_heatmap=False,
+                **kwargs):
+        """Calls either forward_train or forward_test depending on whether
+        return_loss=True. Note this setting will change the expected inputs.
+        When `return_loss=True`, img and img_meta are single-nested (i.e.
+        Tensor and List[dict]), and when `resturn_loss=False`, img and img_meta
+        should be double nested (i.e.  List[Tensor], List[List[dict]]), with
+        the outer list indicating test time augmentations.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - num_img_channel: C (Default: 3)
+            - img height: imgH
+            - img width: imgW
+            - heatmaps height: H
+            - heatmaps weight: W
+
+        Args:
+            img (torch.Tensor[NxCximgHximgW]): Input images.
+            target (torch.Tensor[NxKxHxW]): Target heatmaps.
+            target_weight (torch.Tensor[NxKx1]): Weights across
+                different joint types.
+            img_metas (list(dict)): Information about data augmentation
+                By default this includes:
+
+                - "image_file: path to the image file
+                - "center": center of the bbox
+                - "scale": scale of the bbox
+                - "rotation": rotation of the bbox
+                - "bbox_score": score of bbox
+            return_loss (bool): Option to `return loss`. `return loss=True`
+                for training, `return loss=False` for validation & test.
+            return_heatmap (bool) : Option to return heatmap.
+
+        Returns:
+            dict|tuple: if `return loss` is true, then return losses. \
+                Otherwise, return predicted poses, boxes, image paths \
+                and heatmaps.
+        """
+        if return_loss:
+            return self.forward_train(img, target, target_weight, img_metas,
+                                      **kwargs)
+        return self.forward_test(
+            img, img_metas, return_heatmap=return_heatmap, **kwargs)
+
+    def forward_train(self, img, target, target_weight, img_metas, **kwargs):
+        """Defines the computation performed at every call when training."""
+        output = self.backbone(img)
+        if self.with_neck:
+            output = self.neck(output)
+        if self.with_keypoint:
+            output = self.keypoint_head(output)
+
+        # if return loss
+        losses = dict()
+        if self.with_keypoint:
+            keypoint_losses = self.keypoint_head.get_loss(
+                output, target, target_weight)
+            losses.update(keypoint_losses)
+            keypoint_accuracy = self.keypoint_head.get_accuracy(
+                output, target, target_weight)
+            losses.update(keypoint_accuracy)
+
+        return losses
+
+    def forward_test(self, img, img_metas, return_heatmap=False, **kwargs):
+        """Defines the computation performed at every call when testing."""
+        assert img.size(0) == len(img_metas)
+        batch_size, _, img_height, img_width = img.shape
+        if batch_size > 1:
+            assert 'bbox_id' in img_metas[0]
+
+        result = {}
+
+        features = self.backbone(img)
+        if self.with_neck:
+            features = self.neck(features)
+        if self.with_keypoint:
+            output_heatmap = self.keypoint_head.inference_model(
+                features, flip_pairs=None)
+
+        if self.test_cfg.get('flip_test', True):
+            img_flipped = img.flip(3)
+            features_flipped = self.backbone(img_flipped)
+            if self.with_neck:
+                features_flipped = self.neck(features_flipped)
+            if self.with_keypoint:
+                output_flipped_heatmap = self.keypoint_head.inference_model(
+                    features_flipped, img_metas[0]['flip_pairs'])
+                output_heatmap = (output_heatmap + output_flipped_heatmap)
+                if self.test_cfg.get('regression_flip_shift', False):
+                    output_heatmap[..., 0] -= 1.0 / img_width
+                output_heatmap = output_heatmap / 2
+
+        if self.with_keypoint:
+            keypoint_result = self.keypoint_head.decode(
+                img_metas, output_heatmap, img_size=[img_width, img_height])
+            result.update(keypoint_result)
+
+            if not return_heatmap:
+                output_heatmap = None
+
+            result['output_heatmap'] = output_heatmap
+
+        return result
+
+    def forward_dummy(self, img):
+        """Used for computing network FLOPs.
+
+        See ``tools/get_flops.py``.
+
+        Args:
+            img (torch.Tensor): Input image.
+
+        Returns:
+            Tensor: Output heatmaps.
+        """
+        output = self.backbone(img)
+        if self.with_neck:
+            output = self.neck(output)
+        if self.with_keypoint:
+            output = self.keypoint_head(output)
+        return output
+
+    @deprecated_api_warning({'pose_limb_color': 'pose_link_color'},
+                            cls_name='TopDown')
+    def show_result(self,
+                    img,
+                    result,
+                    skeleton=None,
+                    kpt_score_thr=0.3,
+                    bbox_color='green',
+                    pose_kpt_color=None,
+                    pose_link_color=None,
+                    text_color='white',
+                    radius=4,
+                    thickness=1,
+                    font_scale=0.5,
+                    bbox_thickness=1,
+                    win_name='',
+                    show=False,
+                    show_keypoint_weight=False,
+                    wait_time=0,
+                    out_file=None):
+        """Draw `result` over `img`.
+
+        Args:
+            img (str or Tensor): The image to be displayed.
+            result (list[dict]): The results to draw over `img`
+                (bbox_result, pose_result).
+            skeleton (list[list]): The connection of keypoints.
+                skeleton is 0-based indexing.
+            kpt_score_thr (float, optional): Minimum score of keypoints
+                to be shown. Default: 0.3.
+            bbox_color (str or tuple or :obj:`Color`): Color of bbox lines.
+            pose_kpt_color (np.array[Nx3]`): Color of N keypoints.
+                If None, do not draw keypoints.
+            pose_link_color (np.array[Mx3]): Color of M links.
+                If None, do not draw links.
+            text_color (str or tuple or :obj:`Color`): Color of texts.
+            radius (int): Radius of circles.
+            thickness (int): Thickness of lines.
+            font_scale (float): Font scales of texts.
+            win_name (str): The window name.
+            show (bool): Whether to show the image. Default: False.
+            show_keypoint_weight (bool): Whether to change the transparency
+                using the predicted confidence scores of keypoints.
+            wait_time (int): Value of waitKey param.
+                Default: 0.
+            out_file (str or None): The filename to write the image.
+                Default: None.
+
+        Returns:
+            Tensor: Visualized img, only if not `show` or `out_file`.
+        """
+        img = mmcv.imread(img)
+        img = img.copy()
+
+        bbox_result = []
+        bbox_labels = []
+        pose_result = []
+        for res in result:
+            if 'bbox' in res:
+                bbox_result.append(res['bbox'])
+                bbox_labels.append(res.get('label', None))
+            pose_result.append(res['keypoints'])
+
+        if bbox_result:
+            bboxes = np.vstack(bbox_result)
+            # draw bounding boxes
+            imshow_bboxes(
+                img,
+                bboxes,
+                labels=bbox_labels,
+                colors=bbox_color,
+                text_color=text_color,
+                thickness=bbox_thickness,
+                font_scale=font_scale,
+                show=False)
+
+        if pose_result:
+            imshow_keypoints(img, pose_result, skeleton, kpt_score_thr,
+                             pose_kpt_color, pose_link_color, radius,
+                             thickness)
+
+        if show:
+            imshow(img, win_name, wait_time)
+
+        if out_file is not None:
+            imwrite(img, out_file)
+
+        return img
diff --git a/main/transformer_utils/mmpose/models/heads/__init__.py b/main/transformer_utils/mmpose/models/heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..80c1be50a9bb91eb9e0f97c6bbcca70cf1478a87
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/heads/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .topdown_heatmap_base_head import TopdownHeatmapBaseHead
+from .topdown_heatmap_multi_stage_head import (TopdownHeatmapMSMUHead,
+                                               TopdownHeatmapMultiStageHead)
+from .topdown_heatmap_simple_head import TopdownHeatmapSimpleHead
+from .poseur_head import Poseur_noise_sample
+
+__all__ = [
+    'TopdownHeatmapSimpleHead', 'TopdownHeatmapMultiStageHead',
+    'TopdownHeatmapMSMUHead', 'TopdownHeatmapBaseHead',
+]
diff --git a/main/transformer_utils/mmpose/models/heads/poseur_head.py b/main/transformer_utils/mmpose/models/heads/poseur_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..d01232247db1d687144d8fff2a3b226dd66fdcf5
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/heads/poseur_head.py
@@ -0,0 +1,759 @@
+import numpy as np
+import torch
+import torch.nn as nn
+import copy
+import math
+import warnings
+from mmcv.cnn import build_upsample_layer, Linear, bias_init_with_prob, constant_init, normal_init
+import torch.nn.functional as F
+from mmcv.cnn import normal_init
+
+from mmpose.core.evaluation import (keypoint_pck_accuracy,
+                                    keypoints_from_regression)
+from mmpose.core.post_processing import fliplr_regression
+from mmpose.models.builder import build_loss, HEADS, build_transformer
+from mmpose.core.evaluation import pose_pck_accuracy
+from mmpose.models.utils.transformer import inverse_sigmoid
+from mmcv.cnn import Conv2d, build_activation_layer
+from mmcv.cnn.bricks.transformer import Linear, FFN, build_positional_encoding
+from mmcv.cnn import ConvModule
+import torch.distributions as distributions
+from .rle_regression_head import nets, nett, RealNVP, nets3d, nett3d
+from easydict import EasyDict
+from mmpose.models.losses.regression_loss import L1Loss
+from mmpose.models.losses.rle_loss import RLELoss_poseur, RLEOHKMLoss
+from config import cfg
+from utils.human_models import smpl_x
+from torch.distributions.utils import lazy_property
+
+from torch.distributions import MultivariateNormal
+
+
+def fliplr_rle_regression(regression,
+                          regression_score,
+                          flip_pairs,
+                          center_mode='static',
+                          center_x=0.5,
+                          center_index=0):
+    """Flip human joints horizontally.
+
+    Note:
+        batch_size: N
+        num_keypoint: K
+    Args:
+        regression (np.ndarray([..., K, C])): Coordinates of keypoints, where K
+            is the joint number and C is the dimension. Example shapes are:
+            - [N, K, C]: a batch of keypoints where N is the batch size.
+            - [N, T, K, C]: a batch of pose sequences, where T is the frame
+                number.
+        flip_pairs (list[tuple()]): Pairs of keypoints which are mirrored
+            (for example, left ear -- right ear).
+        center_mode (str): The mode to set the center location on the x-axis
+            to flip around. Options are:
+            - static: use a static x value (see center_x also)
+            - root: use a root joint (see center_index also)
+        center_x (float): Set the x-axis location of the flip center. Only used
+            when center_mode=static.
+        center_index (int): Set the index of the root joint, whose x location
+            will be used as the flip center. Only used when center_mode=root.
+
+    Returns:
+        tuple: Flipped human joints.
+
+        - regression_flipped (np.ndarray([..., K, C])): Flipped joints.
+    """
+    assert regression.ndim >= 2, f'Invalid pose shape {regression.shape}'
+
+    allowed_center_mode = {'static', 'root'}
+    assert center_mode in allowed_center_mode, 'Get invalid center_mode ' \
+                                               f'{center_mode}, allowed choices are {allowed_center_mode}'
+
+    if center_mode == 'static':
+        x_c = center_x
+    elif center_mode == 'root':
+        assert regression.shape[-2] > center_index
+        x_c = regression[..., center_index:center_index + 1, 0]
+
+    regression_flipped = regression.copy()
+    regression_score_flipped = regression_score.copy()
+
+    # Swap left-right parts
+    for left, right in flip_pairs:
+        regression_flipped[..., left, :] = regression[..., right, :]
+        regression_flipped[..., right, :] = regression[..., left, :]
+        regression_score_flipped[..., left, :] = regression_score[..., right, :]
+        regression_score_flipped[..., right, :] = regression_score[..., left, :]
+
+    # Flip horizontally
+    regression_flipped[..., 0] = x_c * 2 - regression_flipped[..., 0]
+    return regression_flipped, regression_score_flipped
+
+
+class Linear_with_norm(nn.Module):
+    def __init__(self, in_channel, out_channel, bias=True, norm=True):
+        super(Linear_with_norm, self).__init__()
+        self.bias = bias
+        self.norm = norm
+        self.linear = nn.Linear(in_channel, out_channel, bias)
+        nn.init.xavier_uniform_(self.linear.weight, gain=0.01)
+
+    def forward(self, x):
+        y = x.matmul(self.linear.weight.t())
+
+        if self.norm:
+            x_norm = torch.norm(x, dim=-1, keepdim=True)
+            y = y / x_norm
+
+        if self.bias:
+            y = y + self.linear.bias
+        return y
+
+def deepapply(obj, fn):
+    r"""Applies `fn` to all tensors referenced in `obj`"""
+
+    if torch.is_tensor(obj):
+        obj = fn(obj)
+    elif isinstance(obj, dict):
+        for key, value in obj.items():
+            obj[key] = deepapply(value, fn)
+    elif isinstance(obj, list):
+        for i, value in enumerate(obj):
+            obj[i] = deepapply(value, fn)
+    elif isinstance(obj, tuple):
+        obj = tuple(
+            deepapply(value, fn)
+            for value in obj
+        )
+    elif hasattr(obj, '__dict__'):
+        deepapply(obj.__dict__, fn)
+
+    return obj
+
+
+__init__ = MultivariateNormal.__init__
+
+
+def init(self, *args, **kwargs):
+    __init__(self, *args, **kwargs)
+
+    self.__class__ = type(
+        self.__class__.__name__,
+        (self.__class__, nn.Module),
+        {},
+    )
+
+    nn.Module.__init__(self)
+
+
+MultivariateNormal.__init__ = init
+MultivariateNormal._apply = deepapply
+
+
+@HEADS.register_module()
+class Poseur_noise_sample(nn.Module):
+    """
+    rle loss for transformer_utils
+    """
+
+    def __init__(self,
+                 in_channels,
+                 num_queries=17,
+                 num_reg_fcs=2,
+                 positional_encoding=dict(
+                     type='SinePositionalEncoding',
+                     num_feats=128,
+                     normalize=True),
+                 transformer=None,
+                 with_box_refine=False,
+                 as_two_stage=False,
+                 heatmap_size=[64, 48],
+                 num_joints=17,
+                 loss_coord_enc=None,
+                 loss_coord_dec=None,
+                 loss_hp_keypoint=None,
+                 use_heatmap_loss=True,
+                 train_cfg=None,
+                 test_cfg=None,
+                 use_udp=False,
+                 ):
+        super().__init__()
+        self.use_udp = use_udp
+        self.num_queries = num_queries
+        self.num_reg_fcs = num_reg_fcs
+        self.in_channels = in_channels
+        self.act_cfg = transformer.get('act_cfg', dict(type='ReLU', inplace=True))
+        self.activate = build_activation_layer(self.act_cfg)
+        self.positional_encoding = build_positional_encoding(positional_encoding)
+        self.with_box_refine = with_box_refine
+        self.as_two_stage = as_two_stage
+        if self.as_two_stage:
+            transformer['as_two_stage'] = self.as_two_stage
+        self.transformer = build_transformer(transformer)
+        self.embed_dims = self.transformer.embed_dims
+        assert 'num_feats' in positional_encoding
+        num_feats = positional_encoding['num_feats']
+        assert num_feats * 2 == self.embed_dims, 'embed_dims should' \
+                                                 f' be exactly 2 times of num_feats. Found {self.embed_dims}' \
+                                                 f' and {num_feats}.'
+
+        self.num_joints = num_joints
+        # self.num_joints = len(smpl_x.pos_joint_part['rhand'])
+        self.heatmap_size = heatmap_size
+        self.loss_coord_enc = build_loss(loss_coord_enc)
+        self.loss_coord_dec = build_loss(loss_coord_dec)
+
+        self.use_dec_rle_loss = isinstance(self.loss_coord_dec, RLELoss_poseur) or isinstance(self.loss_coord_dec,
+                                                                                              RLEOHKMLoss)
+        self.use_heatmap_loss = use_heatmap_loss
+        if self.use_heatmap_loss:
+            self.loss_hp = build_loss(loss_hp_keypoint)
+
+        self.train_cfg = {} if train_cfg is None else train_cfg
+        self.test_cfg = {} if test_cfg is None else test_cfg
+
+        enc_prior = MultivariateNormal(torch.zeros(2), torch.eye(2))
+        dec_prior = MultivariateNormal(torch.zeros(2), torch.eye(2))
+        masks = torch.from_numpy(np.array([[0, 1], [1, 0]] * 3).astype(np.float32))
+
+        enc_prior3d = MultivariateNormal(torch.zeros(3), torch.eye(3))
+        dec_prior3d = MultivariateNormal(torch.zeros(3), torch.eye(3))
+        masks3d = torch.from_numpy(np.array([[0, 0, 1], [1, 1, 0]] * 3).astype(np.float32))
+
+        self.enc_flow2d = RealNVP(nets, nett, masks, enc_prior)
+        self.enc_flow3d = RealNVP(nets3d, nett3d, masks3d, enc_prior3d)
+
+        if self.use_dec_rle_loss:
+            self.dec_flow2d = RealNVP(nets, nett, masks, dec_prior)
+            self.dec_flow3d = RealNVP(nets3d, nett3d, masks3d, dec_prior3d)
+
+        self._init_layers()
+
+    def _init_layers(self):
+        """Initialize classification branch and regression branch of head."""
+
+        fc_coord_branch = []
+        for _ in range(self.num_reg_fcs):
+            fc_coord_branch.append(Linear(self.embed_dims, self.embed_dims))
+            fc_coord_branch.append(nn.ReLU())
+        fc_coord_branch.append(Linear(self.embed_dims, 3))
+        fc_coord_branch = nn.Sequential(*fc_coord_branch)
+
+        if self.use_dec_rle_loss:
+            fc_sigma_branch = []
+            for _ in range(self.num_reg_fcs):
+                fc_sigma_branch.append(Linear(self.embed_dims, self.embed_dims))
+            fc_sigma_branch.append(Linear_with_norm(self.embed_dims, 3, norm=False))
+            fc_sigma_branch = nn.Sequential(*fc_sigma_branch)
+
+        def _get_clones(module, N):
+            return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+
+        num_pred = self.transformer.decoder.num_layers
+
+        if self.with_box_refine:
+            self.fc_coord_branches = _get_clones(fc_coord_branch, num_pred)
+            self.fc_coord_output_branches = _get_clones(fc_coord_branch, num_pred)
+            if self.use_dec_rle_loss:
+                self.fc_sigma_branches = _get_clones(fc_sigma_branch, num_pred)
+        else:
+            self.fc_coord_branches = nn.ModuleList(
+                [fc_coord_branch for _ in range(num_pred)])
+            if isinstance(self.loss_coord_dec, RLELoss) or isinstance(self.loss_coord_dec, RLEOHKMLoss):
+                self.fc_sigma_branches = nn.ModuleList([fc_sigma_branch for _ in range(1)])
+
+        if self.as_two_stage:
+            self.query_embedding = None
+        else:
+            self.query_embedding = nn.Embedding(self.num_queries,
+                                                self.embed_dims * 2)
+
+        if self.use_heatmap_loss:
+            from mmcv.cnn import build_upsample_layer
+            # simplebaseline style
+            num_layers = 3
+            num_kernels = [4, 4, 4]
+            num_filters = [256, 256, 256]
+
+            layers = []
+            for i in range(num_layers):
+                kernel, padding, output_padding = \
+                    self._get_deconv_cfg(num_kernels[i])
+
+                planes = num_filters[i]
+                if i == 0:
+                    layers.append(
+                        build_upsample_layer(
+                            dict(type='deconv'),
+                            in_channels=self.embed_dims,
+                            out_channels=planes,
+                            kernel_size=kernel,
+                            stride=2,
+                            padding=padding,
+                            output_padding=output_padding,
+                            bias=False))
+                else:
+                    layers.append(
+                        build_upsample_layer(
+                            dict(type='deconv'),
+                            in_channels=planes,
+                            out_channels=planes,
+                            kernel_size=kernel,
+                            stride=2,
+                            padding=padding,
+                            output_padding=output_padding,
+                            bias=False))
+
+                layers.append(nn.BatchNorm2d(planes))
+                layers.append(nn.ReLU(inplace=True))
+                self.in_channels = planes
+
+            self.deconv_layer = nn.Sequential(*layers)
+            self.final_layer = nn.Sequential(
+                ConvModule(
+                    self.embed_dims,
+                    self.num_joints,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                    norm_cfg=None,
+                    act_cfg=None,
+                    inplace=False)
+            )
+
+    @staticmethod
+    def _get_deconv_cfg(deconv_kernel):
+        """Get configurations for deconv layers."""
+        if deconv_kernel == 4:
+            padding = 1
+            output_padding = 0
+        elif deconv_kernel == 3:
+            padding = 1
+            output_padding = 1
+        elif deconv_kernel == 2:
+            padding = 0
+            output_padding = 0
+        else:
+            raise ValueError(f'Not supported num_kernels ({deconv_kernel}).')
+
+        return deconv_kernel, padding, output_padding
+
+    def init_weights(self):
+        """Initialize weights of the DeformDETR head."""
+        self.transformer.init_weights()
+
+        # for m in [self.fc_coord_branches, self.fc_sigma_branches]:
+        for m in [self.fc_coord_branches]:
+            for mm in m:
+                if isinstance(mm, nn.Linear):
+                    nn.init.xavier_uniform_(mm.weight, gain=0.01)
+
+        for m in [self.fc_coord_output_branches]:
+            for mm in m:
+                if isinstance(mm, nn.Linear):
+                    nn.init.xavier_uniform_(mm.weight, gain=0.01)
+
+        if self.use_heatmap_loss:
+            for _, m in self.deconv_layer.named_modules():
+                if isinstance(m, nn.ConvTranspose2d):
+                    normal_init(m, std=0.001)
+                elif isinstance(m, nn.BatchNorm2d):
+                    constant_init(m, 1)
+            for m in self.final_layer.modules():
+                if isinstance(m, nn.Conv2d):
+                    normal_init(m, std=0.001, bias=0)
+                elif isinstance(m, nn.BatchNorm2d):
+                    constant_init(m, 1)
+
+    def forward(self, mlvl_feats, coord_init=None, query_init=None):
+
+        batch_size = mlvl_feats[0].size(0)
+        img_w, img_h = self.train_cfg['image_size']
+        img_masks = mlvl_feats[0].new_ones(
+            (batch_size, img_h, img_w))
+        for img_id in range(batch_size):
+            img_masks[img_id, :img_h, :img_w] = 0
+
+        mlvl_masks = []
+        mlvl_positional_encodings = []
+        for feat in mlvl_feats:
+            mlvl_masks.append(F.interpolate(img_masks[None], size=feat.shape[-2:]).to(torch.bool).squeeze(0))
+            mlvl_positional_encodings.append(
+                self.positional_encoding(mlvl_masks[-1]))
+
+        query_embeds = None
+        if not self.as_two_stage:
+            query_embeds = self.query_embedding.weight
+
+        memory, spatial_shapes, level_start_index, hs, init_reference, inter_references, \
+        enc_outputs = self.transformer(
+            mlvl_feats,
+            mlvl_masks,
+            query_embeds,
+            mlvl_positional_encodings,
+            reg_branches=self.fc_coord_branches if self.with_box_refine else None,  # noqa:E501
+            cls_branches=None,  # noqa:E501
+            coord_init=coord_init,
+            query_init=query_init,
+        )
+        hs = hs.permute(0, 2, 1, 3)
+        outputs_coords = []
+
+        dec_outputs = EasyDict(pred_jts=outputs_coords, feat=hs)
+
+        return enc_outputs, dec_outputs
+
+    def get_loss(self, enc_output, dec_output, coord_target, coord_target_weight, hp_target, hp_target_weight):
+        losses = dict()
+        if self.as_two_stage and enc_output is not None:
+            enc_rle_loss = self.get_enc_rle_loss(enc_output, coord_target, coord_target_weight)
+            losses.update(enc_rle_loss)
+
+        dec_rle_loss = self.get_dec_rle_loss(dec_output, coord_target, coord_target_weight)
+        losses.update(dec_rle_loss)
+
+        return losses
+
+    def get_enc_rle_loss(self, output, target, target_weight):
+        """Calculate top-down keypoint loss.
+        Note:
+            batch_size: N
+            num_keypoints: K
+        Args:
+            output (torch.Tensor[N, K, 2]): Output keypoints.
+            target (torch.Tensor[N, K, 2]): Target keypoints.
+            target_weight (torch.Tensor[N, K, 2]):
+                Weights across different joint types.
+        """
+
+        losses = dict()
+        assert not isinstance(self.loss_coord_enc, nn.Sequential)
+        assert target.dim() == 3 and target_weight.dim() == 3
+
+        BATCH_SIZE = output.sigma.size(0)
+        gt_uvd = target.reshape(output.pred_jts.shape)
+        gt_uvd_weight = target_weight.reshape(output.pred_jts.shape)
+        gt_3d_mask = gt_uvd_weight[:, :, 2].reshape(-1)
+
+        assert output.pred_jts.shape == output.sigma.shape, (output.pred_jts.shape, output.sigma.shape)
+        bar_mu = (output.pred_jts - gt_uvd) / output.sigma
+        bar_mu = bar_mu.reshape(-1, 3)
+        bar_mu_3d = bar_mu[gt_3d_mask > 0]
+        bar_mu_2d = bar_mu[gt_3d_mask < 1][:, :2]
+        # (B, K, 3)
+        log_phi_3d = self.enc_flow3d.log_prob(bar_mu_3d)
+        log_phi_2d = self.enc_flow2d.log_prob(bar_mu_2d)
+        log_phi = torch.zeros_like(bar_mu[:, 0])
+        # print(gt_3d_mask)
+        log_phi[gt_3d_mask > 0] = log_phi_3d
+        log_phi[gt_3d_mask < 1] = log_phi_2d
+        log_phi = log_phi.reshape(BATCH_SIZE, self.num_joints, 1)
+
+        output.nf_loss = torch.log(output.sigma) - log_phi
+        losses['enc_rle_loss'] = self.loss_coord_enc(output, target, target_weight)
+
+        return losses
+
+    def get_enc_rle_loss_old(self, output, target, target_weight):
+        """Calculate top-down keypoint loss.
+        Note:
+            batch_size: N
+            num_keypoints: K
+        Args:
+            output (torch.Tensor[N, K, 2]): Output keypoints.
+            target (torch.Tensor[N, K, 2]): Target keypoints.
+            target_weight (torch.Tensor[N, K, 2]):
+                Weights across different joint types.
+        """
+
+        losses = dict()
+        assert not isinstance(self.loss_coord_enc, nn.Sequential)
+        assert target.dim() == 3 and target_weight.dim() == 3
+
+        BATCH_SIZE = output.sigma.size(0)
+        gt_uv = target.reshape(output.pred_jts.shape)
+        bar_mu = (output.pred_jts - gt_uv) / output.sigma
+        # (B, K, 1)
+        log_phi = self.enc_flow.log_prob(bar_mu.reshape(-1, 2)).reshape(BATCH_SIZE, self.num_joints, 1)
+        output.nf_loss = torch.log(output.sigma) - log_phi
+        losses['enc_rle_loss'] = self.loss_coord_enc(output, target, target_weight)
+
+        return losses
+
+    def get_dec_rle_loss(self, output, target, target_weight):
+        """Calculate top-down keypoint loss.
+
+        Note:
+            batch_size: N
+            num_keypoints: K
+
+        Args:
+            output (torch.Tensor[N, K, 2]): Output keypoints.
+            target (torch.Tensor[N, K, 2]): Target keypoints.
+            target_weight (torch.Tensor[N, K, 2]):
+                Weights across different joint types.
+        """
+
+        losses = dict()
+        assert not isinstance(self.loss_coord_dec, nn.Sequential)
+        assert target.dim() == 3 and target_weight.dim() == 3
+        target = target.repeat(1, self.transformer.num_noise_sample + 1, 1)
+        target_weight = target_weight.repeat(1, self.transformer.num_noise_sample + 1, 1)
+
+        if self.with_box_refine:
+            if self.use_dec_rle_loss:
+                for i in range(len(output.pred_jts)):
+                    pred_jts, sigma = output.pred_jts[i], output.sigma[i]
+                    output_i = EasyDict(
+                        pred_jts=pred_jts,
+                        sigma=sigma
+                    )
+                    BATCH_SIZE = output_i.sigma.size(0)
+                    gt_uvd = target.reshape(output_i.pred_jts.shape)
+                    gt_uvd_weight = target_weight.reshape(pred_jts.shape)
+                    gt_3d_mask = gt_uvd_weight[:, :, 2].reshape(-1)
+
+                    assert pred_jts.shape == sigma.shape, (pred_jts.shape, sigma.shape)
+                    bar_mu = (output_i.pred_jts - gt_uvd) / output_i.sigma
+                    bar_mu = bar_mu.reshape(-1, 3)
+                    bar_mu_3d = bar_mu[gt_3d_mask > 0]
+                    bar_mu_2d = bar_mu[gt_3d_mask < 1][:, :2]
+                    # (B, K, 3)
+                    log_phi_3d = self.dec_flow3d.log_prob(bar_mu_3d)
+                    log_phi_2d = self.dec_flow2d.log_prob(bar_mu_2d)
+                    log_phi = torch.zeros_like(bar_mu[:, 0])
+                    log_phi[gt_3d_mask > 0] = log_phi_3d
+                    log_phi[gt_3d_mask < 1] = log_phi_2d
+                    log_phi = log_phi.reshape(BATCH_SIZE, self.num_joints * (self.transformer.num_noise_sample + 1), 1)
+                    output_i.nf_loss = torch.log(output_i.sigma) - log_phi
+                    losses['dec_rle_loss_{}'.format(i)] = self.loss_coord_dec(output_i, target, target_weight)
+            else:
+                for i, pred_jts in enumerate(output.pred_jts):
+                    losses['dec_rle_loss_{}'.format(i)] = self.loss_coord_dec(pred_jts, target, target_weight)
+        else:
+            if self.use_dec_rle_loss:
+                BATCH_SIZE = output.sigma.size(0)
+                gt_uv = target.reshape(output.pred_jts.shape)
+                bar_mu = (output.pred_jts - gt_uv) / output.sigma
+                # (B, K, 1)
+                log_phi = self.dec_flow.log_prob(bar_mu.reshape(-1, 2)).reshape(BATCH_SIZE, self.num_joints, 1)
+                output.nf_loss = torch.log(output.sigma) - log_phi
+                losses['dec_rle_loss'] = self.loss_coord_dec(output, target, target_weight) * 0
+            else:
+                losses['dec_rle_loss'] = self.loss_coord_dec(output.pred_jts, target + 0.5, target_weight) * 0
+
+        return losses
+
+    def get_hp_loss(self, output, target, target_weight):
+        """Calculate top-down keypoint loss.
+
+        Note:
+            batch_size: N
+            num_keypoints: K
+            heatmaps height: H
+            heatmaps weight: W
+
+        Args:
+            output (torch.Tensor[NxKxHxW]): Output heatmaps.
+            target (torch.Tensor[NxKxHxW]): Target heatmaps.
+            target_weight (torch.Tensor[NxKx1]):
+                Weights across different joint types.
+        """
+
+        losses = dict()
+
+        if isinstance(self.loss_hp, nn.Sequential):
+            if not isinstance(output, dict):
+                assert len(self.loss_hp) == output.size(0)
+                assert target.dim() == 5 and target_weight.dim() == 4
+                num_hp_layers = output.size(0)
+                for i in range(num_hp_layers):
+                    target_i = target[:, i, :, :, :]
+                    target_weight_i = target_weight[:, i, :, :]
+                    losses['mse_loss_{}'.format(i)] = self.loss_hp[i](output[i], target_i, target_weight_i)
+            else:
+                out_hp_backbone = output['backbone']
+                num_hp_layers = out_hp_backbone.size(0)
+                for i in range(num_hp_layers):
+                    target_i = target[:, i, :, :, :]
+                    target_weight_i = target_weight[:, i, :, :]
+                    losses['mse_loss_backbone_{}'.format(i)] = self.loss_hp[i](out_hp_backbone[i], target_i,
+                                                                               target_weight_i)
+
+                out_hp_enc = output['enc']
+                for lvl in range(len(out_hp_enc)):
+                    if lvl == 2 or lvl == 5:
+                        # if lvl == 5:
+                        for i in range(3):
+                            target_i = target[:, i + 1, :, :, :]
+                            target_weight_i = target_weight[:, i + 1, :, :]
+                            # losses['reg_loss'] += self.loss(output[i], target, target_weight).sum()
+                            if lvl == 2:
+                                loss_weight = 0.1
+                            elif lvl == 5:
+                                loss_weight = 1.0
+
+                            losses['mse_loss_enc_layer{}_c{}'.format(lvl, i + 3)] = loss_weight * self.loss_hp[i + 1](
+                                out_hp_enc[lvl][i], target_i, target_weight_i)
+        else:
+
+            assert target.dim() == 4 and target_weight.dim() == 3
+            losses['mse_loss'] = self.loss_hp(output, target, target_weight)
+
+        return losses
+
+    def get_accuracy(self, enc_output, dec_output, coord_target, coord_target_weight, hp_target, hp_target_weight):
+        """Calculate accuracy for top-down keypoint loss.
+
+        Note:
+            batch_size: N
+            num_keypoints: K
+
+        Args:
+            output (torch.Tensor[N, K, 2]): Output keypoints.
+            target (torch.Tensor[N, K, 2]): Target keypoints.
+            target_weight (torch.Tensor[N, K, 2]):
+                Weights across different joint types.
+        """
+
+        accuracy = dict()
+        # coord_output = output["coord"]
+        if self.as_two_stage and enc_output is not None:
+            coord_output = enc_output.pred_jts
+            N = coord_output.shape[0]
+
+            _, avg_acc, cnt = keypoint_pck_accuracy(
+                coord_output.detach().cpu().numpy(),
+                coord_target.detach().cpu().numpy(),
+                coord_target_weight[:, :, 0].detach().cpu().numpy() > 0,
+                thr=0.05,
+                normalize=np.ones((N, 2), dtype=np.float32))
+            accuracy['enc_coord_acc'] = avg_acc
+
+        coord_output = dec_output.pred_jts
+        if coord_output.dim() == 4:
+            coord_output = coord_output[-1]
+        N = coord_output.shape[0]
+
+        if not self.use_dec_rle_loss:
+            coord_target += 0.5
+        # self.num_joints
+        _, avg_acc, cnt = keypoint_pck_accuracy(
+            coord_output[:, :self.num_joints].detach().cpu().numpy(),
+            coord_target.detach().cpu().numpy(),
+            coord_target_weight[:, :, 0].detach().cpu().numpy() > 0,
+            thr=0.05,
+            normalize=np.ones((N, 2), dtype=np.float32))
+        accuracy['dec_coord_acc'] = avg_acc
+
+        # if self.use_heatmap_loss and self.use_multi_stage_memory:
+        #     assert hp_target.dim() == 5 and hp_target_weight.dim() == 4
+        #     _, avg_acc, _ = pose_pck_accuracy(
+        #         hp_output_backbone[0].detach().cpu().numpy(),
+        #         hp_target[:, 0, ...].detach().cpu().numpy(),
+        #         hp_target_weight[:, 0,
+        #                       ...].detach().cpu().numpy().squeeze(-1) > 0)
+        #     accuracy['hp_acc_backbone'] = float(avg_acc)
+
+        #     _, avg_acc, _ = pose_pck_accuracy(
+        #         hp_output_enc[-1][0].detach().cpu().numpy(),
+        #         hp_target[:, 1, ...].detach().cpu().numpy(),
+        #         hp_target_weight[:, 1,
+        #                       ...].detach().cpu().numpy().squeeze(-1) > 0)
+        #     accuracy['hp_acc_enc'] = float(avg_acc)
+
+        # else:
+        if self.use_heatmap_loss:
+            hp_output = dec_output["hp"]
+            _, avg_acc, _ = pose_pck_accuracy(
+                hp_output.detach().cpu().numpy(),
+                hp_target.detach().cpu().numpy(),
+                hp_target_weight.detach().cpu().numpy().squeeze(-1) > 0)
+            accuracy['hp_acc'] = float(avg_acc)
+
+        return accuracy
+
+    def inference_model(self, x, flip_pairs=None):
+        """Inference function.
+
+        Returns:
+            output_regression (np.ndarray): Output regression.
+
+        Args:
+            x (torch.Tensor[N, K, 2]): Input features.
+            flip_pairs (None | list[tuple()):
+                Pairs of keypoints which are mirrored.
+        """
+        output_enc, output_dec = self.forward(x)
+        output_regression, output_regression_score = output_dec.pred_jts.detach().cpu().numpy(), output_dec.maxvals.detach().cpu().numpy()
+        output_sigma = output_dec.sigma.detach().cpu().numpy()
+        output_sigma = output_sigma[-1]
+        output_regression_score = np.concatenate([output_regression_score, output_sigma], axis=2)
+
+        if output_regression.ndim == 4:
+            output_regression = output_regression[-1]
+
+        if flip_pairs is not None:
+
+            output_regression, output_regression_score = fliplr_rle_regression(
+                output_regression, output_regression_score, flip_pairs)
+
+        return output_regression, output_regression_score
+
+    def decode_keypoints(self, img_metas, output_regression, output_regression_score, img_size):
+        """Decode keypoints from output regression.
+
+        Args:
+            img_metas (list(dict)): Information about data augmentation
+                By default this includes:
+                - "image_file: path to the image file
+                - "center": center of the bbox
+                - "scale": scale of the bbox
+                - "rotation": rotation of the bbox
+                - "bbox_score": score of bbox
+            output_regression (np.ndarray[N, K, 2]): model
+                predicted regression vector.
+            img_size (tuple(img_width, img_height)): model input image size.
+        """
+        batch_size = len(img_metas)
+
+        if 'bbox_id' in img_metas[0]:
+            bbox_ids = []
+        else:
+            bbox_ids = None
+
+        c = np.zeros((batch_size, 2), dtype=np.float32)
+        s = np.zeros((batch_size, 2), dtype=np.float32)
+        image_paths = []
+        score = np.ones(batch_size)
+        for i in range(batch_size):
+            c[i, :] = img_metas[i]['center']
+            s[i, :] = img_metas[i]['scale']
+            image_paths.append(img_metas[i]['image_file'])
+
+            if 'bbox_score' in img_metas[i]:
+                score[i] = np.array(img_metas[i]['bbox_score']).reshape(-1)
+
+            if bbox_ids is not None:
+                bbox_ids.append(img_metas[i]['bbox_id'])
+
+        preds, maxvals = keypoints_from_regression(output_regression, c, s,
+                                                   img_size)
+
+        all_preds = np.zeros((batch_size, preds.shape[1], 3), dtype=np.float32)
+        all_boxes = np.zeros((batch_size, 6), dtype=np.float32)
+        all_preds[:, :, 0:2] = preds[:, :, 0:2]
+        # all_preds[:, :, 2:3] = maxvals
+        all_preds[:, :, 2:3] = output_regression_score
+        all_boxes[:, 0:2] = c[:, 0:2]
+        all_boxes[:, 2:4] = s[:, 0:2]
+        all_boxes[:, 4] = np.prod(s * 200.0, axis=1)
+        all_boxes[:, 5] = score
+
+        result = {}
+
+        result['preds'] = all_preds
+        result['boxes'] = all_boxes
+        result['image_paths'] = image_paths
+        result['bbox_ids'] = bbox_ids
+
+        return result
diff --git a/main/transformer_utils/mmpose/models/heads/rle_regression_head.py b/main/transformer_utils/mmpose/models/heads/rle_regression_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..b96a19155f6ec13f86e069d75d15ea4b70f133fa
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/heads/rle_regression_head.py
@@ -0,0 +1,426 @@
+import numpy as np
+import torch.nn as nn
+from mmcv.cnn import normal_init
+
+from mmpose.core.evaluation import (keypoint_pck_accuracy,
+                                    keypoints_from_regression)
+from mmpose.core.post_processing import fliplr_regression
+from mmpose.models.builder import HEADS, build_loss
+
+import torch
+import torch.nn as nn
+import torch.distributions as distributions
+from easydict import EasyDict
+
+def rle_fliplr_regression(regression,
+                      regression_score,
+                      flip_pairs,
+                      center_mode='static',
+                      center_x=0.5,
+                      center_index=0,
+                      shift=True):
+    """Flip human joints horizontally.
+
+    Note:
+        batch_size: N
+        num_keypoint: K
+    Args:
+        regression (np.ndarray([..., K, C])): Coordinates of keypoints, where K
+            is the joint number and C is the dimension. Example shapes are:
+            - [N, K, C]: a batch of keypoints where N is the batch size.
+            - [N, T, K, C]: a batch of pose sequences, where T is the frame
+                number.
+        flip_pairs (list[tuple()]): Pairs of keypoints which are mirrored
+            (for example, left ear -- right ear).
+        center_mode (str): The mode to set the center location on the x-axis
+            to flip around. Options are:
+            - static: use a static x value (see center_x also)
+            - root: use a root joint (see center_index also)
+        center_x (float): Set the x-axis location of the flip center. Only used
+            when center_mode=static.
+        center_index (int): Set the index of the root joint, whose x location
+            will be used as the flip center. Only used when center_mode=root.
+
+    Returns:
+        tuple: Flipped human joints.
+
+        - regression_flipped (np.ndarray([..., K, C])): Flipped joints.
+    """
+    assert regression.ndim >= 2, f'Invalid pose shape {regression.shape}'
+
+    # flip
+    # width_dim = 48
+    # if shift:
+    #     regression[:, :, 0] = - regression[:, :, 0] - 1 / (width_dim * 4)
+    # else:
+    #     regression[:, :, 0] = -1 / width_dim - regression[:, :, 0]
+
+    allowed_center_mode = {'static', 'root'}
+    assert center_mode in allowed_center_mode, 'Get invalid center_mode ' \
+        f'{center_mode}, allowed choices are {allowed_center_mode}'
+
+    if center_mode == 'static':
+        x_c = center_x
+    elif center_mode == 'root':
+        assert regression.shape[-2] > center_index
+        x_c = regression[..., center_index:center_index + 1, 0]
+
+    regression_flipped = regression.copy()
+    regression_score_flipped = regression_score.copy()
+
+    # Swap left-right parts
+    for left, right in flip_pairs:
+        regression_flipped[..., left, :] = regression[..., right, :]
+        regression_flipped[..., right, :] = regression[..., left, :]
+        regression_score_flipped[..., left, :] = regression_score[..., right, :]
+        regression_score_flipped[..., right, :] = regression_score[..., left, :]
+
+    # Flip horizontally
+    regression_flipped[..., 0] = x_c * 2 - regression_flipped[..., 0]
+    return regression_flipped, regression_score_flipped
+
+
+def nets():
+    return nn.Sequential(nn.Linear(2, 64), nn.LeakyReLU(), nn.Linear(64, 64), nn.LeakyReLU(), nn.Linear(64, 2), nn.Tanh())
+
+def nets3d():
+    return nn.Sequential(nn.Linear(3, 64), nn.LeakyReLU(), nn.Linear(64, 64), nn.LeakyReLU(), nn.Linear(64, 3), nn.Tanh())
+    # return nn.Sequential(nn.Linear(3, 256), nn.LeakyReLU(), nn.Linear(256, 2), nn.Tanh())
+
+def nett():
+    return nn.Sequential(nn.Linear(2, 64), nn.LeakyReLU(), nn.Linear(64, 64), nn.LeakyReLU(), nn.Linear(64, 2))
+
+def nett3d():
+    return nn.Sequential(nn.Linear(3, 64), nn.LeakyReLU(), nn.Linear(64, 64), nn.LeakyReLU(), nn.Linear(64, 3))
+    # return nn.Sequential(nn.Linear(3, 256), nn.LeakyReLU(), nn.Linear(256, 2))
+
+
+class Linear(nn.Module):
+    def __init__(self, in_channel, out_channel, bias=True, norm=True):
+        super(Linear, self).__init__()
+        self.bias = bias
+        self.norm = norm
+        self.linear = nn.Linear(in_channel, out_channel, bias)
+        nn.init.xavier_uniform_(self.linear.weight, gain=0.01)
+
+    def forward(self, x):
+        y = x.matmul(self.linear.weight.t())
+
+        if self.norm:
+            x_norm = torch.norm(x, dim=1, keepdim=True)
+            y = y / x_norm
+
+        if self.bias:
+            y = y + self.linear.bias
+        return y
+
+
+class RealNVP(nn.Module):
+    def __init__(self, nets, nett, mask, prior):
+        super(RealNVP, self).__init__()
+
+        self.prior = prior
+        self.register_buffer('mask', mask)
+        self.t = torch.nn.ModuleList([nett() for _ in range(len(mask))])
+        self.s = torch.nn.ModuleList([nets() for _ in range(len(mask))])
+
+    def _init(self):
+        for m in self.t:
+            for mm in m.modules():
+                if isinstance(mm, nn.Linear):
+                    nn.init.xavier_uniform_(mm.weight, gain=0.01)
+        for m in self.s:
+            for mm in m.modules():
+                if isinstance(mm, nn.Linear):
+                    nn.init.xavier_uniform_(mm.weight, gain=0.01)
+
+    def forward_p(self, z):
+        x = z
+        for i in range(len(self.t)):
+            x_ = x * self.mask[i]
+            s = self.s[i](x_) * (1 - self.mask[i])
+            t = self.t[i](x_) * (1 - self.mask[i])
+            x = x_ + (1 - self.mask[i]) * (x * torch.exp(s) + t)
+        return x
+
+    def backward_p(self, x):
+        log_det_J, z = x.new_zeros(x.shape[0]), x
+        for i in reversed(range(len(self.t))):
+            z_ = self.mask[i] * z
+            s = self.s[i](z_) * (1 - self.mask[i])
+            t = self.t[i](z_) * (1 - self.mask[i])
+            z = (1 - self.mask[i]) * (z - t) * torch.exp(-s) + z_
+            log_det_J -= s.sum(dim=1)
+        return z, log_det_J
+
+    def log_prob(self, x):
+        DEVICE = x.device
+        if self.prior.loc.device != DEVICE:
+            self.prior.loc = self.prior.loc.to(DEVICE)
+            self.prior.scale_tril = self.prior.scale_tril.to(DEVICE)
+            self.prior._unbroadcasted_scale_tril = self.prior._unbroadcasted_scale_tril.to(DEVICE)
+            self.prior.covariance_matrix = self.prior.covariance_matrix.to(DEVICE)
+            self.prior.precision_matrix = self.prior.precision_matrix.to(DEVICE)
+
+        z, logp = self.backward_p(x)
+        return self.prior.log_prob(z) + logp
+
+    def sample(self, batchSize):
+        z = self.prior.sample((batchSize, 1))
+        x = self.forward_p(z)
+        return x
+
+    def forward(self, x):
+        return self.log_prob(x)
+
+
+@HEADS.register_module()
+class RLERegressionHead(nn.Module):
+    """Deeppose regression head with fully connected layers.
+
+    paper ref: Alexander Toshev and Christian Szegedy,
+    ``DeepPose: Human Pose Estimation via Deep Neural Networks.''.
+
+    Args:
+        in_channels (int): Number of input channels
+        num_joints (int): Number of joints
+        loss_keypoint (dict): Config for keypoint loss. Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 num_joints,
+                 loss_keypoint=None,
+                 train_cfg=None,
+                 test_cfg=None):
+        super().__init__()
+
+        self.in_channels = in_channels
+        self.num_joints = num_joints
+
+        self.loss = build_loss(loss_keypoint)
+
+        self.train_cfg = {} if train_cfg is None else train_cfg
+        self.test_cfg = {} if test_cfg is None else test_cfg
+
+        # self.fc = nn.Linear(self.in_channels, self.num_joints * 2)
+        # self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        # self.fcs, out_channel = self._make_fc_layer()
+
+        # self.fc_coord = Linear(self.in_channels, self.num_joints * 2)
+        # self.fc_sigma = Linear(self.in_channels, self.num_joints * 2, norm=False)
+        self.fc_coord = Linear(self.in_channels, self.num_joints * 3)
+        self.fc_sigma = Linear(self.in_channels, self.num_joints * 3, norm=False)
+
+        self.fc_layers = [self.fc_coord, self.fc_sigma]
+
+        self.share_flow = True
+
+        prior = distributions.MultivariateNormal(torch.zeros(2), torch.eye(2))
+        masks = torch.from_numpy(np.array([[0, 1], [1, 0]] * 3).astype(np.float32))
+
+        prior3d = distributions.MultivariateNormal(torch.zeros(3), torch.eye(3))
+        masks3d = torch.from_numpy(np.array([[0, 0, 1], [1, 1, 0]] * 3).astype(np.float32))
+
+        self.flow2d = RealNVP(nets, nett, masks, prior)
+        self.flow3d = RealNVP(nets3d, nett3d, masks3d, prior3d)
+
+
+    # def _make_fc_layer(self):
+    #     fc_layers = []
+    #     num_deconv = len(self.fc_dim)
+    #     input_channel = self.feature_channel
+    #     for i in range(num_deconv):
+    #         if self.fc_dim[i] > 0:
+    #             fc = nn.Linear(input_channel, self.fc_dim[i])
+    #             bn = nn.BatchNorm1d(self.fc_dim[i])
+    #             fc_layers.append(fc)
+    #             fc_layers.append(bn)
+    #             fc_layers.append(nn.ReLU(inplace=True))
+    #             input_channel = self.fc_dim[i]
+    #         else:
+    #             fc_layers.append(nn.Identity())
+    #
+    #     return nn.Sequential(*fc_layers), input_channel
+
+
+    def forward(self, x):
+        """Forward function."""
+        # output = self.fc(x)
+        # N, C = output.shape
+        # return output.reshape([N, C // 2, 2])
+        BATCH_SIZE = x.shape[0]
+        out_coord = self.fc_coord(x).reshape(BATCH_SIZE, self.num_joints, 3)
+        assert out_coord.shape[2] == 3
+
+        out_sigma = self.fc_sigma(x).reshape(BATCH_SIZE, self.num_joints, -1)
+
+        # (B, N, 3)
+        pred_jts = out_coord.reshape(BATCH_SIZE, self.num_joints, 3)
+        sigma = out_sigma.reshape(BATCH_SIZE, self.num_joints, -1).sigmoid() + 1e-9
+        scores = 1 - sigma
+        # (B, N, 1)
+        scores = torch.mean(scores, dim=2, keepdim=True)
+
+        output = EasyDict(
+            pred_jts=pred_jts,
+            sigma=sigma,
+            maxvals=scores.float(),
+        )
+        return output
+
+    def get_loss(self, output, target, target_weight):
+        """Calculate top-down keypoint loss.
+
+        Note:
+            batch_size: N
+            num_keypoints: K
+
+        Args:
+            output (torch.Tensor[N, K, 2]): Output keypoints.
+            target (torch.Tensor[N, K, 2]): Target keypoints.
+            target_weight (torch.Tensor[N, K, 2]):
+                Weights across different joint types.
+        """
+
+        losses = dict()
+        assert not isinstance(self.loss, nn.Sequential)
+        assert target.dim() == 3 and target_weight.dim() == 3
+
+        BATCH_SIZE = output.sigma.size(0)
+        gt_uvd = target.reshape(output.pred_jts.shape)
+        bar_mu = (output.pred_jts - gt_uvd) / output.sigma
+        # (B, K, 1)
+        log_phi = self.flow.log_prob(bar_mu.reshape(-1, 2)).reshape(BATCH_SIZE, self.num_joints, 1)
+        output.nf_loss = torch.log(output.sigma) - log_phi
+        losses['reg_loss'] = self.loss(output, target, target_weight)
+
+        return losses
+
+    def get_accuracy(self, output, target, target_weight):
+        """Calculate accuracy for top-down keypoint loss.
+
+        Note:
+            batch_size: N
+            num_keypoints: K
+
+        Args:
+            output (torch.Tensor[N, K, 2]): Output keypoints.
+            target (torch.Tensor[N, K, 2]): Target keypoints.
+            target_weight (torch.Tensor[N, K, 2]):
+                Weights across different joint types.
+        """
+
+        accuracy = dict()
+
+        N = output.pred_jts.shape[0]
+
+        _, avg_acc, cnt = keypoint_pck_accuracy(
+            output.pred_jts.detach().cpu().numpy(),
+            target.detach().cpu().numpy(),
+            target_weight[:, :, 0].detach().cpu().numpy() > 0,
+            thr=0.05,
+            normalize=np.ones((N, 2), dtype=np.float32))
+        accuracy['acc_pose'] = avg_acc
+
+        return accuracy
+
+    def inference_model(self, x, flip_pairs=None):
+        """Inference function.
+
+        Returns:
+            output_regression (np.ndarray): Output regression.
+
+        Args:
+            x (torch.Tensor[N, K, 2]): Input features.
+            flip_pairs (None | list[tuple()):
+                Pairs of keypoints which are mirrored.
+        """
+        output = self.forward(x)
+
+        if flip_pairs is not None:
+            output_regression, output_regression_score = rle_fliplr_regression(
+                output.pred_jts.detach().cpu().numpy(), output.maxvals.detach().cpu().numpy(), flip_pairs, center_x=0.0)
+        else:
+            output_regression = output.pred_jts.detach().cpu().numpy()
+            output_regression_score = output.maxvals.detach().cpu().numpy()
+        
+        output_regression += 0.5
+        # output = EasyDict(
+        #     preds=output_regression,
+        #     maxvals=output_regression_score,
+        # )
+        return output_regression
+
+    def decode(self, img_metas, output, pixel_std=200.0, **kwargs):
+        """Decode the keypoints from output regression.
+
+        Args:
+            img_metas (list(dict)): Information about data augmentation
+                By default this includes:
+                - "image_file: path to the image file
+                - "center": center of the bbox
+                - "scale": scale of the bbox
+                - "rotation": rotation of the bbox
+                - "bbox_score": score of bbox
+            output (np.ndarray[N, K, 2]): predicted regression vector.
+            kwargs: dict contains 'img_size'.
+                img_size (tuple(img_width, img_height)): input image size.
+        """
+        batch_size = len(img_metas)
+
+        if 'bbox_id' in img_metas[0]:
+            bbox_ids = []
+        else:
+            bbox_ids = None
+
+        c = np.zeros((batch_size, 2), dtype=np.float32)
+        s = np.zeros((batch_size, 2), dtype=np.float32)
+        image_paths = []
+        score = np.ones(batch_size)
+        for i in range(batch_size):
+            c[i, :] = img_metas[i]['center']
+            s[i, :] = img_metas[i]['scale']
+            image_paths.append(img_metas[i]['image_file'])
+
+            if 'bbox_score' in img_metas[i]:
+                score[i] = np.array(img_metas[i]['bbox_score']).reshape(-1)
+            if bbox_ids is not None:
+                bbox_ids.append(img_metas[i]['bbox_id'])
+
+        preds, maxvals = keypoints_from_regression(output, c, s, kwargs['img_size'], pixel_std)
+        # maxvals = output.maxvals
+
+        all_preds = np.zeros((batch_size, preds.shape[1], 3), dtype=np.float32)
+        all_boxes = np.zeros((batch_size, 6), dtype=np.float32)
+        all_preds[:, :, 0:2] = preds[:, :, 0:2]
+        all_preds[:, :, 2:3] = maxvals
+        all_boxes[:, 0:2] = c[:, 0:2]
+        all_boxes[:, 2:4] = s[:, 0:2]
+        all_boxes[:, 4] = np.prod(s * pixel_std, axis=1)
+        all_boxes[:, 5] = score
+
+        result = {}
+        result['preds'] = all_preds
+        result['boxes'] = all_boxes
+        result['image_paths'] = image_paths
+        result['bbox_ids'] = bbox_ids
+
+        return result
+
+    def init_weights(self):
+        for m in self.fc_layers:
+            if isinstance(m, nn.Linear):
+                nn.init.xavier_uniform_(m.weight, gain=0.01)
+        
+        
+        # for m in self.flow.t:
+        #     for mm in m.modules():
+        #         if isinstance(mm, nn.Linear):
+        #             nn.init.xavier_uniform_(mm.weight, gain=0.01)
+
+        # for m in self.flow.s:
+        #     for mm in m.modules():
+        #         if isinstance(mm, nn.Linear):
+        #             nn.init.xavier_uniform_(mm.weight, gain=0.01)
+        # normal_init(self.fc, mean=0, std=0.01, bias=0)
diff --git a/main/transformer_utils/mmpose/models/heads/topdown_heatmap_base_head.py b/main/transformer_utils/mmpose/models/heads/topdown_heatmap_base_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..09646ead353fb054f066b9fc6816748a43287e2c
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/heads/topdown_heatmap_base_head.py
@@ -0,0 +1,120 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+
+import numpy as np
+import torch.nn as nn
+
+from mmpose.core.evaluation.top_down_eval import keypoints_from_heatmaps
+
+
+class TopdownHeatmapBaseHead(nn.Module):
+    """Base class for top-down heatmap heads.
+
+    All top-down heatmap heads should subclass it.
+    All subclass should overwrite:
+
+    Methods:`get_loss`, supporting to calculate loss.
+    Methods:`get_accuracy`, supporting to calculate accuracy.
+    Methods:`forward`, supporting to forward model.
+    Methods:`inference_model`, supporting to inference model.
+    """
+
+    __metaclass__ = ABCMeta
+
+    @abstractmethod
+    def get_loss(self, **kwargs):
+        """Gets the loss."""
+
+    @abstractmethod
+    def get_accuracy(self, **kwargs):
+        """Gets the accuracy."""
+
+    @abstractmethod
+    def forward(self, **kwargs):
+        """Forward function."""
+
+    @abstractmethod
+    def inference_model(self, **kwargs):
+        """Inference function."""
+
+    def decode(self, img_metas, output, **kwargs):
+        """Decode keypoints from heatmaps.
+
+        Args:
+            img_metas (list(dict)): Information about data augmentation
+                By default this includes:
+
+                - "image_file: path to the image file
+                - "center": center of the bbox
+                - "scale": scale of the bbox
+                - "rotation": rotation of the bbox
+                - "bbox_score": score of bbox
+            output (np.ndarray[N, K, H, W]): model predicted heatmaps.
+        """
+        batch_size = len(img_metas)
+
+        if 'bbox_id' in img_metas[0]:
+            bbox_ids = []
+        else:
+            bbox_ids = None
+
+        c = np.zeros((batch_size, 2), dtype=np.float32)
+        s = np.zeros((batch_size, 2), dtype=np.float32)
+        image_paths = []
+        score = np.ones(batch_size)
+        for i in range(batch_size):
+            c[i, :] = img_metas[i]['center']
+            s[i, :] = img_metas[i]['scale']
+            image_paths.append(img_metas[i]['image_file'])
+
+            if 'bbox_score' in img_metas[i]:
+                score[i] = np.array(img_metas[i]['bbox_score']).reshape(-1)
+            if bbox_ids is not None:
+                bbox_ids.append(img_metas[i]['bbox_id'])
+
+        preds, maxvals = keypoints_from_heatmaps(
+            output,
+            c,
+            s,
+            unbiased=self.test_cfg.get('unbiased_decoding', False),
+            post_process=self.test_cfg.get('post_process', 'default'),
+            kernel=self.test_cfg.get('modulate_kernel', 11),
+            valid_radius_factor=self.test_cfg.get('valid_radius_factor',
+                                                  0.0546875),
+            use_udp=self.test_cfg.get('use_udp', False),
+            target_type=self.test_cfg.get('target_type', 'GaussianHeatmap'))
+
+        all_preds = np.zeros((batch_size, preds.shape[1], 3), dtype=np.float32)
+        all_boxes = np.zeros((batch_size, 6), dtype=np.float32)
+        all_preds[:, :, 0:2] = preds[:, :, 0:2]
+        all_preds[:, :, 2:3] = maxvals
+        all_boxes[:, 0:2] = c[:, 0:2]
+        all_boxes[:, 2:4] = s[:, 0:2]
+        all_boxes[:, 4] = np.prod(s * 200.0, axis=1)
+        all_boxes[:, 5] = score
+
+        result = {}
+
+        result['preds'] = all_preds
+        result['boxes'] = all_boxes
+        result['image_paths'] = image_paths
+        result['bbox_ids'] = bbox_ids
+
+        return result
+
+    @staticmethod
+    def _get_deconv_cfg(deconv_kernel):
+        """Get configurations for deconv layers."""
+        if deconv_kernel == 4:
+            padding = 1
+            output_padding = 0
+        elif deconv_kernel == 3:
+            padding = 1
+            output_padding = 1
+        elif deconv_kernel == 2:
+            padding = 0
+            output_padding = 0
+        else:
+            raise ValueError(f'Not supported num_kernels ({deconv_kernel}).')
+
+        return deconv_kernel, padding, output_padding
diff --git a/main/transformer_utils/mmpose/models/heads/topdown_heatmap_multi_stage_head.py b/main/transformer_utils/mmpose/models/heads/topdown_heatmap_multi_stage_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..c439f5b6332d72a66db75bf599035411c4e1e0d1
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/heads/topdown_heatmap_multi_stage_head.py
@@ -0,0 +1,572 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy as cp
+
+import torch.nn as nn
+from mmcv.cnn import (ConvModule, DepthwiseSeparableConvModule, Linear,
+                      build_activation_layer, build_conv_layer,
+                      build_norm_layer, build_upsample_layer, constant_init,
+                      kaiming_init, normal_init)
+
+from mmpose.core.evaluation import pose_pck_accuracy
+from mmpose.core.post_processing import flip_back
+from mmpose.models.builder import build_loss
+from ..builder import HEADS
+from .topdown_heatmap_base_head import TopdownHeatmapBaseHead
+
+
+@HEADS.register_module()
+class TopdownHeatmapMultiStageHead(TopdownHeatmapBaseHead):
+    """Top-down heatmap multi-stage head.
+
+    TopdownHeatmapMultiStageHead is consisted of multiple branches,
+    each of which has num_deconv_layers(>=0) number of deconv layers
+    and a simple conv2d layer.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        num_stages (int): Number of stages.
+        num_deconv_layers (int): Number of deconv layers.
+            num_deconv_layers should >= 0. Note that 0 means
+            no deconv layers.
+        num_deconv_filters (list|tuple): Number of filters.
+            If num_deconv_layers > 0, the length of
+        num_deconv_kernels (list|tuple): Kernel sizes.
+        loss_keypoint (dict): Config for keypoint loss. Default: None.
+    """
+
+    def __init__(self,
+                 in_channels=512,
+                 out_channels=17,
+                 num_stages=1,
+                 num_deconv_layers=3,
+                 num_deconv_filters=(256, 256, 256),
+                 num_deconv_kernels=(4, 4, 4),
+                 extra=None,
+                 loss_keypoint=None,
+                 train_cfg=None,
+                 test_cfg=None):
+        super().__init__()
+
+        self.in_channels = in_channels
+        self.num_stages = num_stages
+        self.loss = build_loss(loss_keypoint)
+
+        self.train_cfg = {} if train_cfg is None else train_cfg
+        self.test_cfg = {} if test_cfg is None else test_cfg
+        self.target_type = self.test_cfg.get('target_type', 'GaussianHeatmap')
+
+        if extra is not None and not isinstance(extra, dict):
+            raise TypeError('extra should be dict or None.')
+
+        # build multi-stage deconv layers
+        self.multi_deconv_layers = nn.ModuleList([])
+        for _ in range(self.num_stages):
+            if num_deconv_layers > 0:
+                deconv_layers = self._make_deconv_layer(
+                    num_deconv_layers,
+                    num_deconv_filters,
+                    num_deconv_kernels,
+                )
+            elif num_deconv_layers == 0:
+                deconv_layers = nn.Identity()
+            else:
+                raise ValueError(
+                    f'num_deconv_layers ({num_deconv_layers}) should >= 0.')
+            self.multi_deconv_layers.append(deconv_layers)
+
+        identity_final_layer = False
+        if extra is not None and 'final_conv_kernel' in extra:
+            assert extra['final_conv_kernel'] in [0, 1, 3]
+            if extra['final_conv_kernel'] == 3:
+                padding = 1
+            elif extra['final_conv_kernel'] == 1:
+                padding = 0
+            else:
+                # 0 for Identity mapping.
+                identity_final_layer = True
+            kernel_size = extra['final_conv_kernel']
+        else:
+            kernel_size = 1
+            padding = 0
+
+        # build multi-stage final layers
+        self.multi_final_layers = nn.ModuleList([])
+        for i in range(self.num_stages):
+            if identity_final_layer:
+                final_layer = nn.Identity()
+            else:
+                final_layer = build_conv_layer(
+                    cfg=dict(type='Conv2d'),
+                    in_channels=num_deconv_filters[-1]
+                    if num_deconv_layers > 0 else in_channels,
+                    out_channels=out_channels,
+                    kernel_size=kernel_size,
+                    stride=1,
+                    padding=padding)
+            self.multi_final_layers.append(final_layer)
+
+    def get_loss(self, output, target, target_weight):
+        """Calculate top-down keypoint loss.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - num_outputs: O
+            - heatmaps height: H
+            - heatmaps weight: W
+
+        Args:
+            output (torch.Tensor[N,K,H,W]):
+                Output heatmaps.
+            target (torch.Tensor[N,K,H,W]):
+                Target heatmaps.
+            target_weight (torch.Tensor[N,K,1]):
+                Weights across different joint types.
+        """
+
+        losses = dict()
+
+        assert isinstance(output, list)
+        assert target.dim() == 4 and target_weight.dim() == 3
+
+        if isinstance(self.loss, nn.Sequential):
+            assert len(self.loss) == len(output)
+        for i in range(len(output)):
+            target_i = target
+            target_weight_i = target_weight
+            if isinstance(self.loss, nn.Sequential):
+                loss_func = self.loss[i]
+            else:
+                loss_func = self.loss
+            loss_i = loss_func(output[i], target_i, target_weight_i)
+            if 'heatmap_loss' not in losses:
+                losses['heatmap_loss'] = loss_i
+            else:
+                losses['heatmap_loss'] += loss_i
+
+        return losses
+
+    def get_accuracy(self, output, target, target_weight):
+        """Calculate accuracy for top-down keypoint loss.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmaps height: H
+            - heatmaps weight: W
+
+        Args:
+            output (torch.Tensor[N,K,H,W]): Output heatmaps.
+            target (torch.Tensor[N,K,H,W]): Target heatmaps.
+            target_weight (torch.Tensor[N,K,1]):
+                Weights across different joint types.
+        """
+
+        accuracy = dict()
+
+        if self.target_type == 'GaussianHeatmap':
+            _, avg_acc, _ = pose_pck_accuracy(
+                output[-1].detach().cpu().numpy(),
+                target.detach().cpu().numpy(),
+                target_weight.detach().cpu().numpy().squeeze(-1) > 0)
+            accuracy['acc_pose'] = float(avg_acc)
+
+        return accuracy
+
+    def forward(self, x):
+        """Forward function.
+
+        Returns:
+            out (list[Tensor]): a list of heatmaps from multiple stages.
+        """
+        out = []
+        assert isinstance(x, list)
+        for i in range(self.num_stages):
+            y = self.multi_deconv_layers[i](x[i])
+            y = self.multi_final_layers[i](y)
+            out.append(y)
+        return out
+
+    def inference_model(self, x, flip_pairs=None):
+        """Inference function.
+
+        Returns:
+            output_heatmap (np.ndarray): Output heatmaps.
+
+        Args:
+            x (List[torch.Tensor[NxKxHxW]]): Input features.
+            flip_pairs (None | list[tuple()):
+                Pairs of keypoints which are mirrored.
+        """
+        output = self.forward(x)
+        assert isinstance(output, list)
+        output = output[-1]
+
+        if flip_pairs is not None:
+            # perform flip
+            output_heatmap = flip_back(
+                output.detach().cpu().numpy(),
+                flip_pairs,
+                target_type=self.target_type)
+            # feature is not aligned, shift flipped heatmap for higher accuracy
+            if self.test_cfg.get('shift_heatmap', False):
+                output_heatmap[:, :, :, 1:] = output_heatmap[:, :, :, :-1]
+        else:
+            output_heatmap = output.detach().cpu().numpy()
+
+        return output_heatmap
+
+    def _make_deconv_layer(self, num_layers, num_filters, num_kernels):
+        """Make deconv layers."""
+        if num_layers != len(num_filters):
+            error_msg = f'num_layers({num_layers}) ' \
+                        f'!= length of num_filters({len(num_filters)})'
+            raise ValueError(error_msg)
+        if num_layers != len(num_kernels):
+            error_msg = f'num_layers({num_layers}) ' \
+                        f'!= length of num_kernels({len(num_kernels)})'
+            raise ValueError(error_msg)
+
+        layers = []
+        for i in range(num_layers):
+            kernel, padding, output_padding = \
+                self._get_deconv_cfg(num_kernels[i])
+
+            planes = num_filters[i]
+            layers.append(
+                build_upsample_layer(
+                    dict(type='deconv'),
+                    in_channels=self.in_channels,
+                    out_channels=planes,
+                    kernel_size=kernel,
+                    stride=2,
+                    padding=padding,
+                    output_padding=output_padding,
+                    bias=False))
+            layers.append(nn.BatchNorm2d(planes))
+            layers.append(nn.ReLU(inplace=True))
+            self.in_channels = planes
+
+        return nn.Sequential(*layers)
+
+    def init_weights(self):
+        """Initialize model weights."""
+        for _, m in self.multi_deconv_layers.named_modules():
+            if isinstance(m, nn.ConvTranspose2d):
+                normal_init(m, std=0.001)
+            elif isinstance(m, nn.BatchNorm2d):
+                constant_init(m, 1)
+        for m in self.multi_final_layers.modules():
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, std=0.001, bias=0)
+
+
+class PredictHeatmap(nn.Module):
+    """Predict the heat map for an input feature.
+
+    Args:
+        unit_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        out_shape (tuple): Shape of the output heatmap.
+        use_prm (bool): Whether to use pose refine machine. Default: False.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+    """
+
+    def __init__(self,
+                 unit_channels,
+                 out_channels,
+                 out_shape,
+                 use_prm=False,
+                 norm_cfg=dict(type='BN')):
+        # Protect mutable default arguments
+        norm_cfg = cp.deepcopy(norm_cfg)
+        super().__init__()
+        self.unit_channels = unit_channels
+        self.out_channels = out_channels
+        self.out_shape = out_shape
+        self.use_prm = use_prm
+        if use_prm:
+            self.prm = PRM(out_channels, norm_cfg=norm_cfg)
+        self.conv_layers = nn.Sequential(
+            ConvModule(
+                unit_channels,
+                unit_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                norm_cfg=norm_cfg,
+                inplace=False),
+            ConvModule(
+                unit_channels,
+                out_channels,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                norm_cfg=norm_cfg,
+                act_cfg=None,
+                inplace=False))
+
+    def forward(self, feature):
+        feature = self.conv_layers(feature)
+        output = nn.functional.interpolate(
+            feature, size=self.out_shape, mode='bilinear', align_corners=True)
+        if self.use_prm:
+            output = self.prm(output)
+        return output
+
+
+class PRM(nn.Module):
+    """Pose Refine Machine.
+
+    Please refer to "Learning Delicate Local Representations
+    for Multi-Person Pose Estimation" (ECCV 2020).
+
+    Args:
+        out_channels (int): Channel number of the output. Equals to
+            the number of key points.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+    """
+
+    def __init__(self, out_channels, norm_cfg=dict(type='BN')):
+        # Protect mutable default arguments
+        norm_cfg = cp.deepcopy(norm_cfg)
+        super().__init__()
+        self.out_channels = out_channels
+        self.global_pooling = nn.AdaptiveAvgPool2d((1, 1))
+        self.middle_path = nn.Sequential(
+            Linear(self.out_channels, self.out_channels),
+            build_norm_layer(dict(type='BN1d'), out_channels)[1],
+            build_activation_layer(dict(type='ReLU')),
+            Linear(self.out_channels, self.out_channels),
+            build_norm_layer(dict(type='BN1d'), out_channels)[1],
+            build_activation_layer(dict(type='ReLU')),
+            build_activation_layer(dict(type='Sigmoid')))
+
+        self.bottom_path = nn.Sequential(
+            ConvModule(
+                self.out_channels,
+                self.out_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                norm_cfg=norm_cfg,
+                inplace=False),
+            DepthwiseSeparableConvModule(
+                self.out_channels,
+                1,
+                kernel_size=9,
+                stride=1,
+                padding=4,
+                norm_cfg=norm_cfg,
+                inplace=False), build_activation_layer(dict(type='Sigmoid')))
+        self.conv_bn_relu_prm_1 = ConvModule(
+            self.out_channels,
+            self.out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            norm_cfg=norm_cfg,
+            inplace=False)
+
+    def forward(self, x):
+        out = self.conv_bn_relu_prm_1(x)
+        out_1 = out
+
+        out_2 = self.global_pooling(out_1)
+        out_2 = out_2.view(out_2.size(0), -1)
+        out_2 = self.middle_path(out_2)
+        out_2 = out_2.unsqueeze(2)
+        out_2 = out_2.unsqueeze(3)
+
+        out_3 = self.bottom_path(out_1)
+        out = out_1 * (1 + out_2 * out_3)
+
+        return out
+
+
+@HEADS.register_module()
+class TopdownHeatmapMSMUHead(TopdownHeatmapBaseHead):
+    """Heads for multi-stage multi-unit heads used in Multi-Stage Pose
+    estimation Network (MSPN), and Residual Steps Networks (RSN).
+
+    Args:
+        unit_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        out_shape (tuple): Shape of the output heatmap.
+        num_stages (int): Number of stages.
+        num_units (int): Number of units in each stage.
+        use_prm (bool): Whether to use pose refine machine (PRM).
+            Default: False.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        loss_keypoint (dict): Config for keypoint loss. Default: None.
+    """
+
+    def __init__(self,
+                 out_shape,
+                 unit_channels=256,
+                 out_channels=17,
+                 num_stages=4,
+                 num_units=4,
+                 use_prm=False,
+                 norm_cfg=dict(type='BN'),
+                 loss_keypoint=None,
+                 train_cfg=None,
+                 test_cfg=None):
+        # Protect mutable default arguments
+        norm_cfg = cp.deepcopy(norm_cfg)
+        super().__init__()
+
+        self.train_cfg = {} if train_cfg is None else train_cfg
+        self.test_cfg = {} if test_cfg is None else test_cfg
+        self.target_type = self.test_cfg.get('target_type', 'GaussianHeatmap')
+
+        self.out_shape = out_shape
+        self.unit_channels = unit_channels
+        self.out_channels = out_channels
+        self.num_stages = num_stages
+        self.num_units = num_units
+
+        self.loss = build_loss(loss_keypoint)
+
+        self.predict_layers = nn.ModuleList([])
+        for i in range(self.num_stages):
+            for j in range(self.num_units):
+                self.predict_layers.append(
+                    PredictHeatmap(
+                        unit_channels,
+                        out_channels,
+                        out_shape,
+                        use_prm,
+                        norm_cfg=norm_cfg))
+
+    def get_loss(self, output, target, target_weight):
+        """Calculate top-down keypoint loss.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - num_outputs: O
+            - heatmaps height: H
+            - heatmaps weight: W
+
+        Args:
+            output (torch.Tensor[N,O,K,H,W]): Output heatmaps.
+            target (torch.Tensor[N,O,K,H,W]): Target heatmaps.
+            target_weight (torch.Tensor[N,O,K,1]):
+                Weights across different joint types.
+        """
+
+        losses = dict()
+
+        assert isinstance(output, list)
+        assert target.dim() == 5 and target_weight.dim() == 4
+        assert target.size(1) == len(output)
+
+        if isinstance(self.loss, nn.Sequential):
+            assert len(self.loss) == len(output)
+        for i in range(len(output)):
+            target_i = target[:, i, :, :, :]
+            target_weight_i = target_weight[:, i, :, :]
+
+            if isinstance(self.loss, nn.Sequential):
+                loss_func = self.loss[i]
+            else:
+                loss_func = self.loss
+
+            loss_i = loss_func(output[i], target_i, target_weight_i)
+            if 'heatmap_loss' not in losses:
+                losses['heatmap_loss'] = loss_i
+            else:
+                losses['heatmap_loss'] += loss_i
+
+        return losses
+
+    def get_accuracy(self, output, target, target_weight):
+        """Calculate accuracy for top-down keypoint loss.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmaps height: H
+            - heatmaps weight: W
+
+        Args:
+            output (torch.Tensor[N,K,H,W]): Output heatmaps.
+            target (torch.Tensor[N,K,H,W]): Target heatmaps.
+            target_weight (torch.Tensor[N,K,1]):
+                Weights across different joint types.
+        """
+
+        accuracy = dict()
+
+        if self.target_type == 'GaussianHeatmap':
+            assert isinstance(output, list)
+            assert target.dim() == 5 and target_weight.dim() == 4
+            _, avg_acc, _ = pose_pck_accuracy(
+                output[-1].detach().cpu().numpy(),
+                target[:, -1, ...].detach().cpu().numpy(),
+                target_weight[:, -1,
+                              ...].detach().cpu().numpy().squeeze(-1) > 0)
+            accuracy['acc_pose'] = float(avg_acc)
+
+        return accuracy
+
+    def forward(self, x):
+        """Forward function.
+
+        Returns:
+            out (list[Tensor]): a list of heatmaps from multiple stages
+                                and units.
+        """
+        out = []
+        assert isinstance(x, list)
+        assert len(x) == self.num_stages
+        assert isinstance(x[0], list)
+        assert len(x[0]) == self.num_units
+        assert x[0][0].shape[1] == self.unit_channels
+        for i in range(self.num_stages):
+            for j in range(self.num_units):
+                y = self.predict_layers[i * self.num_units + j](x[i][j])
+                out.append(y)
+
+        return out
+
+    def inference_model(self, x, flip_pairs=None):
+        """Inference function.
+
+        Returns:
+            output_heatmap (np.ndarray): Output heatmaps.
+
+        Args:
+            x (list[torch.Tensor[N,K,H,W]]): Input features.
+            flip_pairs (None | list[tuple]):
+                Pairs of keypoints which are mirrored.
+        """
+        output = self.forward(x)
+        assert isinstance(output, list)
+        output = output[-1]
+        if flip_pairs is not None:
+            output_heatmap = flip_back(
+                output.detach().cpu().numpy(),
+                flip_pairs,
+                target_type=self.target_type)
+            # feature is not aligned, shift flipped heatmap for higher accuracy
+            if self.test_cfg.get('shift_heatmap', False):
+                output_heatmap[:, :, :, 1:] = output_heatmap[:, :, :, :-1]
+        else:
+            output_heatmap = output.detach().cpu().numpy()
+        return output_heatmap
+
+    def init_weights(self):
+        """Initialize model weights."""
+        for m in self.predict_layers.modules():
+            if isinstance(m, nn.Conv2d):
+                kaiming_init(m)
+            elif isinstance(m, nn.BatchNorm2d):
+                constant_init(m, 1)
+            elif isinstance(m, nn.Linear):
+                normal_init(m, std=0.01)
diff --git a/main/transformer_utils/mmpose/models/heads/topdown_heatmap_simple_head.py b/main/transformer_utils/mmpose/models/heads/topdown_heatmap_simple_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ddc058d5634a5c63970a1efb8eaa66b158da1ec
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/heads/topdown_heatmap_simple_head.py
@@ -0,0 +1,339 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import (build_conv_layer, build_norm_layer, build_upsample_layer,
+                      constant_init, normal_init)
+
+from mmpose.core.evaluation import pose_pck_accuracy
+from mmpose.core.post_processing import flip_back
+from mmpose.models.builder import build_loss
+from mmpose.models.utils.ops import resize
+from ..builder import HEADS
+from .topdown_heatmap_base_head import TopdownHeatmapBaseHead
+
+
+@HEADS.register_module()
+class TopdownHeatmapSimpleHead(TopdownHeatmapBaseHead):
+    """Top-down heatmap simple head. paper ref: Bin Xiao et al. ``Simple
+    Baselines for Human Pose Estimation and Tracking``.
+
+    TopdownHeatmapSimpleHead is consisted of (>=0) number of deconv layers
+    and a simple conv2d layer.
+
+    Args:
+        in_channels (int): Number of input channels
+        out_channels (int): Number of output channels
+        num_deconv_layers (int): Number of deconv layers.
+            num_deconv_layers should >= 0. Note that 0 means
+            no deconv layers.
+        num_deconv_filters (list|tuple): Number of filters.
+            If num_deconv_layers > 0, the length of
+        num_deconv_kernels (list|tuple): Kernel sizes.
+        in_index (int|Sequence[int]): Input feature index. Default: 0
+        input_transform (str|None): Transformation type of input features.
+            Options: 'resize_concat', 'multiple_select', None.
+            Default: None.
+
+            - 'resize_concat': Multiple feature maps will be resized to the
+                same size as the first one and then concat together.
+                Usually used in FCN head of HRNet.
+            - 'multiple_select': Multiple feature maps will be bundle into
+                a list and passed into decode head.
+            - None: Only one select feature map is allowed.
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False.
+        loss_keypoint (dict): Config for keypoint loss. Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_deconv_layers=3,
+                 num_deconv_filters=(256, 256, 256),
+                 num_deconv_kernels=(4, 4, 4),
+                 extra=None,
+                 in_index=0,
+                 input_transform=None,
+                 align_corners=False,
+                 loss_keypoint=None,
+                 train_cfg=None,
+                 test_cfg=None):
+        super().__init__()
+
+        self.in_channels = in_channels
+        self.loss = build_loss(loss_keypoint)
+
+        self.train_cfg = {} if train_cfg is None else train_cfg
+        self.test_cfg = {} if test_cfg is None else test_cfg
+        self.target_type = self.test_cfg.get('target_type', 'GaussianHeatmap')
+
+        self._init_inputs(in_channels, in_index, input_transform)
+        self.in_index = in_index
+        self.align_corners = align_corners
+
+        if extra is not None and not isinstance(extra, dict):
+            raise TypeError('extra should be dict or None.')
+
+        if num_deconv_layers > 0:
+            self.deconv_layers = self._make_deconv_layer(
+                num_deconv_layers,
+                num_deconv_filters,
+                num_deconv_kernels,
+            )
+        elif num_deconv_layers == 0:
+            self.deconv_layers = nn.Identity()
+        else:
+            raise ValueError(
+                f'num_deconv_layers ({num_deconv_layers}) should >= 0.')
+
+        identity_final_layer = False
+        if extra is not None and 'final_conv_kernel' in extra:
+            assert extra['final_conv_kernel'] in [0, 1, 3]
+            if extra['final_conv_kernel'] == 3:
+                padding = 1
+            elif extra['final_conv_kernel'] == 1:
+                padding = 0
+            else:
+                # 0 for Identity mapping.
+                identity_final_layer = True
+            kernel_size = extra['final_conv_kernel']
+        else:
+            kernel_size = 1
+            padding = 0
+
+        if identity_final_layer:
+            self.final_layer = nn.Identity()
+        else:
+            conv_channels = num_deconv_filters[
+                -1] if num_deconv_layers > 0 else self.in_channels
+
+            layers = []
+            if extra is not None:
+                num_conv_layers = extra.get('num_conv_layers', 0)
+                num_conv_kernels = extra.get('num_conv_kernels',
+                                             [1] * num_conv_layers)
+
+                for i in range(num_conv_layers):
+                    layers.append(
+                        build_conv_layer(
+                            dict(type='Conv2d'),
+                            in_channels=conv_channels,
+                            out_channels=conv_channels,
+                            kernel_size=num_conv_kernels[i],
+                            stride=1,
+                            padding=(num_conv_kernels[i] - 1) // 2))
+                    layers.append(
+                        build_norm_layer(dict(type='BN'), conv_channels)[1])
+                    layers.append(nn.ReLU(inplace=True))
+
+            layers.append(
+                build_conv_layer(
+                    cfg=dict(type='Conv2d'),
+                    in_channels=conv_channels,
+                    out_channels=out_channels,
+                    kernel_size=kernel_size,
+                    stride=1,
+                    padding=padding))
+
+            if len(layers) > 1:
+                self.final_layer = nn.Sequential(*layers)
+            else:
+                self.final_layer = layers[0]
+
+    def get_loss(self, output, target, target_weight):
+        """Calculate top-down keypoint loss.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmaps height: H
+            - heatmaps weight: W
+
+        Args:
+            output (torch.Tensor[N,K,H,W]): Output heatmaps.
+            target (torch.Tensor[N,K,H,W]): Target heatmaps.
+            target_weight (torch.Tensor[N,K,1]):
+                Weights across different joint types.
+        """
+
+        losses = dict()
+
+        assert not isinstance(self.loss, nn.Sequential)
+        assert target.dim() == 4 and target_weight.dim() == 3
+        losses['heatmap_loss'] = self.loss(output, target, target_weight)
+
+        return losses
+
+    def get_accuracy(self, output, target, target_weight):
+        """Calculate accuracy for top-down keypoint loss.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - heatmaps height: H
+            - heatmaps weight: W
+
+        Args:
+            output (torch.Tensor[N,K,H,W]): Output heatmaps.
+            target (torch.Tensor[N,K,H,W]): Target heatmaps.
+            target_weight (torch.Tensor[N,K,1]):
+                Weights across different joint types.
+        """
+
+        accuracy = dict()
+
+        if self.target_type == 'GaussianHeatmap':
+            _, avg_acc, _ = pose_pck_accuracy(
+                output.detach().cpu().numpy(),
+                target.detach().cpu().numpy(),
+                target_weight.detach().cpu().numpy().squeeze(-1) > 0)
+            accuracy['acc_pose'] = float(avg_acc)
+
+        return accuracy
+
+    def forward(self, x):
+        """Forward function."""
+        x = self._transform_inputs(x)
+        x = self.deconv_layers(x)
+        x = self.final_layer(x)
+        return x
+
+    def inference_model(self, x, flip_pairs=None):
+        """Inference function.
+
+        Returns:
+            output_heatmap (np.ndarray): Output heatmaps.
+
+        Args:
+            x (torch.Tensor[N,K,H,W]): Input features.
+            flip_pairs (None | list[tuple]):
+                Pairs of keypoints which are mirrored.
+        """
+        output = self.forward(x)
+
+        if flip_pairs is not None:
+            output_heatmap = flip_back(
+                output.detach().cpu().numpy(),
+                flip_pairs,
+                target_type=self.target_type)
+            # feature is not aligned, shift flipped heatmap for higher accuracy
+            if self.test_cfg.get('shift_heatmap', False):
+                output_heatmap[:, :, :, 1:] = output_heatmap[:, :, :, :-1]
+        else:
+            output_heatmap = output.detach().cpu().numpy()
+        return output_heatmap
+
+    def _init_inputs(self, in_channels, in_index, input_transform):
+        """Check and initialize input transforms.
+
+        The in_channels, in_index and input_transform must match.
+        Specifically, when input_transform is None, only single feature map
+        will be selected. So in_channels and in_index must be of type int.
+        When input_transform is not None, in_channels and in_index must be
+        list or tuple, with the same length.
+
+        Args:
+            in_channels (int|Sequence[int]): Input channels.
+            in_index (int|Sequence[int]): Input feature index.
+            input_transform (str|None): Transformation type of input features.
+                Options: 'resize_concat', 'multiple_select', None.
+
+                - 'resize_concat': Multiple feature maps will be resize to the
+                    same size as first one and than concat together.
+                    Usually used in FCN head of HRNet.
+                - 'multiple_select': Multiple feature maps will be bundle into
+                    a list and passed into decode head.
+                - None: Only one select feature map is allowed.
+        """
+
+        if input_transform is not None:
+            assert input_transform in ['resize_concat', 'multiple_select']
+        self.input_transform = input_transform
+        self.in_index = in_index
+        if input_transform is not None:
+            assert isinstance(in_channels, (list, tuple))
+            assert isinstance(in_index, (list, tuple))
+            assert len(in_channels) == len(in_index)
+            if input_transform == 'resize_concat':
+                self.in_channels = sum(in_channels)
+            else:
+                self.in_channels = in_channels
+        else:
+            assert isinstance(in_channels, int)
+            assert isinstance(in_index, int)
+            self.in_channels = in_channels
+
+    def _transform_inputs(self, inputs):
+        """Transform inputs for decoder.
+
+        Args:
+            inputs (list[Tensor] | Tensor): multi-level img features.
+
+        Returns:
+            Tensor: The transformed inputs
+        """
+        if not isinstance(inputs, list):
+            return inputs
+
+        if self.input_transform == 'resize_concat':
+            inputs = [inputs[i] for i in self.in_index]
+            upsampled_inputs = [
+                resize(
+                    input=x,
+                    size=inputs[0].shape[2:],
+                    mode='bilinear',
+                    align_corners=self.align_corners) for x in inputs
+            ]
+            inputs = torch.cat(upsampled_inputs, dim=1)
+        elif self.input_transform == 'multiple_select':
+            inputs = [inputs[i] for i in self.in_index]
+        else:
+            inputs = inputs[self.in_index]
+
+        return inputs
+
+    def _make_deconv_layer(self, num_layers, num_filters, num_kernels):
+        """Make deconv layers."""
+        if num_layers != len(num_filters):
+            error_msg = f'num_layers({num_layers}) ' \
+                        f'!= length of num_filters({len(num_filters)})'
+            raise ValueError(error_msg)
+        if num_layers != len(num_kernels):
+            error_msg = f'num_layers({num_layers}) ' \
+                        f'!= length of num_kernels({len(num_kernels)})'
+            raise ValueError(error_msg)
+
+        layers = []
+        for i in range(num_layers):
+            kernel, padding, output_padding = \
+                self._get_deconv_cfg(num_kernels[i])
+
+            planes = num_filters[i]
+            layers.append(
+                build_upsample_layer(
+                    dict(type='deconv'),
+                    in_channels=self.in_channels,
+                    out_channels=planes,
+                    kernel_size=kernel,
+                    stride=2,
+                    padding=padding,
+                    output_padding=output_padding,
+                    bias=False))
+            layers.append(nn.BatchNorm2d(planes))
+            layers.append(nn.ReLU(inplace=True))
+            self.in_channels = planes
+
+        return nn.Sequential(*layers)
+
+    def init_weights(self):
+        """Initialize model weights."""
+        for _, m in self.deconv_layers.named_modules():
+            if isinstance(m, nn.ConvTranspose2d):
+                normal_init(m, std=0.001)
+            elif isinstance(m, nn.BatchNorm2d):
+                constant_init(m, 1)
+        for m in self.final_layer.modules():
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, std=0.001, bias=0)
+            elif isinstance(m, nn.BatchNorm2d):
+                constant_init(m, 1)
diff --git a/main/transformer_utils/mmpose/models/losses/__init__.py b/main/transformer_utils/mmpose/models/losses/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6502f7b19e8ab71cbdca028cd8b14bffde24cf20
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/losses/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .classfication_loss import BCELoss
+from .heatmap_loss import AdaptiveWingLoss
+from .mesh_loss import GANLoss, MeshLoss
+from .mse_loss import JointsMSELoss, JointsOHKMMSELoss
+from .multi_loss_factory import AELoss, HeatmapLoss, MultiLossFactory
+from .regression_loss import (BoneLoss, L1Loss, MPJPELoss, MSELoss, RLELoss,
+                              SemiSupervisionLoss, SmoothL1Loss, SoftWingLoss,
+                              WingLoss)
+from .rle_loss import RLELoss_poseur
+
+__all__ = [
+    'JointsMSELoss', 'JointsOHKMMSELoss', 'HeatmapLoss', 'AELoss',
+    'MultiLossFactory', 'MeshLoss', 'GANLoss', 'SmoothL1Loss', 'WingLoss',
+    'MPJPELoss', 'MSELoss', 'L1Loss', 'BCELoss', 'BoneLoss',
+    'SemiSupervisionLoss', 'SoftWingLoss', 'AdaptiveWingLoss', 'RLELoss'
+]
diff --git a/main/transformer_utils/mmpose/models/losses/classfication_loss.py b/main/transformer_utils/mmpose/models/losses/classfication_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..b79b69d035611f75f10e8722aaea4362659509e2
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/losses/classfication_loss.py
@@ -0,0 +1,41 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..builder import LOSSES
+
+
+@LOSSES.register_module()
+class BCELoss(nn.Module):
+    """Binary Cross Entropy loss."""
+
+    def __init__(self, use_target_weight=False, loss_weight=1.):
+        super().__init__()
+        self.criterion = F.binary_cross_entropy
+        self.use_target_weight = use_target_weight
+        self.loss_weight = loss_weight
+
+    def forward(self, output, target, target_weight=None):
+        """Forward function.
+
+        Note:
+            - batch_size: N
+            - num_labels: K
+
+        Args:
+            output (torch.Tensor[N, K]): Output classification.
+            target (torch.Tensor[N, K]): Target classification.
+            target_weight (torch.Tensor[N, K] or torch.Tensor[N]):
+                Weights across different labels.
+        """
+
+        if self.use_target_weight:
+            assert target_weight is not None
+            loss = self.criterion(output, target, reduction='none')
+            if target_weight.dim() == 1:
+                target_weight = target_weight[:, None]
+            loss = (loss * target_weight).mean()
+        else:
+            loss = self.criterion(output, target)
+
+        return loss * self.loss_weight
diff --git a/main/transformer_utils/mmpose/models/losses/heatmap_loss.py b/main/transformer_utils/mmpose/models/losses/heatmap_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..9471457ca0da2d43441da1d394bc45b3e8ca3ee7
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/losses/heatmap_loss.py
@@ -0,0 +1,86 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+
+from ..builder import LOSSES
+
+
+@LOSSES.register_module()
+class AdaptiveWingLoss(nn.Module):
+    """Adaptive wing loss. paper ref: 'Adaptive Wing Loss for Robust Face
+    Alignment via Heatmap Regression' Wang et al. ICCV'2019.
+
+    Args:
+        alpha (float), omega (float), epsilon (float), theta (float)
+            are hyper-parameters.
+        use_target_weight (bool): Option to use weighted MSE loss.
+            Different joint types may have different target weights.
+        loss_weight (float): Weight of the loss. Default: 1.0.
+    """
+
+    def __init__(self,
+                 alpha=2.1,
+                 omega=14,
+                 epsilon=1,
+                 theta=0.5,
+                 use_target_weight=False,
+                 loss_weight=1.):
+        super().__init__()
+        self.alpha = float(alpha)
+        self.omega = float(omega)
+        self.epsilon = float(epsilon)
+        self.theta = float(theta)
+        self.use_target_weight = use_target_weight
+        self.loss_weight = loss_weight
+
+    def criterion(self, pred, target):
+        """Criterion of wingloss.
+
+        Note:
+            batch_size: N
+            num_keypoints: K
+
+        Args:
+            pred (torch.Tensor[NxKxHxW]): Predicted heatmaps.
+            target (torch.Tensor[NxKxHxW]): Target heatmaps.
+        """
+        H, W = pred.shape[2:4]
+        delta = (target - pred).abs()
+
+        A = self.omega * (
+            1 / (1 + torch.pow(self.theta / self.epsilon, self.alpha - target))
+        ) * (self.alpha - target) * (torch.pow(
+            self.theta / self.epsilon,
+            self.alpha - target - 1)) * (1 / self.epsilon)
+        C = self.theta * A - self.omega * torch.log(
+            1 + torch.pow(self.theta / self.epsilon, self.alpha - target))
+
+        losses = torch.where(
+            delta < self.theta,
+            self.omega *
+            torch.log(1 +
+                      torch.pow(delta / self.epsilon, self.alpha - target)),
+            A * delta - C)
+
+        return torch.mean(losses)
+
+    def forward(self, output, target, target_weight):
+        """Forward function.
+
+        Note:
+            batch_size: N
+            num_keypoints: K
+
+        Args:
+            output (torch.Tensor[NxKxHxW]): Output heatmaps.
+            target (torch.Tensor[NxKxHxW]): Target heatmaps.
+            target_weight (torch.Tensor[NxKx1]):
+                Weights across different joint types.
+        """
+        if self.use_target_weight:
+            loss = self.criterion(output * target_weight.unsqueeze(-1),
+                                  target * target_weight.unsqueeze(-1))
+        else:
+            loss = self.criterion(output, target)
+
+        return loss * self.loss_weight
diff --git a/main/transformer_utils/mmpose/models/losses/mesh_loss.py b/main/transformer_utils/mmpose/models/losses/mesh_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9d18bd7296a189ec2f24c422cc05a19035d3224
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/losses/mesh_loss.py
@@ -0,0 +1,340 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+
+from ..builder import LOSSES
+from ..utils.geometry import batch_rodrigues
+
+
+def perspective_projection(points, rotation, translation, focal_length,
+                           camera_center):
+    """This function computes the perspective projection of a set of 3D points.
+
+    Note:
+        - batch size: B
+        - point number: N
+
+    Args:
+        points (Tensor([B, N, 3])): A set of 3D points
+        rotation (Tensor([B, 3, 3])): Camera rotation matrix
+        translation (Tensor([B, 3])): Camera translation
+        focal_length (Tensor([B,])): Focal length
+        camera_center (Tensor([B, 2])): Camera center
+
+    Returns:
+        projected_points (Tensor([B, N, 2])): Projected 2D
+            points in image space.
+    """
+
+    batch_size = points.shape[0]
+    K = torch.zeros([batch_size, 3, 3], device=points.device)
+    K[:, 0, 0] = focal_length
+    K[:, 1, 1] = focal_length
+    K[:, 2, 2] = 1.
+    K[:, :-1, -1] = camera_center
+
+    # Transform points
+    points = torch.einsum('bij,bkj->bki', rotation, points)
+    points = points + translation.unsqueeze(1)
+
+    # Apply perspective distortion
+    projected_points = points / points[:, :, -1].unsqueeze(-1)
+
+    # Apply camera intrinsics
+    projected_points = torch.einsum('bij,bkj->bki', K, projected_points)
+    projected_points = projected_points[:, :, :-1]
+    return projected_points
+
+
+@LOSSES.register_module()
+class MeshLoss(nn.Module):
+    """Mix loss for 3D human mesh. It is composed of loss on 2D joints, 3D
+    joints, mesh vertices and smpl parameters (if any).
+
+    Args:
+        joints_2d_loss_weight (float): Weight for loss on 2D joints.
+        joints_3d_loss_weight (float): Weight for loss on 3D joints.
+        vertex_loss_weight (float): Weight for loss on 3D verteices.
+        smpl_pose_loss_weight (float): Weight for loss on SMPL
+            pose parameters.
+        smpl_beta_loss_weight (float): Weight for loss on SMPL
+            shape parameters.
+        img_res (int): Input image resolution.
+        focal_length (float): Focal length of camera model. Default=5000.
+    """
+
+    def __init__(self,
+                 joints_2d_loss_weight,
+                 joints_3d_loss_weight,
+                 vertex_loss_weight,
+                 smpl_pose_loss_weight,
+                 smpl_beta_loss_weight,
+                 img_res,
+                 focal_length=5000):
+
+        super().__init__()
+        # Per-vertex loss on the mesh
+        self.criterion_vertex = nn.L1Loss(reduction='none')
+
+        # Joints (2D and 3D) loss
+        self.criterion_joints_2d = nn.SmoothL1Loss(reduction='none')
+        self.criterion_joints_3d = nn.SmoothL1Loss(reduction='none')
+
+        # Loss for SMPL parameter regression
+        self.criterion_regr = nn.MSELoss(reduction='none')
+
+        self.joints_2d_loss_weight = joints_2d_loss_weight
+        self.joints_3d_loss_weight = joints_3d_loss_weight
+        self.vertex_loss_weight = vertex_loss_weight
+        self.smpl_pose_loss_weight = smpl_pose_loss_weight
+        self.smpl_beta_loss_weight = smpl_beta_loss_weight
+        self.focal_length = focal_length
+        self.img_res = img_res
+
+    def joints_2d_loss(self, pred_joints_2d, gt_joints_2d, joints_2d_visible):
+        """Compute 2D reprojection loss on the joints.
+
+        The loss is weighted by joints_2d_visible.
+        """
+        conf = joints_2d_visible.float()
+        loss = (conf *
+                self.criterion_joints_2d(pred_joints_2d, gt_joints_2d)).mean()
+        return loss
+
+    def joints_3d_loss(self, pred_joints_3d, gt_joints_3d, joints_3d_visible):
+        """Compute 3D joints loss for the examples that 3D joint annotations
+        are available.
+
+        The loss is weighted by joints_3d_visible.
+        """
+        conf = joints_3d_visible.float()
+        if len(gt_joints_3d) > 0:
+            gt_pelvis = (gt_joints_3d[:, 2, :] + gt_joints_3d[:, 3, :]) / 2
+            gt_joints_3d = gt_joints_3d - gt_pelvis[:, None, :]
+            pred_pelvis = (pred_joints_3d[:, 2, :] +
+                           pred_joints_3d[:, 3, :]) / 2
+            pred_joints_3d = pred_joints_3d - pred_pelvis[:, None, :]
+            return (
+                conf *
+                self.criterion_joints_3d(pred_joints_3d, gt_joints_3d)).mean()
+        return pred_joints_3d.sum() * 0
+
+    def vertex_loss(self, pred_vertices, gt_vertices, has_smpl):
+        """Compute 3D vertex loss for the examples that 3D human mesh
+        annotations are available.
+
+        The loss is weighted by the has_smpl.
+        """
+        conf = has_smpl.float()
+        loss_vertex = self.criterion_vertex(pred_vertices, gt_vertices)
+        loss_vertex = (conf[:, None, None] * loss_vertex).mean()
+        return loss_vertex
+
+    def smpl_losses(self, pred_rotmat, pred_betas, gt_pose, gt_betas,
+                    has_smpl):
+        """Compute SMPL parameters loss for the examples that SMPL parameter
+        annotations are available.
+
+        The loss is weighted by has_smpl.
+        """
+        conf = has_smpl.float()
+        gt_rotmat = batch_rodrigues(gt_pose.view(-1, 3)).view(-1, 24, 3, 3)
+        loss_regr_pose = self.criterion_regr(pred_rotmat, gt_rotmat)
+        loss_regr_betas = self.criterion_regr(pred_betas, gt_betas)
+        loss_regr_pose = (conf[:, None, None, None] * loss_regr_pose).mean()
+        loss_regr_betas = (conf[:, None] * loss_regr_betas).mean()
+        return loss_regr_pose, loss_regr_betas
+
+    def project_points(self, points_3d, camera):
+        """Perform orthographic projection of 3D points using the camera
+        parameters, return projected 2D points in image plane.
+
+        Note:
+            - batch size: B
+            - point number: N
+
+        Args:
+            points_3d (Tensor([B, N, 3])): 3D points.
+            camera (Tensor([B, 3])): camera parameters with the
+                3 channel as (scale, translation_x, translation_y)
+
+        Returns:
+            Tensor([B, N, 2]): projected 2D points \
+                in image space.
+        """
+        batch_size = points_3d.shape[0]
+        device = points_3d.device
+        cam_t = torch.stack([
+            camera[:, 1], camera[:, 2], 2 * self.focal_length /
+            (self.img_res * camera[:, 0] + 1e-9)
+        ],
+                            dim=-1)
+        camera_center = camera.new_zeros([batch_size, 2])
+        rot_t = torch.eye(
+            3, device=device,
+            dtype=points_3d.dtype).unsqueeze(0).expand(batch_size, -1, -1)
+        joints_2d = perspective_projection(
+            points_3d,
+            rotation=rot_t,
+            translation=cam_t,
+            focal_length=self.focal_length,
+            camera_center=camera_center)
+        return joints_2d
+
+    def forward(self, output, target):
+        """Forward function.
+
+        Args:
+            output (dict): dict of network predicted results.
+                Keys: 'vertices', 'joints_3d', 'camera',
+                'pose'(optional), 'beta'(optional)
+            target (dict): dict of ground-truth labels.
+                Keys: 'vertices', 'joints_3d', 'joints_3d_visible',
+                'joints_2d', 'joints_2d_visible', 'pose', 'beta',
+                'has_smpl'
+
+        Returns:
+            dict: dict of losses.
+        """
+        losses = {}
+
+        # Per-vertex loss for the shape
+        pred_vertices = output['vertices']
+
+        gt_vertices = target['vertices']
+        has_smpl = target['has_smpl']
+        loss_vertex = self.vertex_loss(pred_vertices, gt_vertices, has_smpl)
+        losses['vertex_loss'] = loss_vertex * self.vertex_loss_weight
+
+        # Compute loss on SMPL parameters, if available
+        if 'pose' in output.keys() and 'beta' in output.keys():
+            pred_rotmat = output['pose']
+            pred_betas = output['beta']
+            gt_pose = target['pose']
+            gt_betas = target['beta']
+            loss_regr_pose, loss_regr_betas = self.smpl_losses(
+                pred_rotmat, pred_betas, gt_pose, gt_betas, has_smpl)
+            losses['smpl_pose_loss'] = \
+                loss_regr_pose * self.smpl_pose_loss_weight
+            losses['smpl_beta_loss'] = \
+                loss_regr_betas * self.smpl_beta_loss_weight
+
+        # Compute 3D joints loss
+        pred_joints_3d = output['joints_3d']
+        gt_joints_3d = target['joints_3d']
+        joints_3d_visible = target['joints_3d_visible']
+        loss_joints_3d = self.joints_3d_loss(pred_joints_3d, gt_joints_3d,
+                                             joints_3d_visible)
+        losses['joints_3d_loss'] = loss_joints_3d * self.joints_3d_loss_weight
+
+        # Compute 2D reprojection loss for the 2D joints
+        pred_camera = output['camera']
+        gt_joints_2d = target['joints_2d']
+        joints_2d_visible = target['joints_2d_visible']
+        pred_joints_2d = self.project_points(pred_joints_3d, pred_camera)
+
+        # Normalize keypoints to [-1,1]
+        # The coordinate origin of pred_joints_2d is
+        #  the center of the input image.
+        pred_joints_2d = 2 * pred_joints_2d / (self.img_res - 1)
+        # The coordinate origin of gt_joints_2d is
+        # the top left corner of the input image.
+        gt_joints_2d = 2 * gt_joints_2d / (self.img_res - 1) - 1
+        loss_joints_2d = self.joints_2d_loss(pred_joints_2d, gt_joints_2d,
+                                             joints_2d_visible)
+        losses['joints_2d_loss'] = loss_joints_2d * self.joints_2d_loss_weight
+
+        return losses
+
+
+@LOSSES.register_module()
+class GANLoss(nn.Module):
+    """Define GAN loss.
+
+    Args:
+        gan_type (str): Support 'vanilla', 'lsgan', 'wgan', 'hinge'.
+        real_label_val (float): The value for real label. Default: 1.0.
+        fake_label_val (float): The value for fake label. Default: 0.0.
+        loss_weight (float): Loss weight. Default: 1.0.
+            Note that loss_weight is only for generators; and it is always 1.0
+            for discriminators.
+    """
+
+    def __init__(self,
+                 gan_type,
+                 real_label_val=1.0,
+                 fake_label_val=0.0,
+                 loss_weight=1.0):
+        super().__init__()
+        self.gan_type = gan_type
+        self.loss_weight = loss_weight
+        self.real_label_val = real_label_val
+        self.fake_label_val = fake_label_val
+
+        if self.gan_type == 'vanilla':
+            self.loss = nn.BCEWithLogitsLoss()
+        elif self.gan_type == 'lsgan':
+            self.loss = nn.MSELoss()
+        elif self.gan_type == 'wgan':
+            self.loss = self._wgan_loss
+        elif self.gan_type == 'hinge':
+            self.loss = nn.ReLU()
+        else:
+            raise NotImplementedError(
+                f'GAN type {self.gan_type} is not implemented.')
+
+    @staticmethod
+    def _wgan_loss(input, target):
+        """wgan loss.
+
+        Args:
+            input (Tensor): Input tensor.
+            target (bool): Target label.
+
+        Returns:
+            Tensor: wgan loss.
+        """
+        return -input.mean() if target else input.mean()
+
+    def get_target_label(self, input, target_is_real):
+        """Get target label.
+
+        Args:
+            input (Tensor): Input tensor.
+            target_is_real (bool): Whether the target is real or fake.
+
+        Returns:
+            (bool | Tensor): Target tensor. Return bool for wgan, \
+                otherwise, return Tensor.
+        """
+
+        if self.gan_type == 'wgan':
+            return target_is_real
+        target_val = (
+            self.real_label_val if target_is_real else self.fake_label_val)
+        return input.new_ones(input.size()) * target_val
+
+    def forward(self, input, target_is_real, is_disc=False):
+        """
+        Args:
+            input (Tensor): The input for the loss module, i.e., the network
+                prediction.
+            target_is_real (bool): Whether the targe is real or fake.
+            is_disc (bool): Whether the loss for discriminators or not.
+                Default: False.
+
+        Returns:
+            Tensor: GAN loss value.
+        """
+        target_label = self.get_target_label(input, target_is_real)
+        if self.gan_type == 'hinge':
+            if is_disc:  # for discriminators in hinge-gan
+                input = -input if target_is_real else input
+                loss = self.loss(1 + input).mean()
+            else:  # for generators in hinge-gan
+                loss = -input.mean()
+        else:  # other gan types
+            loss = self.loss(input, target_label)
+
+        # loss_weight is always 1.0 for discriminators
+        return loss if is_disc else loss * self.loss_weight
diff --git a/main/transformer_utils/mmpose/models/losses/mse_loss.py b/main/transformer_utils/mmpose/models/losses/mse_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..f972efadfdfe0093c9ae1b308c6f82a9ccd72f73
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/losses/mse_loss.py
@@ -0,0 +1,153 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+
+from ..builder import LOSSES
+
+
+@LOSSES.register_module()
+class JointsMSELoss(nn.Module):
+    """MSE loss for heatmaps.
+
+    Args:
+        use_target_weight (bool): Option to use weighted MSE loss.
+            Different joint types may have different target weights.
+        loss_weight (float): Weight of the loss. Default: 1.0.
+    """
+
+    def __init__(self, use_target_weight=False, loss_weight=1.):
+        super().__init__()
+        self.criterion = nn.MSELoss()
+        self.use_target_weight = use_target_weight
+        self.loss_weight = loss_weight
+
+    def forward(self, output, target, target_weight):
+        """Forward function."""
+        batch_size = output.size(0)
+        num_joints = output.size(1)
+
+        heatmaps_pred = output.reshape(
+            (batch_size, num_joints, -1)).split(1, 1)
+        heatmaps_gt = target.reshape((batch_size, num_joints, -1)).split(1, 1)
+
+        loss = 0.
+
+        for idx in range(num_joints):
+            heatmap_pred = heatmaps_pred[idx].squeeze(1)
+            heatmap_gt = heatmaps_gt[idx].squeeze(1)
+            if self.use_target_weight:
+                loss += self.criterion(heatmap_pred * target_weight[:, idx],
+                                       heatmap_gt * target_weight[:, idx])
+            else:
+                loss += self.criterion(heatmap_pred, heatmap_gt)
+
+        return loss / num_joints * self.loss_weight
+
+
+@LOSSES.register_module()
+class CombinedTargetMSELoss(nn.Module):
+    """MSE loss for combined target.
+        CombinedTarget: The combination of classification target
+        (response map) and regression target (offset map).
+        Paper ref: Huang et al. The Devil is in the Details: Delving into
+        Unbiased Data Processing for Human Pose Estimation (CVPR 2020).
+
+    Args:
+        use_target_weight (bool): Option to use weighted MSE loss.
+            Different joint types may have different target weights.
+        loss_weight (float): Weight of the loss. Default: 1.0.
+    """
+
+    def __init__(self, use_target_weight, loss_weight=1.):
+        super().__init__()
+        self.criterion = nn.MSELoss(reduction='mean')
+        self.use_target_weight = use_target_weight
+        self.loss_weight = loss_weight
+
+    def forward(self, output, target, target_weight):
+        batch_size = output.size(0)
+        num_channels = output.size(1)
+        heatmaps_pred = output.reshape(
+            (batch_size, num_channels, -1)).split(1, 1)
+        heatmaps_gt = target.reshape(
+            (batch_size, num_channels, -1)).split(1, 1)
+        loss = 0.
+        num_joints = num_channels // 3
+        for idx in range(num_joints):
+            heatmap_pred = heatmaps_pred[idx * 3].squeeze()
+            heatmap_gt = heatmaps_gt[idx * 3].squeeze()
+            offset_x_pred = heatmaps_pred[idx * 3 + 1].squeeze()
+            offset_x_gt = heatmaps_gt[idx * 3 + 1].squeeze()
+            offset_y_pred = heatmaps_pred[idx * 3 + 2].squeeze()
+            offset_y_gt = heatmaps_gt[idx * 3 + 2].squeeze()
+            if self.use_target_weight:
+                heatmap_pred = heatmap_pred * target_weight[:, idx]
+                heatmap_gt = heatmap_gt * target_weight[:, idx]
+            # classification loss
+            loss += 0.5 * self.criterion(heatmap_pred, heatmap_gt)
+            # regression loss
+            loss += 0.5 * self.criterion(heatmap_gt * offset_x_pred,
+                                         heatmap_gt * offset_x_gt)
+            loss += 0.5 * self.criterion(heatmap_gt * offset_y_pred,
+                                         heatmap_gt * offset_y_gt)
+        return loss / num_joints * self.loss_weight
+
+
+@LOSSES.register_module()
+class JointsOHKMMSELoss(nn.Module):
+    """MSE loss with online hard keypoint mining.
+
+    Args:
+        use_target_weight (bool): Option to use weighted MSE loss.
+            Different joint types may have different target weights.
+        topk (int): Only top k joint losses are kept.
+        loss_weight (float): Weight of the loss. Default: 1.0.
+    """
+
+    def __init__(self, use_target_weight=False, topk=8, loss_weight=1.):
+        super().__init__()
+        assert topk > 0
+        self.criterion = nn.MSELoss(reduction='none')
+        self.use_target_weight = use_target_weight
+        self.topk = topk
+        self.loss_weight = loss_weight
+
+    def _ohkm(self, loss):
+        """Online hard keypoint mining."""
+        ohkm_loss = 0.
+        N = len(loss)
+        for i in range(N):
+            sub_loss = loss[i]
+            _, topk_idx = torch.topk(
+                sub_loss, k=self.topk, dim=0, sorted=False)
+            tmp_loss = torch.gather(sub_loss, 0, topk_idx)
+            ohkm_loss += torch.sum(tmp_loss) / self.topk
+        ohkm_loss /= N
+        return ohkm_loss
+
+    def forward(self, output, target, target_weight):
+        """Forward function."""
+        batch_size = output.size(0)
+        num_joints = output.size(1)
+        if num_joints < self.topk:
+            raise ValueError(f'topk ({self.topk}) should not '
+                             f'larger than num_joints ({num_joints}).')
+        heatmaps_pred = output.reshape(
+            (batch_size, num_joints, -1)).split(1, 1)
+        heatmaps_gt = target.reshape((batch_size, num_joints, -1)).split(1, 1)
+
+        losses = []
+        for idx in range(num_joints):
+            heatmap_pred = heatmaps_pred[idx].squeeze(1)
+            heatmap_gt = heatmaps_gt[idx].squeeze(1)
+            if self.use_target_weight:
+                losses.append(
+                    self.criterion(heatmap_pred * target_weight[:, idx],
+                                   heatmap_gt * target_weight[:, idx]))
+            else:
+                losses.append(self.criterion(heatmap_pred, heatmap_gt))
+
+        losses = [loss.mean(dim=1).unsqueeze(dim=1) for loss in losses]
+        losses = torch.cat(losses, dim=1)
+
+        return self._ohkm(losses) * self.loss_weight
diff --git a/main/transformer_utils/mmpose/models/losses/multi_loss_factory.py b/main/transformer_utils/mmpose/models/losses/multi_loss_factory.py
new file mode 100644
index 0000000000000000000000000000000000000000..65f90a761d0e5f94309023288f0d3ec848ec82dd
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/losses/multi_loss_factory.py
@@ -0,0 +1,281 @@
+# ------------------------------------------------------------------------------
+# Adapted from https://github.com/HRNet/HigherHRNet-Human-Pose-Estimation
+# Original licence: Copyright (c) Microsoft, under the MIT License.
+# ------------------------------------------------------------------------------
+
+import torch
+import torch.nn as nn
+
+from ..builder import LOSSES
+
+
+def _make_input(t, requires_grad=False, device=torch.device('cpu')):
+    """Make zero inputs for AE loss.
+
+    Args:
+        t (torch.Tensor): input
+        requires_grad (bool): Option to use requires_grad.
+        device: torch device
+
+    Returns:
+        torch.Tensor: zero input.
+    """
+    inp = torch.autograd.Variable(t, requires_grad=requires_grad)
+    inp = inp.sum()
+    inp = inp.to(device)
+    return inp
+
+
+@LOSSES.register_module()
+class HeatmapLoss(nn.Module):
+    """Accumulate the heatmap loss for each image in the batch.
+
+    Args:
+        supervise_empty (bool): Whether to supervise empty channels.
+    """
+
+    def __init__(self, supervise_empty=True):
+        super().__init__()
+        self.supervise_empty = supervise_empty
+
+    def forward(self, pred, gt, mask):
+        """Forward function.
+
+        Note:
+            - batch_size: N
+            - heatmaps weight: W
+            - heatmaps height: H
+            - max_num_people: M
+            - num_keypoints: K
+
+        Args:
+            pred (torch.Tensor[N,K,H,W]):heatmap of output.
+            gt (torch.Tensor[N,K,H,W]): target heatmap.
+            mask (torch.Tensor[N,H,W]): mask of target.
+        """
+        assert pred.size() == gt.size(
+        ), f'pred.size() is {pred.size()}, gt.size() is {gt.size()}'
+
+        if not self.supervise_empty:
+            empty_mask = (gt.sum(dim=[2, 3], keepdim=True) > 0).float()
+            loss = ((pred - gt)**2) * empty_mask.expand_as(
+                pred) * mask[:, None, :, :].expand_as(pred)
+        else:
+            loss = ((pred - gt)**2) * mask[:, None, :, :].expand_as(pred)
+        loss = loss.mean(dim=3).mean(dim=2).mean(dim=1)
+        return loss
+
+
+@LOSSES.register_module()
+class AELoss(nn.Module):
+    """Associative Embedding loss.
+
+    `Associative Embedding: End-to-End Learning for Joint Detection and
+    Grouping <https://arxiv.org/abs/1611.05424v2>`_.
+    """
+
+    def __init__(self, loss_type):
+        super().__init__()
+        self.loss_type = loss_type
+
+    def singleTagLoss(self, pred_tag, joints):
+        """Associative embedding loss for one image.
+
+        Note:
+            - heatmaps weight: W
+            - heatmaps height: H
+            - max_num_people: M
+            - num_keypoints: K
+
+        Args:
+            pred_tag (torch.Tensor[KxHxW,1]): tag of output for one image.
+            joints (torch.Tensor[M,K,2]): joints information for one image.
+        """
+        tags = []
+        pull = 0
+        for joints_per_person in joints:
+            tmp = []
+            for joint in joints_per_person:
+                if joint[1] > 0:
+                    tmp.append(pred_tag[joint[0]])
+            if len(tmp) == 0:
+                continue
+            tmp = torch.stack(tmp)
+            tags.append(torch.mean(tmp, dim=0))
+            pull = pull + torch.mean((tmp - tags[-1].expand_as(tmp))**2)
+
+        num_tags = len(tags)
+        if num_tags == 0:
+            return (
+                _make_input(torch.zeros(1).float(), device=pred_tag.device),
+                _make_input(torch.zeros(1).float(), device=pred_tag.device))
+        elif num_tags == 1:
+            return (_make_input(
+                torch.zeros(1).float(), device=pred_tag.device), pull)
+
+        tags = torch.stack(tags)
+
+        size = (num_tags, num_tags)
+        A = tags.expand(*size)
+        B = A.permute(1, 0)
+
+        diff = A - B
+
+        if self.loss_type == 'exp':
+            diff = torch.pow(diff, 2)
+            push = torch.exp(-diff)
+            push = torch.sum(push) - num_tags
+        elif self.loss_type == 'max':
+            diff = 1 - torch.abs(diff)
+            push = torch.clamp(diff, min=0).sum() - num_tags
+        else:
+            raise ValueError('Unknown ae loss type')
+
+        push_loss = push / ((num_tags - 1) * num_tags) * 0.5
+        pull_loss = pull / (num_tags)
+
+        return push_loss, pull_loss
+
+    def forward(self, tags, joints):
+        """Accumulate the tag loss for each image in the batch.
+
+        Note:
+            - batch_size: N
+            - heatmaps weight: W
+            - heatmaps height: H
+            - max_num_people: M
+            - num_keypoints: K
+
+        Args:
+            tags (torch.Tensor[N,KxHxW,1]): tag channels of output.
+            joints (torch.Tensor[N,M,K,2]): joints information.
+        """
+        pushes, pulls = [], []
+        joints = joints.cpu().data.numpy()
+        batch_size = tags.size(0)
+        for i in range(batch_size):
+            push, pull = self.singleTagLoss(tags[i], joints[i])
+            pushes.append(push)
+            pulls.append(pull)
+        return torch.stack(pushes), torch.stack(pulls)
+
+
+@LOSSES.register_module()
+class MultiLossFactory(nn.Module):
+    """Loss for bottom-up models.
+
+    Args:
+        num_joints (int): Number of keypoints.
+        num_stages (int): Number of stages.
+        ae_loss_type (str): Type of ae loss.
+        with_ae_loss (list[bool]): Use ae loss or not in multi-heatmap.
+        push_loss_factor (list[float]):
+            Parameter of push loss in multi-heatmap.
+        pull_loss_factor (list[float]):
+            Parameter of pull loss in multi-heatmap.
+        with_heatmap_loss (list[bool]):
+            Use heatmap loss or not in multi-heatmap.
+        heatmaps_loss_factor (list[float]):
+            Parameter of heatmap loss in multi-heatmap.
+        supervise_empty (bool): Whether to supervise empty channels.
+    """
+
+    def __init__(self,
+                 num_joints,
+                 num_stages,
+                 ae_loss_type,
+                 with_ae_loss,
+                 push_loss_factor,
+                 pull_loss_factor,
+                 with_heatmaps_loss,
+                 heatmaps_loss_factor,
+                 supervise_empty=True):
+        super().__init__()
+
+        assert isinstance(with_heatmaps_loss, (list, tuple)), \
+            'with_heatmaps_loss should be a list or tuple'
+        assert isinstance(heatmaps_loss_factor, (list, tuple)), \
+            'heatmaps_loss_factor should be a list or tuple'
+        assert isinstance(with_ae_loss, (list, tuple)), \
+            'with_ae_loss should be a list or tuple'
+        assert isinstance(push_loss_factor, (list, tuple)), \
+            'push_loss_factor should be a list or tuple'
+        assert isinstance(pull_loss_factor, (list, tuple)), \
+            'pull_loss_factor should be a list or tuple'
+
+        self.num_joints = num_joints
+        self.num_stages = num_stages
+        self.ae_loss_type = ae_loss_type
+        self.with_ae_loss = with_ae_loss
+        self.push_loss_factor = push_loss_factor
+        self.pull_loss_factor = pull_loss_factor
+        self.with_heatmaps_loss = with_heatmaps_loss
+        self.heatmaps_loss_factor = heatmaps_loss_factor
+
+        self.heatmaps_loss = \
+            nn.ModuleList(
+                [
+                    HeatmapLoss(supervise_empty)
+                    if with_heatmaps_loss else None
+                    for with_heatmaps_loss in self.with_heatmaps_loss
+                ]
+            )
+
+        self.ae_loss = \
+            nn.ModuleList(
+                [
+                    AELoss(self.ae_loss_type) if with_ae_loss else None
+                    for with_ae_loss in self.with_ae_loss
+                ]
+            )
+
+    def forward(self, outputs, heatmaps, masks, joints):
+        """Forward function to calculate losses.
+
+        Note:
+            - batch_size: N
+            - heatmaps weight: W
+            - heatmaps height: H
+            - max_num_people: M
+            - num_keypoints: K
+            - output_channel: C C=2K if use ae loss else K
+
+        Args:
+            outputs (list(torch.Tensor[N,C,H,W])): outputs of stages.
+            heatmaps (list(torch.Tensor[N,K,H,W])): target of heatmaps.
+            masks (list(torch.Tensor[N,H,W])): masks of heatmaps.
+            joints (list(torch.Tensor[N,M,K,2])): joints of ae loss.
+        """
+        heatmaps_losses = []
+        push_losses = []
+        pull_losses = []
+        for idx in range(len(outputs)):
+            offset_feat = 0
+            if self.heatmaps_loss[idx]:
+                heatmaps_pred = outputs[idx][:, :self.num_joints]
+                offset_feat = self.num_joints
+                heatmaps_loss = self.heatmaps_loss[idx](heatmaps_pred,
+                                                        heatmaps[idx],
+                                                        masks[idx])
+                heatmaps_loss = heatmaps_loss * self.heatmaps_loss_factor[idx]
+                heatmaps_losses.append(heatmaps_loss)
+            else:
+                heatmaps_losses.append(None)
+
+            if self.ae_loss[idx]:
+                tags_pred = outputs[idx][:, offset_feat:]
+                batch_size = tags_pred.size()[0]
+                tags_pred = tags_pred.contiguous().view(batch_size, -1, 1)
+
+                push_loss, pull_loss = self.ae_loss[idx](tags_pred,
+                                                         joints[idx])
+                push_loss = push_loss * self.push_loss_factor[idx]
+                pull_loss = pull_loss * self.pull_loss_factor[idx]
+
+                push_losses.append(push_loss)
+                pull_losses.append(pull_loss)
+            else:
+                push_losses.append(None)
+                pull_losses.append(None)
+
+        return heatmaps_losses, push_losses, pull_losses
diff --git a/main/transformer_utils/mmpose/models/losses/regression_loss.py b/main/transformer_utils/mmpose/models/losses/regression_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc7aa33847d8fdc8c6e096b7e3467759024af053
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/losses/regression_loss.py
@@ -0,0 +1,530 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..builder import LOSSES
+from ..utils.realnvp import RealNVP
+
+
+@LOSSES.register_module()
+class RLELoss(nn.Module):
+    """RLE Loss.
+
+    `Human Pose Regression With Residual Log-Likelihood Estimation
+    arXiv: <https://arxiv.org/abs/2107.11291>`_.
+
+    Code is modified from `the official implementation
+    <https://github.com/Jeff-sjtu/res-loglikelihood-regression>`_.
+
+    Args:
+        use_target_weight (bool): Option to use weighted MSE loss.
+            Different joint types may have different target weights.
+        size_average (bool): Option to average the loss by the batch_size.
+        residual (bool): Option to add L1 loss and let the flow
+            learn the residual error distribution.
+        q_dis (string): Option for the identity Q(error) distribution,
+            Options: "laplace" or "gaussian"
+    """
+
+    def __init__(self,
+                 use_target_weight=False,
+                 size_average=True,
+                 residual=True,
+                 q_dis='laplace'):
+        super(RLELoss, self).__init__()
+        self.size_average = size_average
+        self.use_target_weight = use_target_weight
+        self.residual = residual
+        self.q_dis = q_dis
+
+        self.flow_model = RealNVP()
+
+    def forward(self, output, target, target_weight=None):
+        """Forward function.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - dimension of keypoints: D (D=2 or D=3)
+
+        Args:
+            output (torch.Tensor[N, K, D*2]): Output regression,
+                    including coords and sigmas.
+            target (torch.Tensor[N, K, D]): Target regression.
+            target_weight (torch.Tensor[N, K, D]):
+                Weights across different joint types.
+        """
+        pred = output[:, :, :2]
+        sigma = output[:, :, 2:4].sigmoid()
+
+        error = (pred - target) / (sigma + 1e-9)
+        # (B, K, 2)
+        log_phi = self.flow_model.log_prob(error.reshape(-1, 2))
+        log_phi = log_phi.reshape(target.shape[0], target.shape[1], 1)
+        log_sigma = torch.log(sigma).reshape(target.shape[0], target.shape[1],
+                                             2)
+        nf_loss = log_sigma - log_phi
+
+        if self.residual:
+            assert self.q_dis in ['laplace', 'gaussian', 'strict']
+            if self.q_dis == 'laplace':
+                loss_q = torch.log(sigma * 2) + torch.abs(error)
+            else:
+                loss_q = torch.log(
+                    sigma * math.sqrt(2 * math.pi)) + 0.5 * error**2
+
+            loss = nf_loss + loss_q
+        else:
+            loss = nf_loss
+
+        if self.use_target_weight:
+            assert target_weight is not None
+            loss *= target_weight
+
+        if self.size_average:
+            loss /= len(loss)
+
+        return loss.sum()
+
+
+@LOSSES.register_module()
+class SmoothL1Loss(nn.Module):
+    """SmoothL1Loss loss.
+
+    Args:
+        use_target_weight (bool): Option to use weighted MSE loss.
+            Different joint types may have different target weights.
+        loss_weight (float): Weight of the loss. Default: 1.0.
+    """
+
+    def __init__(self, use_target_weight=False, loss_weight=1.):
+        super().__init__()
+        self.criterion = F.smooth_l1_loss
+        self.use_target_weight = use_target_weight
+        self.loss_weight = loss_weight
+
+    def forward(self, output, target, target_weight=None):
+        """Forward function.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - dimension of keypoints: D (D=2 or D=3)
+
+        Args:
+            output (torch.Tensor[N, K, D]): Output regression.
+            target (torch.Tensor[N, K, D]): Target regression.
+            target_weight (torch.Tensor[N, K, D]):
+                Weights across different joint types.
+        """
+        if self.use_target_weight:
+            assert target_weight is not None
+            loss = self.criterion(output * target_weight,
+                                  target * target_weight)
+        else:
+            loss = self.criterion(output, target)
+
+        return loss * self.loss_weight
+
+
+@LOSSES.register_module()
+class WingLoss(nn.Module):
+    """Wing Loss. paper ref: 'Wing Loss for Robust Facial Landmark Localisation
+    with Convolutional Neural Networks' Feng et al. CVPR'2018.
+
+    Args:
+        omega (float): Also referred to as width.
+        epsilon (float): Also referred to as curvature.
+        use_target_weight (bool): Option to use weighted MSE loss.
+            Different joint types may have different target weights.
+        loss_weight (float): Weight of the loss. Default: 1.0.
+    """
+
+    def __init__(self,
+                 omega=10.0,
+                 epsilon=2.0,
+                 use_target_weight=False,
+                 loss_weight=1.):
+        super().__init__()
+        self.omega = omega
+        self.epsilon = epsilon
+        self.use_target_weight = use_target_weight
+        self.loss_weight = loss_weight
+
+        # constant that smoothly links the piecewise-defined linear
+        # and nonlinear parts
+        self.C = self.omega * (1.0 - math.log(1.0 + self.omega / self.epsilon))
+
+    def criterion(self, pred, target):
+        """Criterion of wingloss.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - dimension of keypoints: D (D=2 or D=3)
+
+        Args:
+            pred (torch.Tensor[N, K, D]): Output regression.
+            target (torch.Tensor[N, K, D]): Target regression.
+        """
+        delta = (target - pred).abs()
+        losses = torch.where(
+            delta < self.omega,
+            self.omega * torch.log(1.0 + delta / self.epsilon), delta - self.C)
+        return torch.mean(torch.sum(losses, dim=[1, 2]), dim=0)
+
+    def forward(self, output, target, target_weight=None):
+        """Forward function.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - dimension of keypoints: D (D=2 or D=3)
+
+        Args:
+            output (torch.Tensor[N, K, D]): Output regression.
+            target (torch.Tensor[N, K, D]): Target regression.
+            target_weight (torch.Tensor[N,K,D]):
+                Weights across different joint types.
+        """
+        if self.use_target_weight:
+            assert target_weight is not None
+            loss = self.criterion(output * target_weight,
+                                  target * target_weight)
+        else:
+            loss = self.criterion(output, target)
+
+        return loss * self.loss_weight
+
+
+@LOSSES.register_module()
+class SoftWingLoss(nn.Module):
+    """Soft Wing Loss 'Structure-Coherent Deep Feature Learning for Robust Face
+    Alignment' Lin et al. TIP'2021.
+
+    loss =
+        1. |x|                           , if |x| < omega1
+        2. omega2*ln(1+|x|/epsilon) + B, if |x| >= omega1
+
+    Args:
+        omega1 (float): The first threshold.
+        omega2 (float): The second threshold.
+        epsilon (float): Also referred to as curvature.
+        use_target_weight (bool): Option to use weighted MSE loss.
+            Different joint types may have different target weights.
+        loss_weight (float): Weight of the loss. Default: 1.0.
+    """
+
+    def __init__(self,
+                 omega1=2.0,
+                 omega2=20.0,
+                 epsilon=0.5,
+                 use_target_weight=False,
+                 loss_weight=1.):
+        super().__init__()
+        self.omega1 = omega1
+        self.omega2 = omega2
+        self.epsilon = epsilon
+        self.use_target_weight = use_target_weight
+        self.loss_weight = loss_weight
+
+        # constant that smoothly links the piecewise-defined linear
+        # and nonlinear parts
+        self.B = self.omega1 - self.omega2 * math.log(1.0 + self.omega1 /
+                                                      self.epsilon)
+
+    def criterion(self, pred, target):
+        """Criterion of wingloss.
+
+        Note:
+            batch_size: N
+            num_keypoints: K
+            dimension of keypoints: D (D=2 or D=3)
+
+        Args:
+            pred (torch.Tensor[N, K, D]): Output regression.
+            target (torch.Tensor[N, K, D]): Target regression.
+        """
+        delta = (target - pred).abs()
+        losses = torch.where(
+            delta < self.omega1, delta,
+            self.omega2 * torch.log(1.0 + delta / self.epsilon) + self.B)
+        return torch.mean(torch.sum(losses, dim=[1, 2]), dim=0)
+
+    def forward(self, output, target, target_weight=None):
+        """Forward function.
+
+        Note:
+            batch_size: N
+            num_keypoints: K
+            dimension of keypoints: D (D=2 or D=3)
+
+        Args:
+            output (torch.Tensor[N, K, D]): Output regression.
+            target (torch.Tensor[N, K, D]): Target regression.
+            target_weight (torch.Tensor[N, K, D]):
+                Weights across different joint types.
+        """
+        if self.use_target_weight:
+            assert target_weight is not None
+            loss = self.criterion(output * target_weight,
+                                  target * target_weight)
+        else:
+            loss = self.criterion(output, target)
+
+        return loss * self.loss_weight
+
+
+@LOSSES.register_module()
+class MPJPELoss(nn.Module):
+    """MPJPE (Mean Per Joint Position Error) loss.
+
+    Args:
+        use_target_weight (bool): Option to use weighted MSE loss.
+            Different joint types may have different target weights.
+        loss_weight (float): Weight of the loss. Default: 1.0.
+    """
+
+    def __init__(self, use_target_weight=False, loss_weight=1.):
+        super().__init__()
+        self.use_target_weight = use_target_weight
+        self.loss_weight = loss_weight
+
+    def forward(self, output, target, target_weight=None):
+        """Forward function.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - dimension of keypoints: D (D=2 or D=3)
+
+        Args:
+            output (torch.Tensor[N, K, D]): Output regression.
+            target (torch.Tensor[N, K, D]): Target regression.
+            target_weight (torch.Tensor[N,K,D]):
+                Weights across different joint types.
+        """
+
+        if self.use_target_weight:
+            assert target_weight is not None
+            loss = torch.mean(
+                torch.norm((output - target) * target_weight, dim=-1))
+        else:
+            loss = torch.mean(torch.norm(output - target, dim=-1))
+
+        return loss * self.loss_weight
+
+
+@LOSSES.register_module()
+class L1Loss(nn.Module):
+    """L1Loss loss ."""
+
+    def __init__(self, use_target_weight=False, loss_weight=1.):
+        super().__init__()
+        self.criterion = F.l1_loss
+        self.use_target_weight = use_target_weight
+        self.loss_weight = loss_weight
+
+    def forward(self, output, target, target_weight=None):
+        """Forward function.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+
+        Args:
+            output (torch.Tensor[N, K, 2]): Output regression.
+            target (torch.Tensor[N, K, 2]): Target regression.
+            target_weight (torch.Tensor[N, K, 2]):
+                Weights across different joint types.
+        """
+        if self.use_target_weight:
+            assert target_weight is not None
+            loss = self.criterion(output * target_weight,
+                                  target * target_weight)
+        else:
+            loss = self.criterion(output, target)
+
+        return loss * self.loss_weight
+
+
+@LOSSES.register_module()
+class MSELoss(nn.Module):
+    """MSE loss for coordinate regression."""
+
+    def __init__(self, use_target_weight=False, loss_weight=1.):
+        super().__init__()
+        self.criterion = F.mse_loss
+        self.use_target_weight = use_target_weight
+        self.loss_weight = loss_weight
+
+    def forward(self, output, target, target_weight=None):
+        """Forward function.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+
+        Args:
+            output (torch.Tensor[N, K, 2]): Output regression.
+            target (torch.Tensor[N, K, 2]): Target regression.
+            target_weight (torch.Tensor[N, K, 2]):
+                Weights across different joint types.
+        """
+        if self.use_target_weight:
+            assert target_weight is not None
+            loss = self.criterion(output * target_weight,
+                                  target * target_weight)
+        else:
+            loss = self.criterion(output, target)
+
+        return loss * self.loss_weight
+
+
+@LOSSES.register_module()
+class BoneLoss(nn.Module):
+    """Bone length loss.
+
+    Args:
+        joint_parents (list): Indices of each joint's parent joint.
+        use_target_weight (bool): Option to use weighted bone loss.
+            Different bone types may have different target weights.
+        loss_weight (float): Weight of the loss. Default: 1.0.
+    """
+
+    def __init__(self, joint_parents, use_target_weight=False, loss_weight=1.):
+        super().__init__()
+        self.joint_parents = joint_parents
+        self.use_target_weight = use_target_weight
+        self.loss_weight = loss_weight
+
+        self.non_root_indices = []
+        for i in range(len(self.joint_parents)):
+            if i != self.joint_parents[i]:
+                self.non_root_indices.append(i)
+
+    def forward(self, output, target, target_weight=None):
+        """Forward function.
+
+        Note:
+            - batch_size: N
+            - num_keypoints: K
+            - dimension of keypoints: D (D=2 or D=3)
+
+        Args:
+            output (torch.Tensor[N, K, D]): Output regression.
+            target (torch.Tensor[N, K, D]): Target regression.
+            target_weight (torch.Tensor[N, K-1]):
+                Weights across different bone types.
+        """
+        output_bone = torch.norm(
+            output - output[:, self.joint_parents, :],
+            dim=-1)[:, self.non_root_indices]
+        target_bone = torch.norm(
+            target - target[:, self.joint_parents, :],
+            dim=-1)[:, self.non_root_indices]
+        if self.use_target_weight:
+            assert target_weight is not None
+            loss = torch.mean(
+                torch.abs((output_bone * target_weight).mean(dim=0) -
+                          (target_bone * target_weight).mean(dim=0)))
+        else:
+            loss = torch.mean(
+                torch.abs(output_bone.mean(dim=0) - target_bone.mean(dim=0)))
+
+        return loss * self.loss_weight
+
+
+@LOSSES.register_module()
+class SemiSupervisionLoss(nn.Module):
+    """Semi-supervision loss for unlabeled data. It is composed of projection
+    loss and bone loss.
+
+    Paper ref: `3D human pose estimation in video with temporal convolutions
+    and semi-supervised training` Dario Pavllo et al. CVPR'2019.
+
+    Args:
+        joint_parents (list): Indices of each joint's parent joint.
+        projection_loss_weight (float): Weight for projection loss.
+        bone_loss_weight (float): Weight for bone loss.
+        warmup_iterations (int): Number of warmup iterations. In the first
+            `warmup_iterations` iterations, the model is trained only on
+            labeled data, and semi-supervision loss will be 0.
+            This is a workaround since currently we cannot access
+            epoch number in loss functions. Note that the iteration number in
+            an epoch can be changed due to different GPU numbers in multi-GPU
+            settings. So please set this parameter carefully.
+            warmup_iterations = dataset_size // samples_per_gpu // gpu_num
+            * warmup_epochs
+    """
+
+    def __init__(self,
+                 joint_parents,
+                 projection_loss_weight=1.,
+                 bone_loss_weight=1.,
+                 warmup_iterations=0):
+        super().__init__()
+        self.criterion_projection = MPJPELoss(
+            loss_weight=projection_loss_weight)
+        self.criterion_bone = BoneLoss(
+            joint_parents, loss_weight=bone_loss_weight)
+        self.warmup_iterations = warmup_iterations
+        self.num_iterations = 0
+
+    @staticmethod
+    def project_joints(x, intrinsics):
+        """Project 3D joint coordinates to 2D image plane using camera
+        intrinsic parameters.
+
+        Args:
+            x (torch.Tensor[N, K, 3]): 3D joint coordinates.
+            intrinsics (torch.Tensor[N, 4] | torch.Tensor[N, 9]): Camera
+                intrinsics: f (2), c (2), k (3), p (2).
+        """
+        while intrinsics.dim() < x.dim():
+            intrinsics.unsqueeze_(1)
+        f = intrinsics[..., :2]
+        c = intrinsics[..., 2:4]
+        _x = torch.clamp(x[:, :, :2] / x[:, :, 2:], -1, 1)
+        if intrinsics.shape[-1] == 9:
+            k = intrinsics[..., 4:7]
+            p = intrinsics[..., 7:9]
+
+            r2 = torch.sum(_x[:, :, :2]**2, dim=-1, keepdim=True)
+            radial = 1 + torch.sum(
+                k * torch.cat((r2, r2**2, r2**3), dim=-1),
+                dim=-1,
+                keepdim=True)
+            tan = torch.sum(p * _x, dim=-1, keepdim=True)
+            _x = _x * (radial + tan) + p * r2
+        _x = f * _x + c
+        return _x
+
+    def forward(self, output, target):
+        losses = dict()
+
+        self.num_iterations += 1
+        if self.num_iterations <= self.warmup_iterations:
+            return losses
+
+        labeled_pose = output['labeled_pose']
+        unlabeled_pose = output['unlabeled_pose']
+        unlabeled_traj = output['unlabeled_traj']
+        unlabeled_target_2d = target['unlabeled_target_2d']
+        intrinsics = target['intrinsics']
+
+        # projection loss
+        unlabeled_output = unlabeled_pose + unlabeled_traj
+        unlabeled_output_2d = self.project_joints(unlabeled_output, intrinsics)
+        loss_proj = self.criterion_projection(unlabeled_output_2d,
+                                              unlabeled_target_2d, None)
+        losses['proj_loss'] = loss_proj
+
+        # bone loss
+        loss_bone = self.criterion_bone(unlabeled_pose, labeled_pose, None)
+        losses['bone_loss'] = loss_bone
+
+        return losses
diff --git a/main/transformer_utils/mmpose/models/losses/rle_loss.py b/main/transformer_utils/mmpose/models/losses/rle_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..5973da8df59dd4804af746bd7fb83a23fbb78c35
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/losses/rle_loss.py
@@ -0,0 +1,180 @@
+import math
+import mmcv
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..builder import LOSSES
+
+
+@LOSSES.register_module()
+class RLELoss_poseur_old(nn.Module):
+    ''' RLE Regression Loss
+    '''
+
+    def __init__(self, OUTPUT_3D=False, use_target_weight=True, size_average=True):
+        super(RLELoss_poseur_old, self).__init__()
+        self.size_average = size_average
+        self.amp = 1 / math.sqrt(2 * math.pi)
+
+    def logQ(self, gt_uv, pred_jts, sigma):
+        return torch.log(sigma / self.amp) + torch.abs(gt_uv - pred_jts) / (math.sqrt(2) * sigma + 1e-9)
+
+    def forward(self, output, target_uv, target_uv_weight):
+
+        pred_jts = output.pred_jts
+        sigma = output.sigma
+        gt_uv = target_uv.reshape(pred_jts.shape)
+        gt_uv_weight = target_uv_weight.reshape(pred_jts.shape)
+
+
+
+        nf_loss = output.nf_loss * gt_uv_weight[:, :, :1]
+        # print(gt_uv.min(), gt_uv.max())
+
+        residual = True
+        if residual:
+            Q_logprob = self.logQ(gt_uv, pred_jts, sigma) * gt_uv_weight
+            loss = nf_loss + Q_logprob
+
+        if self.size_average and gt_uv_weight.sum() > 0:
+            return loss.sum() / len(loss)
+        else:
+            return loss.sum()
+
+@LOSSES.register_module()
+class RLELoss_poseur(nn.Module):
+    ''' RLE Regression Loss
+    '''
+
+    def __init__(self, OUTPUT_3D=False, use_target_weight=True, size_average=True):
+        super(RLELoss_poseur, self).__init__()
+        self.size_average = size_average
+        self.amp = 1 / math.sqrt(2 * math.pi)
+
+    def logQ(self, gt_uv, pred_jts, sigma):
+        return torch.log(sigma / self.amp) + torch.abs(gt_uv - pred_jts) / (math.sqrt(2) * sigma + 1e-9)
+
+    def forward(self, output, target_uvd, target_uvd_weight):
+
+        pred_jts = output.pred_jts
+        sigma = output.sigma
+        gt_uv = target_uvd.reshape(pred_jts.shape)
+        gt_uv_weight = target_uvd_weight.reshape(pred_jts.shape)
+
+        # nf_loss = output.nf_loss * gt_uv_weight[:, :, :1]
+        nf_loss = output.nf_loss * gt_uv_weight
+
+        residual = True
+        if residual:
+            Q_logprob = self.logQ(gt_uv, pred_jts, sigma) * gt_uv_weight
+            loss = nf_loss + Q_logprob
+
+        if self.size_average and gt_uv_weight.sum() > 0:
+            return loss.sum() / len(loss)
+        else:
+            return loss.sum()
+
+@LOSSES.register_module()
+class RLEOHKMLoss(nn.Module):
+    ''' RLE Regression Loss
+    '''
+
+    def __init__(self, OUTPUT_3D=False, use_target_weight=True, size_average=True, topk=8, 
+                    ori_weight = 1.0, ohkm_weight = 0.0):
+        super(RLEOHKMLoss, self).__init__()
+        self.size_average = size_average
+        self.amp = 1 / math.sqrt(2 * math.pi)
+        self.topk = topk
+        self.ori_weight = ori_weight
+        self.ohkm_weight = ohkm_weight
+        self.neg_inf = -float("Inf")
+
+    def logQ(self, gt_uv, pred_jts, sigma):
+        return torch.log(sigma / self.amp) + torch.abs(gt_uv - pred_jts) / (math.sqrt(2) * sigma + 1e-9)
+
+    def ohkm(self, loss, weight):
+        # mask = weight == 0
+        loss_value = loss.clone().detach()
+        loss_value[weight == 0] = self.neg_inf
+        _, topk_idx = torch.topk(
+            loss_value, k=self.topk, dim=1, sorted=False)
+        tmp_loss = torch.gather(loss, 1, topk_idx)
+        tmp_weight = torch.gather(weight, 1, topk_idx)
+        # tmp_loss[tmp_loss==-float("Inf")] = 0
+        tmp_loss = tmp_loss * tmp_weight
+        tmp_loss = tmp_loss.flatten(start_dim=1).sum(dim = 1)
+        # tmp_weight = tmp_weight.flatten(start_dim=1).sum(dim = 1)
+        # tmp_loss = tmp_loss / tmp_weight
+
+        return tmp_loss.mean()
+
+    def ori(self, loss, weight):
+        # mask = weight == 0
+        loss = loss * weight
+        loss = loss.flatten(start_dim=1).sum(dim = 1)
+        # weight = weight.flatten(start_dim=1).sum(dim = 1)
+
+        return loss.mean()
+
+    def forward(self, output, target_uv, target_uv_weight):
+
+        pred_jts = output.pred_jts
+        sigma = output.sigma
+        gt_uv = target_uv.reshape(pred_jts.shape)
+        gt_uv_weight = target_uv_weight.reshape(pred_jts.shape)
+
+        # gt_uv_weight = gt_uv_weight[:, :, :1] 
+        nf_loss = output.nf_loss
+        q_loss = self.logQ(gt_uv, pred_jts, sigma)
+
+        # nf_loss_ohkm = self.ohkm(nf_loss, gt_uv_weight)
+        # q_loss_ohkm = self.ohkm(q_loss, gt_uv_weight)
+
+        ori_loss = nf_loss + q_loss
+        ohkm_loss = self.ohkm(ori_loss, gt_uv_weight)
+        ori_loss = self.ori(ori_loss, gt_uv_weight)
+
+        loss = self.ori_weight * ori_loss + self.ohkm_weight * ohkm_loss
+        return loss #TODO mean?
+
+
+        # nf_loss = output.nf_loss * gt_uv_weight
+
+
+        # Q_logprob = self.logQ(gt_uv, pred_jts, sigma) * gt_uv_weight
+        # loss = nf_loss + Q_logprob
+
+        # return loss.sum() / len(loss)
+
+
+@LOSSES.register_module()
+class RLELoss3D(nn.Module):
+    ''' RLE Regression Loss 3D
+    '''
+
+    def __init__(self, OUTPUT_3D=False, size_average=True):
+        super(RLELoss3D, self).__init__()
+        self.size_average = size_average
+        self.amp = 1 / math.sqrt(2 * math.pi)
+
+    def logQ(self, gt_uv, pred_jts, sigma):
+        return torch.log(sigma / self.amp) + torch.abs(gt_uv - pred_jts) / (math.sqrt(2) * sigma + 1e-9)
+
+    def forward(self, output, labels):
+        nf_loss = output.nf_loss
+        pred_jts = output.pred_jts
+        sigma = output.sigma
+        gt_uv = labels['target_uvd'].reshape(pred_jts.shape)
+        gt_uv_weight = labels['target_uvd_weight'].reshape(pred_jts.shape)
+        nf_loss = nf_loss * gt_uv_weight
+
+        residual = True
+        if residual:
+            Q_logprob = self.logQ(gt_uv, pred_jts, sigma) * gt_uv_weight
+            loss = nf_loss + Q_logprob
+
+        if self.size_average and gt_uv_weight.sum() > 0:
+            return loss.sum() / len(loss)
+        else:
+            return loss.sum()
\ No newline at end of file
diff --git a/main/transformer_utils/mmpose/models/misc/__init__.py b/main/transformer_utils/mmpose/models/misc/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef101fec61e72abc0eb90266d453b5b22331378d
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/misc/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) OpenMMLab. All rights reserved.
diff --git a/main/transformer_utils/mmpose/models/misc/discriminator.py b/main/transformer_utils/mmpose/models/misc/discriminator.py
new file mode 100644
index 0000000000000000000000000000000000000000..712f0a8b566e3dcbc0cd13206610d3c750b942ab
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/misc/discriminator.py
@@ -0,0 +1,307 @@
+# ------------------------------------------------------------------------------
+# Adapted from https://github.com/akanazawa/hmr
+# Original licence: Copyright (c) 2018 akanazawa, under the MIT License.
+# ------------------------------------------------------------------------------
+
+from abc import abstractmethod
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import normal_init, xavier_init
+
+from mmpose.models.utils.geometry import batch_rodrigues
+
+
+class BaseDiscriminator(nn.Module):
+    """Base linear module for SMPL parameter discriminator.
+
+    Args:
+        fc_layers (Tuple): Tuple of neuron count,
+            such as (9, 32, 32, 1)
+        use_dropout (Tuple): Tuple of bool define use dropout or not
+            for each layer, such as (True, True, False)
+        drop_prob (Tuple): Tuple of float defined the drop prob,
+            such as (0.5, 0.5, 0)
+        use_activation(Tuple): Tuple of bool define use active function
+            or not, such as (True, True, False)
+    """
+
+    def __init__(self, fc_layers, use_dropout, drop_prob, use_activation):
+        super().__init__()
+        self.fc_layers = fc_layers
+        self.use_dropout = use_dropout
+        self.drop_prob = drop_prob
+        self.use_activation = use_activation
+        self._check()
+        self.create_layers()
+
+    def _check(self):
+        """Check input to avoid ValueError."""
+        if not isinstance(self.fc_layers, tuple):
+            raise TypeError(f'fc_layers require tuple, '
+                            f'get {type(self.fc_layers)}')
+
+        if not isinstance(self.use_dropout, tuple):
+            raise TypeError(f'use_dropout require tuple, '
+                            f'get {type(self.use_dropout)}')
+
+        if not isinstance(self.drop_prob, tuple):
+            raise TypeError(f'drop_prob require tuple, '
+                            f'get {type(self.drop_prob)}')
+
+        if not isinstance(self.use_activation, tuple):
+            raise TypeError(f'use_activation require tuple, '
+                            f'get {type(self.use_activation)}')
+
+        l_fc_layer = len(self.fc_layers)
+        l_use_drop = len(self.use_dropout)
+        l_drop_prob = len(self.drop_prob)
+        l_use_activation = len(self.use_activation)
+
+        pass_check = (
+            l_fc_layer >= 2 and l_use_drop < l_fc_layer
+            and l_drop_prob < l_fc_layer and l_use_activation < l_fc_layer
+            and l_drop_prob == l_use_drop)
+
+        if not pass_check:
+            msg = 'Wrong BaseDiscriminator parameters!'
+            raise ValueError(msg)
+
+    def create_layers(self):
+        """Create layers."""
+        l_fc_layer = len(self.fc_layers)
+        l_use_drop = len(self.use_dropout)
+        l_use_activation = len(self.use_activation)
+
+        self.fc_blocks = nn.Sequential()
+
+        for i in range(l_fc_layer - 1):
+            self.fc_blocks.add_module(
+                name=f'regressor_fc_{i}',
+                module=nn.Linear(
+                    in_features=self.fc_layers[i],
+                    out_features=self.fc_layers[i + 1]))
+
+            if i < l_use_activation and self.use_activation[i]:
+                self.fc_blocks.add_module(
+                    name=f'regressor_af_{i}', module=nn.ReLU())
+
+            if i < l_use_drop and self.use_dropout[i]:
+                self.fc_blocks.add_module(
+                    name=f'regressor_fc_dropout_{i}',
+                    module=nn.Dropout(p=self.drop_prob[i]))
+
+    @abstractmethod
+    def forward(self, inputs):
+        """Forward function."""
+        msg = 'the base class [BaseDiscriminator] is not callable!'
+        raise NotImplementedError(msg)
+
+    def init_weights(self):
+        """Initialize model weights."""
+        for m in self.fc_blocks.named_modules():
+            if isinstance(m, nn.Linear):
+                xavier_init(m, gain=0.01)
+
+
+class ShapeDiscriminator(BaseDiscriminator):
+    """Discriminator for SMPL shape parameters, the inputs is (batch_size x 10)
+
+    Args:
+        fc_layers (Tuple): Tuple of neuron count, such as (10, 5, 1)
+        use_dropout (Tuple): Tuple of bool define use dropout or
+            not for each layer, such as (True, True, False)
+        drop_prob (Tuple): Tuple of float defined the drop prob,
+            such as (0.5, 0)
+        use_activation(Tuple): Tuple of bool define use active
+            function or not, such as (True, False)
+    """
+
+    def __init__(self, fc_layers, use_dropout, drop_prob, use_activation):
+        if fc_layers[-1] != 1:
+            msg = f'the neuron count of the last layer ' \
+                  f'must be 1, but got {fc_layers[-1]}'
+            raise ValueError(msg)
+
+        super().__init__(fc_layers, use_dropout, drop_prob, use_activation)
+
+    def forward(self, inputs):
+        """Forward function."""
+        return self.fc_blocks(inputs)
+
+
+class PoseDiscriminator(nn.Module):
+    """Discriminator for SMPL pose parameters of each joint. It is composed of
+    discriminators for each joints. The inputs is (batch_size x joint_count x
+    9)
+
+    Args:
+        channels (Tuple): Tuple of channel number,
+            such as (9, 32, 32, 1)
+        joint_count (int): Joint number, such as 23
+    """
+
+    def __init__(self, channels, joint_count):
+        super().__init__()
+        if channels[-1] != 1:
+            msg = f'the neuron count of the last layer ' \
+                  f'must be 1, but got {channels[-1]}'
+            raise ValueError(msg)
+        self.joint_count = joint_count
+
+        self.conv_blocks = nn.Sequential()
+        len_channels = len(channels)
+        for idx in range(len_channels - 2):
+            self.conv_blocks.add_module(
+                name=f'conv_{idx}',
+                module=nn.Conv2d(
+                    in_channels=channels[idx],
+                    out_channels=channels[idx + 1],
+                    kernel_size=1,
+                    stride=1))
+
+        self.fc_layer = nn.ModuleList()
+        for idx in range(joint_count):
+            self.fc_layer.append(
+                nn.Linear(
+                    in_features=channels[len_channels - 2], out_features=1))
+
+    def forward(self, inputs):
+        """Forward function.
+
+        The input is (batch_size x joint_count x 9).
+        """
+        # shape: batch_size x 9 x 1 x joint_count
+        inputs = inputs.transpose(1, 2).unsqueeze(2).contiguous()
+        # shape: batch_size x c x 1 x joint_count
+        internal_outputs = self.conv_blocks(inputs)
+        outputs = []
+        for idx in range(self.joint_count):
+            outputs.append(self.fc_layer[idx](internal_outputs[:, :, 0, idx]))
+
+        return torch.cat(outputs, 1), internal_outputs
+
+    def init_weights(self):
+        """Initialize model weights."""
+        for m in self.conv_blocks:
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, std=0.001, bias=0)
+        for m in self.fc_layer.named_modules():
+            if isinstance(m, nn.Linear):
+                xavier_init(m, gain=0.01)
+
+
+class FullPoseDiscriminator(BaseDiscriminator):
+    """Discriminator for SMPL pose parameters of all joints.
+
+    Args:
+        fc_layers (Tuple): Tuple of neuron count,
+            such as (736, 1024, 1024, 1)
+        use_dropout (Tuple): Tuple of bool define use dropout or not
+            for each layer, such as (True, True, False)
+        drop_prob (Tuple): Tuple of float defined the drop prob,
+            such as (0.5, 0.5, 0)
+        use_activation(Tuple): Tuple of bool define use active
+            function or not, such as (True, True, False)
+    """
+
+    def __init__(self, fc_layers, use_dropout, drop_prob, use_activation):
+        if fc_layers[-1] != 1:
+            msg = f'the neuron count of the last layer must be 1,' \
+                  f' but got {fc_layers[-1]}'
+            raise ValueError(msg)
+
+        super().__init__(fc_layers, use_dropout, drop_prob, use_activation)
+
+    def forward(self, inputs):
+        """Forward function."""
+        return self.fc_blocks(inputs)
+
+
+class SMPLDiscriminator(nn.Module):
+    """Discriminator for SMPL pose and shape parameters. It is composed of a
+    discriminator for SMPL shape parameters, a discriminator for SMPL pose
+    parameters of all joints  and a discriminator for SMPL pose parameters of
+    each joint.
+
+    Args:
+        beta_channel (tuple of int): Tuple of neuron count of the
+            discriminator of shape parameters. Defaults to (10, 5, 1)
+        per_joint_channel (tuple of int): Tuple of neuron count of the
+            discriminator of each joint. Defaults to (9, 32, 32, 1)
+        full_pose_channel (tuple of int): Tuple of neuron count of the
+            discriminator of full pose. Defaults to (23*32, 1024, 1024, 1)
+    """
+
+    def __init__(self,
+                 beta_channel=(10, 5, 1),
+                 per_joint_channel=(9, 32, 32, 1),
+                 full_pose_channel=(23 * 32, 1024, 1024, 1)):
+        super().__init__()
+        self.joint_count = 23
+        # The count of SMPL shape parameter is 10.
+        assert beta_channel[0] == 10
+        # Use 3 x 3 rotation matrix as the pose parameters
+        # of each joint, so the input channel is 9.
+        assert per_joint_channel[0] == 9
+        assert self.joint_count * per_joint_channel[-2] \
+            == full_pose_channel[0]
+
+        self.beta_channel = beta_channel
+        self.per_joint_channel = per_joint_channel
+        self.full_pose_channel = full_pose_channel
+        self._create_sub_modules()
+
+    def _create_sub_modules(self):
+        """Create sub discriminators."""
+
+        # create theta discriminator for each joint
+        self.pose_discriminator = PoseDiscriminator(self.per_joint_channel,
+                                                    self.joint_count)
+
+        # create full pose discriminator for total joints
+        fc_layers = self.full_pose_channel
+        use_dropout = tuple([False] * (len(fc_layers) - 1))
+        drop_prob = tuple([0.5] * (len(fc_layers) - 1))
+        use_activation = tuple([True] * (len(fc_layers) - 2) + [False])
+
+        self.full_pose_discriminator = FullPoseDiscriminator(
+            fc_layers, use_dropout, drop_prob, use_activation)
+
+        # create shape discriminator for betas
+        fc_layers = self.beta_channel
+        use_dropout = tuple([False] * (len(fc_layers) - 1))
+        drop_prob = tuple([0.5] * (len(fc_layers) - 1))
+        use_activation = tuple([True] * (len(fc_layers) - 2) + [False])
+        self.shape_discriminator = ShapeDiscriminator(fc_layers, use_dropout,
+                                                      drop_prob,
+                                                      use_activation)
+
+    def forward(self, thetas):
+        """Forward function."""
+        _, poses, shapes = thetas
+
+        batch_size = poses.shape[0]
+        shape_disc_value = self.shape_discriminator(shapes)
+
+        # The first rotation matrix is global rotation
+        # and is NOT used in discriminator.
+        if poses.dim() == 2:
+            rotate_matrixs = \
+                batch_rodrigues(poses.contiguous().view(-1, 3)
+                                ).view(batch_size, 24, 9)[:, 1:, :]
+        else:
+            rotate_matrixs = poses.contiguous().view(batch_size, 24,
+                                                     9)[:, 1:, :].contiguous()
+        pose_disc_value, pose_inter_disc_value \
+            = self.pose_discriminator(rotate_matrixs)
+        full_pose_disc_value = self.full_pose_discriminator(
+            pose_inter_disc_value.contiguous().view(batch_size, -1))
+        return torch.cat(
+            (pose_disc_value, full_pose_disc_value, shape_disc_value), 1)
+
+    def init_weights(self):
+        """Initialize model weights."""
+        self.full_pose_discriminator.init_weights()
+        self.pose_discriminator.init_weights()
+        self.shape_discriminator.init_weights()
diff --git a/main/transformer_utils/mmpose/models/necks/__init__.py b/main/transformer_utils/mmpose/models/necks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0593f61c01fa9968260b939f7ccd50311c058595
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/necks/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .fpn import FPN
+from .gap_neck import GlobalAveragePooling
+from .posewarper_neck import PoseWarperNeck
+from .tcformer_mta_neck import MTA
+from .channel_mapper import ChannelMapper
+
+__all__ = ['GlobalAveragePooling', 'PoseWarperNeck', 'FPN', 'MTA']
diff --git a/main/transformer_utils/mmpose/models/necks/channel_mapper.py b/main/transformer_utils/mmpose/models/necks/channel_mapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..113d170e9d55b9e2d3984c6838a86e4c659fa75c
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/necks/channel_mapper.py
@@ -0,0 +1,76 @@
+import torch.nn as nn
+from mmcv.cnn import ConvModule, xavier_init
+
+from ..builder import NECKS
+
+
+@NECKS.register_module()
+class ChannelMapper(nn.Module):
+    r"""Channel Mapper to reduce/increase channels of backbone features.
+
+    This is used to reduce/increase channels of backbone features.
+
+    Args:
+        in_channels (List[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale).
+        kernel_size (int, optional): kernel_size for reducing channels (used
+            at each scale). Default: 3.
+        conv_cfg (dict, optional): Config dict for convolution layer.
+            Default: None.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: None.
+        act_cfg (dict, optional): Config dict for activation layer in
+            ConvModule. Default: dict(type='ReLU').
+
+    Example:
+        >>> import torch
+        >>> in_channels = [2, 3, 5, 7]
+        >>> scales = [340, 170, 84, 43]
+        >>> inputs = [torch.rand(1, c, s, s)
+        ...           for c, s in zip(in_channels, scales)]
+        >>> self = ChannelMapper(in_channels, 11, 3).eval()
+        >>> outputs = self.forward(inputs)
+        >>> for i in range(len(outputs)):
+        ...     print(f'outputs[{i}].shape = {outputs[i].shape}')
+        outputs[0].shape = torch.Size([1, 11, 340, 340])
+        outputs[1].shape = torch.Size([1, 11, 170, 170])
+        outputs[2].shape = torch.Size([1, 11, 84, 84])
+        outputs[3].shape = torch.Size([1, 11, 43, 43])
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 act_cfg=dict(type='ReLU')):
+        super(ChannelMapper, self).__init__()
+        assert isinstance(in_channels, list)
+
+        self.convs = nn.ModuleList()
+        for in_channel in in_channels:
+            self.convs.append(
+                ConvModule(
+                    in_channel,
+                    out_channels,
+                    kernel_size,
+                    padding=(kernel_size - 1) // 2,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+
+    # default init_weights for conv(msra) and norm in ConvModule
+    def init_weights(self):
+        """Initialize the weights of ChannelMapper module."""
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                xavier_init(m, distribution='uniform')
+
+    def forward(self, inputs):
+        """Forward function."""
+        
+        
+        assert len(inputs) == len(self.convs)
+        outs = [self.convs[i](inputs[i]) for i in range(len(inputs))]
+        return tuple(outs)
diff --git a/main/transformer_utils/mmpose/models/necks/fpn.py b/main/transformer_utils/mmpose/models/necks/fpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..795a8af0b6904153a9b4e1a41d7b803381874162
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/necks/fpn.py
@@ -0,0 +1,207 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, xavier_init
+from mmcv.runner import auto_fp16
+
+from ..builder import NECKS
+
+
+@NECKS.register_module()
+class FPN(nn.Module):
+    r"""Feature Pyramid Network.
+
+    This is an implementation of paper `Feature Pyramid Networks for Object
+    Detection <https://arxiv.org/abs/1612.03144>`_.
+
+    Args:
+        in_channels (list[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale).
+        num_outs (int): Number of output scales.
+        start_level (int): Index of the start input backbone level used to
+            build the feature pyramid. Default: 0.
+        end_level (int): Index of the end input backbone level (exclusive) to
+            build the feature pyramid. Default: -1, which means the last level.
+        add_extra_convs (bool | str): If bool, it decides whether to add conv
+            layers on top of the original feature maps. Default to False.
+            If True, it is equivalent to `add_extra_convs='on_input'`.
+            If str, it specifies the source feature map of the extra convs.
+            Only the following options are allowed
+
+            - 'on_input': Last feat map of neck inputs (i.e. backbone feature).
+            - 'on_lateral': Last feature map after lateral convs.
+            - 'on_output': The last output feature map after fpn convs.
+        relu_before_extra_convs (bool): Whether to apply relu before the extra
+            conv. Default: False.
+        no_norm_on_lateral (bool): Whether to apply norm on lateral.
+            Default: False.
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        act_cfg (dict): Config dict for activation layer in ConvModule.
+            Default: None.
+        upsample_cfg (dict): Config dict for interpolate layer.
+            Default: dict(mode='nearest').
+
+    Example:
+        >>> import torch
+        >>> in_channels = [2, 3, 5, 7]
+        >>> scales = [340, 170, 84, 43]
+        >>> inputs = [torch.rand(1, c, s, s)
+        ...           for c, s in zip(in_channels, scales)]
+        >>> self = FPN(in_channels, 11, len(in_channels)).eval()
+        >>> outputs = self.forward(inputs)
+        >>> for i in range(len(outputs)):
+        ...     print(f'outputs[{i}].shape = {outputs[i].shape}')
+        outputs[0].shape = torch.Size([1, 11, 340, 340])
+        outputs[1].shape = torch.Size([1, 11, 170, 170])
+        outputs[2].shape = torch.Size([1, 11, 84, 84])
+        outputs[3].shape = torch.Size([1, 11, 43, 43])
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_outs,
+                 start_level=0,
+                 end_level=-1,
+                 add_extra_convs=False,
+                 relu_before_extra_convs=False,
+                 no_norm_on_lateral=False,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 act_cfg=None,
+                 upsample_cfg=dict(mode='nearest')):
+        super().__init__()
+        assert isinstance(in_channels, list)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_ins = len(in_channels)
+        self.num_outs = num_outs
+        self.relu_before_extra_convs = relu_before_extra_convs
+        self.no_norm_on_lateral = no_norm_on_lateral
+        self.fp16_enabled = False
+        self.upsample_cfg = upsample_cfg.copy()
+
+        if end_level == -1 or end_level == self.num_ins - 1:
+            self.backbone_end_level = self.num_ins
+            assert num_outs >= self.num_ins - start_level
+        else:
+            # if end_level is not the last level, no extra level is allowed
+            self.backbone_end_level = end_level + 1
+            assert end_level < self.num_ins
+            assert num_outs == end_level - start_level + 1
+        self.start_level = start_level
+        self.end_level = end_level
+        self.add_extra_convs = add_extra_convs
+        assert isinstance(add_extra_convs, (str, bool))
+        if isinstance(add_extra_convs, str):
+            # Extra_convs_source choices: 'on_input', 'on_lateral', 'on_output'
+            assert add_extra_convs in ('on_input', 'on_lateral', 'on_output')
+        elif add_extra_convs:  # True
+            self.add_extra_convs = 'on_input'
+
+        self.lateral_convs = nn.ModuleList()
+        self.fpn_convs = nn.ModuleList()
+
+        for i in range(self.start_level, self.backbone_end_level):
+            l_conv = ConvModule(
+                in_channels[i],
+                out_channels,
+                1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg if not self.no_norm_on_lateral else None,
+                act_cfg=act_cfg,
+                inplace=False)
+            fpn_conv = ConvModule(
+                out_channels,
+                out_channels,
+                3,
+                padding=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg,
+                inplace=False)
+
+            self.lateral_convs.append(l_conv)
+            self.fpn_convs.append(fpn_conv)
+
+        # add extra conv layers (e.g., RetinaNet)
+        extra_levels = num_outs - self.backbone_end_level + self.start_level
+        if self.add_extra_convs and extra_levels >= 1:
+            for i in range(extra_levels):
+                if i == 0 and self.add_extra_convs == 'on_input':
+                    in_channels = self.in_channels[self.backbone_end_level - 1]
+                else:
+                    in_channels = out_channels
+                extra_fpn_conv = ConvModule(
+                    in_channels,
+                    out_channels,
+                    3,
+                    stride=2,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg,
+                    inplace=False)
+                self.fpn_convs.append(extra_fpn_conv)
+
+    def init_weights(self):
+        """Initialize model weights."""
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                xavier_init(m, distribution='uniform')
+
+    @auto_fp16()
+    def forward(self, inputs):
+        """Forward function."""
+        assert len(inputs) == len(self.in_channels)
+
+        # build laterals
+        laterals = [
+            lateral_conv(inputs[i + self.start_level])
+            for i, lateral_conv in enumerate(self.lateral_convs)
+        ]
+
+        # build top-down path
+        used_backbone_levels = len(laterals)
+        for i in range(used_backbone_levels - 1, 0, -1):
+            # In some cases, fixing `scale factor` (e.g. 2) is preferred, but
+            #  it cannot co-exist with `size` in `F.interpolate`.
+            if 'scale_factor' in self.upsample_cfg:
+                # fix runtime error of "+=" inplace operation in PyTorch 1.10
+                laterals[i - 1] = laterals[i - 1] + F.interpolate(
+                    laterals[i], **self.upsample_cfg)
+            else:
+                prev_shape = laterals[i - 1].shape[2:]
+                laterals[i - 1] = laterals[i - 1] + F.interpolate(
+                    laterals[i], size=prev_shape, **self.upsample_cfg)
+
+        # build outputs
+        # part 1: from original levels
+        outs = [
+            self.fpn_convs[i](laterals[i]) for i in range(used_backbone_levels)
+        ]
+        # part 2: add extra levels
+        if self.num_outs > len(outs):
+            # use max pool to get more levels on top of outputs
+            # (e.g., Faster R-CNN, Mask R-CNN)
+            if not self.add_extra_convs:
+                for i in range(self.num_outs - used_backbone_levels):
+                    outs.append(F.max_pool2d(outs[-1], 1, stride=2))
+            # add conv layers on top of original feature maps (RetinaNet)
+            else:
+                if self.add_extra_convs == 'on_input':
+                    extra_source = inputs[self.backbone_end_level - 1]
+                elif self.add_extra_convs == 'on_lateral':
+                    extra_source = laterals[-1]
+                elif self.add_extra_convs == 'on_output':
+                    extra_source = outs[-1]
+                else:
+                    raise NotImplementedError
+                outs.append(self.fpn_convs[used_backbone_levels](extra_source))
+                for i in range(used_backbone_levels + 1, self.num_outs):
+                    if self.relu_before_extra_convs:
+                        outs.append(self.fpn_convs[i](F.relu(outs[-1])))
+                    else:
+                        outs.append(self.fpn_convs[i](outs[-1]))
+        return outs
diff --git a/main/transformer_utils/mmpose/models/necks/gap_neck.py b/main/transformer_utils/mmpose/models/necks/gap_neck.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e6ad68ec11110daaad3a66e09d67efb355c4b93
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/necks/gap_neck.py
@@ -0,0 +1,37 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+
+from ..builder import NECKS
+
+
+@NECKS.register_module()
+class GlobalAveragePooling(nn.Module):
+    """Global Average Pooling neck.
+
+    Note that we use `view` to remove extra channel after pooling. We do not
+    use `squeeze` as it will also remove the batch dimension when the tensor
+    has a batch dimension of size 1, which can lead to unexpected errors.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.gap = nn.AdaptiveAvgPool2d((1, 1))
+
+    def init_weights(self):
+        pass
+
+    def forward(self, inputs):
+        if isinstance(inputs, tuple):
+            outs = tuple([self.gap(x) for x in inputs])
+            outs = tuple(
+                [out.view(x.size(0), -1) for out, x in zip(outs, inputs)])
+        elif isinstance(inputs, list):
+            outs = [self.gap(x) for x in inputs]
+            outs = [out.view(x.size(0), -1) for out, x in zip(outs, inputs)]
+        elif isinstance(inputs, torch.Tensor):
+            outs = self.gap(inputs)
+            outs = outs.view(inputs.size(0), -1)
+        else:
+            raise TypeError('neck inputs should be tuple or torch.tensor')
+        return outs
diff --git a/main/transformer_utils/mmpose/models/necks/posewarper_neck.py b/main/transformer_utils/mmpose/models/necks/posewarper_neck.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd4ddfbf8984857a6110f19b0a7d703b53f1c433
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/necks/posewarper_neck.py
@@ -0,0 +1,329 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import torch
+import torch.nn as nn
+from mmcv.cnn import (build_conv_layer, build_norm_layer, constant_init,
+                      normal_init)
+from mmcv.utils import digit_version
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmpose.models.utils.ops import resize
+from ..backbones.resnet import BasicBlock, Bottleneck
+from ..builder import NECKS
+
+try:
+    from mmcv.ops import DeformConv2d
+    has_mmcv_full = True
+except (ImportError, ModuleNotFoundError):
+    has_mmcv_full = False
+
+
+@NECKS.register_module()
+class PoseWarperNeck(nn.Module):
+    """PoseWarper neck.
+
+    `"Learning temporal pose estimation from sparsely-labeled videos"
+    <https://arxiv.org/abs/1906.04016>`_.
+
+    Args:
+        in_channels (int): Number of input channels from backbone
+        out_channels (int): Number of output channels
+        inner_channels (int): Number of intermediate channels of the res block
+        deform_groups (int): Number of groups in the deformable conv
+        dilations (list|tuple): different dilations of the offset conv layers
+        trans_conv_kernel (int): the kernel of the trans conv layer, which is
+            used to get heatmap from the output of backbone. Default: 1
+        res_blocks_cfg (dict|None): config of residual blocks. If None,
+            use the default values. If not None, it should contain the
+            following keys:
+
+            - block (str): the type of residual block, Default: 'BASIC'.
+            - num_blocks (int):  the number of blocks, Default: 20.
+
+        offsets_kernel (int): the kernel of offset conv layer.
+        deform_conv_kernel (int): the kernel of defomrable conv layer.
+        in_index (int|Sequence[int]): Input feature index. Default: 0
+        input_transform (str|None): Transformation type of input features.
+            Options: 'resize_concat', 'multiple_select', None.
+            Default: None.
+
+            - 'resize_concat': Multiple feature maps will be resize to \
+                the same size as first one and than concat together. \
+                Usually used in FCN head of HRNet.
+            - 'multiple_select': Multiple feature maps will be bundle into \
+                a list and passed into decode head.
+            - None: Only one select feature map is allowed.
+
+        freeze_trans_layer (bool): Whether to freeze the transition layer
+            (stop grad and set eval mode). Default: True.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        im2col_step (int): the argument `im2col_step` in deformable conv,
+            Default: 80.
+    """
+    blocks_dict = {'BASIC': BasicBlock, 'BOTTLENECK': Bottleneck}
+    minimum_mmcv_version = '1.3.17'
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 inner_channels,
+                 deform_groups=17,
+                 dilations=(3, 6, 12, 18, 24),
+                 trans_conv_kernel=1,
+                 res_blocks_cfg=None,
+                 offsets_kernel=3,
+                 deform_conv_kernel=3,
+                 in_index=0,
+                 input_transform=None,
+                 freeze_trans_layer=True,
+                 norm_eval=False,
+                 im2col_step=80):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.inner_channels = inner_channels
+        self.deform_groups = deform_groups
+        self.dilations = dilations
+        self.trans_conv_kernel = trans_conv_kernel
+        self.res_blocks_cfg = res_blocks_cfg
+        self.offsets_kernel = offsets_kernel
+        self.deform_conv_kernel = deform_conv_kernel
+        self.in_index = in_index
+        self.input_transform = input_transform
+        self.freeze_trans_layer = freeze_trans_layer
+        self.norm_eval = norm_eval
+        self.im2col_step = im2col_step
+
+        identity_trans_layer = False
+
+        assert trans_conv_kernel in [0, 1, 3]
+        kernel_size = trans_conv_kernel
+        if kernel_size == 3:
+            padding = 1
+        elif kernel_size == 1:
+            padding = 0
+        else:
+            # 0 for Identity mapping.
+            identity_trans_layer = True
+
+        if identity_trans_layer:
+            self.trans_layer = nn.Identity()
+        else:
+            self.trans_layer = build_conv_layer(
+                cfg=dict(type='Conv2d'),
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=kernel_size,
+                stride=1,
+                padding=padding)
+
+        # build chain of residual blocks
+        if res_blocks_cfg is not None and not isinstance(res_blocks_cfg, dict):
+            raise TypeError('res_blocks_cfg should be dict or None.')
+
+        if res_blocks_cfg is None:
+            block_type = 'BASIC'
+            num_blocks = 20
+        else:
+            block_type = res_blocks_cfg.get('block', 'BASIC')
+            num_blocks = res_blocks_cfg.get('num_blocks', 20)
+
+        block = self.blocks_dict[block_type]
+
+        res_layers = []
+        downsample = nn.Sequential(
+            build_conv_layer(
+                cfg=dict(type='Conv2d'),
+                in_channels=out_channels,
+                out_channels=inner_channels,
+                kernel_size=1,
+                stride=1,
+                bias=False),
+            build_norm_layer(dict(type='BN'), inner_channels)[1])
+        res_layers.append(
+            block(
+                in_channels=out_channels,
+                out_channels=inner_channels,
+                downsample=downsample))
+
+        for _ in range(1, num_blocks):
+            res_layers.append(block(inner_channels, inner_channels))
+        self.offset_feats = nn.Sequential(*res_layers)
+
+        # build offset layers
+        self.num_offset_layers = len(dilations)
+        assert self.num_offset_layers > 0, 'Number of offset layers ' \
+            'should be larger than 0.'
+
+        target_offset_channels = 2 * offsets_kernel**2 * deform_groups
+
+        offset_layers = [
+            build_conv_layer(
+                cfg=dict(type='Conv2d'),
+                in_channels=inner_channels,
+                out_channels=target_offset_channels,
+                kernel_size=offsets_kernel,
+                stride=1,
+                dilation=dilations[i],
+                padding=dilations[i],
+                bias=False,
+            ) for i in range(self.num_offset_layers)
+        ]
+        self.offset_layers = nn.ModuleList(offset_layers)
+
+        # build deformable conv layers
+        assert digit_version(mmcv.__version__) >= \
+            digit_version(self.minimum_mmcv_version), \
+            f'Current MMCV version: {mmcv.__version__}, ' \
+            f'but MMCV >= {self.minimum_mmcv_version} is required, see ' \
+            f'https://github.com/open-mmlab/mmcv/issues/1440, ' \
+            f'Please install the latest MMCV.'
+
+        if has_mmcv_full:
+            deform_conv_layers = [
+                DeformConv2d(
+                    in_channels=out_channels,
+                    out_channels=out_channels,
+                    kernel_size=deform_conv_kernel,
+                    stride=1,
+                    padding=int(deform_conv_kernel / 2) * dilations[i],
+                    dilation=dilations[i],
+                    deform_groups=deform_groups,
+                    im2col_step=self.im2col_step,
+                ) for i in range(self.num_offset_layers)
+            ]
+        else:
+            raise ImportError('Please install the full version of mmcv '
+                              'to use `DeformConv2d`.')
+
+        self.deform_conv_layers = nn.ModuleList(deform_conv_layers)
+
+        self.freeze_layers()
+
+    def freeze_layers(self):
+        if self.freeze_trans_layer:
+            self.trans_layer.eval()
+
+            for param in self.trans_layer.parameters():
+                param.requires_grad = False
+
+    def init_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, std=0.001)
+            elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
+                constant_init(m, 1)
+            elif isinstance(m, DeformConv2d):
+                filler = torch.zeros([
+                    m.weight.size(0),
+                    m.weight.size(1),
+                    m.weight.size(2),
+                    m.weight.size(3)
+                ],
+                                     dtype=torch.float32,
+                                     device=m.weight.device)
+                for k in range(m.weight.size(0)):
+                    filler[k, k,
+                           int(m.weight.size(2) / 2),
+                           int(m.weight.size(3) / 2)] = 1.0
+                m.weight = torch.nn.Parameter(filler)
+                m.weight.requires_grad = True
+
+        # posewarper offset layer weight initialization
+        for m in self.offset_layers.modules():
+            constant_init(m, 0)
+
+    def _transform_inputs(self, inputs):
+        """Transform inputs for decoder.
+
+        Args:
+            inputs (list[Tensor] | Tensor): multi-level img features.
+
+        Returns:
+            Tensor: The transformed inputs
+        """
+        if not isinstance(inputs, list):
+            return inputs
+
+        if self.input_transform == 'resize_concat':
+            inputs = [inputs[i] for i in self.in_index]
+            upsampled_inputs = [
+                resize(
+                    input=x,
+                    size=inputs[0].shape[2:],
+                    mode='bilinear',
+                    align_corners=self.align_corners) for x in inputs
+            ]
+            inputs = torch.cat(upsampled_inputs, dim=1)
+        elif self.input_transform == 'multiple_select':
+            inputs = [inputs[i] for i in self.in_index]
+        else:
+            inputs = inputs[self.in_index]
+
+        return inputs
+
+    def forward(self, inputs, frame_weight):
+        assert isinstance(inputs, (list, tuple)), 'PoseWarperNeck inputs ' \
+            'should be list or tuple, even though the length is 1, ' \
+            'for unified processing.'
+
+        output_heatmap = 0
+        if len(inputs) > 1:
+            inputs = [self._transform_inputs(input) for input in inputs]
+            inputs = [self.trans_layer(input) for input in inputs]
+
+            # calculate difference features
+            diff_features = [
+                self.offset_feats(inputs[0] - input) for input in inputs
+            ]
+
+            for i in range(len(inputs)):
+                if frame_weight[i] == 0:
+                    continue
+                warped_heatmap = 0
+                for j in range(self.num_offset_layers):
+                    offset = (self.offset_layers[j](diff_features[i]))
+                    warped_heatmap_tmp = self.deform_conv_layers[j](inputs[i],
+                                                                    offset)
+                    warped_heatmap += warped_heatmap_tmp / \
+                        self.num_offset_layers
+
+                output_heatmap += warped_heatmap * frame_weight[i]
+
+        else:
+            inputs = inputs[0]
+            inputs = self._transform_inputs(inputs)
+            inputs = self.trans_layer(inputs)
+
+            num_frames = len(frame_weight)
+            batch_size = inputs.size(0) // num_frames
+            ref_x = inputs[:batch_size]
+            ref_x_tiled = ref_x.repeat(num_frames, 1, 1, 1)
+
+            offset_features = self.offset_feats(ref_x_tiled - inputs)
+
+            warped_heatmap = 0
+            for j in range(self.num_offset_layers):
+                offset = self.offset_layers[j](offset_features)
+
+                warped_heatmap_tmp = self.deform_conv_layers[j](inputs, offset)
+                warped_heatmap += warped_heatmap_tmp / self.num_offset_layers
+
+            for i in range(num_frames):
+                if frame_weight[i] == 0:
+                    continue
+                output_heatmap += warped_heatmap[i * batch_size:(i + 1) *
+                                                 batch_size] * frame_weight[i]
+
+        return output_heatmap
+
+    def train(self, mode=True):
+        """Convert the model into training mode."""
+        super().train(mode)
+        self.freeze_layers()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, _BatchNorm):
+                    m.eval()
diff --git a/main/transformer_utils/mmpose/models/necks/tcformer_mta_neck.py b/main/transformer_utils/mmpose/models/necks/tcformer_mta_neck.py
new file mode 100644
index 0000000000000000000000000000000000000000..6723fb018e7799c1c0104868b1ca87c56cd28351
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/necks/tcformer_mta_neck.py
@@ -0,0 +1,224 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, constant_init, normal_init, trunc_normal_init
+from mmcv.runner import BaseModule
+
+from ..builder import NECKS
+from ..utils import TCFormerDynamicBlock, token2map, token_interp
+
+
+@NECKS.register_module()
+class MTA(BaseModule):
+    """Multi-stage Token feature Aggregation (MTA) module in TCFormer.
+
+    Args:
+        in_channels (list[int]): Number of input channels per stage.
+            Default: [64, 128, 256, 512].
+        out_channels (int): Number of output channels (used at each scale).
+        num_outs (int): Number of output scales. Default: 4.
+        start_level (int): Index of the start input backbone level used to
+            build the feature pyramid. Default: 0.
+        end_level (int): Index of the end input backbone level (exclusive) to
+            build the feature pyramid. Default: -1, which means the last level.
+        add_extra_convs (bool | str): If bool, it decides whether to add conv
+            layers on top of the original feature maps. Default to False.
+            If True, it is equivalent to `add_extra_convs='on_input'`.
+            If str, it specifies the source feature map of the extra convs.
+            Only the following options are allowed
+            - 'on_input': Last feat map of neck inputs (i.e. backbone feature).
+            - 'on_output': The last output feature map after fpn convs.
+        relu_before_extra_convs (bool): Whether to apply relu before the extra
+            conv. Default: False.
+        no_norm_on_lateral (bool): Whether to apply norm on lateral.
+            Default: False.
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        act_cfg (dict): Config dict for activation layer in ConvModule.
+        num_heads (Sequence[int]): The attention heads of each transformer
+            block. Default: [2, 2, 2, 2].
+        mlp_ratios (Sequence[int]): The ratio of the mlp hidden dim to the
+            embedding dim of each transformer block.
+        sr_ratios (Sequence[int]): The spatial reduction rate of each
+            transformer block. Default: [8, 4, 2, 1].
+        qkv_bias (bool): Enable bias for qkv if True. Default: True.
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Default: None.
+        drop_rate (float): Probability of an element to be zeroed.
+            Default 0.0.
+        attn_drop_rate (float): The drop out rate for attention layer.
+            Default 0.0.
+        drop_path_rate (float): stochastic depth rate. Default 0.
+        transformer_norm_cfg (dict): Config dict for normalization layer
+            in transformer blocks. Default: dict(type='LN').
+        use_sr_conv (bool): If True, use a conv layer for spatial reduction.
+            If False, use a pooling process for spatial reduction. Defaults:
+            False.
+    """
+
+    def __init__(
+            self,
+            in_channels=[64, 128, 256, 512],
+            out_channels=128,
+            num_outs=4,
+            start_level=0,
+            end_level=-1,
+            add_extra_convs=False,
+            relu_before_extra_convs=False,
+            no_norm_on_lateral=False,
+            conv_cfg=None,
+            norm_cfg=None,
+            act_cfg=None,
+            num_heads=[2, 2, 2, 2],
+            mlp_ratios=[4, 4, 4, 4],
+            sr_ratios=[8, 4, 2, 1],
+            qkv_bias=True,
+            qk_scale=None,
+            drop_rate=0.,
+            attn_drop_rate=0.,
+            drop_path_rate=0.,
+            transformer_norm_cfg=dict(type='LN'),
+            use_sr_conv=False,
+    ):
+        super().__init__()
+        assert isinstance(in_channels, list)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_ins = len(in_channels)
+        self.num_outs = num_outs
+        self.no_norm_on_lateral = no_norm_on_lateral
+        self.fp16_enabled = False
+        self.norm_cfg = norm_cfg
+        self.conv_cfg = conv_cfg
+        self.act_cfg = act_cfg
+        self.mlp_ratios = mlp_ratios
+
+        if end_level == -1 or end_level == self.num_ins - 1:
+            self.backbone_end_level = self.num_ins
+            assert num_outs >= self.num_ins - start_level
+        else:
+            # if end_level is not the last level, no extra level is allowed
+            self.backbone_end_level = end_level + 1
+            assert end_level < self.num_ins
+            assert num_outs == end_level - start_level + 1
+        self.start_level = start_level
+        self.end_level = end_level
+
+        self.lateral_convs = nn.ModuleList()
+        self.merge_blocks = nn.ModuleList()
+
+        for i in range(self.start_level, self.backbone_end_level):
+            l_conv = ConvModule(
+                in_channels[i],
+                out_channels,
+                1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg if not self.no_norm_on_lateral else None,
+                act_cfg=act_cfg,
+                inplace=False)
+            self.lateral_convs.append(l_conv)
+
+        for i in range(self.start_level, self.backbone_end_level - 1):
+            merge_block = TCFormerDynamicBlock(
+                dim=out_channels,
+                num_heads=num_heads[i],
+                mlp_ratio=mlp_ratios[i],
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=drop_path_rate,
+                norm_cfg=transformer_norm_cfg,
+                sr_ratio=sr_ratios[i],
+                use_sr_conv=use_sr_conv)
+            self.merge_blocks.append(merge_block)
+
+        # add extra conv layers (e.g., RetinaNet)
+        self.relu_before_extra_convs = relu_before_extra_convs
+
+        self.add_extra_convs = add_extra_convs
+        assert isinstance(add_extra_convs, (str, bool))
+        if isinstance(add_extra_convs, str):
+            # Extra_convs_source choices: 'on_input', 'on_output'
+            assert add_extra_convs in ('on_input', 'on_output')
+        elif add_extra_convs:  # True
+            self.add_extra_convs = 'on_input'
+
+        self.extra_convs = nn.ModuleList()
+        extra_levels = num_outs - (self.end_level + 1 - self.start_level)
+        if self.add_extra_convs and extra_levels >= 1:
+            for i in range(extra_levels):
+                if i == 0 and self.add_extra_convs == 'on_input':
+                    in_channels = self.in_channels[self.end_level]
+                else:
+                    in_channels = out_channels
+                extra_fpn_conv = ConvModule(
+                    in_channels,
+                    out_channels,
+                    3,
+                    stride=2,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg,
+                    inplace=False)
+                self.extra_convs.append(extra_fpn_conv)
+
+    def init_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Linear):
+                trunc_normal_init(m, std=.02, bias=0.)
+            elif isinstance(m, nn.LayerNorm):
+                constant_init(m, 1.0)
+            elif isinstance(m, nn.Conv2d):
+                fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                fan_out //= m.groups
+                normal_init(m, 0, math.sqrt(2.0 / fan_out))
+
+    def forward(self, inputs):
+        """Forward function."""
+        assert len(inputs) == len(self.in_channels)
+
+        # build lateral tokens
+        input_dicts = []
+        for i, lateral_conv in enumerate(self.lateral_convs):
+            tmp = inputs[i + self.start_level].copy()
+            tmp['x'] = lateral_conv(tmp['x'].unsqueeze(2).permute(
+                0, 3, 1, 2)).permute(0, 2, 3, 1).squeeze(2)
+            input_dicts.append(tmp)
+
+        # merge from high level to low level
+        for i in range(len(input_dicts) - 2, -1, -1):
+            input_dicts[i]['x'] = input_dicts[i]['x'] + token_interp(
+                input_dicts[i], input_dicts[i + 1])
+            input_dicts[i] = self.merge_blocks[i](input_dicts[i])
+
+        # transform to feature map
+        outs = [token2map(token_dict) for token_dict in input_dicts]
+
+        # part 2: add extra levels
+        used_backbone_levels = len(outs)
+        if self.num_outs > len(outs):
+            # use max pool to get more levels on top of outputs
+            if not self.add_extra_convs:
+                for i in range(self.num_outs - used_backbone_levels):
+                    outs.append(F.max_pool2d(outs[-1], 1, stride=2))
+            # add conv layers on top of original feature maps
+            else:
+                if self.add_extra_convs == 'on_input':
+                    tmp = inputs[self.backbone_end_level - 1]
+                    extra_source = token2map(tmp)
+                elif self.add_extra_convs == 'on_output':
+                    extra_source = outs[-1]
+                else:
+                    raise NotImplementedError
+
+                outs.append(self.extra_convs[0](extra_source))
+                for i in range(1, self.num_outs - used_backbone_levels):
+                    if self.relu_before_extra_convs:
+                        outs.append(self.extra_convs[i](F.relu(outs[-1])))
+                    else:
+                        outs.append(self.extra_convs[i](outs[-1]))
+        return outs
diff --git a/main/transformer_utils/mmpose/models/registry.py b/main/transformer_utils/mmpose/models/registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..f354ae9e137262e2f375a64aef74c3af20baae63
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/registry.py
@@ -0,0 +1,13 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+from .builder import BACKBONES, HEADS, LOSSES, NECKS, POSENETS
+
+__all__ = ['BACKBONES', 'HEADS', 'LOSSES', 'NECKS', 'POSENETS']
+
+warnings.simplefilter('once', DeprecationWarning)
+warnings.warn(
+    'Registries (BACKBONES, NECKS, HEADS, LOSSES, POSENETS) have '
+    'been moved to mmpose.models.builder. Importing from '
+    'mmpose.models.registry will be deprecated in the future.',
+    DeprecationWarning)
diff --git a/main/transformer_utils/mmpose/models/utils/__init__.py b/main/transformer_utils/mmpose/models/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d7e8f6482ce3e2c06229a578f22536bd75e5260
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/utils/__init__.py
@@ -0,0 +1,24 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .ckpt_convert import pvt_convert, tcformer_convert
+from .geometry import batch_rodrigues, quat_to_rotmat, rot6d_to_rotmat
+from .misc import torch_meshgrid_ij
+from .ops import resize
+from .realnvp import RealNVP
+from .smpl import SMPL
+from .tcformer_utils import (TCFormerDynamicBlock, TCFormerRegularBlock,
+                             TokenConv, cluster_dpc_knn, merge_tokens,
+                             token2map, token_interp)
+from .transformer import (PatchEmbed, PatchMerging, nchw_to_nlc, nlc_to_nchw, 
+                            PoseurTransformer_v3, DetrTransformerEncoder_zero_layer,
+                            DeformableDetrTransformerDecoder, DetrTransformerDecoderLayer_grouped)
+
+from .positional_encoding import (LearnedPositionalEncoding,
+                                  SinePositionalEncoding)
+
+__all__ = [
+    'SMPL', 'PatchEmbed', 'nchw_to_nlc', 'nlc_to_nchw', 'pvt_convert',
+    'PatchMerging', 'batch_rodrigues', 'quat_to_rotmat', 'rot6d_to_rotmat',
+    'resize', 'RealNVP', 'torch_meshgrid_ij', 'token2map', 'TokenConv',
+    'TCFormerRegularBlock', 'TCFormerDynamicBlock', 'cluster_dpc_knn',
+    'merge_tokens', 'token_interp', 'tcformer_convert'
+]
diff --git a/main/transformer_utils/mmpose/models/utils/ckpt_convert.py b/main/transformer_utils/mmpose/models/utils/ckpt_convert.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5213937db3641bf7300156a2be3f2225326f02b
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/utils/ckpt_convert.py
@@ -0,0 +1,94 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# This script consists of several convert functions which
+# can modify the weights of model in original repo to be
+# pre-trained weights.
+
+from collections import OrderedDict
+
+import torch
+
+
+def pvt_convert(ckpt):
+    new_ckpt = OrderedDict()
+    # Process the concat between q linear weights and kv linear weights
+    use_abs_pos_embed = False
+    use_conv_ffn = False
+    for k in ckpt.keys():
+        if k.startswith('pos_embed'):
+            use_abs_pos_embed = True
+        if k.find('dwconv') >= 0:
+            use_conv_ffn = True
+    for k, v in ckpt.items():
+        if k.startswith('head'):
+            continue
+        if k.startswith('norm.'):
+            continue
+        if k.startswith('cls_token'):
+            continue
+        if k.startswith('pos_embed'):
+            stage_i = int(k.replace('pos_embed', ''))
+            new_k = k.replace(f'pos_embed{stage_i}',
+                              f'layers.{stage_i - 1}.1.0.pos_embed')
+            if stage_i == 4 and v.size(1) == 50:  # 1 (cls token) + 7 * 7
+                new_v = v[:, 1:, :]  # remove cls token
+            else:
+                new_v = v
+        elif k.startswith('patch_embed'):
+            stage_i = int(k.split('.')[0].replace('patch_embed', ''))
+            new_k = k.replace(f'patch_embed{stage_i}',
+                              f'layers.{stage_i - 1}.0')
+            new_v = v
+            if 'proj.' in new_k:
+                new_k = new_k.replace('proj.', 'projection.')
+        elif k.startswith('block'):
+            stage_i = int(k.split('.')[0].replace('block', ''))
+            layer_i = int(k.split('.')[1])
+            new_layer_i = layer_i + use_abs_pos_embed
+            new_k = k.replace(f'block{stage_i}.{layer_i}',
+                              f'layers.{stage_i - 1}.1.{new_layer_i}')
+            new_v = v
+            if 'attn.q.' in new_k:
+                sub_item_k = k.replace('q.', 'kv.')
+                new_k = new_k.replace('q.', 'attn.in_proj_')
+                new_v = torch.cat([v, ckpt[sub_item_k]], dim=0)
+            elif 'attn.kv.' in new_k:
+                continue
+            elif 'attn.proj.' in new_k:
+                new_k = new_k.replace('proj.', 'attn.out_proj.')
+            elif 'attn.sr.' in new_k:
+                new_k = new_k.replace('sr.', 'sr.')
+            elif 'mlp.' in new_k:
+                string = f'{new_k}-'
+                new_k = new_k.replace('mlp.', 'ffn.layers.')
+                if 'fc1.weight' in new_k or 'fc2.weight' in new_k:
+                    new_v = v.reshape((*v.shape, 1, 1))
+                new_k = new_k.replace('fc1.', '0.')
+                new_k = new_k.replace('dwconv.dwconv.', '1.')
+                if use_conv_ffn:
+                    new_k = new_k.replace('fc2.', '4.')
+                else:
+                    new_k = new_k.replace('fc2.', '3.')
+                string += f'{new_k} {v.shape}-{new_v.shape}'
+        elif k.startswith('norm'):
+            stage_i = int(k[4])
+            new_k = k.replace(f'norm{stage_i}', f'layers.{stage_i - 1}.2')
+            new_v = v
+        else:
+            new_k = k
+            new_v = v
+        new_ckpt[new_k] = new_v
+
+    return new_ckpt
+
+
+def tcformer_convert(ckpt):
+    new_ckpt = OrderedDict()
+    # Process the concat between q linear weights and kv linear weights
+    for k, v in ckpt.items():
+        if 'patch_embed' in k:
+            new_k = k.replace('.proj.', '.projection.')
+        else:
+            new_k = k
+        new_ckpt[new_k] = v
+    return new_ckpt
diff --git a/main/transformer_utils/mmpose/models/utils/geometry.py b/main/transformer_utils/mmpose/models/utils/geometry.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ceadaec30cd2c9bb3fbada132e1ea674f2e8754
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/utils/geometry.py
@@ -0,0 +1,68 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from torch.nn import functional as F
+
+
+def rot6d_to_rotmat(x):
+    """Convert 6D rotation representation to 3x3 rotation matrix.
+
+    Based on Zhou et al., "On the Continuity of Rotation
+    Representations in Neural Networks", CVPR 2019
+    Input:
+        (B,6) Batch of 6-D rotation representations
+    Output:
+        (B,3,3) Batch of corresponding rotation matrices
+    """
+    x = x.view(-1, 3, 2)
+    a1 = x[:, :, 0]
+    a2 = x[:, :, 1]
+    b1 = F.normalize(a1)
+    b2 = F.normalize(a2 - torch.einsum('bi,bi->b', b1, a2).unsqueeze(-1) * b1)
+    b3 = torch.cross(b1, b2)
+    return torch.stack((b1, b2, b3), dim=-1)
+
+
+def batch_rodrigues(theta):
+    """Convert axis-angle representation to rotation matrix.
+    Args:
+        theta: size = [B, 3]
+    Returns:
+        Rotation matrix corresponding to the quaternion
+            -- size = [B, 3, 3]
+    """
+    l2norm = torch.norm(theta + 1e-8, p=2, dim=1)
+    angle = torch.unsqueeze(l2norm, -1)
+    normalized = torch.div(theta, angle)
+    angle = angle * 0.5
+    v_cos = torch.cos(angle)
+    v_sin = torch.sin(angle)
+    quat = torch.cat([v_cos, v_sin * normalized], dim=1)
+    return quat_to_rotmat(quat)
+
+
+def quat_to_rotmat(quat):
+    """Convert quaternion coefficients to rotation matrix.
+    Args:
+        quat: size = [B, 4] 4 <===>(w, x, y, z)
+    Returns:
+        Rotation matrix corresponding to the quaternion
+            -- size = [B, 3, 3]
+    """
+    norm_quat = quat
+    norm_quat = norm_quat / norm_quat.norm(p=2, dim=1, keepdim=True)
+    w, x, y, z = norm_quat[:, 0], norm_quat[:, 1],\
+        norm_quat[:, 2], norm_quat[:, 3]
+
+    B = quat.size(0)
+
+    w2, x2, y2, z2 = w.pow(2), x.pow(2), y.pow(2), z.pow(2)
+    wx, wy, wz = w * x, w * y, w * z
+    xy, xz, yz = x * y, x * z, y * z
+
+    rotMat = torch.stack([
+        w2 + x2 - y2 - z2, 2 * xy - 2 * wz, 2 * wy + 2 * xz, 2 * wz + 2 * xy,
+        w2 - x2 + y2 - z2, 2 * yz - 2 * wx, 2 * xz - 2 * wy, 2 * wx + 2 * yz,
+        w2 - x2 - y2 + z2
+    ],
+                         dim=1).view(B, 3, 3)
+    return rotMat
diff --git a/main/transformer_utils/mmpose/models/utils/misc.py b/main/transformer_utils/mmpose/models/utils/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c784588ef0c0ef58badf5c68d0a9602e14d6079
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/utils/misc.py
@@ -0,0 +1,13 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from packaging import version
+
+_torch_version_meshgrid_indexing = version.parse(
+    torch.__version__) >= version.parse('1.10.0a0')
+
+
+def torch_meshgrid_ij(*tensors):
+    if _torch_version_meshgrid_indexing:
+        return torch.meshgrid(*tensors, indexing='ij')
+    else:
+        return torch.meshgrid(*tensors)  # Uses indexing='ij' by default
diff --git a/main/transformer_utils/mmpose/models/utils/ops.py b/main/transformer_utils/mmpose/models/utils/ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..858d0a92148a591d235e58bfce8990207632fb39
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/utils/ops.py
@@ -0,0 +1,29 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch
+import torch.nn.functional as F
+
+
+def resize(input,
+           size=None,
+           scale_factor=None,
+           mode='nearest',
+           align_corners=None,
+           warning=True):
+    if warning:
+        if size is not None and align_corners:
+            input_h, input_w = tuple(int(x) for x in input.shape[2:])
+            output_h, output_w = tuple(int(x) for x in size)
+            if output_h > input_h or output_w > output_h:
+                if ((output_h > 1 and output_w > 1 and input_h > 1
+                     and input_w > 1) and (output_h - 1) % (input_h - 1)
+                        and (output_w - 1) % (input_w - 1)):
+                    warnings.warn(
+                        f'When align_corners={align_corners}, '
+                        'the output would more aligned if '
+                        f'input size {(input_h, input_w)} is `x+1` and '
+                        f'out size {(output_h, output_w)} is `nx+1`')
+    if isinstance(size, torch.Size):
+        size = tuple(int(x) for x in size)
+    return F.interpolate(input, size, scale_factor, mode, align_corners)
diff --git a/main/transformer_utils/mmpose/models/utils/positional_encoding.py b/main/transformer_utils/mmpose/models/utils/positional_encoding.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c7e6bab9f5b3a1a71895f068bcbee47a891de68
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/utils/positional_encoding.py
@@ -0,0 +1,155 @@
+import math
+
+import torch
+import torch.nn as nn
+from mmcv.cnn.bricks.transformer import POSITIONAL_ENCODING
+from mmcv.runner import BaseModule
+
+
+@POSITIONAL_ENCODING.register_module(force=True)
+class SinePositionalEncoding(BaseModule):
+    """Position encoding with sine and cosine functions.
+    See `End-to-End Object Detection with Transformers
+    <https://arxiv.org/pdf/2005.12872>`_ for details.
+    Args:
+        num_feats (int): The feature dimension for each position
+            along x-axis or y-axis. Note the final returned dimension
+            for each position is 2 times of this value.
+        temperature (int, optional): The temperature used for scaling
+            the position embedding. Defaults to 10000.
+        normalize (bool, optional): Whether to normalize the position
+            embedding. Defaults to False.
+        scale (float, optional): A scale factor that scales the position
+            embedding. The scale will be used only when `normalize` is True.
+            Defaults to 2*pi.
+        eps (float, optional): A value added to the denominator for
+            numerical stability. Defaults to 1e-6.
+        offset (float): offset add to embed when do the normalization.
+            Defaults to 0.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 num_feats,
+                 temperature=10000,
+                 normalize=False,
+                 scale=2 * math.pi,
+                 eps=1e-6,
+                 offset=0.,
+                 init_cfg=None):
+        super(SinePositionalEncoding, self).__init__(init_cfg)
+        if normalize:
+            assert isinstance(scale, (float, int)), 'when normalize is set,' \
+                'scale should be provided and in float or int type, ' \
+                f'found {type(scale)}'
+        self.num_feats = num_feats
+        self.temperature = temperature
+        self.normalize = normalize
+        self.scale = scale
+        self.eps = eps
+        self.offset = offset
+
+    def forward(self, mask):
+        """Forward function for `SinePositionalEncoding`.
+        Args:
+            mask (Tensor): ByteTensor mask. Non-zero values representing
+                ignored positions, while zero values means valid positions
+                for this image. Shape [bs, h, w].
+        Returns:
+            pos (Tensor): Returned position embedding with shape
+                [bs, num_feats*2, h, w].
+        """
+        # For convenience of exporting to ONNX, it's required to convert
+        # `masks` from bool to int.
+        mask = mask.to(torch.int)
+        not_mask = 1 - mask  # logical_not
+        y_embed = not_mask.cumsum(1, dtype=torch.float32)
+        x_embed = not_mask.cumsum(2, dtype=torch.float32)
+        if self.normalize:
+            y_embed = (y_embed + self.offset) / \
+                      (y_embed[:, -1:, :] + self.eps) * self.scale
+            x_embed = (x_embed + self.offset) / \
+                      (x_embed[:, :, -1:] + self.eps) * self.scale
+        dim_t = torch.arange(
+            self.num_feats, dtype=torch.float32, device=mask.device)
+        dim_t = self.temperature**(2 * (dim_t // 2) / self.num_feats)
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        # use `view` instead of `flatten` for dynamically exporting to ONNX
+        B, H, W = mask.size()
+        pos_x = torch.stack(
+            (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()),
+            dim=4).view(B, H, W, -1)
+        pos_y = torch.stack(
+            (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()),
+            dim=4).view(B, H, W, -1)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        return pos
+
+    def __repr__(self):
+        """str: a string that describes the module"""
+        repr_str = self.__class__.__name__
+        repr_str += f'(num_feats={self.num_feats}, '
+        repr_str += f'temperature={self.temperature}, '
+        repr_str += f'normalize={self.normalize}, '
+        repr_str += f'scale={self.scale}, '
+        repr_str += f'eps={self.eps})'
+        return repr_str
+
+
+@POSITIONAL_ENCODING.register_module(force=True)
+class LearnedPositionalEncoding(BaseModule):
+    """Position embedding with learnable embedding weights.
+    Args:
+        num_feats (int): The feature dimension for each position
+            along x-axis or y-axis. The final returned dimension for
+            each position is 2 times of this value.
+        row_num_embed (int, optional): The dictionary size of row embeddings.
+            Default 50.
+        col_num_embed (int, optional): The dictionary size of col embeddings.
+            Default 50.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 num_feats,
+                 row_num_embed=50,
+                 col_num_embed=50,
+                 init_cfg=dict(type='Uniform', layer='Embedding')):
+        super(LearnedPositionalEncoding, self).__init__(init_cfg)
+        self.row_embed = nn.Embedding(row_num_embed, num_feats)
+        self.col_embed = nn.Embedding(col_num_embed, num_feats)
+        self.num_feats = num_feats
+        self.row_num_embed = row_num_embed
+        self.col_num_embed = col_num_embed
+
+    def forward(self, mask):
+        """Forward function for `LearnedPositionalEncoding`.
+        Args:
+            mask (Tensor): ByteTensor mask. Non-zero values representing
+                ignored positions, while zero values means valid positions
+                for this image. Shape [bs, h, w].
+        Returns:
+            pos (Tensor): Returned position embedding with shape
+                [bs, num_feats*2, h, w].
+        """
+        h, w = mask.shape[-2:]
+        x = torch.arange(w, device=mask.device)
+        y = torch.arange(h, device=mask.device)
+        x_embed = self.col_embed(x)
+        y_embed = self.row_embed(y)
+        pos = torch.cat(
+            (x_embed.unsqueeze(0).repeat(h, 1, 1), y_embed.unsqueeze(1).repeat(
+                1, w, 1)),
+            dim=-1).permute(2, 0,
+                            1).unsqueeze(0).repeat(mask.shape[0], 1, 1, 1)
+        return pos
+
+    def __repr__(self):
+        """str: a string that describes the module"""
+        repr_str = self.__class__.__name__
+        repr_str += f'(num_feats={self.num_feats}, '
+        repr_str += f'row_num_embed={self.row_num_embed}, '
+        repr_str += f'col_num_embed={self.col_num_embed})'
+        return repr_str
\ No newline at end of file
diff --git a/main/transformer_utils/mmpose/models/utils/realnvp.py b/main/transformer_utils/mmpose/models/utils/realnvp.py
new file mode 100644
index 0000000000000000000000000000000000000000..911953e8f9d1056d44a2d3538d750e89b9bd6a7a
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/utils/realnvp.py
@@ -0,0 +1,76 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from torch import distributions
+
+
+class RealNVP(nn.Module):
+    """RealNVP: a flow-based generative model
+
+    `Density estimation using Real NVP
+    arXiv: <https://arxiv.org/abs/1605.08803>`_.
+
+    Code is modified from `the official implementation of RLE
+    <https://github.com/Jeff-sjtu/res-loglikelihood-regression>`_.
+
+    See also `real-nvp-pytorch
+    <https://github.com/senya-ashukha/real-nvp-pytorch>`_.
+    """
+
+    @staticmethod
+    def get_scale_net():
+        """Get the scale model in a single invertable mapping."""
+        return nn.Sequential(
+            nn.Linear(2, 64), nn.LeakyReLU(), nn.Linear(64, 64),
+            nn.LeakyReLU(), nn.Linear(64, 2), nn.Tanh())
+
+    @staticmethod
+    def get_trans_net():
+        """Get the translation model in a single invertable mapping."""
+        return nn.Sequential(
+            nn.Linear(2, 64), nn.LeakyReLU(), nn.Linear(64, 64),
+            nn.LeakyReLU(), nn.Linear(64, 2))
+
+    @property
+    def prior(self):
+        """The prior distribution."""
+        return distributions.MultivariateNormal(self.loc, self.cov)
+
+    def __init__(self):
+        super(RealNVP, self).__init__()
+
+        self.register_buffer('loc', torch.zeros(2))
+        self.register_buffer('cov', torch.eye(2))
+        self.register_buffer(
+            'mask', torch.tensor([[0, 1], [1, 0]] * 3, dtype=torch.float32))
+
+        self.s = torch.nn.ModuleList(
+            [self.get_scale_net() for _ in range(len(self.mask))])
+        self.t = torch.nn.ModuleList(
+            [self.get_trans_net() for _ in range(len(self.mask))])
+        self.init_weights()
+
+    def init_weights(self):
+        """Initialization model weights."""
+        for m in self.modules():
+            if isinstance(m, nn.Linear):
+                nn.init.xavier_uniform_(m.weight, gain=0.01)
+
+    def backward_p(self, x):
+        """Apply mapping form the data space to the latent space and calculate
+        the log determinant of the Jacobian matrix."""
+
+        log_det_jacob, z = x.new_zeros(x.shape[0]), x
+        for i in reversed(range(len(self.t))):
+            z_ = self.mask[i] * z
+            s = self.s[i](z_) * (1 - self.mask[i])  # torch.exp(s): betas
+            t = self.t[i](z_) * (1 - self.mask[i])  # gammas
+            z = (1 - self.mask[i]) * (z - t) * torch.exp(-s) + z_
+            log_det_jacob -= s.sum(dim=1)
+        return z, log_det_jacob
+
+    def log_prob(self, x):
+        """Calculate the log probability of given sample in data space."""
+
+        z, log_det = self.backward_p(x)
+        return self.prior.log_prob(z) + log_det
diff --git a/main/transformer_utils/mmpose/models/utils/smpl.py b/main/transformer_utils/mmpose/models/utils/smpl.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe723d483aadb7ce7e0e9f50ef8da7b10e7529e5
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/utils/smpl.py
@@ -0,0 +1,184 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+import torch.nn as nn
+
+from ..builder import MESH_MODELS
+
+try:
+    from smplx import SMPL as SMPL_
+    has_smpl = True
+except (ImportError, ModuleNotFoundError):
+    has_smpl = False
+
+
+@MESH_MODELS.register_module()
+class SMPL(nn.Module):
+    """SMPL 3d human mesh model of paper ref: Matthew Loper. ``SMPL: A skinned
+    multi-person linear model''. This module is based on the smplx project
+    (https://github.com/vchoutas/smplx).
+
+    Args:
+        smpl_path (str): The path to the folder where the model weights are
+            stored.
+        joints_regressor (str): The path to the file where the joints
+            regressor weight are stored.
+    """
+
+    def __init__(self, smpl_path, joints_regressor):
+        super().__init__()
+
+        assert has_smpl, 'Please install smplx to use SMPL.'
+
+        self.smpl_neutral = SMPL_(
+            model_path=smpl_path,
+            create_global_orient=False,
+            create_body_pose=False,
+            create_transl=False,
+            gender='neutral')
+
+        self.smpl_male = SMPL_(
+            model_path=smpl_path,
+            create_betas=False,
+            create_global_orient=False,
+            create_body_pose=False,
+            create_transl=False,
+            gender='male')
+
+        self.smpl_female = SMPL_(
+            model_path=smpl_path,
+            create_betas=False,
+            create_global_orient=False,
+            create_body_pose=False,
+            create_transl=False,
+            gender='female')
+
+        joints_regressor = torch.tensor(
+            np.load(joints_regressor), dtype=torch.float)[None, ...]
+        self.register_buffer('joints_regressor', joints_regressor)
+
+        self.num_verts = self.smpl_neutral.get_num_verts()
+        self.num_joints = self.joints_regressor.shape[1]
+
+    def smpl_forward(self, model, **kwargs):
+        """Apply a specific SMPL model with given model parameters.
+
+        Note:
+            B: batch size
+            V: number of vertices
+            K: number of joints
+
+        Returns:
+            outputs (dict): Dict with mesh vertices and joints.
+                - vertices: Tensor([B, V, 3]), mesh vertices
+                - joints: Tensor([B, K, 3]), 3d joints regressed
+                    from mesh vertices.
+        """
+
+        betas = kwargs['betas']
+        batch_size = betas.shape[0]
+        device = betas.device
+        output = {}
+        if batch_size == 0:
+            output['vertices'] = betas.new_zeros([0, self.num_verts, 3])
+            output['joints'] = betas.new_zeros([0, self.num_joints, 3])
+        else:
+            smpl_out = model(**kwargs)
+            output['vertices'] = smpl_out.vertices
+            output['joints'] = torch.matmul(
+                self.joints_regressor.to(device), output['vertices'])
+        return output
+
+    def get_faces(self):
+        """Return mesh faces.
+
+        Note:
+            F: number of faces
+
+        Returns:
+            faces: np.ndarray([F, 3]), mesh faces
+        """
+        return self.smpl_neutral.faces
+
+    def forward(self,
+                betas,
+                body_pose,
+                global_orient,
+                transl=None,
+                gender=None):
+        """Forward function.
+
+        Note:
+            B: batch size
+            J: number of controllable joints of model, for smpl model J=23
+            K: number of joints
+
+        Args:
+            betas: Tensor([B, 10]), human body shape parameters of SMPL model.
+            body_pose: Tensor([B, J*3] or [B, J, 3, 3]), human body pose
+                parameters of SMPL model. It should be axis-angle vector
+                ([B, J*3]) or rotation matrix ([B, J, 3, 3)].
+            global_orient: Tensor([B, 3] or [B, 1, 3, 3]), global orientation
+                of human body. It should be axis-angle vector ([B, 3]) or
+                rotation matrix ([B, 1, 3, 3)].
+            transl: Tensor([B, 3]), global translation of human body.
+            gender: Tensor([B]), gender parameters of human body. -1 for
+                neutral, 0 for male , 1 for female.
+
+        Returns:
+            outputs (dict): Dict with mesh vertices and joints.
+                - vertices: Tensor([B, V, 3]), mesh vertices
+                - joints: Tensor([B, K, 3]), 3d joints regressed from
+                    mesh vertices.
+        """
+
+        batch_size = betas.shape[0]
+        pose2rot = True if body_pose.dim() == 2 else False
+        if batch_size > 0 and gender is not None:
+            output = {
+                'vertices': betas.new_zeros([batch_size, self.num_verts, 3]),
+                'joints': betas.new_zeros([batch_size, self.num_joints, 3])
+            }
+
+            mask = gender < 0
+            _out = self.smpl_forward(
+                self.smpl_neutral,
+                betas=betas[mask],
+                body_pose=body_pose[mask],
+                global_orient=global_orient[mask],
+                transl=transl[mask] if transl is not None else None,
+                pose2rot=pose2rot)
+            output['vertices'][mask] = _out['vertices']
+            output['joints'][mask] = _out['joints']
+
+            mask = gender == 0
+            _out = self.smpl_forward(
+                self.smpl_male,
+                betas=betas[mask],
+                body_pose=body_pose[mask],
+                global_orient=global_orient[mask],
+                transl=transl[mask] if transl is not None else None,
+                pose2rot=pose2rot)
+            output['vertices'][mask] = _out['vertices']
+            output['joints'][mask] = _out['joints']
+
+            mask = gender == 1
+            _out = self.smpl_forward(
+                self.smpl_male,
+                betas=betas[mask],
+                body_pose=body_pose[mask],
+                global_orient=global_orient[mask],
+                transl=transl[mask] if transl is not None else None,
+                pose2rot=pose2rot)
+            output['vertices'][mask] = _out['vertices']
+            output['joints'][mask] = _out['joints']
+        else:
+            return self.smpl_forward(
+                self.smpl_neutral,
+                betas=betas,
+                body_pose=body_pose,
+                global_orient=global_orient,
+                transl=transl,
+                pose2rot=pose2rot)
+
+        return output
diff --git a/main/transformer_utils/mmpose/models/utils/tcformer_utils.py b/main/transformer_utils/mmpose/models/utils/tcformer_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d3a28534c83d60e52ed0382f54a4d9f4902e018
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/utils/tcformer_utils.py
@@ -0,0 +1,995 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import build_norm_layer, trunc_normal_init
+from mmcv.cnn.bricks.transformer import build_dropout
+
+try:
+    from torch.cuda.amp import autocast
+    WITH_AUTOCAST = True
+except ImportError:
+    WITH_AUTOCAST = False
+
+
+def get_grid_index(init_grid_size, map_size, device):
+    """For every initial grid, get its index in the feature map.
+    Note:
+        [H_init, W_init]: shape of initial grid
+        [H, W]: shape of feature map
+        N_init: numbers of initial token
+
+    Args:
+        init_grid_size (list[int] or tuple[int]): initial grid resolution in
+            format [H_init, W_init].
+        map_size (list[int] or tuple[int]): feature map resolution in format
+            [H, W].
+        device: the device of output
+
+    Returns:
+        idx (torch.LongTensor[B, N_init]): index in flattened feature map.
+    """
+    H_init, W_init = init_grid_size
+    H, W = map_size
+    idx = torch.arange(H * W, device=device).reshape(1, 1, H, W)
+    idx = F.interpolate(idx.float(), [H_init, W_init], mode='nearest').long()
+    return idx.flatten()
+
+
+def index_points(points, idx):
+    """Sample features following the index.
+    Note:
+        B: batch size
+        N: point number
+        C: channel number of each point
+        Ns: sampled point number
+
+    Args:
+        points (torch.Tensor[B, N, C]): input points data
+        idx (torch.LongTensor[B, S]): sample index
+
+    Returns:
+        new_points (torch.Tensor[B, Ns, C]):, indexed points data
+    """
+    device = points.device
+    B = points.shape[0]
+    view_shape = list(idx.shape)
+    view_shape[1:] = [1] * (len(view_shape) - 1)
+    repeat_shape = list(idx.shape)
+    repeat_shape[0] = 1
+    batch_indices = torch.arange(
+        B, dtype=torch.long).to(device).view(view_shape).repeat(repeat_shape)
+    new_points = points[batch_indices, idx, :]
+    return new_points
+
+
+def token2map(token_dict):
+    """Transform vision tokens to feature map. This function only works when
+    the resolution of the feature map is not higher than the initial grid
+    structure.
+
+    Note:
+        B: batch size
+        C: channel number of each token
+        [H, W]: shape of feature map
+        N_init: numbers of initial token
+
+    Args:
+        token_dict (dict): dict for token information.
+
+    Returns:
+        x_out (Tensor[B, C, H, W]): feature map.
+    """
+
+    x = token_dict['x']
+    H, W = token_dict['map_size']
+    H_init, W_init = token_dict['init_grid_size']
+    idx_token = token_dict['idx_token']
+    B, N, C = x.shape
+    N_init = H_init * W_init
+    device = x.device
+
+    if N_init == N and N == H * W:
+        # for the initial tokens with grid structure, just reshape
+        return x.reshape(B, H, W, C).permute(0, 3, 1, 2).contiguous()
+
+    # for each initial grid, get the corresponding index in
+    # the flattened feature map.
+    idx_hw = get_grid_index([H_init, W_init], [H, W],
+                            device=device)[None, :].expand(B, -1)
+    idx_batch = torch.arange(B, device=device)[:, None].expand(B, N_init)
+    value = x.new_ones(B * N_init)
+
+    # choose the way with fewer flops.
+    if N_init < N * H * W:
+        # use sparse matrix multiplication
+        # Flops: B * N_init * (C+2)
+        idx_hw = idx_hw + idx_batch * H * W
+        idx_tokens = idx_token + idx_batch * N
+        coor = torch.stack([idx_hw, idx_tokens], dim=0).reshape(2, B * N_init)
+
+        # torch.sparse do not support gradient for
+        # sparse tensor, so we detach it
+        value = value.detach().to(torch.float32)
+
+        # build a sparse matrix with the shape [B * H * W, B * N]
+        A = torch.sparse.FloatTensor(coor, value,
+                                     torch.Size([B * H * W, B * N]))
+
+        # normalize the weight for each row
+        if WITH_AUTOCAST:
+            with autocast(enabled=False):
+                all_weight = A @ x.new_ones(B * N, 1).type(
+                    torch.float32) + 1e-6
+        else:
+            all_weight = A @ x.new_ones(B * N, 1).type(torch.float32) + 1e-6
+        value = value / all_weight[idx_hw.reshape(-1), 0]
+
+        # update the matrix with normalize weight
+        A = torch.sparse.FloatTensor(coor, value,
+                                     torch.Size([B * H * W, B * N]))
+
+        # sparse matrix multiplication
+        if WITH_AUTOCAST:
+            with autocast(enabled=False):
+                x_out = A @ x.reshape(B * N, C).to(torch.float32)  # [B*H*W, C]
+        else:
+            x_out = A @ x.reshape(B * N, C).to(torch.float32)  # [B*H*W, C]
+
+    else:
+        # use dense matrix multiplication
+        # Flops: B * N * H * W * (C+2)
+        coor = torch.stack([idx_batch, idx_hw, idx_token],
+                           dim=0).reshape(3, B * N_init)
+
+        # build a matrix with shape [B, H*W, N]
+        A = torch.sparse.FloatTensor(coor, value, torch.Size([B, H * W,
+                                                              N])).to_dense()
+        # normalize the weight
+        A = A / (A.sum(dim=-1, keepdim=True) + 1e-6)
+
+        x_out = A @ x  # [B, H*W, C]
+
+    x_out = x_out.type(x.dtype)
+    x_out = x_out.reshape(B, H, W, C).permute(0, 3, 1, 2).contiguous()
+    return x_out
+
+
+def map2token(feature_map, token_dict):
+    """Transform feature map to vision tokens. This function only works when
+    the resolution of the feature map is not higher than the initial grid
+    structure.
+
+    Note:
+        B: batch size
+        C: channel number
+        [H, W]: shape of feature map
+        N_init: numbers of initial token
+
+    Args:
+        feature_map (Tensor[B, C, H, W]): feature map.
+        token_dict (dict): dict for token information.
+
+    Returns:
+        out (Tensor[B, N, C]): token features.
+    """
+    idx_token = token_dict['idx_token']
+    N = token_dict['token_num']
+    H_init, W_init = token_dict['init_grid_size']
+    N_init = H_init * W_init
+
+    B, C, H, W = feature_map.shape
+    device = feature_map.device
+
+    if N_init == N and N == H * W:
+        # for the initial tokens with grid structure, just reshape
+        return feature_map.flatten(2).permute(0, 2, 1).contiguous()
+
+    idx_hw = get_grid_index([H_init, W_init], [H, W],
+                            device=device)[None, :].expand(B, -1)
+
+    idx_batch = torch.arange(B, device=device)[:, None].expand(B, N_init)
+    value = feature_map.new_ones(B * N_init)
+
+    # choose the way with fewer flops.
+    if N_init < N * H * W:
+        # use sparse matrix multiplication
+        # Flops: B * N_init * (C+2)
+        idx_token = idx_token + idx_batch * N
+        idx_hw = idx_hw + idx_batch * H * W
+        indices = torch.stack([idx_token, idx_hw], dim=0).reshape(2, -1)
+
+        # sparse mm do not support gradient for sparse matrix
+        value = value.detach().to(torch.float32)
+        # build a sparse matrix with shape [B*N, B*H*W]
+        A = torch.sparse_coo_tensor(indices, value, (B * N, B * H * W))
+        # normalize the matrix
+        if WITH_AUTOCAST:
+            with autocast(enabled=False):
+                all_weight = A @ torch.ones(
+                    [B * H * W, 1], device=device, dtype=torch.float32) + 1e-6
+        else:
+            all_weight = A @ torch.ones(
+                [B * H * W, 1], device=device, dtype=torch.float32) + 1e-6
+        value = value / all_weight[idx_token.reshape(-1), 0]
+
+        A = torch.sparse_coo_tensor(indices, value, (B * N, B * H * W))
+        # out: [B*N, C]
+        if WITH_AUTOCAST:
+            with autocast(enabled=False):
+                out = A @ feature_map.permute(0, 2, 3, 1).contiguous().reshape(
+                    B * H * W, C).float()
+        else:
+            out = A @ feature_map.permute(0, 2, 3, 1).contiguous().reshape(
+                B * H * W, C).float()
+    else:
+        # use dense matrix multiplication
+        # Flops: B * N * H * W * (C+2)
+        indices = torch.stack([idx_batch, idx_token, idx_hw],
+                              dim=0).reshape(3, -1)
+        value = value.detach()  # To reduce the training time, we detach here.
+        A = torch.sparse_coo_tensor(indices, value, (B, N, H * W)).to_dense()
+        # normalize the matrix
+        A = A / (A.sum(dim=-1, keepdim=True) + 1e-6)
+
+        out = A @ feature_map.permute(0, 2, 3, 1).reshape(B, H * W,
+                                                          C).contiguous()
+
+    out = out.type(feature_map.dtype)
+    out = out.reshape(B, N, C)
+    return out
+
+
+def token_interp(target_dict, source_dict):
+    """Transform token features between different distribution.
+
+    Note:
+        B: batch size
+        N: token number
+        C: channel number
+
+    Args:
+        target_dict (dict): dict for target token information
+        source_dict (dict): dict for source token information.
+
+    Returns:
+        x_out (Tensor[B, N, C]): token features.
+    """
+
+    x_s = source_dict['x']
+    idx_token_s = source_dict['idx_token']
+    idx_token_t = target_dict['idx_token']
+    T = target_dict['token_num']
+    B, S, C = x_s.shape
+    N_init = idx_token_s.shape[1]
+
+    weight = target_dict['agg_weight'] if 'agg_weight' in target_dict.keys(
+    ) else None
+    if weight is None:
+        weight = x_s.new_ones(B, N_init, 1)
+    weight = weight.reshape(-1)
+
+    # choose the way with fewer flops.
+    if N_init < T * S:
+        # use sparse matrix multiplication
+        # Flops: B * N_init * (C+2)
+        idx_token_t = idx_token_t + torch.arange(
+            B, device=x_s.device)[:, None] * T
+        idx_token_s = idx_token_s + torch.arange(
+            B, device=x_s.device)[:, None] * S
+        coor = torch.stack([idx_token_t, idx_token_s],
+                           dim=0).reshape(2, B * N_init)
+
+        # torch.sparse does not support grad for sparse matrix
+        weight = weight.float().detach().to(torch.float32)
+        # build a matrix with shape [B*T, B*S]
+        A = torch.sparse.FloatTensor(coor, weight, torch.Size([B * T, B * S]))
+        # normalize the matrix
+        if WITH_AUTOCAST:
+            with autocast(enabled=False):
+                all_weight = A.type(torch.float32) @ x_s.new_ones(
+                    B * S, 1).type(torch.float32) + 1e-6
+        else:
+            all_weight = A.type(torch.float32) @ x_s.new_ones(B * S, 1).type(
+                torch.float32) + 1e-6
+        weight = weight / all_weight[idx_token_t.reshape(-1), 0]
+        A = torch.sparse.FloatTensor(coor, weight, torch.Size([B * T, B * S]))
+        # sparse matmul
+        if WITH_AUTOCAST:
+            with autocast(enabled=False):
+                x_out = A.type(torch.float32) @ x_s.reshape(B * S, C).type(
+                    torch.float32)
+        else:
+            x_out = A.type(torch.float32) @ x_s.reshape(B * S, C).type(
+                torch.float32)
+    else:
+        # use dense matrix multiplication
+        # Flops: B * T * S * (C+2)
+        idx_batch = torch.arange(
+            B, device=x_s.device)[:, None].expand(B, N_init)
+        coor = torch.stack([idx_batch, idx_token_t, idx_token_s],
+                           dim=0).reshape(3, B * N_init)
+        weight = weight.detach()  # detach to reduce training time
+        # build a matrix with shape [B, T, S]
+        A = torch.sparse.FloatTensor(coor, weight, torch.Size([B, T,
+                                                               S])).to_dense()
+        # normalize the matrix
+        A = A / (A.sum(dim=-1, keepdim=True) + 1e-6)
+        # dense matmul
+        x_out = A @ x_s
+
+    x_out = x_out.reshape(B, T, C).type(x_s.dtype)
+    return x_out
+
+
+def cluster_dpc_knn(token_dict, cluster_num, k=5, token_mask=None):
+    """Cluster tokens with DPC-KNN algorithm.
+
+    Note:
+        B: batch size
+        N: token number
+        C: channel number
+
+    Args:
+        token_dict (dict): dict for token information
+        cluster_num (int): cluster number
+        k (int): number of the nearest neighbor used for local density.
+        token_mask (Tensor[B, N]): mask indicating which token is the
+            padded empty token. Non-zero value means the token is meaningful,
+            zero value means the token is an empty token. If set to None, all
+            tokens are regarded as meaningful.
+
+    Return:
+        idx_cluster (Tensor[B, N]): cluster index of each token.
+        cluster_num (int): actual cluster number. In this function, it equals
+            to the input cluster number.
+    """
+
+    with torch.no_grad():
+        x = token_dict['x']
+        B, N, C = x.shape
+
+        dist_matrix = torch.cdist(x, x) / (C**0.5)
+
+        if token_mask is not None:
+            token_mask = token_mask > 0
+            # in order to not affect the local density, the
+            # distance between empty tokens and any other
+            # tokens should be the maximal distance.
+            dist_matrix = \
+                dist_matrix * token_mask[:, None, :] +\
+                (dist_matrix.max() + 1) * (~token_mask[:, None, :])
+
+        # get local density
+        dist_nearest, index_nearest = torch.topk(
+            dist_matrix, k=k, dim=-1, largest=False)
+
+        density = (-(dist_nearest**2).mean(dim=-1)).exp()
+        # add a little noise to ensure no tokens have the same density.
+        density = density + torch.rand(
+            density.shape, device=density.device, dtype=density.dtype) * 1e-6
+
+        if token_mask is not None:
+            # the density of empty token should be 0
+            density = density * token_mask
+
+        # get distance indicator
+        mask = density[:, None, :] > density[:, :, None]
+        mask = mask.type(x.dtype)
+        dist_max = dist_matrix.flatten(1).max(dim=-1)[0][:, None, None]
+        dist, index_parent = (dist_matrix * mask + dist_max *
+                              (1 - mask)).min(dim=-1)
+
+        # select clustering center according to score
+        score = dist * density
+        _, index_down = torch.topk(score, k=cluster_num, dim=-1)
+
+        # assign tokens to the nearest center
+        dist_matrix = index_points(dist_matrix, index_down)
+
+        idx_cluster = dist_matrix.argmin(dim=1)
+
+        # make sure cluster center merge to itself
+        idx_batch = torch.arange(
+            B, device=x.device)[:, None].expand(B, cluster_num)
+        idx_tmp = torch.arange(
+            cluster_num, device=x.device)[None, :].expand(B, cluster_num)
+        idx_cluster[idx_batch.reshape(-1),
+                    index_down.reshape(-1)] = idx_tmp.reshape(-1)
+
+    return idx_cluster, cluster_num
+
+
+def merge_tokens(token_dict, idx_cluster, cluster_num, token_weight=None):
+    """Merge tokens in the same cluster to a single cluster. Implemented by
+    torch.index_add(). Flops: B*N*(C+2)
+
+    Note:
+        B: batch size
+        N: token number
+        C: channel number
+
+    Args:
+        token_dict (dict): dict for input token information
+        idx_cluster (Tensor[B, N]): cluster index of each token.
+        cluster_num (int): cluster number
+        token_weight (Tensor[B, N, 1]): weight for each token.
+
+    Return:
+        out_dict (dict): dict for output token information
+    """
+
+    x = token_dict['x']
+    idx_token = token_dict['idx_token']
+    agg_weight = token_dict['agg_weight']
+
+    B, N, C = x.shape
+    if token_weight is None:
+        token_weight = x.new_ones(B, N, 1)
+
+    idx_batch = torch.arange(B, device=x.device)[:, None]
+    idx = idx_cluster + idx_batch * cluster_num
+
+    all_weight = token_weight.new_zeros(B * cluster_num, 1)
+    all_weight.index_add_(
+        dim=0, index=idx.reshape(B * N), source=token_weight.reshape(B * N, 1))
+    all_weight = all_weight + 1e-6
+    norm_weight = token_weight / all_weight[idx]
+
+    # average token features
+    x_merged = x.new_zeros(B * cluster_num, C)
+    source = x * norm_weight
+    x_merged.index_add_(
+        dim=0,
+        index=idx.reshape(B * N),
+        source=source.reshape(B * N, C).type(x.dtype))
+    x_merged = x_merged.reshape(B, cluster_num, C)
+
+    idx_token_new = index_points(idx_cluster[..., None], idx_token).squeeze(-1)
+    weight_t = index_points(norm_weight, idx_token)
+    agg_weight_new = agg_weight * weight_t
+    agg_weight_new / agg_weight_new.max(dim=1, keepdim=True)[0]
+
+    out_dict = {}
+    out_dict['x'] = x_merged
+    out_dict['token_num'] = cluster_num
+    out_dict['map_size'] = token_dict['map_size']
+    out_dict['init_grid_size'] = token_dict['init_grid_size']
+    out_dict['idx_token'] = idx_token_new
+    out_dict['agg_weight'] = agg_weight_new
+    return out_dict
+
+
+class MLP(nn.Module):
+    """FFN with Depthwise Conv of TCFormer.
+
+    Args:
+        in_features (int): The feature dimension.
+        hidden_features (int, optional): The hidden dimension of FFNs.
+            Defaults: The same as in_features.
+        out_features (int, optional): The output feature dimension.
+            Defaults: The same as in_features.
+        act_layer (nn.Module, optional): The activation config for FFNs.
+            Default: nn.GELU.
+        drop (float, optional): drop out rate. Default: 0.
+    """
+
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.dwconv = DWConv(hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def init_weights(self):
+        """init weights."""
+        for m in self.modules():
+            if isinstance(m, nn.Linear):
+                trunc_normal_init(m, std=.02, bias=0.)
+            elif isinstance(m, nn.LayerNorm):
+                nn.init.constant_(m.bias, 0)
+                nn.init.constant_(m.weight, 1.0)
+            elif isinstance(m, nn.Conv2d):
+                fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                fan_out //= m.groups
+                m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+                if m.bias is not None:
+                    m.bias.data.zero_()
+
+    def forward(self, x, H, W):
+        x = self.fc1(x)
+        x = self.dwconv(x, H, W)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class DWConv(nn.Module):
+    """Depthwise Conv for regular grid-based tokens.
+
+    Args:
+        dim (int): The feature dimension.
+    """
+
+    def __init__(self, dim=768):
+        super(DWConv, self).__init__()
+        self.dwconv = nn.Conv2d(dim, dim, 3, 1, 1, bias=True, groups=dim)
+
+    def forward(self, x, H, W):
+        B, N, C = x.shape
+        x = x.transpose(1, 2).view(B, C, H, W)
+        x = self.dwconv(x)
+        x = x.flatten(2).transpose(1, 2)
+        return x
+
+
+class TCFormerRegularAttention(nn.Module):
+    """Spatial Reduction Attention for regular grid-based tokens.
+
+    Args:
+        dim (int): The feature dimension of tokens,
+        num_heads (int): Parallel attention heads.
+        qkv_bias (bool): enable bias for qkv if True. Default: False.
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Default: None.
+        attn_drop (float): A Dropout layer on attn_output_weights.
+            Default: 0.0.
+        proj_drop (float): A Dropout layer after attention process.
+            Default: 0.0.
+        sr_ratio (int): The ratio of spatial reduction of Spatial Reduction
+            Attention. Default: 1.
+        use_sr_conv (bool): If True, use a conv layer for spatial reduction.
+            If False, use a pooling process for spatial reduction. Defaults:
+            True.
+    """
+
+    def __init__(
+        self,
+        dim,
+        num_heads=8,
+        qkv_bias=False,
+        qk_scale=None,
+        attn_drop=0.,
+        proj_drop=0.,
+        sr_ratio=1,
+        use_sr_conv=True,
+    ):
+        super().__init__()
+        assert dim % num_heads == 0, \
+            f'dim {dim} should be divided by num_heads {num_heads}.'
+
+        self.dim = dim
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.q = nn.Linear(dim, dim, bias=qkv_bias)
+        self.kv = nn.Linear(dim, dim * 2, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        self.sr_ratio = sr_ratio
+        self.use_sr_conv = use_sr_conv
+        if sr_ratio > 1 and self.use_sr_conv:
+            self.sr = nn.Conv2d(
+                dim, dim, kernel_size=sr_ratio, stride=sr_ratio)
+            self.norm = nn.LayerNorm(dim)
+
+    def init_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Linear):
+                trunc_normal_init(m, std=.02, bias=0.)
+            elif isinstance(m, nn.LayerNorm):
+                nn.init.constant_(m.bias, 0)
+                nn.init.constant_(m.weight, 1.0)
+            elif isinstance(m, nn.Conv2d):
+                fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                fan_out //= m.groups
+                m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+                if m.bias is not None:
+                    m.bias.data.zero_()
+
+    def forward(self, x, H, W):
+        B, N, C = x.shape
+        q = self.q(x).reshape(B, N, self.num_heads,
+                              C // self.num_heads).permute(0, 2, 1, 3)
+
+        if self.sr_ratio > 1:
+            kv = x.permute(0, 2, 1).reshape(B, C, H, W)
+            if self.use_sr_conv:
+                kv = self.sr(kv).reshape(B, C, -1).permute(0, 2,
+                                                           1).contiguous()
+                kv = self.norm(kv)
+            else:
+                kv = F.avg_pool2d(
+                    kv, kernel_size=self.sr_ratio, stride=self.sr_ratio)
+                kv = kv.reshape(B, C, -1).permute(0, 2, 1).contiguous()
+        else:
+            kv = x
+
+        kv = self.kv(kv).reshape(B, -1, 2, self.num_heads,
+                                 C // self.num_heads).permute(2, 0, 3, 1,
+                                                              4).contiguous()
+        k, v = kv[0], kv[1]
+
+        attn = (q * self.scale) @ k.transpose(-2, -1)
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+
+        return x
+
+
+class TCFormerRegularBlock(nn.Module):
+    """Transformer block for regular grid-based tokens.
+
+    Args:
+        dim (int): The feature dimension.
+        num_heads (int): Parallel attention heads.
+        mlp_ratio (int): The expansion ratio for the FFNs.
+        qkv_bias (bool): enable bias for qkv if True. Default: False.
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Default: None.
+        drop (float): Dropout layers after attention process and in FFN.
+            Default: 0.0.
+        attn_drop (float): A Dropout layer on attn_output_weights.
+            Default: 0.0.
+        drop_path (int, optional): The drop path rate of transformer block.
+            Default: 0.0
+        act_layer (nn.Module, optional): The activation config for FFNs.
+            Default: nn.GELU.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+        sr_ratio (int): The ratio of spatial reduction of Spatial Reduction
+            Attention. Default: 1.
+        use_sr_conv (bool): If True, use a conv layer for spatial reduction.
+            If False, use a pooling process for spatial reduction. Defaults:
+            True.
+    """
+
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_cfg=dict(type='LN'),
+                 sr_ratio=1,
+                 use_sr_conv=True):
+        super().__init__()
+        self.norm1 = build_norm_layer(norm_cfg, dim)[1]
+
+        self.attn = TCFormerRegularAttention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            sr_ratio=sr_ratio,
+            use_sr_conv=use_sr_conv)
+        self.drop_path = build_dropout(
+            dict(type='DropPath', drop_prob=drop_path))
+
+        self.norm2 = build_norm_layer(norm_cfg, dim)[1]
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = MLP(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop)
+
+    def init_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Linear):
+                trunc_normal_init(m, std=.02, bias=0.)
+            elif isinstance(m, nn.LayerNorm):
+                nn.init.constant_(m.bias, 0)
+                nn.init.constant_(m.weight, 1.0)
+            elif isinstance(m, nn.Conv2d):
+                fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                fan_out //= m.groups
+                m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+                if m.bias is not None:
+                    m.bias.data.zero_()
+
+    def forward(self, x, H, W):
+        x = x + self.drop_path(self.attn(self.norm1(x), H, W))
+        x = x + self.drop_path(self.mlp(self.norm2(x), H, W))
+        return x
+
+
+class TokenConv(nn.Conv2d):
+    """Conv layer for dynamic tokens.
+
+    A skip link is added between the input and output tokens to reserve detail
+    tokens.
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        groups = kwargs['groups'] if 'groups' in kwargs.keys() else 1
+        self.skip = nn.Conv1d(
+            in_channels=kwargs['in_channels'],
+            out_channels=kwargs['out_channels'],
+            kernel_size=1,
+            bias=False,
+            groups=groups)
+
+    def forward(self, token_dict):
+        x = token_dict['x']
+        x = self.skip(x.permute(0, 2, 1)).permute(0, 2, 1)
+        x_map = token2map(token_dict)
+        x_map = super().forward(x_map)
+        x = x + map2token(x_map, token_dict)
+        return x
+
+
+class TCMLP(nn.Module):
+    """FFN with Depthwise Conv for dynamic tokens.
+
+    Args:
+        in_features (int): The feature dimension.
+        hidden_features (int, optional): The hidden dimension of FFNs.
+            Defaults: The same as in_features.
+        out_features (int, optional): The output feature dimension.
+            Defaults: The same as in_features.
+        act_layer (nn.Module, optional): The activation config for FFNs.
+            Default: nn.GELU.
+        drop (float, optional): drop out rate. Default: 0.
+    """
+
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.dwconv = TokenConv(
+            in_channels=hidden_features,
+            out_channels=hidden_features,
+            kernel_size=3,
+            padding=1,
+            stride=1,
+            bias=True,
+            groups=hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def init_weights(self):
+        """init weights."""
+        for m in self.modules():
+            if isinstance(m, nn.Linear):
+                trunc_normal_init(m, std=.02, bias=0.)
+            elif isinstance(m, nn.LayerNorm):
+                nn.init.constant_(m.bias, 0)
+                nn.init.constant_(m.weight, 1.0)
+            elif isinstance(m, nn.Conv2d):
+                fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                fan_out //= m.groups
+                m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+                if m.bias is not None:
+                    m.bias.data.zero_()
+
+    def forward(self, token_dict):
+        token_dict['x'] = self.fc1(token_dict['x'])
+        x = self.dwconv(token_dict)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class TCFormerDynamicAttention(TCFormerRegularAttention):
+    """Spatial Reduction Attention for dynamic tokens."""
+
+    def forward(self, q_dict, kv_dict):
+        """Attention process for dynamic tokens.
+        Dynamic tokens are represented by a dict with the following keys:
+            x (torch.Tensor[B, N, C]): token features.
+            token_num(int): token number.
+            map_size(list[int] or tuple[int]): feature map resolution in
+                format [H, W].
+            init_grid_size(list[int] or tuple[int]): initial grid resolution
+                in format [H_init, W_init].
+            idx_token(torch.LongTensor[B, N_init]): indicates which token
+                the initial grid belongs to.
+            agg_weight(torch.LongTensor[B, N_init] or None): weight for
+                aggregation. Indicates the weight of each token in its
+                cluster. If set to None, uniform weight is used.
+
+        Note:
+            B: batch size
+            N: token number
+            C: channel number
+            Ns: sampled point number
+            [H_init, W_init]: shape of initial grid
+            [H, W]: shape of feature map
+            N_init: numbers of initial token
+
+        Args:
+            q_dict (dict): dict for query token information
+            kv_dict (dict): dict for key and value token information
+
+        Return:
+            x (torch.Tensor[B, N, C]): output token features.
+        """
+
+        q = q_dict['x']
+        kv = kv_dict['x']
+        B, Nq, C = q.shape
+        Nkv = kv.shape[1]
+        conf_kv = kv_dict['token_score'] if 'token_score' in kv_dict.keys(
+        ) else kv.new_zeros(B, Nkv, 1)
+
+        q = self.q(q).reshape(B, Nq, self.num_heads,
+                              C // self.num_heads).permute(0, 2, 1,
+                                                           3).contiguous()
+
+        if self.sr_ratio > 1:
+            tmp = torch.cat([kv, conf_kv], dim=-1)
+            tmp_dict = kv_dict.copy()
+            tmp_dict['x'] = tmp
+            tmp_dict['map_size'] = q_dict['map_size']
+            tmp = token2map(tmp_dict)
+
+            kv = tmp[:, :C]
+            conf_kv = tmp[:, C:]
+
+            if self.use_sr_conv:
+                kv = self.sr(kv)
+                _, _, h, w = kv.shape
+                kv = kv.reshape(B, C, -1).permute(0, 2, 1).contiguous()
+                kv = self.norm(kv)
+            else:
+                kv = F.avg_pool2d(
+                    kv, kernel_size=self.sr_ratio, stride=self.sr_ratio)
+                kv = kv.reshape(B, C, -1).permute(0, 2, 1).contiguous()
+
+            conf_kv = F.avg_pool2d(
+                conf_kv, kernel_size=self.sr_ratio, stride=self.sr_ratio)
+            conf_kv = conf_kv.reshape(B, 1, -1).permute(0, 2, 1).contiguous()
+
+        kv = self.kv(kv).reshape(B, -1, 2, self.num_heads,
+                                 C // self.num_heads).permute(2, 0, 3, 1,
+                                                              4).contiguous()
+        k, v = kv[0], kv[1]
+
+        attn = (q * self.scale) @ k.transpose(-2, -1)
+
+        conf_kv = conf_kv.squeeze(-1)[:, None, None, :]
+        attn = attn + conf_kv
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, Nq, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+# Transformer block for dynamic tokens
+class TCFormerDynamicBlock(TCFormerRegularBlock):
+    """Transformer block for dynamic tokens.
+
+    Args:
+        dim (int): The feature dimension.
+        num_heads (int): Parallel attention heads.
+        mlp_ratio (int): The expansion ratio for the FFNs.
+        qkv_bias (bool): enable bias for qkv if True. Default: False.
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Default: None.
+        drop (float): Dropout layers after attention process and in FFN.
+            Default: 0.0.
+        attn_drop (float): A Dropout layer on attn_output_weights.
+            Default: 0.0.
+        drop_path (int, optional): The drop path rate of transformer block.
+            Default: 0.0
+        act_layer (nn.Module, optional): The activation config for FFNs.
+            Default: nn.GELU.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+        sr_ratio (int): The ratio of spatial reduction of Spatial Reduction
+            Attention. Default: 1.
+        use_sr_conv (bool): If True, use a conv layer for spatial reduction.
+            If False, use a pooling process for spatial reduction. Defaults:
+            True.
+    """
+
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_cfg=dict(type='LN'),
+                 sr_ratio=1,
+                 use_sr_conv=True):
+        super(TCFormerRegularBlock, self).__init__()
+        self.norm1 = build_norm_layer(norm_cfg, dim)[1]
+
+        self.attn = TCFormerDynamicAttention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            sr_ratio=sr_ratio,
+            use_sr_conv=use_sr_conv)
+        self.drop_path = build_dropout(
+            dict(type='DropPath', drop_prob=drop_path))
+
+        self.norm2 = build_norm_layer(norm_cfg, dim)[1]
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = TCMLP(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop)
+
+    def forward(self, inputs):
+        """Forward function.
+
+        Args:
+            inputs (dict or tuple[dict] or list[dict]): input dynamic
+                token information. If a single dict is provided, it's
+                regraded as query and key, value. If a tuple or list
+                of dict is provided, the first one is regarded as key
+                and the second one is regarded as key, value.
+
+        Return:
+            q_dict (dict): dict for output token information
+        """
+        if isinstance(inputs, tuple) or isinstance(inputs, list):
+            q_dict, kv_dict = inputs
+        else:
+            q_dict, kv_dict = inputs, None
+
+        x = q_dict['x']
+        # norm1
+        q_dict['x'] = self.norm1(q_dict['x'])
+        if kv_dict is None:
+            kv_dict = q_dict
+        else:
+            kv_dict['x'] = self.norm1(kv_dict['x'])
+
+        # attn
+        x = x + self.drop_path(self.attn(q_dict, kv_dict))
+
+        # mlp
+        q_dict['x'] = self.norm2(x)
+        x = x + self.drop_path(self.mlp(q_dict))
+        q_dict['x'] = x
+
+        return q_dict
diff --git a/main/transformer_utils/mmpose/models/utils/transformer.py b/main/transformer_utils/mmpose/models/utils/transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..42205707347e57c433e66eda728cc82a6df7455a
--- /dev/null
+++ b/main/transformer_utils/mmpose/models/utils/transformer.py
@@ -0,0 +1,1138 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Sequence
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import build_conv_layer, build_norm_layer
+from mmcv.runner.base_module import BaseModule
+from mmcv.utils import to_2tuple
+from mmpose.models.builder import TRANSFORMER
+
+from easydict import EasyDict
+from einops import rearrange, repeat
+from mmcv.runner import force_fp32
+from mmcv.cnn.bricks.transformer import (BaseTransformerLayer,
+                                         TransformerLayerSequence,
+                                         build_transformer_layer_sequence)
+from mmcv.cnn.bricks.registry import (TRANSFORMER_LAYER,
+                                      TRANSFORMER_LAYER_SEQUENCE)
+import torch.distributions as distributions
+from mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention
+from torch.nn.init import normal_
+import copy
+import warnings
+from mmcv.cnn import build_activation_layer, build_norm_layer, xavier_init
+
+from utils.human_models import smpl_x
+
+from config import cfg
+
+
+def point_sample(input, point_coords, **kwargs):
+    """
+    A wrapper around :function:`torch.nn.functional.grid_sample` to support 3D point_coords tensors.
+    Unlike :function:`torch.nn.functional.grid_sample` it assumes `point_coords` to lie inside
+    [0, 1] x [0, 1] square.
+    Args:
+        input (Tensor): A tensor of shape (N, C, H, W) that contains features map on a H x W grid.
+        point_coords (Tensor): A tensor of shape (N, P, 2) or (N, Hgrid, Wgrid, 2) that contains
+        [0, 1] x [0, 1] normalized point coordinates.
+    Returns:
+        output (Tensor): A tensor of shape (N, C, P) or (N, C, Hgrid, Wgrid) that contains
+            features for points in `point_coords`. The features are obtained via bilinear
+            interplation from `input` the same way as :function:`torch.nn.functional.grid_sample`.
+    """
+    add_dim = False
+    if point_coords.dim() == 3:
+        add_dim = True
+        point_coords = point_coords.unsqueeze(2)
+    output = F.grid_sample(input, 2.0 * point_coords - 1.0, **kwargs)
+    if add_dim:
+        output = output.squeeze(3)
+    return output
+
+
+def nlc_to_nchw(x, hw_shape):
+    """Convert [N, L, C] shape tensor to [N, C, H, W] shape tensor.
+
+    Args:
+        x (Tensor): The input tensor of shape [N, L, C] before conversion.
+        hw_shape (Sequence[int]): The height and width of output feature map.
+
+    Returns:
+        Tensor: The output tensor of shape [N, C, H, W] after conversion.
+    """
+    H, W = hw_shape
+    assert len(x.shape) == 3
+    B, L, C = x.shape
+    assert L == H * W, 'The seq_len does not match H, W'
+    return x.transpose(1, 2).reshape(B, C, H, W).contiguous()
+
+
+def nchw_to_nlc(x):
+    """Flatten [N, C, H, W] shape tensor to [N, L, C] shape tensor.
+
+    Args:
+        x (Tensor): The input tensor of shape [N, C, H, W] before conversion.
+
+    Returns:
+        Tensor: The output tensor of shape [N, L, C] after conversion.
+    """
+    assert len(x.shape) == 4
+    return x.flatten(2).transpose(1, 2).contiguous()
+
+
+class AdaptivePadding(nn.Module):
+    """Applies padding to input (if needed) so that input can get fully covered
+    by filter you specified. It support two modes "same" and "corner". The
+    "same" mode is same with "SAME" padding mode in TensorFlow, pad zero around
+    input. The "corner"  mode would pad zero to bottom right.
+
+    Args:
+        kernel_size (int | tuple): Size of the kernel:
+        stride (int | tuple): Stride of the filter. Default: 1:
+        dilation (int | tuple): Spacing between kernel elements.
+            Default: 1
+        padding (str): Support "same" and "corner", "corner" mode
+            would pad zero to bottom right, and "same" mode would
+            pad zero around input. Default: "corner".
+    Example:
+        >>> kernel_size = 16
+        >>> stride = 16
+        >>> dilation = 1
+        >>> input = torch.rand(1, 1, 15, 17)
+        >>> adap_pad = AdaptivePadding(
+        >>>     kernel_size=kernel_size,
+        >>>     stride=stride,
+        >>>     dilation=dilation,
+        >>>     padding="corner")
+        >>> out = adap_pad(input)
+        >>> assert (out.shape[2], out.shape[3]) == (16, 32)
+        >>> input = torch.rand(1, 1, 16, 17)
+        >>> out = adap_pad(input)
+        >>> assert (out.shape[2], out.shape[3]) == (16, 32)
+    """
+
+    def __init__(self, kernel_size=1, stride=1, dilation=1, padding='corner'):
+
+        super(AdaptivePadding, self).__init__()
+
+        assert padding in ('same', 'corner')
+
+        kernel_size = to_2tuple(kernel_size)
+        stride = to_2tuple(stride)
+        padding = to_2tuple(padding)
+        dilation = to_2tuple(dilation)
+
+        self.padding = padding
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.dilation = dilation
+
+    def get_pad_shape(self, input_shape):
+        input_h, input_w = input_shape
+        kernel_h, kernel_w = self.kernel_size
+        stride_h, stride_w = self.stride
+        output_h = math.ceil(input_h / stride_h)
+        output_w = math.ceil(input_w / stride_w)
+        pad_h = max((output_h - 1) * stride_h +
+                    (kernel_h - 1) * self.dilation[0] + 1 - input_h, 0)
+        pad_w = max((output_w - 1) * stride_w +
+                    (kernel_w - 1) * self.dilation[1] + 1 - input_w, 0)
+        return pad_h, pad_w
+
+    def forward(self, x):
+        pad_h, pad_w = self.get_pad_shape(x.size()[-2:])
+        if pad_h > 0 or pad_w > 0:
+            if self.padding == 'corner':
+                x = F.pad(x, [0, pad_w, 0, pad_h])
+            elif self.padding == 'same':
+                x = F.pad(x, [
+                    pad_w // 2, pad_w - pad_w // 2, pad_h // 2,
+                    pad_h - pad_h // 2
+                ])
+        return x
+
+
+class PatchEmbed(BaseModule):
+    """Image to Patch Embedding.
+
+    We use a conv layer to implement PatchEmbed.
+
+    Args:
+        in_channels (int): The num of input channels. Default: 3
+        embed_dims (int): The dimensions of embedding. Default: 768
+        conv_type (str): The config dict for embedding
+            conv layer type selection. Default: "Conv2d.
+        kernel_size (int): The kernel_size of embedding conv. Default: 16.
+        stride (int): The slide stride of embedding conv.
+            Default: None (Would be set as `kernel_size`).
+        padding (int | tuple | string ): The padding length of
+            embedding conv. When it is a string, it means the mode
+            of adaptive padding, support "same" and "corner" now.
+            Default: "corner".
+        dilation (int): The dilation rate of embedding conv. Default: 1.
+        bias (bool): Bias of embed conv. Default: True.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: None.
+        input_size (int | tuple | None): The size of input, which will be
+            used to calculate the out size. Only work when `dynamic_size`
+            is False. Default: None.
+        init_cfg (`mmcv.ConfigDict`, optional): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(
+            self,
+            in_channels=3,
+            embed_dims=768,
+            conv_type='Conv2d',
+            kernel_size=16,
+            stride=16,
+            padding='corner',
+            dilation=1,
+            bias=True,
+            norm_cfg=None,
+            input_size=None,
+            init_cfg=None,
+    ):
+        super(PatchEmbed, self).__init__(init_cfg=init_cfg)
+
+        self.embed_dims = embed_dims
+        if stride is None:
+            stride = kernel_size
+
+        kernel_size = to_2tuple(kernel_size)
+        stride = to_2tuple(stride)
+        dilation = to_2tuple(dilation)
+
+        if isinstance(padding, str):
+            self.adap_padding = AdaptivePadding(
+                kernel_size=kernel_size,
+                stride=stride,
+                dilation=dilation,
+                padding=padding)
+            # disable the padding of conv
+            padding = 0
+        else:
+            self.adap_padding = None
+        padding = to_2tuple(padding)
+
+        self.projection = build_conv_layer(
+            dict(type=conv_type),
+            in_channels=in_channels,
+            out_channels=embed_dims,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias)
+
+        if norm_cfg is not None:
+            self.norm = build_norm_layer(norm_cfg, embed_dims)[1]
+        else:
+            self.norm = None
+
+        if input_size:
+            input_size = to_2tuple(input_size)
+            # `init_out_size` would be used outside to
+            # calculate the num_patches
+            # when `use_abs_pos_embed` outside
+            self.init_input_size = input_size
+            if self.adap_padding:
+                pad_h, pad_w = self.adap_padding.get_pad_shape(input_size)
+                input_h, input_w = input_size
+                input_h = input_h + pad_h
+                input_w = input_w + pad_w
+                input_size = (input_h, input_w)
+
+            # https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html
+            h_out = (input_size[0] + 2 * padding[0] - dilation[0] *
+                     (kernel_size[0] - 1) - 1) // stride[0] + 1
+            w_out = (input_size[1] + 2 * padding[1] - dilation[1] *
+                     (kernel_size[1] - 1) - 1) // stride[1] + 1
+            self.init_out_size = (h_out, w_out)
+        else:
+            self.init_input_size = None
+            self.init_out_size = None
+
+    def forward(self, x):
+        """
+        Args:
+            x (Tensor): Has shape (B, C, H, W). In most case, C is 3.
+
+        Returns:
+            tuple: Contains merged results and its spatial shape.
+
+                - x (Tensor): Has shape (B, out_h * out_w, embed_dims)
+                - out_size (tuple[int]): Spatial shape of x, arrange as
+                    (out_h, out_w).
+        """
+
+        if self.adap_padding:
+            x = self.adap_padding(x)
+
+        x = self.projection(x)
+        out_size = (x.shape[2], x.shape[3])
+        x = x.flatten(2).transpose(1, 2)
+        if self.norm is not None:
+            x = self.norm(x)
+        return x, out_size
+
+
+class PatchMerging(BaseModule):
+    """Merge patch feature map.
+
+    This layer groups feature map by kernel_size, and applies norm and linear
+    layers to the grouped feature map. Our implementation uses `nn.Unfold` to
+    merge patch, which is about 25% faster than original implementation.
+    Instead, we need to modify pretrained models for compatibility.
+
+    Args:
+        in_channels (int): The num of input channels.
+            to gets fully covered by filter and stride you specified..
+            Default: True.
+        out_channels (int): The num of output channels.
+        kernel_size (int | tuple, optional): the kernel size in the unfold
+            layer. Defaults to 2.
+        stride (int | tuple, optional): the stride of the sliding blocks in the
+            unfold layer. Default: None. (Would be set as `kernel_size`)
+        padding (int | tuple | string ): The padding length of
+            embedding conv. When it is a string, it means the mode
+            of adaptive padding, support "same" and "corner" now.
+            Default: "corner".
+        dilation (int | tuple, optional): dilation parameter in the unfold
+            layer. Default: 1.
+        bias (bool, optional): Whether to add bias in linear layer or not.
+            Defaults: False.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: dict(type='LN').
+        init_cfg (dict, optional): The extra config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=2,
+                 stride=None,
+                 padding='corner',
+                 dilation=1,
+                 bias=False,
+                 norm_cfg=dict(type='LN'),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        if stride:
+            stride = stride
+        else:
+            stride = kernel_size
+
+        kernel_size = to_2tuple(kernel_size)
+        stride = to_2tuple(stride)
+        dilation = to_2tuple(dilation)
+
+        if isinstance(padding, str):
+            self.adap_padding = AdaptivePadding(
+                kernel_size=kernel_size,
+                stride=stride,
+                dilation=dilation,
+                padding=padding)
+            # disable the padding of unfold
+            padding = 0
+        else:
+            self.adap_padding = None
+
+        padding = to_2tuple(padding)
+        self.sampler = nn.Unfold(
+            kernel_size=kernel_size,
+            dilation=dilation,
+            padding=padding,
+            stride=stride)
+
+        sample_dim = kernel_size[0] * kernel_size[1] * in_channels
+
+        if norm_cfg is not None:
+            self.norm = build_norm_layer(norm_cfg, sample_dim)[1]
+        else:
+            self.norm = None
+
+        self.reduction = nn.Linear(sample_dim, out_channels, bias=bias)
+
+    def forward(self, x, input_size):
+        """
+        Args:
+            x (Tensor): Has shape (B, H*W, C_in).
+            input_size (tuple[int]): The spatial shape of x, arrange as (H, W).
+                Default: None.
+
+        Returns:
+            tuple: Contains merged results and its spatial shape.
+
+                - x (Tensor): Has shape (B, Merged_H * Merged_W, C_out)
+                - out_size (tuple[int]): Spatial shape of x, arrange as
+                    (Merged_H, Merged_W).
+        """
+        B, L, C = x.shape
+        assert isinstance(input_size, Sequence), f'Expect ' \
+                                                 f'input_size is ' \
+                                                 f'`Sequence` ' \
+                                                 f'but get {input_size}'
+
+        H, W = input_size
+        assert L == H * W, 'input feature has wrong size'
+
+        x = x.view(B, H, W, C).permute([0, 3, 1, 2])  # B, C, H, W
+        # Use nn.Unfold to merge patch. About 25% faster than original method,
+        # but need to modify pretrained model for compatibility
+
+        if self.adap_padding:
+            x = self.adap_padding(x)
+            H, W = x.shape[-2:]
+
+        x = self.sampler(x)
+        # if kernel_size=2 and stride=2, x should has shape (B, 4*C, H/2*W/2)
+
+        out_h = (H + 2 * self.sampler.padding[0] - self.sampler.dilation[0] *
+                 (self.sampler.kernel_size[0] - 1) -
+                 1) // self.sampler.stride[0] + 1
+        out_w = (W + 2 * self.sampler.padding[1] - self.sampler.dilation[1] *
+                 (self.sampler.kernel_size[1] - 1) -
+                 1) // self.sampler.stride[1] + 1
+
+        output_size = (out_h, out_w)
+        x = x.transpose(1, 2)  # B, H/2*W/2, 4*C
+        x = self.norm(x) if self.norm else x
+        x = self.reduction(x)
+        return x, output_size
+
+
+def inverse_sigmoid(x, eps=1e-5):
+    """Inverse function of sigmoid.
+    Args:
+        x (Tensor): The tensor to do the
+            inverse.
+        eps (float): EPS avoid numerical
+            overflow. Defaults 1e-5.
+    Returns:
+        Tensor: The x has passed the inverse
+            function of sigmoid, has same
+            shape with input.
+    """
+    x = x.clamp(min=0, max=1)
+    x1 = x.clamp(min=eps)
+    x2 = (1 - x).clamp(min=eps)
+    return torch.log(x1 / x2)
+
+
+@TRANSFORMER_LAYER_SEQUENCE.register_module()
+class DetrTransformerEncoder_zero_layer():
+    def __init__(self, *args, post_norm_cfg=dict(type='LN'), **kwargs):
+        pass
+
+    def __call__(self,
+                 query,
+                 key,
+                 value,
+                 query_pos=None,
+                 key_pos=None,
+                 attn_masks=None,
+                 query_key_padding_mask=None,
+                 key_padding_mask=None,
+                 **kwargs):
+        query = query + query_pos
+        return query
+
+
+@TRANSFORMER_LAYER.register_module()
+class DetrTransformerDecoderLayer_grouped(BaseTransformerLayer):
+    def __init__(self,
+                 attn_cfgs,
+                 feedforward_channels,
+                 ffn_dropout=0.0,
+                 operation_order=None,
+                 act_cfg=dict(type='ReLU', inplace=True),
+                 norm_cfg=dict(type='LN'),
+                 ffn_num_fcs=2,
+                 num_joints=17,
+                 **kwargs):
+        super(DetrTransformerDecoderLayer_grouped, self).__init__(
+            attn_cfgs=attn_cfgs,
+            feedforward_channels=feedforward_channels,
+            ffn_dropout=ffn_dropout,
+            operation_order=operation_order,
+            act_cfg=act_cfg,
+            norm_cfg=norm_cfg,
+            ffn_num_fcs=ffn_num_fcs,
+            **kwargs)
+        # assert len(operation_order) == 6
+        # assert set(operation_order) == set(
+        #     ['self_attn', 'norm', 'cross_attn', 'ffn'])
+        self.num_joints = num_joints
+        # self.num_joints = len(smpl_x.pos_joint_part['rhand'])
+        # self.num_joints = len(smpl_x.pos_joint_part['body']) + len(smpl_x.pos_joint_part['rhand']) + len(smpl_x.pos_joint_part['lhand'])
+
+    def forward(self,
+                query,
+                key=None,
+                value=None,
+                query_pos=None,
+                key_pos=None,
+                attn_masks=None,
+                query_key_padding_mask=None,
+                key_padding_mask=None,
+                **kwargs):
+
+        norm_index = 0
+        attn_index = 0
+        ffn_index = 0
+        identity = query
+        if attn_masks is None:
+            attn_masks = [None for _ in range(self.num_attn)]
+        elif isinstance(attn_masks, torch.Tensor):
+            attn_masks = [
+                copy.deepcopy(attn_masks) for _ in range(self.num_attn)
+            ]
+            warnings.warn(f'Use same attn_mask in all attentions in '
+                          f'{self.__class__.__name__} ')
+        else:
+            assert len(attn_masks) == self.num_attn, f'The length of ' \
+                                                     f'attn_masks {len(attn_masks)} must be equal ' \
+                                                     f'to the number of attention in ' \
+                                                     f'operation_order {self.num_attn}'
+
+        for layer in self.operation_order:
+            if layer == 'self_attn':
+                # print(query.shape)
+                assert query.size(0) % self.num_joints == 0, f'query.shape: {query.shape}, num_joints: {self.num_joints}'
+                num_group = query.size(0) // self.num_joints
+                bs = query.size(1)
+
+                temp_query = rearrange(query, '(g k) b c -> k (g b) c',
+                                       g=num_group, k=self.num_joints)
+                temp_identity = rearrange(identity, '(g k) b c -> k (g b) c',
+                                          g=num_group, k=self.num_joints)
+                temp_query_pos = rearrange(query_pos, '(g k) b c -> k (g b) c',
+                                           g=num_group, k=self.num_joints)
+
+                temp_key = temp_value = temp_query
+                query = self.attentions[attn_index](
+                    temp_query,
+                    temp_key,
+                    temp_value,
+                    temp_identity if self.pre_norm else None,
+                    query_pos=temp_query_pos,
+                    key_pos=temp_query_pos,
+                    attn_mask=attn_masks[attn_index],
+                    key_padding_mask=query_key_padding_mask,
+                    **kwargs)
+
+                query = rearrange(query, 'k (g b) c -> (g k) b c',
+                                  g=num_group, b=bs)
+
+                attn_index += 1
+                identity = query
+
+            elif layer == 'norm':
+                query = self.norms[norm_index](query)
+                norm_index += 1
+
+            elif layer == 'cross_attn':
+                query = self.attentions[attn_index](
+                    query,
+                    key,
+                    value,
+                    identity if self.pre_norm else None,
+                    query_pos=query_pos,
+                    key_pos=key_pos,
+                    attn_mask=attn_masks[attn_index],
+                    key_padding_mask=key_padding_mask,
+                    **kwargs)
+                attn_index += 1
+                identity = query
+
+            elif layer == 'ffn':
+                query = self.ffns[ffn_index](
+                    query, identity if self.pre_norm else None)
+                ffn_index += 1
+        if 'cross_attn' not in self.operation_order:
+            query = query + value.sum() * 0
+
+        return query
+
+
+@TRANSFORMER_LAYER_SEQUENCE.register_module(force=True)
+class DeformableDetrTransformerDecoder(TransformerLayerSequence):
+    """Implements the decoder in DETR transformer.
+    Args:
+        return_intermediate (bool): Whether to return intermediate outputs.
+        coder_norm_cfg (dict): Config of last normalization layer. Default：
+            `LN`.
+    """
+
+    def __init__(self, *args, return_intermediate=False, **kwargs):
+
+        super(DeformableDetrTransformerDecoder, self).__init__(*args, **kwargs)
+        self.return_intermediate = return_intermediate
+
+    def forward(self,
+                query,
+                *args,
+                reference_points=None,
+                valid_ratios=None,
+                reg_branches=None,
+                fc_coord=None,
+                **kwargs):
+        output = query
+        intermediate = []
+        intermediate_reference_points = []
+
+        for lid, layer in enumerate(self.layers):
+            if reference_points.shape[-1] == 4:
+                reference_points_input = reference_points[:, :, None] * \
+                                         torch.cat([valid_ratios, valid_ratios], -1)[:, None]
+            else:
+                assert reference_points.shape[-1] == 3
+                # print(reference_points.shape, valid_ratios.shape)  # [48,65,3], [48,4,3]
+                reference_points_input = reference_points[:, :, None, :2] * \
+                                         valid_ratios[:, None]
+                # assert reference_points.shape[-1] == 2
+                # reference_points_input = reference_points[:, :, None] * \
+                #     valid_ratios[:, None]
+            # print(output.shape, reference_points_input.shape)
+            output = layer(
+                output,
+                *args,
+                reference_points=reference_points_input,
+                **kwargs)
+            output = output.permute(1, 0, 2)
+
+            # if reg_branches is not None:
+            #     tmp = reg_branches[lid](output)
+            #
+            #     if fc_coord is not None:
+            #         tmp = fc_coord(tmp)
+            #
+            #     if reference_points.shape[-1] == 4:
+            #         new_reference_points = tmp + inverse_sigmoid(
+            #             reference_points)
+            #         new_reference_points = new_reference_points.sigmoid()
+            #     else:
+            #         assert reference_points.shape[-1] == 3
+            #         new_reference_points = tmp
+            #         new_reference_points[..., :3] = tmp[
+            #                                         ..., :3] + inverse_sigmoid(reference_points)
+            #         new_reference_points = new_reference_points.sigmoid()
+            #     # else:
+            #     #     assert reference_points.shape[-1] == 2
+            #     #     new_reference_points = tmp
+            #     #     new_reference_points[..., :2] = tmp[
+            #     #         ..., :2] + inverse_sigmoid(reference_points)
+            #     #     new_reference_points = new_reference_points.sigmoid()
+            #     # # reference_points = new_reference_points.detach()
+            #     # reference_points = new_reference_points
+            #     reference_points = new_reference_points
+            output = output.permute(1, 0, 2)
+            if self.return_intermediate:
+                intermediate.append(output)
+                intermediate_reference_points.append(reference_points)
+
+        if self.return_intermediate:
+            return torch.stack(intermediate), torch.stack(
+                intermediate_reference_points)
+
+        return output, reference_points
+
+
+class Linear_with_norm(nn.Module):
+    def __init__(self, in_channel, out_channel, bias=True, norm=True):
+        super(Linear_with_norm, self).__init__()
+        self.bias = bias
+        self.norm = norm
+        self.linear = nn.Linear(in_channel, out_channel, bias)
+        nn.init.xavier_uniform_(self.linear.weight, gain=0.01)
+
+    def forward(self, x):
+        y = x.matmul(self.linear.weight.t())
+
+        if self.norm:
+            x_norm = torch.norm(x, dim=1, keepdim=True)
+            y = y / x_norm
+
+        if self.bias:
+            y = y + self.linear.bias
+        return y
+
+
+@TRANSFORMER.register_module()
+class Transformer(BaseModule):
+    """Implements the DETR transformer.
+    Following the official DETR implementation, this module copy-paste
+    from torch.nn.Transformer with modifications:
+        * positional encodings are passed in MultiheadAttention
+        * extra LN at the end of encoder is removed
+        * decoder returns a stack of activations from all decoding layers
+    See `paper: End-to-End Object Detection with Transformers
+    <https://arxiv.org/pdf/2005.12872>`_ for details.
+    Args:
+        encoder (`mmcv.ConfigDict` | Dict): Config of
+            TransformerEncoder. Defaults to None.
+        decoder ((`mmcv.ConfigDict` | Dict)): Config of
+            TransformerDecoder. Defaults to None
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Defaults to None.
+    """
+
+    def __init__(self, encoder=None, decoder=None, init_cfg=None):
+        super(Transformer, self).__init__(init_cfg=init_cfg)
+        self.encoder = build_transformer_layer_sequence(encoder)
+        self.decoder = build_transformer_layer_sequence(decoder)
+        # self.embed_dims = self.encoder.embed_dims
+
+    def init_weights(self):
+        # follow the official DETR to init parameters
+        for m in self.modules():
+            if hasattr(m, 'weight') and m.weight.dim() > 1:
+                xavier_init(m, distribution='uniform')
+        self._is_init = True
+
+    def forward(self, x, mask, query_embed, pos_embed):
+        """Forward function for `Transformer`.
+        Args:
+            x (Tensor): Input query with shape [bs, c, h, w] where
+                c = embed_dims.
+            mask (Tensor): The key_padding_mask used for encoder and decoder,
+                with shape [bs, h, w].
+            query_embed (Tensor): The query embedding for decoder, with shape
+                [num_query, c].
+            pos_embed (Tensor): The positional encoding for encoder and
+                decoder, with the same shape as `x`.
+        Returns:
+            tuple[Tensor]: results of decoder containing the following tensor.
+                - out_dec: Output from decoder. If return_intermediate_dec \
+                      is True output has shape [num_dec_layers, bs,
+                      num_query, embed_dims], else has shape [1, bs, \
+                      num_query, embed_dims].
+                - memory: Output results from encoder, with shape \
+                      [bs, embed_dims, h, w].
+        """
+        bs, c, h, w = x.shape
+        # use `view` instead of `flatten` for dynamically exporting to ONNX
+        x = x.view(bs, c, -1).permute(2, 0, 1)  # [bs, c, h, w] -> [h*w, bs, c]
+        pos_embed = pos_embed.view(bs, c, -1).permute(2, 0, 1)
+        query_embed = query_embed.unsqueeze(1).repeat(
+            1, bs, 1)  # [num_query, dim] -> [num_query, bs, dim]
+        mask = mask.view(bs, -1)  # [bs, h, w] -> [bs, h*w]
+        memory = self.encoder(
+            query=x,
+            key=None,
+            value=None,
+            query_pos=pos_embed,
+            query_key_padding_mask=mask)
+        target = torch.zeros_like(query_embed)
+        # out_dec: [num_layers, num_query, bs, dim]
+        out_dec = self.decoder(
+            query=target,
+            key=memory,
+            value=memory,
+            key_pos=pos_embed,
+            query_pos=query_embed,
+            key_padding_mask=mask)
+        out_dec = out_dec.transpose(1, 2)
+        memory = memory.permute(1, 2, 0).reshape(bs, c, h, w)
+        return out_dec, memory
+
+
+@TRANSFORMER.register_module()
+class PoseurTransformer_v3(Transformer):
+    """ add noise training """
+
+    def __init__(self,
+                 as_two_stage=False,
+                 num_feature_levels=4,
+                 two_stage_num_proposals=300,
+                 num_joints=17,
+                 use_soft_argmax=False,
+                 use_soft_argmax_def=False,
+                 proposal_feature='backbone_s',  # or encoder_memory
+                 image_size=[192, 256],
+                 init_q_sigmoid=False,
+                 soft_arg_stride=4,
+                 add_feat_2_query=False,
+                 query_pose_emb=True,
+                 num_noise_sample=3,
+                 num_noise_point=4,
+                 noise_sigma=0.2,
+                 embed_dims=256,
+                 **kwargs):
+        super(PoseurTransformer_v3, self).__init__(**kwargs)
+        assert query_pose_emb == True
+        # self.num_noise_sample = num_noise_sample
+        self.num_noise_sample = num_noise_sample
+        self.num_noise_point = num_noise_point
+        self.noise_sigma = noise_sigma
+        self.add_feat_2_query = add_feat_2_query
+        self.as_two_stage = as_two_stage
+        self.num_feature_levels = num_feature_levels
+        self.two_stage_num_proposals = two_stage_num_proposals
+        try:
+            self.embed_dims = self.encoder.embed_dims
+        except:
+            self.embed_dims = embed_dims
+        self.num_joints = num_joints
+        # self.num_joints = 17
+        # self.num_joints = len(smpl_x.pos_joint_part['rhand'])  # body_joints+bboxes
+        # self.num_joints = len(smpl_x.pos_joint_part['body']) + len(smpl_x.pos_joint_part['rhand']) + len(smpl_x.pos_joint_part['lhand'])
+        self.use_soft_argmax = use_soft_argmax
+        self.use_soft_argmax_def = use_soft_argmax_def
+        assert not (self.use_soft_argmax & self.use_soft_argmax_def)
+        self.init_q_sigmoid = init_q_sigmoid
+        self.image_size = image_size
+        self.soft_arg_stride = soft_arg_stride
+        self.proposal_feature = proposal_feature
+        self.query_pose_emb = query_pose_emb
+        self.prior = distributions.MultivariateNormal(torch.zeros(2), torch.eye(2) * self.noise_sigma)
+        self.init_layers()
+
+    def init_layers(self):
+        """Initialize layers of the DeformableDetrTransformer."""
+        self.level_embeds = nn.Parameter(
+            torch.Tensor(self.num_feature_levels, self.embed_dims))
+
+        if self.as_two_stage:
+            self.avg_pool = nn.AdaptiveAvgPool2d(1)
+            # self.fc_sigma = Linear_with_norm(self.embed_dims, self.num_joints * 2, norm=False)
+            self.fc_sigma = Linear_with_norm(self.embed_dims, self.num_joints * 3, norm=False)
+            if self.use_soft_argmax:
+                self.soft_argmax_coord = Heatmap1DHead(in_channels=self.embed_dims, expand_ratio=2, hidden_dims=(512,),
+                                                       image_size=self.image_size, stride=self.soft_arg_stride)
+                self.fc_layers = [self.fc_sigma]
+            elif self.use_soft_argmax_def:
+                self.soft_argmax_coord = Heatmap2DHead(in_channels=self.embed_dims,
+                                                       image_size=self.image_size, stride=self.soft_arg_stride)
+                self.fc_layers = [self.fc_sigma]
+            else:
+                # self.fc_coord = Linear_with_norm(self.embed_dims, self.num_joints * 2)
+                self.fc_coord = Linear_with_norm(self.embed_dims, self.num_joints * 3)
+                self.fc_layers = [self.fc_coord, self.fc_sigma]
+
+            if self.query_pose_emb:
+                self.pos_trans = nn.Linear(self.embed_dims * 2,
+                                           self.embed_dims)
+                self.pos_trans_norm = nn.LayerNorm(self.embed_dims)
+                # self.pos_embed = nn.Embedding(17,self.embed_dims)
+                self.pos_embed = nn.Embedding(self.num_joints, self.embed_dims)
+            else:
+                self.pos_trans = nn.Linear(self.embed_dims * 2,
+                                           self.embed_dims * 2)
+                self.pos_trans_norm = nn.LayerNorm(self.embed_dims * 2)
+        else:
+            self.reference_points = nn.Linear(self.embed_dims, 2)
+        self.fp16_enabled = False
+
+    def init_weights(self):
+        """Initialize the transformer weights."""
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+        for m in self.modules():
+            if isinstance(m, MultiScaleDeformableAttention):
+                m.init_weights()
+        if not self.as_two_stage:
+            xavier_init(self.reference_points, distribution='uniform', bias=0.)
+        normal_(self.level_embeds)
+        if self.use_soft_argmax:
+            self.soft_argmax_coord.init_weights()
+
+        if self.as_two_stage:
+            for m in self.fc_layers:
+                if isinstance(m, nn.Linear):
+                    nn.init.xavier_uniform_(m.weight, gain=0.01)
+
+    def gen_encoder_output_proposals(self, memory, memory_padding_mask,
+                                     spatial_shapes):
+        """Generate proposals from encoded memory.
+        Args:
+            memory (Tensor) : The output of encoder,
+                has shape (bs, num_key, embed_dim).  num_key is
+                equal the number of points on feature map from
+                all level.
+            memory_padding_mask (Tensor): Padding mask for memory.
+                has shape (bs, num_key).
+            spatial_shapes (Tensor): The shape of all feature maps.
+                has shape (num_level, 2).
+        Returns:
+            tuple: A tuple of feature map and bbox prediction.
+                - output_memory (Tensor): The input of decoder,  \
+                    has shape (bs, num_key, embed_dim).  num_key is \
+                    equal the number of points on feature map from \
+                    all levels.
+                - output_proposals (Tensor): The normalized proposal \
+                    after a inverse sigmoid, has shape \
+                    (bs, num_keys, 4).
+        """
+
+        N, S, C = memory.shape
+        proposals = []
+        _cur = 0
+        for lvl, (H, W) in enumerate(spatial_shapes):
+            mask_flatten_ = memory_padding_mask[:, _cur:(_cur + H * W)].view(
+                N, H, W, 1)
+            valid_H = torch.sum(~mask_flatten_[:, :, 0, 0], 1)
+            valid_W = torch.sum(~mask_flatten_[:, 0, :, 0], 1)
+
+            grid_y, grid_x = torch.meshgrid(
+                torch.linspace(
+                    0, H - 1, H, dtype=torch.float32, device=memory.device),
+                torch.linspace(
+                    0, W - 1, W, dtype=torch.float32, device=memory.device))
+            grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1)
+
+            scale = torch.cat([valid_W.unsqueeze(-1),
+                               valid_H.unsqueeze(-1)], 1).view(N, 1, 1, 2)
+            grid = (grid.unsqueeze(0).expand(N, -1, -1, -1) + 0.5) / scale
+            wh = torch.ones_like(grid) * 0.05 * (2.0 ** lvl)
+            # proposal = torch.cat((grid, wh), -1).view(N, -1, 4)
+            proposal = grid.view(N, -1, 2)
+            proposals.append(proposal)
+            _cur += (H * W)
+        output_proposals = torch.cat(proposals, 1)
+        output_proposals_valid = ((output_proposals > 0.01) &
+                                  (output_proposals < 0.99)).all(
+            -1, keepdim=True)
+        output_proposals = torch.log(output_proposals / (1 - output_proposals))
+        output_proposals = output_proposals.masked_fill(
+            memory_padding_mask.unsqueeze(-1), float('inf'))
+        output_proposals = output_proposals.masked_fill(
+            ~output_proposals_valid, float('inf'))
+
+        output_memory = memory
+        output_memory = output_memory.masked_fill(
+            memory_padding_mask.unsqueeze(-1), float(0))
+        output_memory = output_memory.masked_fill(~output_proposals_valid,
+                                                  float(0))
+        output_memory = self.enc_output_norm(self.enc_output(output_memory))
+        return output_memory, output_proposals
+
+    @staticmethod
+    def get_reference_points(spatial_shapes, valid_ratios, device):
+        """Get the reference points used in decoder.
+        Args:
+            spatial_shapes (Tensor): The shape of all
+                feature maps, has shape (num_level, 2).
+            valid_ratios (Tensor): The radios of valid
+                points on the feature map, has shape
+                (bs, num_levels, 2)
+            device (obj:`device`): The device where
+                reference_points should be.
+        Returns:
+            Tensor: reference points used in decoder, has \
+                shape (bs, num_keys, num_levels, 2).
+        """
+        # print(spatial_shapes)
+        reference_points_list = []
+        for lvl, (H, W) in enumerate(spatial_shapes):
+            #  TODO  check this 0.5
+            ref_y, ref_x = torch.meshgrid(
+                torch.linspace(
+                    0.5, H - 0.5, H, dtype=torch.float32, device=device),
+                torch.linspace(
+                    0.5, W - 0.5, W, dtype=torch.float32, device=device))
+            ref_y = ref_y.reshape(-1)[None] / (
+                    valid_ratios[:, None, lvl, 1] * H)
+            ref_x = ref_x.reshape(-1)[None] / (
+                    valid_ratios[:, None, lvl, 0] * W)
+            ref = torch.stack((ref_x, ref_y), -1)
+            reference_points_list.append(ref)
+        # print(reference_points_list[-1])   # range:(0,1)
+        # print(H, W)  [8,6]
+        reference_points = torch.cat(reference_points_list, 1)
+        reference_points = reference_points[:, :, None] * valid_ratios[:, None]
+        return reference_points
+
+    def get_valid_ratio(self, mask):
+        """Get the valid radios of feature maps of all  level."""
+        _, H, W = mask.shape
+        valid_H = torch.sum(~mask[:, :, 0], 1)
+        valid_W = torch.sum(~mask[:, 0, :], 1)
+        valid_ratio_h = valid_H.float() / H
+        valid_ratio_w = valid_W.float() / W
+        valid_ratio = torch.stack([valid_ratio_w, valid_ratio_h], -1)
+        return valid_ratio
+
+    def get_proposal_pos_embed(self,
+                               proposals,
+                               num_pos_feats=128,
+                               temperature=10000):
+        """Get the position embedding of proposal."""
+        num_pos_feats = self.embed_dims // 3 + 1
+        scale = 2 * math.pi
+        dim_t = torch.arange(
+            num_pos_feats, dtype=torch.float32, device=proposals.device)
+        dim_t = temperature ** (2 * (dim_t // 2) / num_pos_feats)
+        # N, L, 2
+        if self.init_q_sigmoid:
+            proposals = proposals.sigmoid() * scale
+        else:
+            proposals = proposals * scale
+
+        # N, L, 3, 86
+        pos = proposals[:, :, :, None] / dim_t
+        # N, L, 3, 43, 2
+        pos = torch.stack((pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()), dim=4).flatten(2)
+        return pos[:, :, :self.embed_dims]
+
+    @force_fp32(apply_to=('mlvl_feats', 'query_embed', 'mlvl_pos_embeds'))
+    def forward(self,
+                mlvl_feats,
+                mlvl_masks,
+                query_embed,
+                mlvl_pos_embeds,
+                reg_branches=None,
+                fc_coord=None,
+                cls_branches=None,
+                coord_init=None,
+                query_init=None,
+                **kwargs):
+        assert self.as_two_stage or query_embed is not None
+
+        feat_flatten = []
+        mask_flatten = []
+        lvl_pos_embed_flatten = []
+        spatial_shapes = []
+        for lvl, (feat, mask, pos_embed) in enumerate(
+                zip(mlvl_feats, mlvl_masks, mlvl_pos_embeds)):
+            bs, c, h, w = feat.shape
+            spatial_shape = (h, w)
+            spatial_shapes.append(spatial_shape)
+            feat = feat.flatten(2).transpose(1, 2)
+            mask = mask.flatten(1)
+            pos_embed = pos_embed.flatten(2).transpose(1, 2)
+            lvl_pos_embed = pos_embed + self.level_embeds[lvl].view(1, 1, -1)
+            lvl_pos_embed_flatten.append(lvl_pos_embed)
+            feat_flatten.append(feat)
+            mask_flatten.append(mask)
+        feat_flatten = torch.cat(feat_flatten, 1)
+        mask_flatten = torch.cat(mask_flatten, 1)
+        lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)
+        spatial_shapes = torch.as_tensor(
+            spatial_shapes, dtype=torch.long, device=feat_flatten.device)
+        level_start_index = torch.cat((spatial_shapes.new_zeros(
+            (1,)), spatial_shapes.prod(1).cumsum(0)[:-1]))
+        valid_ratios = torch.stack(
+            [self.get_valid_ratio(m) for m in mlvl_masks], 1)
+        # [bs, H*W, num_lvls, 2]
+        # print(spatial_shape)
+        reference_points = \
+            self.get_reference_points(spatial_shapes,
+                                      valid_ratios,
+                                      device=feat.device)
+        # print(reference_points.shape, valid_ratios.shape)   # [bs, 4080, 4, 2]; [bs, 4, 2]
+        feat_flatten = feat_flatten.permute(1, 0, 2)  # (H*W, bs, embed_dims)
+        lvl_pos_embed_flatten = lvl_pos_embed_flatten.permute(
+            1, 0, 2)  # (H*W, bs, embed_dims)
+
+        memory = self.encoder(
+            query=feat_flatten,
+            key=None,
+            value=None,
+            query_pos=lvl_pos_embed_flatten,
+            query_key_padding_mask=mask_flatten,
+            spatial_shapes=spatial_shapes,
+            reference_points=reference_points,
+            level_start_index=level_start_index,
+            valid_ratios=valid_ratios,
+            **kwargs)
+
+        memory = memory.permute(1, 0, 2)
+        bs, _, c = memory.shape
+
+        if self.proposal_feature == 'backbone_l':
+            x = mlvl_feats[0]
+        elif self.proposal_feature == 'backbone_s':
+            x = mlvl_feats[-1]
+            point_sample_feat = mlvl_feats[-1]
+        elif self.proposal_feature == 'encoder_memory_l':
+            x = memory.permute(0, 2, 1)[:, :, :int(level_start_index[1])].view_as(mlvl_feats[0])
+            point_sample_feat = memory.permute(0, 2, 1)[:, :, :int(level_start_index[1])].view_as(mlvl_feats[0])
+        elif self.proposal_feature == 'encoder_memory_s':
+            x = memory.permute(0, 2, 1)[:, :, int(level_start_index[-1]):].view_as(mlvl_feats[-1])
+        else:
+            raise NotImplementedError
+
+        BATCH_SIZE = x.shape[0]
+
+        if coord_init is not None:
+            pred_jts = coord_init
+            enc_outputs = None
+        else:
+            if self.use_soft_argmax:
+                out_coord = self.soft_argmax_coord(x)  # bs, 17, 2
+                assert out_coord.shape[2] == 2
+                x = self.avg_pool(x).reshape(BATCH_SIZE, -1)
+                out_sigma = self.fc_sigma(x).reshape(BATCH_SIZE, self.num_joints, -1)
+            elif self.use_soft_argmax_def:
+                out_coord = self.soft_argmax_coord(x)  # bs, 17, 2
+                assert out_coord.shape[2] == 2
+                x = self.avg_pool(x).reshape(BATCH_SIZE, -1)
+                out_sigma = self.fc_sigma(x).reshape(BATCH_SIZE, self.num_joints, -1)
+            else:
+                x = self.avg_pool(x).reshape(BATCH_SIZE, -1)
+                out_coord = self.fc_coord(x).reshape(BATCH_SIZE, self.num_joints, 3)
+                assert out_coord.shape[2] == 3
+                out_sigma = self.fc_sigma(x).reshape(BATCH_SIZE, self.num_joints, -1)
+
+            # (B, N, 3)
+            pred_jts = out_coord.reshape(BATCH_SIZE, self.num_joints, 3)
+            sigma = out_sigma.reshape(BATCH_SIZE, self.num_joints, -1).sigmoid()
+            scores = 1 - sigma
+
+            scores = torch.mean(scores, dim=2, keepdim=True)
+            enc_outputs = EasyDict(
+                pred_jts=pred_jts,
+                sigma=sigma,
+                maxvals=scores.float(),
+            )
+
+        reference_points = pred_jts.detach()
+        reference_points_cliped = reference_points.clip(0, 1)
+
+        init_reference_out = reference_points_cliped
+        if query_init is not None:
+            query = query_init
+        else:
+            pred_jts_pos_embed = self.get_proposal_pos_embed(reference_points.detach())
+            reference_points_pos_embed = self.get_proposal_pos_embed(reference_points_cliped.detach())  # query init here
+            if self.add_feat_2_query:
+                query_feat = point_sample(point_sample_feat, init_reference_out, align_corners=False).permute(0, 2, 1)
+                reference_points_pos_embed = reference_points_pos_embed + query_feat
+            query_pos_emb = torch.cat([pred_jts_pos_embed, reference_points_pos_embed], dim=2)
+            pos_trans_out = self.pos_trans_norm(self.pos_trans(query_pos_emb))
+
+            query = pos_trans_out
+
+        query_pos = self.pos_embed.weight.clone().repeat(bs, 1, 1).contiguous()
+
+        # decoder
+        query = query.permute(1, 0, 2)
+        memory = memory.permute(1, 0, 2)
+        query_pos = query_pos.permute(1, 0, 2)
+        inter_states, inter_references = self.decoder(
+            query=query,
+            key=None,
+            value=memory,
+            query_pos=query_pos,
+            key_padding_mask=mask_flatten,
+            reference_points=reference_points,
+            spatial_shapes=spatial_shapes,
+            level_start_index=level_start_index,
+            valid_ratios=valid_ratios,
+            reg_branches=reg_branches,
+            fc_coord=fc_coord,
+            **kwargs)
+        inter_references_out = inter_references
+        return memory.permute(1, 0, 2), spatial_shapes, level_start_index, inter_states, init_reference_out, \
+               inter_references_out, enc_outputs
diff --git a/main/transformer_utils/mmpose/ops/__init__.py b/main/transformer_utils/mmpose/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6af823310ad59c2d1e52274f8af9a0fc0f14a72
--- /dev/null
+++ b/main/transformer_utils/mmpose/ops/__init__.py
@@ -0,0 +1,9 @@
+from .multi_scale_deform_attn import (MultiScaleDeformableAttention_share_value,
+                                    MultiScaleDeformableAttention_bottle_neck_v,
+                                    MultiScaleDeformableAttention_post_value,
+                                    MultiScaleDeformableAttention_post_v_stirct,
+                                    )
+
+__all__ = [
+    'MultiScaleDeformableAttention',
+]
\ No newline at end of file
diff --git a/main/transformer_utils/mmpose/ops/csrc/pytorch/info.cpp b/main/transformer_utils/mmpose/ops/csrc/pytorch/info.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d252ba2edf71649c976923f9836da801fd71b48b
--- /dev/null
+++ b/main/transformer_utils/mmpose/ops/csrc/pytorch/info.cpp
@@ -0,0 +1,55 @@
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/vision.cpp
+#include "pytorch_cpp_helper.hpp"
+
+#ifdef MMCV_WITH_CUDA
+#ifndef HIP_DIFF
+#include <cuda_runtime_api.h>
+int get_cudart_version() { return CUDART_VERSION; }
+#endif
+#endif
+
+std::string get_compiling_cuda_version() {
+#ifdef MMCV_WITH_CUDA
+#ifndef HIP_DIFF
+  std::ostringstream oss;
+  // copied from
+  // https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/cuda/detail/CUDAHooks.cpp#L231
+  auto printCudaStyleVersion = [&](int v) {
+    oss << (v / 1000) << "." << (v / 10 % 100);
+    if (v % 10 != 0) {
+      oss << "." << (v % 10);
+    }
+  };
+  printCudaStyleVersion(get_cudart_version());
+  return oss.str();
+#else
+  return std::string("rocm not vailable");
+#endif
+#else
+  return std::string("not available");
+#endif
+}
+
+// similar to
+// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Version.cpp
+std::string get_compiler_version() {
+  std::ostringstream ss;
+#if defined(__GNUC__)
+#ifndef __clang__
+  { ss << "GCC " << __GNUC__ << "." << __GNUC_MINOR__; }
+#endif
+#endif
+
+#if defined(__clang_major__)
+  {
+    ss << "clang " << __clang_major__ << "." << __clang_minor__ << "."
+       << __clang_patchlevel__;
+  }
+#endif
+
+#if defined(_MSC_VER)
+  { ss << "MSVC " << _MSC_FULL_VER; }
+#endif
+  return ss.str();
+}
\ No newline at end of file
diff --git a/main/transformer_utils/mmpose/ops/csrc/pytorch/ms_deform_attn.cpp b/main/transformer_utils/mmpose/ops/csrc/pytorch/ms_deform_attn.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1fda9aeba25b8fd87fd8b9e2d7e27d646271d7b3
--- /dev/null
+++ b/main/transformer_utils/mmpose/ops/csrc/pytorch/ms_deform_attn.cpp
@@ -0,0 +1,79 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from
+*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#include "pytorch_cpp_helper.hpp"
+
+#ifdef MMCV_WITH_CUDA
+Tensor ms_deform_attn_cuda_forward(const Tensor &value,
+                                   const Tensor &spatial_shapes,
+                                   const Tensor &level_start_index,
+                                   const Tensor &sampling_loc,
+                                   const Tensor &attn_weight,
+                                   const int im2col_step);
+
+void ms_deform_attn_cuda_backward(
+    const Tensor &value, const Tensor &spatial_shapes,
+    const Tensor &level_start_index, const Tensor &sampling_loc,
+    const Tensor &attn_weight, const Tensor &grad_output, Tensor &grad_value,
+    Tensor &grad_sampling_loc, Tensor &grad_attn_weight, const int im2col_step);
+
+#endif
+
+Tensor ms_deform_attn_forward(const Tensor &value, const Tensor &spatial_shapes,
+                              const Tensor &level_start_index,
+                              const Tensor &sampling_loc,
+                              const Tensor &attn_weight,
+                              const int im2col_step) {
+  if (value.type().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(value)
+    CHECK_CUDA_INPUT(spatial_shapes)
+    CHECK_CUDA_INPUT(level_start_index)
+    CHECK_CUDA_INPUT(sampling_loc)
+    CHECK_CUDA_INPUT(attn_weight)
+    return ms_deform_attn_cuda_forward(value, spatial_shapes, level_start_index,
+                                       sampling_loc, attn_weight, im2col_step);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+  AT_ERROR("Not implemented on the CPU");
+}
+
+void ms_deform_attn_backward(const Tensor &value, const Tensor &spatial_shapes,
+                             const Tensor &level_start_index,
+                             const Tensor &sampling_loc,
+                             const Tensor &attn_weight,
+                             const Tensor &grad_output, Tensor &grad_value,
+                             Tensor &grad_sampling_loc,
+                             Tensor &grad_attn_weight, const int im2col_step) {
+  if (value.type().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(value)
+    CHECK_CUDA_INPUT(spatial_shapes)
+    CHECK_CUDA_INPUT(level_start_index)
+    CHECK_CUDA_INPUT(sampling_loc)
+    CHECK_CUDA_INPUT(attn_weight)
+    CHECK_CUDA_INPUT(grad_output)
+    CHECK_CUDA_INPUT(grad_value)
+    CHECK_CUDA_INPUT(grad_sampling_loc)
+    CHECK_CUDA_INPUT(grad_attn_weight)
+    ms_deform_attn_cuda_backward(value, spatial_shapes, level_start_index,
+                                 sampling_loc, attn_weight, grad_output,
+                                 grad_value, grad_sampling_loc,
+                                 grad_attn_weight, im2col_step);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("Not implemented on the CPU");
+  }
+}
\ No newline at end of file
diff --git a/main/transformer_utils/mmpose/ops/csrc/pytorch/ms_deform_attn_cuda.cu b/main/transformer_utils/mmpose/ops/csrc/pytorch/ms_deform_attn_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4c2ad396cf68ae470431e2cba362506bc068cc7d
--- /dev/null
+++ b/main/transformer_utils/mmpose/ops/csrc/pytorch/ms_deform_attn_cuda.cu
@@ -0,0 +1,360 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from
+*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <THC/THCAtomics.cuh>
+#include <ms_deform_attn_cuda_kernel.cuh>
+#include <vector>
+
+template <typename scalar_t>
+void ms_deformable_im2col_cuda(cudaStream_t stream, const scalar_t *data_value,
+                               const int64_t *data_spatial_shapes,
+                               const int64_t *data_level_start_index,
+                               const scalar_t *data_sampling_loc,
+                               const scalar_t *data_attn_weight,
+                               const int batch_size, const int spatial_size,
+                               const int num_heads, const int channels,
+                               const int num_levels, const int num_query,
+                               const int num_point, scalar_t *data_col) {
+  const int num_kernels = batch_size * num_query * num_heads * channels;
+  const int num_actual_kernels = batch_size * num_query * num_heads * channels;
+  const int num_threads = CUDA_NUM_THREADS;
+  ms_deformable_im2col_gpu_kernel<scalar_t>
+      <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0, stream>>>(
+          num_kernels, data_value, data_spatial_shapes, data_level_start_index,
+          data_sampling_loc, data_attn_weight, batch_size, spatial_size,
+          num_heads, channels, num_levels, num_query, num_point, data_col);
+
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf("error in ms_deformable_im2col_cuda: %s\n", cudaGetErrorString(err));
+  }
+}
+
+template <typename scalar_t>
+void ms_deformable_col2im_cuda(
+    cudaStream_t stream, const scalar_t *grad_col, const scalar_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
+    const int batch_size, const int spatial_size, const int num_heads,
+    const int channels, const int num_levels, const int num_query,
+    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
+    scalar_t *grad_attn_weight) {
+  const int num_threads =
+      (channels > CUDA_NUM_THREADS) ? CUDA_NUM_THREADS : channels;
+  const int num_kernels = batch_size * num_query * num_heads * channels;
+  const int num_actual_kernels = batch_size * num_query * num_heads * channels;
+  if (channels > 1024) {
+    if ((channels & 1023) == 0) {
+      ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks<scalar_t>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+             num_threads * 3 * sizeof(scalar_t), stream>>>(
+              num_kernels, grad_col, data_value, data_spatial_shapes,
+              data_level_start_index, data_sampling_loc, data_attn_weight,
+              batch_size, spatial_size, num_heads, channels, num_levels,
+              num_query, num_point, grad_value, grad_sampling_loc,
+              grad_attn_weight);
+    } else {
+      ms_deformable_col2im_gpu_kernel_gm<scalar_t>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+             stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                       data_level_start_index, data_sampling_loc,
+                       data_attn_weight, batch_size, spatial_size, num_heads,
+                       channels, num_levels, num_query, num_point, grad_value,
+                       grad_sampling_loc, grad_attn_weight);
+    }
+  } else {
+    switch (channels) {
+      case 1:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,
+                                                                      1>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 2:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,
+                                                                      2>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 4:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,
+                                                                      4>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 8:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,
+                                                                      8>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 16:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,
+                                                                      16>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 32:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,
+                                                                      32>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 64:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t,
+                                                                      64>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 128:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t,
+                                                                      128>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 256:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t,
+                                                                      256>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 512:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t,
+                                                                      512>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 1024:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t,
+                                                                      1024>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      default:
+        if (channels < 64) {
+          ms_deformable_col2im_gpu_kernel_shm_reduce_v1<scalar_t>
+              <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+                 num_threads * 3 * sizeof(scalar_t), stream>>>(
+                  num_kernels, grad_col, data_value, data_spatial_shapes,
+                  data_level_start_index, data_sampling_loc, data_attn_weight,
+                  batch_size, spatial_size, num_heads, channels, num_levels,
+                  num_query, num_point, grad_value, grad_sampling_loc,
+                  grad_attn_weight);
+        } else {
+          ms_deformable_col2im_gpu_kernel_shm_reduce_v2<scalar_t>
+              <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+                 num_threads * 3 * sizeof(scalar_t), stream>>>(
+                  num_kernels, grad_col, data_value, data_spatial_shapes,
+                  data_level_start_index, data_sampling_loc, data_attn_weight,
+                  batch_size, spatial_size, num_heads, channels, num_levels,
+                  num_query, num_point, grad_value, grad_sampling_loc,
+                  grad_attn_weight);
+        }
+    }
+  }
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf("error in ms_deformable_col2im_cuda: %s\n", cudaGetErrorString(err));
+  }
+}
+
+at::Tensor ms_deform_attn_cuda_forward(const at::Tensor &value,
+                                       const at::Tensor &spatial_shapes,
+                                       const at::Tensor &level_start_index,
+                                       const at::Tensor &sampling_loc,
+                                       const at::Tensor &attn_weight,
+                                       const int im2col_step) {
+  AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
+  AT_ASSERTM(spatial_shapes.is_contiguous(),
+             "spatial_shapes tensor has to be contiguous");
+  AT_ASSERTM(level_start_index.is_contiguous(),
+             "level_start_index tensor has to be contiguous");
+  AT_ASSERTM(sampling_loc.is_contiguous(),
+             "sampling_loc tensor has to be contiguous");
+  AT_ASSERTM(attn_weight.is_contiguous(),
+             "attn_weight tensor has to be contiguous");
+
+  AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
+  AT_ASSERTM(spatial_shapes.type().is_cuda(),
+             "spatial_shapes must be a CUDA tensor");
+  AT_ASSERTM(level_start_index.type().is_cuda(),
+             "level_start_index must be a CUDA tensor");
+  AT_ASSERTM(sampling_loc.type().is_cuda(),
+             "sampling_loc must be a CUDA tensor");
+  AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
+
+  const int batch = value.size(0);
+  const int spatial_size = value.size(1);
+  const int num_heads = value.size(2);
+  const int channels = value.size(3);
+
+  const int num_levels = spatial_shapes.size(0);
+
+  const int num_query = sampling_loc.size(1);
+  const int num_point = sampling_loc.size(4);
+
+  const int im2col_step_ = std::min(batch, im2col_step);
+
+  AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)",
+             batch, im2col_step_);
+
+  auto output =
+      at::zeros({batch, num_query, num_heads, channels}, value.options());
+
+  const int batch_n = im2col_step_;
+  auto output_n = output.view(
+      {batch / im2col_step_, batch_n, num_query, num_heads, channels});
+  auto per_value_size = spatial_size * num_heads * channels;
+  auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
+  auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
+  for (int n = 0; n < batch / im2col_step_; ++n) {
+    auto columns = output_n.select(0, n);
+    AT_DISPATCH_FLOATING_TYPES(
+        value.type(), "ms_deform_attn_forward_cuda", ([&] {
+          ms_deformable_im2col_cuda(
+              at::cuda::getCurrentCUDAStream(),
+              value.data<scalar_t>() + n * im2col_step_ * per_value_size,
+              spatial_shapes.data<int64_t>(), level_start_index.data<int64_t>(),
+              sampling_loc.data<scalar_t>() +
+                  n * im2col_step_ * per_sample_loc_size,
+              attn_weight.data<scalar_t>() +
+                  n * im2col_step_ * per_attn_weight_size,
+              batch_n, spatial_size, num_heads, channels, num_levels, num_query,
+              num_point, columns.data<scalar_t>());
+        }));
+  }
+
+  output = output.view({batch, num_query, num_heads * channels});
+
+  return output;
+}
+
+void ms_deform_attn_cuda_backward(
+    const at::Tensor &value, const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index, const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight, const at::Tensor &grad_output,
+    at::Tensor &grad_value, at::Tensor &grad_sampling_loc,
+    at::Tensor &grad_attn_weight, const int im2col_step) {
+  AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
+  AT_ASSERTM(spatial_shapes.is_contiguous(),
+             "spatial_shapes tensor has to be contiguous");
+  AT_ASSERTM(level_start_index.is_contiguous(),
+             "level_start_index tensor has to be contiguous");
+  AT_ASSERTM(sampling_loc.is_contiguous(),
+             "sampling_loc tensor has to be contiguous");
+  AT_ASSERTM(attn_weight.is_contiguous(),
+             "attn_weight tensor has to be contiguous");
+  AT_ASSERTM(grad_output.is_contiguous(),
+             "grad_output tensor has to be contiguous");
+
+  AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
+  AT_ASSERTM(spatial_shapes.type().is_cuda(),
+             "spatial_shapes must be a CUDA tensor");
+  AT_ASSERTM(level_start_index.type().is_cuda(),
+             "level_start_index must be a CUDA tensor");
+  AT_ASSERTM(sampling_loc.type().is_cuda(),
+             "sampling_loc must be a CUDA tensor");
+  AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
+  AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor");
+
+  const int batch = value.size(0);
+  const int spatial_size = value.size(1);
+  const int num_heads = value.size(2);
+  const int channels = value.size(3);
+
+  const int num_levels = spatial_shapes.size(0);
+
+  const int num_query = sampling_loc.size(1);
+  const int num_point = sampling_loc.size(4);
+
+  const int im2col_step_ = std::min(batch, im2col_step);
+
+  AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)",
+             batch, im2col_step_);
+
+  const int batch_n = im2col_step_;
+  auto per_value_size = spatial_size * num_heads * channels;
+  auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
+  auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
+  auto grad_output_n = grad_output.view(
+      {batch / im2col_step_, batch_n, num_query, num_heads, channels});
+
+  for (int n = 0; n < batch / im2col_step_; ++n) {
+    auto grad_output_g = grad_output_n.select(0, n);
+    AT_DISPATCH_FLOATING_TYPES(
+        value.type(), "ms_deform_attn_backward_cuda", ([&] {
+          ms_deformable_col2im_cuda(
+              at::cuda::getCurrentCUDAStream(), grad_output_g.data<scalar_t>(),
+              value.data<scalar_t>() + n * im2col_step_ * per_value_size,
+              spatial_shapes.data<int64_t>(), level_start_index.data<int64_t>(),
+              sampling_loc.data<scalar_t>() +
+                  n * im2col_step_ * per_sample_loc_size,
+              attn_weight.data<scalar_t>() +
+                  n * im2col_step_ * per_attn_weight_size,
+              batch_n, spatial_size, num_heads, channels, num_levels, num_query,
+              num_point,
+              grad_value.data<scalar_t>() + n * im2col_step_ * per_value_size,
+              grad_sampling_loc.data<scalar_t>() +
+                  n * im2col_step_ * per_sample_loc_size,
+              grad_attn_weight.data<scalar_t>() +
+                  n * im2col_step_ * per_attn_weight_size);
+        }));
+  }
+}
\ No newline at end of file
diff --git a/main/transformer_utils/mmpose/ops/csrc/pytorch/ms_deform_attn_cuda_kernel.cuh b/main/transformer_utils/mmpose/ops/csrc/pytorch/ms_deform_attn_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..cbdf3820117a5cab28e2d116f3404876efa6a0bd
--- /dev/null
+++ b/main/transformer_utils/mmpose/ops/csrc/pytorch/ms_deform_attn_cuda_kernel.cuh
@@ -0,0 +1,807 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from
+*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+#ifndef DEFORM_ATTN_CUDA_KERNEL
+#define DEFORM_ATTN_CUDA_KERNEL
+
+#include "common_cuda_helper.hpp"
+#include "pytorch_cuda_helper.hpp"
+
+const int CUDA_NUM_THREADS = 1024;
+inline int GET_BLOCKS(const int N, const int num_threads) {
+  return (N + num_threads - 1) / num_threads;
+}
+
+template <typename scalar_t>
+__device__ scalar_t ms_deform_attn_im2col_bilinear(
+    const scalar_t *&bottom_data, const int &height, const int &width,
+    const int &nheads, const int &channels, const scalar_t &h,
+    const scalar_t &w, const int &m, const int &c) {
+  const int h_low = floor(h);
+  const int w_low = floor(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const scalar_t lh = h - h_low;
+  const scalar_t lw = w - w_low;
+  const scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0) {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+  }
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1) {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+  }
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0) {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+  }
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1) {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+  }
+
+  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+template <typename scalar_t>
+__device__ void ms_deform_attn_col2im_bilinear(
+    const scalar_t *&bottom_data, const int &height, const int &width,
+    const int &nheads, const int &channels, const scalar_t &h,
+    const scalar_t &w, const int &m, const int &c, const scalar_t &top_grad,
+    const scalar_t &attn_weight, scalar_t *&grad_value,
+    scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) {
+  const int h_low = floor(h);
+  const int w_low = floor(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const scalar_t lh = h - h_low;
+  const scalar_t lw = w - w_low;
+  const scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+  const scalar_t top_grad_value = top_grad * attn_weight;
+  scalar_t grad_h_weight = 0, grad_w_weight = 0;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0) {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+    grad_h_weight -= hw * v1;
+    grad_w_weight -= hh * v1;
+    atomicAdd(grad_value + ptr1, w1 * top_grad_value);
+  }
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1) {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+    grad_h_weight -= lw * v2;
+    grad_w_weight += hh * v2;
+    atomicAdd(grad_value + ptr2, w2 * top_grad_value);
+  }
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0) {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+    grad_h_weight += hw * v3;
+    grad_w_weight -= lh * v3;
+    atomicAdd(grad_value + ptr3, w3 * top_grad_value);
+  }
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1) {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+    grad_h_weight += lw * v4;
+    grad_w_weight += lh * v4;
+    atomicAdd(grad_value + ptr4, w4 * top_grad_value);
+  }
+
+  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  *grad_attn_weight = top_grad * val;
+  *grad_sampling_loc = width * grad_w_weight * top_grad_value;
+  *(grad_sampling_loc + 1) = height * grad_h_weight * top_grad_value;
+}
+
+template <typename scalar_t>
+__device__ void ms_deform_attn_col2im_bilinear_gm(
+    const scalar_t *&bottom_data, const int &height, const int &width,
+    const int &nheads, const int &channels, const scalar_t &h,
+    const scalar_t &w, const int &m, const int &c, const scalar_t &top_grad,
+    const scalar_t &attn_weight, scalar_t *&grad_value,
+    scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) {
+  const int h_low = floor(h);
+  const int w_low = floor(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const scalar_t lh = h - h_low;
+  const scalar_t lw = w - w_low;
+  const scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+  const scalar_t top_grad_value = top_grad * attn_weight;
+  scalar_t grad_h_weight = 0, grad_w_weight = 0;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0) {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+    grad_h_weight -= hw * v1;
+    grad_w_weight -= hh * v1;
+    atomicAdd(grad_value + ptr1, w1 * top_grad_value);
+  }
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1) {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+    grad_h_weight -= lw * v2;
+    grad_w_weight += hh * v2;
+    atomicAdd(grad_value + ptr2, w2 * top_grad_value);
+  }
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0) {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+    grad_h_weight += hw * v3;
+    grad_w_weight -= lh * v3;
+    atomicAdd(grad_value + ptr3, w3 * top_grad_value);
+  }
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1) {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+    grad_h_weight += lw * v4;
+    grad_w_weight += lh * v4;
+    atomicAdd(grad_value + ptr4, w4 * top_grad_value);
+  }
+
+  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  atomicAdd(grad_attn_weight, top_grad * val);
+  atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value);
+  atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value);
+}
+
+template <typename scalar_t>
+__global__ void ms_deformable_im2col_gpu_kernel(
+    const int n, const scalar_t *data_value, const int64_t *data_spatial_shapes,
+    const int64_t *data_level_start_index, const scalar_t *data_sampling_loc,
+    const scalar_t *data_attn_weight, const int batch_size,
+    const int spatial_size, const int num_heads, const int channels,
+    const int num_levels, const int num_query, const int num_point,
+    scalar_t *data_col) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    scalar_t *data_col_ptr = data_col + index;
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+    scalar_t col = 0;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const scalar_t *data_value_ptr =
+          data_value +
+          (data_value_ptr_init_offset + level_start_id * qid_stride);
+      for (int p_col = 0; p_col < num_point; ++p_col) {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          col += ms_deform_attn_im2col_bilinear(data_value_ptr, spatial_h,
+                                                spatial_w, num_heads, channels,
+                                                h_im, w_im, m_col, c_col) *
+                 weight;
+        }
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+      }
+    }
+    *data_col_ptr = col;
+  }
+}
+
+template <typename scalar_t, unsigned int blockSize>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(
+    const int n, const scalar_t *grad_col, const scalar_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
+    const int batch_size, const int spatial_size, const int num_heads,
+    const int channels, const int num_levels, const int num_query,
+    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
+    scalar_t *grad_attn_weight) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
+    __shared__ scalar_t cache_grad_attn_weight[blockSize];
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset =
+          data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col = 0; p_col < num_point; ++p_col) {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight + threadIdx.x) = 0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          ms_deform_attn_col2im_bilinear(
+              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+              cache_grad_sampling_loc + (threadIdx.x << 1),
+              cache_grad_attn_weight + threadIdx.x);
+        }
+
+        __syncthreads();
+        if (tid == 0) {
+          scalar_t _grad_w = cache_grad_sampling_loc[0],
+                   _grad_h = cache_grad_sampling_loc[1],
+                   _grad_a = cache_grad_attn_weight[0];
+          int sid = 2;
+          for (unsigned int tid = 1; tid < blockSize; ++tid) {
+            _grad_w += cache_grad_sampling_loc[sid];
+            _grad_h += cache_grad_sampling_loc[sid + 1];
+            _grad_a += cache_grad_attn_weight[tid];
+            sid += 2;
+          }
+
+          *grad_sampling_loc = _grad_w;
+          *(grad_sampling_loc + 1) = _grad_h;
+          *grad_attn_weight = _grad_a;
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename scalar_t, unsigned int blockSize>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(
+    const int n, const scalar_t *grad_col, const scalar_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
+    const int batch_size, const int spatial_size, const int num_heads,
+    const int channels, const int num_levels, const int num_query,
+    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
+    scalar_t *grad_attn_weight) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
+    __shared__ scalar_t cache_grad_attn_weight[blockSize];
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset =
+          data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col = 0; p_col < num_point; ++p_col) {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight + threadIdx.x) = 0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          ms_deform_attn_col2im_bilinear(
+              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+              cache_grad_sampling_loc + (threadIdx.x << 1),
+              cache_grad_attn_weight + threadIdx.x);
+        }
+
+        __syncthreads();
+
+        for (unsigned int s = blockSize / 2; s > 0; s >>= 1) {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] +=
+                cache_grad_sampling_loc[xid2 + 1];
+          }
+          __syncthreads();
+        }
+
+        if (tid == 0) {
+          *grad_sampling_loc = cache_grad_sampling_loc[0];
+          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
+          *grad_attn_weight = cache_grad_attn_weight[0];
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(
+    const int n, const scalar_t *grad_col, const scalar_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
+    const int batch_size, const int spatial_size, const int num_heads,
+    const int channels, const int num_levels, const int num_query,
+    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
+    scalar_t *grad_attn_weight) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    extern __shared__ int _s[];
+    scalar_t *cache_grad_sampling_loc = reinterpret_cast<scalar_t *>(_s);
+    scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset =
+          data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col = 0; p_col < num_point; ++p_col) {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight + threadIdx.x) = 0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          ms_deform_attn_col2im_bilinear(
+              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+              cache_grad_sampling_loc + (threadIdx.x << 1),
+              cache_grad_attn_weight + threadIdx.x);
+        }
+
+        __syncthreads();
+        if (tid == 0) {
+          scalar_t _grad_w = cache_grad_sampling_loc[0],
+                   _grad_h = cache_grad_sampling_loc[1],
+                   _grad_a = cache_grad_attn_weight[0];
+          int sid = 2;
+          for (unsigned int tid = 1; tid < blockDim.x; ++tid) {
+            _grad_w += cache_grad_sampling_loc[sid];
+            _grad_h += cache_grad_sampling_loc[sid + 1];
+            _grad_a += cache_grad_attn_weight[tid];
+            sid += 2;
+          }
+
+          *grad_sampling_loc = _grad_w;
+          *(grad_sampling_loc + 1) = _grad_h;
+          *grad_attn_weight = _grad_a;
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(
+    const int n, const scalar_t *grad_col, const scalar_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
+    const int batch_size, const int spatial_size, const int num_heads,
+    const int channels, const int num_levels, const int num_query,
+    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
+    scalar_t *grad_attn_weight) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    extern __shared__ int _s[];
+    scalar_t *cache_grad_sampling_loc = reinterpret_cast<scalar_t *>(_s);
+    scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset =
+          data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col = 0; p_col < num_point; ++p_col) {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight + threadIdx.x) = 0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          ms_deform_attn_col2im_bilinear(
+              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+              cache_grad_sampling_loc + (threadIdx.x << 1),
+              cache_grad_attn_weight + threadIdx.x);
+        }
+
+        __syncthreads();
+
+        for (unsigned int s = blockDim.x / 2, spre = blockDim.x; s > 0;
+             s >>= 1, spre >>= 1) {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] +=
+                cache_grad_sampling_loc[xid2 + 1];
+            if (tid + (s << 1) < spre) {
+              cache_grad_attn_weight[tid] +=
+                  cache_grad_attn_weight[tid + (s << 1)];
+              cache_grad_sampling_loc[xid1] +=
+                  cache_grad_sampling_loc[xid2 + (s << 1)];
+              cache_grad_sampling_loc[xid1 + 1] +=
+                  cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
+            }
+          }
+          __syncthreads();
+        }
+
+        if (tid == 0) {
+          *grad_sampling_loc = cache_grad_sampling_loc[0];
+          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
+          *grad_attn_weight = cache_grad_attn_weight[0];
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(
+    const int n, const scalar_t *grad_col, const scalar_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
+    const int batch_size, const int spatial_size, const int num_heads,
+    const int channels, const int num_levels, const int num_query,
+    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
+    scalar_t *grad_attn_weight) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    extern __shared__ int _s[];
+    scalar_t *cache_grad_sampling_loc = reinterpret_cast<scalar_t *>(_s);
+    scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset =
+          data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col = 0; p_col < num_point; ++p_col) {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight + threadIdx.x) = 0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          ms_deform_attn_col2im_bilinear(
+              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+              cache_grad_sampling_loc + (threadIdx.x << 1),
+              cache_grad_attn_weight + threadIdx.x);
+        }
+
+        __syncthreads();
+
+        for (unsigned int s = blockDim.x / 2, spre = blockDim.x; s > 0;
+             s >>= 1, spre >>= 1) {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] +=
+                cache_grad_sampling_loc[xid2 + 1];
+            if (tid + (s << 1) < spre) {
+              cache_grad_attn_weight[tid] +=
+                  cache_grad_attn_weight[tid + (s << 1)];
+              cache_grad_sampling_loc[xid1] +=
+                  cache_grad_sampling_loc[xid2 + (s << 1)];
+              cache_grad_sampling_loc[xid1 + 1] +=
+                  cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
+            }
+          }
+          __syncthreads();
+        }
+
+        if (tid == 0) {
+          atomicAdd(grad_sampling_loc, cache_grad_sampling_loc[0]);
+          atomicAdd(grad_sampling_loc + 1, cache_grad_sampling_loc[1]);
+          atomicAdd(grad_attn_weight, cache_grad_attn_weight[0]);
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_gm(
+    const int n, const scalar_t *grad_col, const scalar_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
+    const int batch_size, const int spatial_size, const int num_heads,
+    const int channels, const int num_levels, const int num_query,
+    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
+    scalar_t *grad_attn_weight) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset =
+          data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col = 0; p_col < num_point; ++p_col) {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          ms_deform_attn_col2im_bilinear_gm(
+              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+              grad_sampling_loc, grad_attn_weight);
+        }
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+#endif  // DEFORM_ATTN_CUDA_KERNEL
\ No newline at end of file
diff --git a/main/transformer_utils/mmpose/ops/multi_scale_deform_attn.py b/main/transformer_utils/mmpose/ops/multi_scale_deform_attn.py
new file mode 100644
index 0000000000000000000000000000000000000000..e58ca98ebd0cef0498607270b7650a4ad1f6ec27
--- /dev/null
+++ b/main/transformer_utils/mmpose/ops/multi_scale_deform_attn.py
@@ -0,0 +1,1055 @@
+import math
+import warnings
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd.function import Function, once_differentiable
+
+from mmcv import deprecated_api_warning
+from mmcv.cnn import constant_init, xavier_init
+from mmcv.cnn.bricks.registry import ATTENTION
+from mmcv.runner import BaseModule
+from mmcv.utils import ext_loader
+from mmcv.ops.multi_scale_deform_attn import ext_module
+
+class MultiScaleDeformableAttnFunction(Function):
+
+    @staticmethod
+    def forward(ctx, value, value_spatial_shapes, value_level_start_index,
+                sampling_locations, attention_weights, im2col_step):
+        """GPU version of multi-scale deformable attention.
+        Args:
+            value (Tensor): The value has shape
+                (bs, num_keys, mum_heads, embed_dims//num_heads)
+            value_spatial_shapes (Tensor): Spatial shape of
+                each feature map, has shape (num_levels, 2),
+                last dimension 2 represent (h, w)
+            sampling_locations (Tensor): The location of sampling points,
+                has shape
+                (bs ,num_queries, num_heads, num_levels, num_points, 2),
+                the last dimension 2 represent (x, y).
+            attention_weights (Tensor): The weight of sampling points used
+                when calculate the attention, has shape
+                (bs ,num_queries, num_heads, num_levels, num_points),
+            im2col_step (Tensor): The step used in image to column.
+        Returns:
+            Tensor: has shape (bs, num_queries, embed_dims)
+        """
+
+        ctx.im2col_step = im2col_step
+        # print(im2col_step)
+        output = ext_module.ms_deform_attn_forward(
+            value,
+            value_spatial_shapes,
+            value_level_start_index,
+            sampling_locations,
+            attention_weights,
+            im2col_step=ctx.im2col_step)
+        ctx.save_for_backward(value, value_spatial_shapes,
+                              value_level_start_index, sampling_locations,
+                              attention_weights)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        """GPU version of backward function.
+        Args:
+            grad_output (Tensor): Gradient
+                of output tensor of forward.
+        Returns:
+             Tuple[Tensor]: Gradient
+                of input tensors in forward.
+        """
+        value, value_spatial_shapes, value_level_start_index,\
+            sampling_locations, attention_weights = ctx.saved_tensors
+        grad_value = torch.zeros_like(value)
+        grad_sampling_loc = torch.zeros_like(sampling_locations)
+        grad_attn_weight = torch.zeros_like(attention_weights)
+
+        ext_module.ms_deform_attn_backward(
+            value,
+            value_spatial_shapes,
+            value_level_start_index,
+            sampling_locations,
+            attention_weights,
+            grad_output.contiguous(),
+            grad_value,
+            grad_sampling_loc,
+            grad_attn_weight,
+            im2col_step=ctx.im2col_step)
+
+        return grad_value, None, None, \
+            grad_sampling_loc, grad_attn_weight, None
+
+
+def multi_scale_deformable_attn_pytorch(value, value_spatial_shapes,
+                                        sampling_locations, attention_weights):
+    """CPU version of multi-scale deformable attention.
+    Args:
+        value (Tensor): The value has shape
+            (bs, num_keys, mum_heads, embed_dims//num_heads)
+        value_spatial_shapes (Tensor): Spatial shape of
+            each feature map, has shape (num_levels, 2),
+            last dimension 2 represent (h, w)
+        sampling_locations (Tensor): The location of sampling points,
+            has shape
+            (bs ,num_queries, num_heads, num_levels, num_points, 2),
+            the last dimension 2 represent (x, y).
+        attention_weights (Tensor): The weight of sampling points used
+            when calculate the attention, has shape
+            (bs ,num_queries, num_heads, num_levels, num_points),
+    Returns:
+        Tensor: has shape (bs, num_queries, embed_dims)
+    """
+
+    bs, _, num_heads, embed_dims = value.shape
+    _, num_queries, num_heads, num_levels, num_points, _ =\
+        sampling_locations.shape
+    value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes],
+                             dim=1)
+    sampling_grids = 2 * sampling_locations - 1
+    sampling_value_list = []
+    for level, (H_, W_) in enumerate(value_spatial_shapes):
+        # bs, H_*W_, num_heads, embed_dims ->
+        # bs, H_*W_, num_heads*embed_dims ->
+        # bs, num_heads*embed_dims, H_*W_ ->
+        # bs*num_heads, embed_dims, H_, W_
+        value_l_ = value_list[level].flatten(2).transpose(1, 2).reshape(
+            bs * num_heads, embed_dims, H_, W_)
+        # bs, num_queries, num_heads, num_points, 2 ->
+        # bs, num_heads, num_queries, num_points, 2 ->
+        # bs*num_heads, num_queries, num_points, 2
+        sampling_grid_l_ = sampling_grids[:, :, :,
+                                          level].transpose(1, 2).flatten(0, 1)
+        # bs*num_heads, embed_dims, num_queries, num_points
+        sampling_value_l_ = F.grid_sample(
+            value_l_,
+            sampling_grid_l_,
+            mode='bilinear',
+            padding_mode='zeros',
+            align_corners=False)
+        sampling_value_list.append(sampling_value_l_)
+    # (bs, num_queries, num_heads, num_levels, num_points) ->
+    # (bs, num_heads, num_queries, num_levels, num_points) ->
+    # (bs, num_heads, 1, num_queries, num_levels*num_points)
+    attention_weights = attention_weights.transpose(1, 2).reshape(
+        bs * num_heads, 1, num_queries, num_levels * num_points)
+    output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) *
+              attention_weights).sum(-1).view(bs, num_heads * embed_dims,
+                                              num_queries)
+    return output.transpose(1, 2).contiguous()
+
+
+@ATTENTION.register_module()
+class MultiScaleDeformableAttention_share_value(BaseModule):
+    """An attention module used in Deformable-Detr. `Deformable DETR:
+    Deformable Transformers for End-to-End Object Detection.
+      <https://arxiv.org/pdf/2010.04159.pdf>`_.
+    Args:
+        embed_dims (int): The embedding dimension of Attention.
+            Default: 256.
+        num_heads (int): Parallel attention heads. Default: 64.
+        num_levels (int): The number of feature map used in
+            Attention. Default: 4.
+        num_points (int): The number of sampling points for
+            each query in each head. Default: 4.
+        im2col_step (int): The step used in image_to_column.
+            Default: 64.
+        dropout (float): A Dropout layer on `inp_identity`.
+            Default: 0.1.
+        batch_first (bool): Key, Query and Value are shape of
+            (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default to False.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: None.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims=256,
+                 num_heads=8,
+                 num_levels=4,
+                 num_points=4,
+                 im2col_step=64,
+                 dropout=0.1,
+                 batch_first=False,
+                 norm_cfg=None,
+                 init_cfg=None):
+        super().__init__(init_cfg)
+        if embed_dims % num_heads != 0:
+            raise ValueError(f'embed_dims must be divisible by num_heads, '
+                             f'but got {embed_dims} and {num_heads}')
+        dim_per_head = embed_dims // num_heads
+        self.norm_cfg = norm_cfg
+        self.dropout = nn.Dropout(dropout)
+        self.batch_first = batch_first
+        # print(im2col_step)
+
+        # you'd better set dim_per_head to a power of 2
+        # which is more efficient in the CUDA implementation
+        def _is_power_of_2(n):
+            if (not isinstance(n, int)) or (n < 0):
+                raise ValueError(
+                    'invalid input for _is_power_of_2: {} (type: {})'.format(
+                        n, type(n)))
+            return (n & (n - 1) == 0) and n != 0
+
+        if not _is_power_of_2(dim_per_head):
+            warnings.warn(
+                "You'd better set embed_dims in "
+                'MultiScaleDeformAttention to make '
+                'the dimension of each attention head a power of 2 '
+                'which is more efficient in our CUDA implementation.')
+
+        # self.im2col_step = im2col_step
+        self.im2col_step = im2col_step*4
+        self.embed_dims = embed_dims
+        self.num_levels = num_levels
+        self.num_heads = num_heads
+        self.num_points = num_points
+        self.sampling_offsets = nn.Linear(
+            embed_dims, num_heads * num_levels * num_points * 2)
+        self.attention_weights = nn.Linear(embed_dims,
+                                           num_heads * num_levels * num_points)
+        # self.value_proj = nn.Linear(embed_dims, embed_dims)
+        self.output_proj = nn.Linear(embed_dims, embed_dims)
+        self.init_weights()
+
+    def init_weights(self):
+        """Default initialization for Parameters of Module."""
+        constant_init(self.sampling_offsets, 0.)
+        thetas = torch.arange(
+            self.num_heads,
+            dtype=torch.float32) * (2.0 * math.pi / self.num_heads)
+        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = (grid_init /
+                     grid_init.abs().max(-1, keepdim=True)[0]).view(
+                         self.num_heads, 1, 1,
+                         2).repeat(1, self.num_levels, self.num_points, 1)
+        for i in range(self.num_points):
+            grid_init[:, :, i, :] *= i + 1
+
+        self.sampling_offsets.bias.data = grid_init.view(-1)
+        constant_init(self.attention_weights, val=0., bias=0.)
+        # xavier_init(self.value_proj, distribution='uniform', bias=0.)
+        xavier_init(self.output_proj, distribution='uniform', bias=0.)
+        self._is_init = True
+
+    @deprecated_api_warning({'residual': 'identity'},
+                            cls_name='MultiScaleDeformableAttention')
+    def forward(self,
+                query,
+                key=None,
+                value=None,
+                identity=None,
+                query_pos=None,
+                key_padding_mask=None,
+                reference_points=None,
+                spatial_shapes=None,
+                level_start_index=None,
+                **kwargs):
+        """Forward Function of MultiScaleDeformAttention.
+        Args:
+            query (Tensor): Query of Transformer with shape
+                (num_query, bs, embed_dims).
+            key (Tensor): The key tensor with shape
+                `(num_key, bs, embed_dims)`.
+            value (Tensor): The value tensor with shape
+                `(num_key, bs, embed_dims)`.
+            identity (Tensor): The tensor used for addition, with the
+                same shape as `query`. Default None. If None,
+                `query` will be used.
+            query_pos (Tensor): The positional encoding for `query`.
+                Default: None.
+            key_pos (Tensor): The positional encoding for `key`. Default
+                None.
+            reference_points (Tensor):  The normalized reference
+                points with shape (bs, num_query, num_levels, 2),
+                all elements is range in [0, 1], top-left (0,0),
+                bottom-right (1, 1), including padding area.
+                or (N, Length_{query}, num_levels, 4), add
+                additional two dimensions is (w, h) to
+                form reference boxes.
+            key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_key].
+            spatial_shapes (Tensor): Spatial shape of features in
+                different levels. With shape (num_levels, 2),
+                last dimension represents (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape ``(num_levels, )`` and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+        Returns:
+             Tensor: forwarded results with shape [num_query, bs, embed_dims].
+        """
+
+        if value is None:
+            value = query
+
+        if identity is None:
+            identity = query
+        if query_pos is not None:
+            query = query + query_pos
+        if not self.batch_first:
+            # change to (bs, num_query ,embed_dims)
+            query = query.permute(1, 0, 2)
+            value = value.permute(1, 0, 2)
+
+        bs, num_query, _ = query.shape
+        bs, num_value, _ = value.shape
+        assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value
+
+        # value = self.value_proj(value)
+        if key_padding_mask is not None:
+            value = value.masked_fill(key_padding_mask[..., None], 0.0)
+        value = value.view(bs, num_value, self.num_heads, -1)
+        sampling_offsets = self.sampling_offsets(query).view(
+            bs, num_query, self.num_heads, self.num_levels, self.num_points, 2)
+        attention_weights = self.attention_weights(query).view(
+            bs, num_query, self.num_heads, self.num_levels * self.num_points)
+        attention_weights = attention_weights.softmax(-1)
+
+        attention_weights = attention_weights.view(bs, num_query,
+                                                   self.num_heads,
+                                                   self.num_levels,
+                                                   self.num_points)
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = torch.stack(
+                [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
+            sampling_locations = reference_points[:, :, None, :, None, :] \
+                + sampling_offsets \
+                / offset_normalizer[None, None, None, :, None, :]
+        elif reference_points.shape[-1] == 4:
+            sampling_locations = reference_points[:, :, None, :, None, :2] \
+                + sampling_offsets / self.num_points \
+                * reference_points[:, :, None, :, None, 2:] \
+                * 0.5
+        else:
+            raise ValueError(
+                f'Last dim of reference_points must be'
+                f' 2 or 4, but get {reference_points.shape[-1]} instead.')
+        if torch.cuda.is_available():
+            output = MultiScaleDeformableAttnFunction.apply(
+                value, spatial_shapes, level_start_index, sampling_locations,
+                attention_weights, self.im2col_step)
+        else:
+            output = multi_scale_deformable_attn_pytorch(
+                value, spatial_shapes, level_start_index, sampling_locations,
+                attention_weights, self.im2col_step)
+
+        output = self.output_proj(output)
+
+        if not self.batch_first:
+            # (num_query, bs ,embed_dims)
+            output = output.permute(1, 0, 2)
+
+        return self.dropout(output) + identity
+
+class bottle_neck(nn.Module):
+    def __init__(self, embed_dims, hiddem_dims):
+        super().__init__()
+        self.fc1 = nn.Linear(embed_dims, hiddem_dims)
+        self.fc2 = nn.Linear(hiddem_dims, hiddem_dims)
+        self.fc3 = nn.Linear(hiddem_dims, embed_dims)
+        xavier_init(self.fc1, distribution='uniform', bias=0.)
+        xavier_init(self.fc2, distribution='uniform', bias=0.)
+        xavier_init(self.fc3, distribution='uniform', bias=0.)
+    def forward(self, x):
+        res = x
+        x = self.fc1(x)
+        x = self.fc2(x)
+        x = self.fc3(x)
+        x = x + res
+        return x
+
+    
+
+@ATTENTION.register_module()
+class MultiScaleDeformableAttention_bottle_neck_v(BaseModule):
+    """An attention module used in Deformable-Detr. `Deformable DETR:
+    Deformable Transformers for End-to-End Object Detection.
+      <https://arxiv.org/pdf/2010.04159.pdf>`_.
+    Args:
+        embed_dims (int): The embedding dimension of Attention.
+            Default: 256.
+        num_heads (int): Parallel attention heads. Default: 64.
+        num_levels (int): The number of feature map used in
+            Attention. Default: 4.
+        num_points (int): The number of sampling points for
+            each query in each head. Default: 4.
+        im2col_step (int): The step used in image_to_column.
+            Default: 64.
+        dropout (float): A Dropout layer on `inp_identity`.
+            Default: 0.1.
+        batch_first (bool): Key, Query and Value are shape of
+            (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default to False.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: None.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims=256,
+                 num_heads=8,
+                 num_levels=4,
+                 num_points=4,
+                 im2col_step=64,
+                 dropout=0.1,
+                 batch_first=False,
+                 norm_cfg=None,
+                 init_cfg=None):
+        super().__init__(init_cfg)
+        if embed_dims % num_heads != 0:
+            raise ValueError(f'embed_dims must be divisible by num_heads, '
+                             f'but got {embed_dims} and {num_heads}')
+        dim_per_head = embed_dims // num_heads
+        self.norm_cfg = norm_cfg
+        self.dropout = nn.Dropout(dropout)
+        self.batch_first = batch_first
+
+        # you'd better set dim_per_head to a power of 2
+        # which is more efficient in the CUDA implementation
+        def _is_power_of_2(n):
+            if (not isinstance(n, int)) or (n < 0):
+                raise ValueError(
+                    'invalid input for _is_power_of_2: {} (type: {})'.format(
+                        n, type(n)))
+            return (n & (n - 1) == 0) and n != 0
+
+        if not _is_power_of_2(dim_per_head):
+            warnings.warn(
+                "You'd better set embed_dims in "
+                'MultiScaleDeformAttention to make '
+                'the dimension of each attention head a power of 2 '
+                'which is more efficient in our CUDA implementation.')
+
+        self.im2col_step = im2col_step
+        self.embed_dims = embed_dims
+        self.num_levels = num_levels
+        self.num_heads = num_heads
+        self.num_points = num_points
+        self.sampling_offsets = nn.Linear(
+            embed_dims, num_heads * num_levels * num_points * 2)
+        self.attention_weights = nn.Linear(embed_dims,
+                                           num_heads * num_levels * num_points)
+        self.value_proj = bottle_neck(embed_dims, hiddem_dims = embed_dims//16)
+        self.output_proj = nn.Linear(embed_dims, embed_dims)
+        self.init_weights()
+
+    def init_weights(self):
+        """Default initialization for Parameters of Module."""
+        constant_init(self.sampling_offsets, 0.)
+        thetas = torch.arange(
+            self.num_heads,
+            dtype=torch.float32) * (2.0 * math.pi / self.num_heads)
+        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = (grid_init /
+                     grid_init.abs().max(-1, keepdim=True)[0]).view(
+                         self.num_heads, 1, 1,
+                         2).repeat(1, self.num_levels, self.num_points, 1)
+        for i in range(self.num_points):
+            grid_init[:, :, i, :] *= i + 1
+
+        self.sampling_offsets.bias.data = grid_init.view(-1)
+        constant_init(self.attention_weights, val=0., bias=0.)
+        # xavier_init(self.value_proj, distribution='uniform', bias=0.)
+        xavier_init(self.output_proj, distribution='uniform', bias=0.)
+        self._is_init = True
+
+    @deprecated_api_warning({'residual': 'identity'},
+                            cls_name='MultiScaleDeformableAttention')
+    def forward(self,
+                query,
+                key=None,
+                value=None,
+                identity=None,
+                query_pos=None,
+                key_padding_mask=None,
+                reference_points=None,
+                spatial_shapes=None,
+                level_start_index=None,
+                **kwargs):
+        """Forward Function of MultiScaleDeformAttention.
+        Args:
+            query (Tensor): Query of Transformer with shape
+                (num_query, bs, embed_dims).
+            key (Tensor): The key tensor with shape
+                `(num_key, bs, embed_dims)`.
+            value (Tensor): The value tensor with shape
+                `(num_key, bs, embed_dims)`.
+            identity (Tensor): The tensor used for addition, with the
+                same shape as `query`. Default None. If None,
+                `query` will be used.
+            query_pos (Tensor): The positional encoding for `query`.
+                Default: None.
+            key_pos (Tensor): The positional encoding for `key`. Default
+                None.
+            reference_points (Tensor):  The normalized reference
+                points with shape (bs, num_query, num_levels, 2),
+                all elements is range in [0, 1], top-left (0,0),
+                bottom-right (1, 1), including padding area.
+                or (N, Length_{query}, num_levels, 4), add
+                additional two dimensions is (w, h) to
+                form reference boxes.
+            key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_key].
+            spatial_shapes (Tensor): Spatial shape of features in
+                different levels. With shape (num_levels, 2),
+                last dimension represents (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape ``(num_levels, )`` and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+        Returns:
+             Tensor: forwarded results with shape [num_query, bs, embed_dims].
+        """
+
+        if value is None:
+            value = query
+
+        if identity is None:
+            identity = query
+        if query_pos is not None:
+            query = query + query_pos
+        if not self.batch_first:
+            # change to (bs, num_query ,embed_dims)
+            query = query.permute(1, 0, 2)
+            value = value.permute(1, 0, 2)
+
+        bs, num_query, _ = query.shape
+        bs, num_value, _ = value.shape
+        assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value
+
+        value = self.value_proj(value)
+        if key_padding_mask is not None:
+            value = value.masked_fill(key_padding_mask[..., None], 0.0)
+        value = value.view(bs, num_value, self.num_heads, -1)
+        sampling_offsets = self.sampling_offsets(query).view(
+            bs, num_query, self.num_heads, self.num_levels, self.num_points, 2)
+        attention_weights = self.attention_weights(query).view(
+            bs, num_query, self.num_heads, self.num_levels * self.num_points)
+        attention_weights = attention_weights.softmax(-1)
+
+        attention_weights = attention_weights.view(bs, num_query,
+                                                   self.num_heads,
+                                                   self.num_levels,
+                                                   self.num_points)
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = torch.stack(
+                [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
+            sampling_locations = reference_points[:, :, None, :, None, :] \
+                + sampling_offsets \
+                / offset_normalizer[None, None, None, :, None, :]
+        elif reference_points.shape[-1] == 4:
+            sampling_locations = reference_points[:, :, None, :, None, :2] \
+                + sampling_offsets / self.num_points \
+                * reference_points[:, :, None, :, None, 2:] \
+                * 0.5
+        else:
+            raise ValueError(
+                f'Last dim of reference_points must be'
+                f' 2 or 4, but get {reference_points.shape[-1]} instead.')
+        if torch.cuda.is_available():
+            output = MultiScaleDeformableAttnFunction.apply(
+                value, spatial_shapes, level_start_index, sampling_locations,
+                attention_weights, self.im2col_step)
+        else:
+            output = multi_scale_deformable_attn_pytorch(
+                value, spatial_shapes, level_start_index, sampling_locations,
+                attention_weights, self.im2col_step)
+
+        output = self.output_proj(output)
+
+        if not self.batch_first:
+            # (num_query, bs ,embed_dims)
+            output = output.permute(1, 0, 2)
+
+        return self.dropout(output) + identity
+
+@ATTENTION.register_module()
+class MultiScaleDeformableAttention_post_value(BaseModule):
+    """An attention module used in Deformable-Detr. `Deformable DETR:
+    Deformable Transformers for End-to-End Object Detection.
+      <https://arxiv.org/pdf/2010.04159.pdf>`_.
+    Args:
+        embed_dims (int): The embedding dimension of Attention.
+            Default: 256.
+        num_heads (int): Parallel attention heads. Default: 64.
+        num_levels (int): The number of feature map used in
+            Attention. Default: 4.
+        num_points (int): The number of sampling points for
+            each query in each head. Default: 4.
+        im2col_step (int): The step used in image_to_column.
+            Default: 64.
+        dropout (float): A Dropout layer on `inp_identity`.
+            Default: 0.1.
+        batch_first (bool): Key, Query and Value are shape of
+            (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default to False.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: None.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims=256,
+                 num_heads=8,
+                 num_levels=4,
+                 num_points=4,
+                 im2col_step=64,
+                 dropout=0.1,
+                 batch_first=False,
+                 norm_cfg=None,
+                 init_cfg=None):
+        super().__init__(init_cfg)
+        if embed_dims % num_heads != 0:
+            raise ValueError(f'embed_dims must be divisible by num_heads, '
+                             f'but got {embed_dims} and {num_heads}')
+        dim_per_head = embed_dims // num_heads
+        self.norm_cfg = norm_cfg
+        self.dropout = nn.Dropout(dropout)
+        self.batch_first = batch_first
+
+        # you'd better set dim_per_head to a power of 2
+        # which is more efficient in the CUDA implementation
+        def _is_power_of_2(n):
+            if (not isinstance(n, int)) or (n < 0):
+                raise ValueError(
+                    'invalid input for _is_power_of_2: {} (type: {})'.format(
+                        n, type(n)))
+            return (n & (n - 1) == 0) and n != 0
+
+        if not _is_power_of_2(dim_per_head):
+            warnings.warn(
+                "You'd better set embed_dims in "
+                'MultiScaleDeformAttention to make '
+                'the dimension of each attention head a power of 2 '
+                'which is more efficient in our CUDA implementation.')
+
+        self.im2col_step = im2col_step
+        self.embed_dims = embed_dims
+        self.num_levels = num_levels
+        self.num_heads = num_heads
+        self.num_points = num_points
+        self.sampling_offsets = nn.Linear(
+            embed_dims, num_heads * num_levels * num_points * 2)
+        self.attention_weights = nn.Linear(embed_dims,
+                                           num_heads * num_levels * num_points)
+        # self.value_proj = nn.Linear(embed_dims, embed_dims)
+        self.output_proj = nn.Linear(embed_dims, embed_dims)
+        # self.value_proj = nn.Linear(embed_dims * 8, embed_dims//8,)
+        self.value_proj_weight = nn.Embedding(8*embed_dims, embed_dims//8)
+        self.value_proj_bias = nn.Embedding(8, embed_dims//8)
+        self.init_weights()
+
+    def init_weights(self):
+        """Default initialization for Parameters of Module."""
+        constant_init(self.sampling_offsets, 0.)
+        thetas = torch.arange(
+            self.num_heads,
+            dtype=torch.float32) * (2.0 * math.pi / self.num_heads)
+        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = (grid_init /
+                     grid_init.abs().max(-1, keepdim=True)[0]).view(
+                         self.num_heads, 1, 1,
+                         2).repeat(1, self.num_levels, self.num_points, 1)
+        for i in range(self.num_points):
+            grid_init[:, :, i, :] *= i + 1
+
+        self.sampling_offsets.bias.data = grid_init.view(-1)
+        constant_init(self.attention_weights, val=0., bias=0.)
+        # xavier_init(self.value_proj, distribution='uniform', bias=0.)
+        xavier_init(self.output_proj, distribution='uniform', bias=0.)
+        self._is_init = True
+
+    @deprecated_api_warning({'residual': 'identity'},
+                            cls_name='MultiScaleDeformableAttention')
+    def forward(self,
+                query,
+                key=None,
+                value=None,
+                identity=None,
+                query_pos=None,
+                key_padding_mask=None,
+                reference_points=None,
+                spatial_shapes=None,
+                level_start_index=None,
+                **kwargs):
+        """Forward Function of MultiScaleDeformAttention.
+        Args:
+            query (Tensor): Query of Transformer with shape
+                (num_query, bs, embed_dims).
+            key (Tensor): The key tensor with shape
+                `(num_key, bs, embed_dims)`.
+            value (Tensor): The value tensor with shape
+                `(num_key, bs, embed_dims)`.
+            identity (Tensor): The tensor used for addition, with the
+                same shape as `query`. Default None. If None,
+                `query` will be used.
+            query_pos (Tensor): The positional encoding for `query`.
+                Default: None.
+            key_pos (Tensor): The positional encoding for `key`. Default
+                None.
+            reference_points (Tensor):  The normalized reference
+                points with shape (bs, num_query, num_levels, 2),
+                all elements is range in [0, 1], top-left (0,0),
+                bottom-right (1, 1), including padding area.
+                or (N, Length_{query}, num_levels, 4), add
+                additional two dimensions is (w, h) to
+                form reference boxes.
+            key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_key].
+            spatial_shapes (Tensor): Spatial shape of features in
+                different levels. With shape (num_levels, 2),
+                last dimension represents (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape ``(num_levels, )`` and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+        Returns:
+             Tensor: forwarded results with shape [num_query, bs, embed_dims].
+        """
+
+        if value is None:
+            value = query
+
+        if identity is None:
+            identity = query
+        if query_pos is not None:
+            query = query + query_pos
+        if not self.batch_first:
+            # change to (bs, num_query ,embed_dims)
+            query = query.permute(1, 0, 2)
+            value = value.permute(1, 0, 2)
+
+        bs, num_query, _ = query.shape
+        bs, num_value, _ = value.shape
+        assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value
+
+        # value = self.value_proj(value)
+        if key_padding_mask is not None:
+            value = value.masked_fill(key_padding_mask[..., None], 0.0)
+        # value = value.view(bs, num_value, self.num_heads, -1)
+        value = value[:,:,None,:].repeat(1, 1, self.num_heads, 1)
+        sampling_offsets = self.sampling_offsets(query).view(
+            bs, num_query, self.num_heads, self.num_levels, self.num_points, 2)
+        attention_weights = self.attention_weights(query).view(
+            bs, num_query, self.num_heads, self.num_levels * self.num_points)
+        attention_weights = attention_weights.softmax(-1)
+
+        attention_weights = attention_weights.view(bs, num_query,
+                                                   self.num_heads,
+                                                   self.num_levels,
+                                                   self.num_points)
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = torch.stack(
+                [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
+            sampling_locations = reference_points[:, :, None, :, None, :] \
+                + sampling_offsets \
+                / offset_normalizer[None, None, None, :, None, :]
+        elif reference_points.shape[-1] == 4:
+            sampling_locations = reference_points[:, :, None, :, None, :2] \
+                + sampling_offsets / self.num_points \
+                * reference_points[:, :, None, :, None, 2:] \
+                * 0.5
+        else:
+            raise ValueError(
+                f'Last dim of reference_points must be'
+                f' 2 or 4, but get {reference_points.shape[-1]} instead.')
+        if torch.cuda.is_available():
+            output = MultiScaleDeformableAttnFunction.apply(
+                value, spatial_shapes, level_start_index, sampling_locations,
+                attention_weights, self.im2col_step)
+        else:
+            output = multi_scale_deformable_attn_pytorch(
+                value, spatial_shapes, level_start_index, sampling_locations,
+                attention_weights, self.im2col_step)
+
+        bs, num_q, ch = output.shape
+        value_proj_weight = self.value_proj_weight.weight.clone()
+        value_proj_bias = self.value_proj_bias.weight.clone()
+        value_proj_weight = value_proj_weight.reshape(8,ch//8,-1)
+        output = output.reshape(bs,num_q,8,ch//8)
+        output = torch.einsum('bqhc,hco->bqho', output, value_proj_weight)
+        output = output + value_proj_bias[None,None,:,:]
+        output = output.flatten(-2)
+
+        output = self.output_proj(output)
+
+        if not self.batch_first:
+            # (num_query, bs ,embed_dims)
+            output = output.permute(1, 0, 2)
+
+        return self.dropout(output) + identity
+
+def multi_scale_deformable_post_v(value, value_spatial_shapes,
+                                        sampling_locations, attention_weights,
+                                        value_proj_weight, value_proj_bias):
+    """CPU version of multi-scale deformable attention.
+    Args:
+        value (Tensor): The value has shape
+            (bs, num_keys, mum_heads, embed_dims//num_heads)
+        value_spatial_shapes (Tensor): Spatial shape of
+            each feature map, has shape (num_levels, 2),
+            last dimension 2 represent (h, w)
+        sampling_locations (Tensor): The location of sampling points,
+            has shape
+            (bs ,num_queries, num_heads, num_levels, num_points, 2),
+            the last dimension 2 represent (x, y).
+        attention_weights (Tensor): The weight of sampling points used
+            when calculate the attention, has shape
+            (bs ,num_queries, num_heads, num_levels, num_points),
+    Returns:
+        Tensor: has shape (bs, num_queries, embed_dims)
+    """
+
+    bs, _, num_heads, embed_dims = value.shape
+    _, num_queries, num_heads, num_levels, num_points, _ =\
+        sampling_locations.shape
+    value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes],
+                             dim=1)
+    sampling_grids = 2 * sampling_locations - 1
+    sampling_value_list = []
+    for level, (H_, W_) in enumerate(value_spatial_shapes):
+        # bs, H_*W_, num_heads, embed_dims ->
+        # bs, H_*W_, num_heads*embed_dims ->
+        # bs, num_heads*embed_dims, H_*W_ ->
+        # bs*num_heads, embed_dims, H_, W_
+        value_l_ = value_list[level].flatten(2).transpose(1, 2).reshape(
+            bs * num_heads, embed_dims, H_, W_)
+        # bs, num_queries, num_heads, num_points, 2 ->
+        # bs, num_heads, num_queries, num_points, 2 ->
+        # bs*num_heads, num_queries, num_points, 2
+        sampling_grid_l_ = sampling_grids[:, :, :,
+                                          level].transpose(1, 2).flatten(0, 1)
+        # bs*num_heads, embed_dims, num_queries, num_points
+        sampling_value_l_ = F.grid_sample(
+            value_l_,
+            sampling_grid_l_,
+            mode='bilinear',
+            padding_mode='zeros',
+            align_corners=False)
+        sampling_value_list.append(sampling_value_l_)
+    # (bs, num_queries, num_heads, num_levels, num_points) ->
+    # (bs, num_heads, num_queries, num_levels, num_points) ->
+    # (bs, num_heads, 1, num_queries, num_levels*num_points)
+    attention_weights = attention_weights.transpose(1, 2).reshape(
+        bs * num_heads, 1, num_queries, num_levels * num_points)
+
+    value_proj_weight = value_proj_weight.weight.clone()
+    value_proj_weight = value_proj_weight.reshape(8,value_proj_weight.shape[0]//8,value_proj_weight.shape[1])
+    value_proj_bias = value_proj_bias.weight.clone()
+    sampling_value_list = torch.stack(sampling_value_list, dim=-2).flatten(-2)
+    sampling_value_list = sampling_value_list.reshape(bs,num_heads,value_proj_weight.shape[1],num_queries,num_levels*num_points)
+    sampling_value_list = torch.einsum('bhcqp,hco->bhoqp', sampling_value_list, value_proj_weight)
+    sampling_value_list = sampling_value_list + value_proj_bias[None,:,:,None,None]
+    sampling_value_list = sampling_value_list.reshape(bs*num_heads,value_proj_weight.shape[1]//num_heads,num_queries,num_levels*num_points)
+
+    output = (sampling_value_list *
+              attention_weights).sum(-1).view(bs, embed_dims,
+                                              num_queries)
+    return output.transpose(1, 2).contiguous()
+
+@ATTENTION.register_module()
+class MultiScaleDeformableAttention_post_v_stirct(BaseModule):
+    """An attention module used in Deformable-Detr. `Deformable DETR:
+    Deformable Transformers for End-to-End Object Detection.
+      <https://arxiv.org/pdf/2010.04159.pdf>`_.
+    Args:
+        embed_dims (int): The embedding dimension of Attention.
+            Default: 256.
+        num_heads (int): Parallel attention heads. Default: 64.
+        num_levels (int): The number of feature map used in
+            Attention. Default: 4.
+        num_points (int): The number of sampling points for
+            each query in each head. Default: 4.
+        im2col_step (int): The step used in image_to_column.
+            Default: 64.
+        dropout (float): A Dropout layer on `inp_identity`.
+            Default: 0.1.
+        batch_first (bool): Key, Query and Value are shape of
+            (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default to False.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: None.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims=256,
+                 num_heads=8,
+                 num_levels=4,
+                 num_points=4,
+                 im2col_step=64,
+                 dropout=0.1,
+                 batch_first=False,
+                 norm_cfg=None,
+                 init_cfg=None):
+        super().__init__(init_cfg)
+        if embed_dims % num_heads != 0:
+            raise ValueError(f'embed_dims must be divisible by num_heads, '
+                             f'but got {embed_dims} and {num_heads}')
+        dim_per_head = embed_dims // num_heads
+        self.norm_cfg = norm_cfg
+        self.dropout = nn.Dropout(dropout)
+        self.batch_first = batch_first
+
+        # you'd better set dim_per_head to a power of 2
+        # which is more efficient in the CUDA implementation
+        def _is_power_of_2(n):
+            if (not isinstance(n, int)) or (n < 0):
+                raise ValueError(
+                    'invalid input for _is_power_of_2: {} (type: {})'.format(
+                        n, type(n)))
+            return (n & (n - 1) == 0) and n != 0
+
+        if not _is_power_of_2(dim_per_head):
+            warnings.warn(
+                "You'd better set embed_dims in "
+                'MultiScaleDeformAttention to make '
+                'the dimension of each attention head a power of 2 '
+                'which is more efficient in our CUDA implementation.')
+
+        self.im2col_step = im2col_step
+        self.embed_dims = embed_dims
+        self.num_levels = num_levels
+        self.num_heads = num_heads
+        self.num_points = num_points
+        self.sampling_offsets = nn.Linear(
+            embed_dims, num_heads * num_levels * num_points * 2)
+        self.attention_weights = nn.Linear(embed_dims,
+                                           num_heads * num_levels * num_points)
+        # self.value_proj = nn.Linear(embed_dims, embed_dims)
+        self.output_proj = nn.Linear(embed_dims, embed_dims)
+        self.value_proj_weight = nn.Embedding(8*embed_dims, embed_dims//8)
+        self.value_proj_bias = nn.Embedding(8, embed_dims//8)
+        self.init_weights()
+
+    def init_weights(self):
+        """Default initialization for Parameters of Module."""
+        constant_init(self.sampling_offsets, 0.)
+        thetas = torch.arange(
+            self.num_heads,
+            dtype=torch.float32) * (2.0 * math.pi / self.num_heads)
+        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = (grid_init /
+                     grid_init.abs().max(-1, keepdim=True)[0]).view(
+                         self.num_heads, 1, 1,
+                         2).repeat(1, self.num_levels, self.num_points, 1)
+        for i in range(self.num_points):
+            grid_init[:, :, i, :] *= i + 1
+
+        self.sampling_offsets.bias.data = grid_init.view(-1)
+        constant_init(self.attention_weights, val=0., bias=0.)
+        # xavier_init(self.value_proj, distribution='uniform', bias=0.)
+        xavier_init(self.output_proj, distribution='uniform', bias=0.)
+        self._is_init = True
+
+    @deprecated_api_warning({'residual': 'identity'},
+                            cls_name='MultiScaleDeformableAttention')
+    def forward(self,
+                query,
+                key=None,
+                value=None,
+                identity=None,
+                query_pos=None,
+                key_padding_mask=None,
+                reference_points=None,
+                spatial_shapes=None,
+                level_start_index=None,
+                **kwargs):
+        """Forward Function of MultiScaleDeformAttention.
+        Args:
+            query (Tensor): Query of Transformer with shape
+                (num_query, bs, embed_dims).
+            key (Tensor): The key tensor with shape
+                `(num_key, bs, embed_dims)`.
+            value (Tensor): The value tensor with shape
+                `(num_key, bs, embed_dims)`.
+            identity (Tensor): The tensor used for addition, with the
+                same shape as `query`. Default None. If None,
+                `query` will be used.
+            query_pos (Tensor): The positional encoding for `query`.
+                Default: None.
+            key_pos (Tensor): The positional encoding for `key`. Default
+                None.
+            reference_points (Tensor):  The normalized reference
+                points with shape (bs, num_query, num_levels, 2),
+                all elements is range in [0, 1], top-left (0,0),
+                bottom-right (1, 1), including padding area.
+                or (N, Length_{query}, num_levels, 4), add
+                additional two dimensions is (w, h) to
+                form reference boxes.
+            key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_key].
+            spatial_shapes (Tensor): Spatial shape of features in
+                different levels. With shape (num_levels, 2),
+                last dimension represents (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape ``(num_levels, )`` and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+        Returns:
+             Tensor: forwarded results with shape [num_query, bs, embed_dims].
+        """
+
+        if value is None:
+            value = query
+
+        if identity is None:
+            identity = query
+        if query_pos is not None:
+            query = query + query_pos
+        if not self.batch_first:
+            # change to (bs, num_query ,embed_dims)
+            query = query.permute(1, 0, 2)
+            value = value.permute(1, 0, 2)
+
+        bs, num_query, _ = query.shape
+        bs, num_value, _ = value.shape
+        assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value
+
+        # value = self.value_proj(value)
+        if key_padding_mask is not None:
+            value = value.masked_fill(key_padding_mask[..., None], 0.0)
+        value = value[:,:,None,:].expand(-1, -1, self.num_heads, -1)
+        sampling_offsets = self.sampling_offsets(query).view(
+            bs, num_query, self.num_heads, self.num_levels, self.num_points, 2)
+        attention_weights = self.attention_weights(query).view(
+            bs, num_query, self.num_heads, self.num_levels * self.num_points)
+        attention_weights = attention_weights.softmax(-1)
+
+        attention_weights = attention_weights.view(bs, num_query,
+                                                   self.num_heads,
+                                                   self.num_levels,
+                                                   self.num_points)
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = torch.stack(
+                [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
+            sampling_locations = reference_points[:, :, None, :, None, :] \
+                + sampling_offsets \
+                / offset_normalizer[None, None, None, :, None, :]
+        elif reference_points.shape[-1] == 4:
+            sampling_locations = reference_points[:, :, None, :, None, :2] \
+                + sampling_offsets / self.num_points \
+                * reference_points[:, :, None, :, None, 2:] \
+                * 0.5
+        else:
+            raise ValueError(
+                f'Last dim of reference_points must be'
+                f' 2 or 4, but get {reference_points.shape[-1]} instead.')
+
+        output = multi_scale_deformable_post_v(
+            value, spatial_shapes, sampling_locations,
+            attention_weights, self.value_proj_weight, self.value_proj_bias)
+
+        output = self.output_proj(output)
+
+        if not self.batch_first:
+            # (num_query, bs ,embed_dims)
+            output = output.permute(1, 0, 2)
+
+        return self.dropout(output) + identity
diff --git a/main/transformer_utils/mmpose/utils/__init__.py b/main/transformer_utils/mmpose/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1293ca05aab2632e0d6df29734438bc38ed79c6c
--- /dev/null
+++ b/main/transformer_utils/mmpose/utils/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .collect_env import collect_env
+from .logger import get_root_logger
+from .setup_env import setup_multi_processes
+from .timer import StopWatch
+
+__all__ = [
+    'get_root_logger', 'collect_env', 'StopWatch', 'setup_multi_processes'
+]
diff --git a/main/transformer_utils/mmpose/utils/collect_env.py b/main/transformer_utils/mmpose/utils/collect_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..f75c5ea73383ccef367632cf497227498ac50078
--- /dev/null
+++ b/main/transformer_utils/mmpose/utils/collect_env.py
@@ -0,0 +1,16 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.utils import collect_env as collect_basic_env
+from mmcv.utils import get_git_hash
+
+import mmpose
+
+
+def collect_env():
+    env_info = collect_basic_env()
+    env_info['MMPose'] = (mmpose.__version__ + '+' + get_git_hash(digits=7))
+    return env_info
+
+
+if __name__ == '__main__':
+    for name, val in collect_env().items():
+        print(f'{name}: {val}')
diff --git a/main/transformer_utils/mmpose/utils/hooks.py b/main/transformer_utils/mmpose/utils/hooks.py
new file mode 100644
index 0000000000000000000000000000000000000000..b68940f2b7a8a618916ea5aab331e3ce45ba98e7
--- /dev/null
+++ b/main/transformer_utils/mmpose/utils/hooks.py
@@ -0,0 +1,60 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import functools
+
+
+class OutputHook:
+
+    def __init__(self, module, outputs=None, as_tensor=False):
+        self.outputs = outputs
+        self.as_tensor = as_tensor
+        self.layer_outputs = {}
+        self.register(module)
+
+    def register(self, module):
+
+        def hook_wrapper(name):
+
+            def hook(model, input, output):
+                if self.as_tensor:
+                    self.layer_outputs[name] = output
+                else:
+                    if isinstance(output, list):
+                        self.layer_outputs[name] = [
+                            out.detach().cpu().numpy() for out in output
+                        ]
+                    else:
+                        self.layer_outputs[name] = output.detach().cpu().numpy(
+                        )
+
+            return hook
+
+        self.handles = []
+        if isinstance(self.outputs, (list, tuple)):
+            for name in self.outputs:
+                try:
+                    layer = rgetattr(module, name)
+                    h = layer.register_forward_hook(hook_wrapper(name))
+                except ModuleNotFoundError as module_not_found:
+                    raise ModuleNotFoundError(
+                        f'Module {name} not found') from module_not_found
+                self.handles.append(h)
+
+    def remove(self):
+        for h in self.handles:
+            h.remove()
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.remove()
+
+
+# using wonder's beautiful simplification:
+# https://stackoverflow.com/questions/31174295/getattr-and-setattr-on-nested-objects
+def rgetattr(obj, attr, *args):
+
+    def _getattr(obj, attr):
+        return getattr(obj, attr, *args)
+
+    return functools.reduce(_getattr, [obj] + attr.split('.'))
diff --git a/main/transformer_utils/mmpose/utils/logger.py b/main/transformer_utils/mmpose/utils/logger.py
new file mode 100644
index 0000000000000000000000000000000000000000..294837fa6aec1e1896de8c8accf470f366f81296
--- /dev/null
+++ b/main/transformer_utils/mmpose/utils/logger.py
@@ -0,0 +1,25 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+
+from mmcv.utils import get_logger
+
+
+def get_root_logger(log_file=None, log_level=logging.INFO):
+    """Use `get_logger` method in mmcv to get the root logger.
+
+    The logger will be initialized if it has not been initialized. By default a
+    StreamHandler will be added. If `log_file` is specified, a FileHandler will
+    also be added. The name of the root logger is the top-level package name,
+    e.g., "mmpose".
+
+    Args:
+        log_file (str | None): The log filename. If specified, a FileHandler
+            will be added to the root logger.
+        log_level (int): The root logger level. Note that only the process of
+            rank 0 is affected, while other processes will set the level to
+            "Error" and be silent most of the time.
+
+    Returns:
+        logging.Logger: The root logger.
+    """
+    return get_logger(__name__.split('.')[0], log_file, log_level)
diff --git a/main/transformer_utils/mmpose/utils/setup_env.py b/main/transformer_utils/mmpose/utils/setup_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..21def2f0809153a5f755af2431f7e702db625e5c
--- /dev/null
+++ b/main/transformer_utils/mmpose/utils/setup_env.py
@@ -0,0 +1,47 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import platform
+import warnings
+
+import cv2
+import torch.multiprocessing as mp
+
+
+def setup_multi_processes(cfg):
+    """Setup multi-processing environment variables."""
+    # set multi-process start method as `fork` to speed up the training
+    if platform.system() != 'Windows':
+        mp_start_method = cfg.get('mp_start_method', 'fork')
+        current_method = mp.get_start_method(allow_none=True)
+        if current_method is not None and current_method != mp_start_method:
+            warnings.warn(
+                f'Multi-processing start method `{mp_start_method}` is '
+                f'different from the previous setting `{current_method}`.'
+                f'It will be force set to `{mp_start_method}`. You can change '
+                f'this behavior by changing `mp_start_method` in your config.')
+        mp.set_start_method(mp_start_method, force=True)
+
+    # disable opencv multithreading to avoid system being overloaded
+    opencv_num_threads = cfg.get('opencv_num_threads', 0)
+    cv2.setNumThreads(opencv_num_threads)
+
+    # setup OMP threads
+    # This code is referred from https://github.com/pytorch/pytorch/blob/master/torch/distributed/run.py  # noqa
+    if 'OMP_NUM_THREADS' not in os.environ and cfg.data.workers_per_gpu > 1:
+        omp_num_threads = 1
+        warnings.warn(
+            f'Setting OMP_NUM_THREADS environment variable for each process '
+            f'to be {omp_num_threads} in default, to avoid your system being '
+            f'overloaded, please further tune the variable for optimal '
+            f'performance in your application as needed.')
+        os.environ['OMP_NUM_THREADS'] = str(omp_num_threads)
+
+    # setup MKL threads
+    if 'MKL_NUM_THREADS' not in os.environ and cfg.data.workers_per_gpu > 1:
+        mkl_num_threads = 1
+        warnings.warn(
+            f'Setting MKL_NUM_THREADS environment variable for each process '
+            f'to be {mkl_num_threads} in default, to avoid your system being '
+            f'overloaded, please further tune the variable for optimal '
+            f'performance in your application as needed.')
+        os.environ['MKL_NUM_THREADS'] = str(mkl_num_threads)
diff --git a/main/transformer_utils/mmpose/utils/timer.py b/main/transformer_utils/mmpose/utils/timer.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a3185c5e89ce73bd33591c22ce74fc73ef8e770
--- /dev/null
+++ b/main/transformer_utils/mmpose/utils/timer.py
@@ -0,0 +1,117 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections import defaultdict
+from contextlib import contextmanager
+from functools import partial
+
+import numpy as np
+from mmcv import Timer
+
+
+class RunningAverage():
+    r"""A helper class to calculate running average in a sliding window.
+
+    Args:
+        window (int): The size of the sliding window.
+    """
+
+    def __init__(self, window: int = 1):
+        self.window = window
+        self._data = []
+
+    def update(self, value):
+        """Update a new data sample."""
+        self._data.append(value)
+        self._data = self._data[-self.window:]
+
+    def average(self):
+        """Get the average value of current window."""
+        return np.mean(self._data)
+
+
+class StopWatch:
+    r"""A helper class to measure FPS and detailed time consuming of each phase
+    in a video processing loop or similar scenarios.
+
+    Args:
+        window (int): The sliding window size to calculate the running average
+            of the time consuming.
+
+    Example:
+        >>> from mmpose.utils import StopWatch
+        >>> import time
+        >>> stop_watch = StopWatch(window=10)
+        >>> with stop_watch.timeit('total'):
+        >>>     time.sleep(0.1)
+        >>>     # 'timeit' support nested use
+        >>>     with stop_watch.timeit('phase1'):
+        >>>         time.sleep(0.1)
+        >>>     with stop_watch.timeit('phase2'):
+        >>>         time.sleep(0.2)
+        >>>     time.sleep(0.2)
+        >>> report = stop_watch.report()
+    """
+
+    def __init__(self, window=1):
+        self.window = window
+        self._record = defaultdict(partial(RunningAverage, window=self.window))
+        self._timer_stack = []
+
+    @contextmanager
+    def timeit(self, timer_name='_FPS_'):
+        """Timing a code snippet with an assigned name.
+
+        Args:
+            timer_name (str): The unique name of the interested code snippet to
+                handle multiple timers and generate reports. Note that '_FPS_'
+                is a special key that the measurement will be in `fps` instead
+                of `millisecond`. Also see `report` and `report_strings`.
+                Default: '_FPS_'.
+        Note:
+            This function should always be used in a `with` statement, as shown
+            in the example.
+        """
+        self._timer_stack.append((timer_name, Timer()))
+        try:
+            yield
+        finally:
+            timer_name, timer = self._timer_stack.pop()
+            self._record[timer_name].update(timer.since_start())
+
+    def report(self, key=None):
+        """Report timing information.
+
+        Returns:
+            dict: The key is the timer name and the value is the \
+                corresponding average time consuming.
+        """
+        result = {
+            name: r.average() * 1000.
+            for name, r in self._record.items()
+        }
+
+        if '_FPS_' in result:
+            result['_FPS_'] = 1000. / result.pop('_FPS_')
+
+        if key is None:
+            return result
+        return result[key]
+
+    def report_strings(self):
+        """Report timing information in texture strings.
+
+        Returns:
+            list(str): Each element is the information string of a timed \
+                event, in format of '{timer_name}: {time_in_ms}'. \
+                Specially, if timer_name is '_FPS_', the result will \
+                be converted to fps.
+        """
+        result = self.report()
+        strings = []
+        if '_FPS_' in result:
+            strings.append(f'FPS: {result["_FPS_"]:>5.1f}')
+        strings += [f'{name}: {val:>3.0f}' for name, val in result.items()]
+        return strings
+
+    def reset(self):
+        self._record = defaultdict(list)
+        self._active_timer_stack = []
diff --git a/main/transformer_utils/mmpose/version.py b/main/transformer_utils/mmpose/version.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c8908940628018788a5adc23afafbe7e352b3a2
--- /dev/null
+++ b/main/transformer_utils/mmpose/version.py
@@ -0,0 +1,19 @@
+# Copyright (c) Open-MMLab. All rights reserved.
+
+__version__ = '0.28.0'
+short_version = __version__
+
+
+def parse_version_info(version_str):
+    version_info = []
+    for x in version_str.split('.'):
+        if x.isdigit():
+            version_info.append(int(x))
+        elif x.find('rc') != -1:
+            patch_version = x.split('rc')
+            version_info.append(int(patch_version[0]))
+            version_info.append(f'rc{patch_version[1]}')
+    return tuple(version_info)
+
+
+version_info = parse_version_info(__version__)
diff --git a/main/transformer_utils/mmpose_README.md b/main/transformer_utils/mmpose_README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f2c4a4f91bed45023806a0b03d0ba4f60ccb8354
--- /dev/null
+++ b/main/transformer_utils/mmpose_README.md
@@ -0,0 +1,289 @@
+<div align="center">
+  <img src="resources/mmpose-logo.png" width="450"/>
+  <div>&nbsp;</div>
+  <div align="center">
+    <b>OpenMMLab website</b>
+    <sup>
+      <a href="https://openmmlab.com">
+        <i>HOT</i>
+      </a>
+    </sup>
+    &nbsp;&nbsp;&nbsp;&nbsp;
+    <b>OpenMMLab platform</b>
+    <sup>
+      <a href="https://platform.openmmlab.com">
+        <i>TRY IT OUT</i>
+      </a>
+    </sup>
+  </div>
+  <div>&nbsp;</div>
+
+[![Documentation](https://readthedocs.org/projects/mmpose/badge/?version=latest)](https://mmpose.readthedocs.io/en/latest/?badge=latest)
+[![actions](https://github.com/open-mmlab/mmpose/workflows/build/badge.svg)](https://github.com/open-mmlab/mmpose/actions)
+[![codecov](https://codecov.io/gh/open-mmlab/mmpose/branch/master/graph/badge.svg)](https://codecov.io/gh/open-mmlab/mmpose)
+[![PyPI](https://img.shields.io/pypi/v/mmpose)](https://pypi.org/project/mmpose/)
+[![LICENSE](https://img.shields.io/github/license/open-mmlab/mmpose.svg)](https://github.com/open-mmlab/mmpose/blob/master/LICENSE)
+[![Average time to resolve an issue](https://isitmaintained.com/badge/resolution/open-mmlab/mmpose.svg)](https://github.com/open-mmlab/mmpose/issues)
+[![Percentage of issues still open](https://isitmaintained.com/badge/open/open-mmlab/mmpose.svg)](https://github.com/open-mmlab/mmpose/issues)
+
+[📘Documentation](https://mmpose.readthedocs.io/en/v0.28.0/) |
+[🛠️Installation](https://mmpose.readthedocs.io/en/v0.28.0/install.html) |
+[👀Model Zoo](https://mmpose.readthedocs.io/en/v0.28.0/modelzoo.html) |
+[📜Papers](https://mmpose.readthedocs.io/en/v0.28.0/papers/algorithms.html) |
+[🆕Update News](https://mmpose.readthedocs.io/en/v0.28.0/changelog.html) |
+[🤔Reporting Issues](https://github.com/open-mmlab/mmpose/issues/new/choose)
+
+</div>
+
+<div align="center">
+
+English | [简体中文](README_CN.md)
+
+</div>
+
+## Introduction
+
+MMPose is an open-source toolbox for pose estimation based on PyTorch.
+It is a part of the [OpenMMLab project](https://github.com/open-mmlab).
+
+The master branch works with **PyTorch 1.5+**.
+
+https://user-images.githubusercontent.com/15977946/124654387-0fd3c500-ded1-11eb-84f6-24eeddbf4d91.mp4
+
+<details open>
+<summary><b>Major Features</b></summary>
+
+- **Support diverse tasks**
+
+  We support a wide spectrum of mainstream pose analysis tasks in current research community, including 2d multi-person human pose estimation, 2d hand pose estimation, 2d face landmark detection, 133 keypoint whole-body human pose estimation, 3d human mesh recovery, fashion landmark detection and animal pose estimation.
+  See [demo.md](demo/README.md) for more information.
+
+- **Higher efficiency and higher accuracy**
+
+  MMPose implements multiple state-of-the-art (SOTA) deep learning models, including both top-down & bottom-up approaches. We achieve faster training speed and higher accuracy than other popular codebases, such as [HRNet](https://github.com/leoxiaobin/deep-high-resolution-net.pytorch).
+  See [benchmark.md](docs/en/benchmark.md) for more information.
+
+- **Support for various datasets**
+
+  The toolbox directly supports multiple popular and representative datasets, COCO, AIC, MPII, MPII-TRB, OCHuman etc.
+  See [data_preparation.md](docs/en/data_preparation.md) for more information.
+
+- **Well designed, tested and documented**
+
+  We decompose MMPose into different components and one can easily construct a customized
+  pose estimation framework by combining different modules.
+  We provide detailed documentation and API reference, as well as unittests.
+
+</details>
+
+## What's New
+
+- 2022-07-06: MMPose [v0.28.0](https://github.com/open-mmlab/mmpose/releases/tag/v0.28.0) is released. Major updates include:
+  - Support [TCFormer](https://openaccess.thecvf.com/content/CVPR2022/html/Zeng_Not_All_Tokens_Are_Equal_Human-Centric_Visual_Analysis_via_Token_CVPR_2022_paper.html) (CVPR'2022). See the [model page](/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/tcformer_coco-wholebody.md)
+  - Add [RLE](https://arxiv.org/abs/2107.11291) pre-trained model on COCO dataset. See the [model page](/configs/body/2d_kpt_sview_rgb_img/deeppose/coco/resnet_rle_coco.md)
+  - Update [Swin](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/swin_coco.md) models with better performance
+- 2022-02-28: MMPose model deployment is supported by [MMDeploy](https://github.com/open-mmlab/mmdeploy) v0.3.0
+  MMPose Webcam API is a simple yet powerful tool to develop interactive webcam applications with MMPose features.
+- 2021-12-29: OpenMMLab Open Platform is online! Try our [pose estimation demo](https://platform.openmmlab.com/web-demo/demo/poseestimation)
+
+## Installation
+
+MMPose depends on [PyTorch](https://pytorch.org/) and [MMCV](https://github.com/open-mmlab/mmcv).
+Below are quick steps for installation.
+Please refer to [install.md](docs/en/install.md) for detailed installation guide.
+
+```shell
+conda create -n open-mmlab python=3.8 pytorch=1.10 cudatoolkit=11.3 torchvision -c pytorch -y
+conda activate open-mmlab
+pip3 install openmim
+mim install mmcv-full
+git clone https://github.com/open-mmlab/mmpose.git
+cd mmpose
+pip3 install -e .
+```
+
+## Getting Started
+
+Please see [get_started.md](docs/en/get_started.md) for the basic usage of MMPose.
+There are also tutorials:
+
+- [learn about configs](docs/en/tutorials/0_config.md)
+- [finetune model](docs/en/tutorials/1_finetune.md)
+- [add new dataset](docs/en/tutorials/2_new_dataset.md)
+- [customize data pipelines](docs/en/tutorials/3_data_pipeline.md)
+- [add new modules](docs/en/tutorials/4_new_modules.md)
+- [export a model to ONNX](docs/en/tutorials/5_export_model.md)
+- [customize runtime settings](docs/en/tutorials/6_customize_runtime.md)
+
+## Model Zoo
+
+Results and models are available in the *README.md* of each method's config directory.
+A summary can be found in the [Model Zoo](https://mmpose.readthedocs.io/en/latest/modelzoo.html) page.
+
+<details open>
+<summary><b>Supported algorithms:</b></summary>
+
+- [x] [DeepPose](https://mmpose.readthedocs.io/en/latest/papers/algorithms.html#deeppose-cvpr-2014) (CVPR'2014)
+- [x] [CPM](https://mmpose.readthedocs.io/en/latest/papers/backbones.html#cpm-cvpr-2016) (CVPR'2016)
+- [x] [Hourglass](https://mmpose.readthedocs.io/en/latest/papers/backbones.html#hourglass-eccv-2016) (ECCV'2016)
+- [x] [SimpleBaseline3D](https://mmpose.readthedocs.io/en/latest/papers/algorithms.html#simplebaseline3d-iccv-2017) (ICCV'2017)
+- [x] [Associative Embedding](https://mmpose.readthedocs.io/en/latest/papers/algorithms.html#associative-embedding-nips-2017) (NeurIPS'2017)
+- [x] [HMR](https://mmpose.readthedocs.io/en/latest/papers/algorithms.html#hmr-cvpr-2018) (CVPR'2018)
+- [x] [SimpleBaseline2D](https://mmpose.readthedocs.io/en/latest/papers/algorithms.html#simplebaseline2d-eccv-2018) (ECCV'2018)
+- [x] [HRNet](https://mmpose.readthedocs.io/en/latest/papers/backbones.html#hrnet-cvpr-2019) (CVPR'2019)
+- [x] [VideoPose3D](https://mmpose.readthedocs.io/en/latest/papers/algorithms.html#videopose3d-cvpr-2019) (CVPR'2019)
+- [x] [HRNetv2](https://mmpose.readthedocs.io/en/latest/papers/backbones.html#hrnetv2-tpami-2019) (TPAMI'2019)
+- [x] [MSPN](https://mmpose.readthedocs.io/en/latest/papers/backbones.html#mspn-arxiv-2019) (ArXiv'2019)
+- [x] [SCNet](https://mmpose.readthedocs.io/en/latest/papers/backbones.html#scnet-cvpr-2020) (CVPR'2020)
+- [x] [HigherHRNet](https://mmpose.readthedocs.io/en/latest/papers/backbones.html#higherhrnet-cvpr-2020) (CVPR'2020)
+- [x] [RSN](https://mmpose.readthedocs.io/en/latest/papers/backbones.html#rsn-eccv-2020) (ECCV'2020)
+- [x] [InterNet](https://mmpose.readthedocs.io/en/latest/papers/algorithms.html#internet-eccv-2020) (ECCV'2020)
+- [x] [VoxelPose](https://mmpose.readthedocs.io/en/latest/papers/algorithms.html#voxelpose-eccv-2020) (ECCV'2020)
+- [x] [LiteHRNet](https://mmpose.readthedocs.io/en/latest/papers/backbones.html#litehrnet-cvpr-2021) (CVPR'2021)
+- [x] [ViPNAS](https://mmpose.readthedocs.io/en/latest/papers/backbones.html#vipnas-cvpr-2021) (CVPR'2021)
+
+</details>
+
+<details open>
+<summary><b>Supported techniques:</b></summary>
+
+- [x] [FPN](https://mmpose.readthedocs.io/en/latest/papers/techniques.html#fpn-cvpr-2017) (CVPR'2017)
+- [x] [FP16](https://mmpose.readthedocs.io/en/latest/papers/techniques.html#fp16-arxiv-2017) (ArXiv'2017)
+- [x] [Wingloss](https://mmpose.readthedocs.io/en/latest/papers/techniques.html#wingloss-cvpr-2018) (CVPR'2018)
+- [x] [AdaptiveWingloss](https://mmpose.readthedocs.io/en/latest/papers/techniques.html#adaptivewingloss-iccv-2019) (ICCV'2019)
+- [x] [DarkPose](https://mmpose.readthedocs.io/en/latest/papers/techniques.html#darkpose-cvpr-2020) (CVPR'2020)
+- [x] [UDP](https://mmpose.readthedocs.io/en/latest/papers/techniques.html#udp-cvpr-2020) (CVPR'2020)
+- [x] [Albumentations](https://mmpose.readthedocs.io/en/latest/papers/techniques.html#albumentations-information-2020) (Information'2020)
+- [x] [SoftWingloss](https://mmpose.readthedocs.io/en/latest/papers/techniques.html#softwingloss-tip-2021) (TIP'2021)
+- [x] [SmoothNet](/configs/_base_/filters/smoothnet_h36m.md) (arXiv'2021)
+- [x] [RLE](https://mmpose.readthedocs.io/en/latest/papers/techniques.html#rle-iccv-2021) (ICCV'2021)
+
+</details>
+
+<details open>
+<summary><b>Supported <a href="https://mmpose.readthedocs.io/en/latest/datasets.html">datasets</a>:</b></summary>
+
+- [x] [AFLW](https://mmpose.readthedocs.io/en/latest/papers/datasets.html#aflw-iccvw-2011) \[[homepage](https://www.tugraz.at/institute/icg/research/team-bischof/lrs/downloads/aflw/)\] (ICCVW'2011)
+- [x] [sub-JHMDB](https://mmpose.readthedocs.io/en/latest/papers/datasets.html#jhmdb-iccv-2013) \[[homepage](http://jhmdb.is.tue.mpg.de/dataset)\] (ICCV'2013)
+- [x] [COFW](https://mmpose.readthedocs.io/en/latest/papers/datasets.html#cofw-iccv-2013) \[[homepage](http://www.vision.caltech.edu/xpburgos/ICCV13/)\] (ICCV'2013)
+- [x] [MPII](https://mmpose.readthedocs.io/en/latest/papers/datasets.html#mpii-cvpr-2014) \[[homepage](http://human-pose.mpi-inf.mpg.de/)\] (CVPR'2014)
+- [x] [Human3.6M](https://mmpose.readthedocs.io/en/latest/papers/datasets.html#human3-6m-tpami-2014) \[[homepage](http://vision.imar.ro/human3.6m/description.php)\] (TPAMI'2014)
+- [x] [COCO](https://mmpose.readthedocs.io/en/latest/papers/datasets.html#coco-eccv-2014) \[[homepage](http://cocodataset.org/)\] (ECCV'2014)
+- [x] [CMU Panoptic](https://mmpose.readthedocs.io/en/latest/papers/datasets.html#cmu-panoptic-iccv-2015) \[[homepage](http://domedb.perception.cs.cmu.edu/)\] (ICCV'2015)
+- [x] [DeepFashion](https://mmpose.readthedocs.io/en/latest/papers/datasets.html#deepfashion-cvpr-2016) \[[homepage](http://mmlab.ie.cuhk.edu.hk/projects/DeepFashion/LandmarkDetection.html)\] (CVPR'2016)
+- [x] [300W](https://mmpose.readthedocs.io/en/latest/papers/datasets.html#300w-imavis-2016) \[[homepage](https://ibug.doc.ic.ac.uk/resources/300-W/)\] (IMAVIS'2016)
+- [x] [RHD](https://mmpose.readthedocs.io/en/latest/papers/datasets.html#rhd-iccv-2017) \[[homepage](https://lmb.informatik.uni-freiburg.de/resources/datasets/RenderedHandposeDataset.en.html)\] (ICCV'2017)
+- [x] [CMU Panoptic HandDB](https://mmpose.readthedocs.io/en/latest/papers/datasets.html#cmu-panoptic-handdb-cvpr-2017) \[[homepage](http://domedb.perception.cs.cmu.edu/handdb.html)\] (CVPR'2017)
+- [x] [AI Challenger](https://mmpose.readthedocs.io/en/latest/papers/datasets.html#ai-challenger-arxiv-2017) \[[homepage](https://github.com/AIChallenger/AI_Challenger_2017)\] (ArXiv'2017)
+- [x] [MHP](https://mmpose.readthedocs.io/en/latest/papers/datasets.html#mhp-acm-mm-2018) \[[homepage](https://lv-mhp.github.io/dataset)\] (ACM MM'2018)
+- [x] [WFLW](https://mmpose.readthedocs.io/en/latest/papers/datasets.html#wflw-cvpr-2018) \[[homepage](https://wywu.github.io/projects/LAB/WFLW.html)\] (CVPR'2018)
+- [x] [PoseTrack18](https://mmpose.readthedocs.io/en/latest/papers/datasets.html#posetrack18-cvpr-2018) \[[homepage](https://posetrack.net/users/download.php)\] (CVPR'2018)
+- [x] [OCHuman](https://mmpose.readthedocs.io/en/latest/papers/datasets.html#ochuman-cvpr-2019) \[[homepage](https://github.com/liruilong940607/OCHumanApi)\] (CVPR'2019)
+- [x] [CrowdPose](https://mmpose.readthedocs.io/en/latest/papers/datasets.html#crowdpose-cvpr-2019) \[[homepage](https://github.com/Jeff-sjtu/CrowdPose)\] (CVPR'2019)
+- [x] [MPII-TRB](https://mmpose.readthedocs.io/en/latest/papers/datasets.html#mpii-trb-iccv-2019) \[[homepage](https://github.com/kennymckormick/Triplet-Representation-of-human-Body)\] (ICCV'2019)
+- [x] [FreiHand](https://mmpose.readthedocs.io/en/latest/papers/datasets.html#freihand-iccv-2019) \[[homepage](https://lmb.informatik.uni-freiburg.de/projects/freihand/)\] (ICCV'2019)
+- [x] [Animal-Pose](https://mmpose.readthedocs.io/en/latest/papers/datasets.html#animal-pose-iccv-2019) \[[homepage](https://sites.google.com/view/animal-pose/)\] (ICCV'2019)
+- [x] [OneHand10K](https://mmpose.readthedocs.io/en/latest/papers/datasets.html#onehand10k-tcsvt-2019) \[[homepage](https://www.yangangwang.com/papers/WANG-MCC-2018-10.html)\] (TCSVT'2019)
+- [x] [Vinegar Fly](https://mmpose.readthedocs.io/en/latest/papers/datasets.html#vinegar-fly-nature-methods-2019) \[[homepage](https://github.com/jgraving/DeepPoseKit-Data)\] (Nature Methods'2019)
+- [x] [Desert Locust](https://mmpose.readthedocs.io/en/latest/papers/datasets.html#desert-locust-elife-2019) \[[homepage](https://github.com/jgraving/DeepPoseKit-Data)\] (Elife'2019)
+- [x] [Grévy’s Zebra](https://mmpose.readthedocs.io/en/latest/papers/datasets.html#grevys-zebra-elife-2019) \[[homepage](https://github.com/jgraving/DeepPoseKit-Data)\] (Elife'2019)
+- [x] [ATRW](https://mmpose.readthedocs.io/en/latest/papers/datasets.html#atrw-acm-mm-2020) \[[homepage](https://cvwc2019.github.io/challenge.html)\] (ACM MM'2020)
+- [x] [Halpe](https://mmpose.readthedocs.io/en/latest/papers/datasets.html#halpe-cvpr-2020) \[[homepage](https://github.com/Fang-Haoshu/Halpe-FullBody/)\] (CVPR'2020)
+- [x] [COCO-WholeBody](https://mmpose.readthedocs.io/en/latest/papers/datasets.html#coco-wholebody-eccv-2020) \[[homepage](https://github.com/jin-s13/COCO-WholeBody/)\] (ECCV'2020)
+- [x] [MacaquePose](https://mmpose.readthedocs.io/en/latest/papers/datasets.html#macaquepose-biorxiv-2020) \[[homepage](http://www.pri.kyoto-u.ac.jp/datasets/macaquepose/index.html)\] (bioRxiv'2020)
+- [x] [InterHand2.6M](https://mmpose.readthedocs.io/en/latest/papers/datasets.html#interhand2-6m-eccv-2020) \[[homepage](https://mks0601.github.io/InterHand2.6M/)\] (ECCV'2020)
+- [x] [AP-10K](https://mmpose.readthedocs.io/en/latest/papers/datasets.html#ap-10k-neurips-2021) \[[homepage](https://github.com/AlexTheBad/AP-10K)\] (NeurIPS'2021)
+- [x] [Horse-10](https://mmpose.readthedocs.io/en/latest/papers/datasets.html#horse-10-wacv-2021) \[[homepage](http://www.mackenziemathislab.org/horse10)\] (WACV'2021)
+
+</details>
+
+<details open>
+<summary><b>Supported backbones:</b></summary>
+
+- [x] [AlexNet](https://mmpose.readthedocs.io/en/latest/papers/backbones.html#alexnet-neurips-2012) (NeurIPS'2012)
+- [x] [VGG](https://mmpose.readthedocs.io/en/latest/papers/backbones.html#vgg-iclr-2015) (ICLR'2015)
+- [x] [ResNet](https://mmpose.readthedocs.io/en/latest/papers/backbones.html#resnet-cvpr-2016) (CVPR'2016)
+- [x] [ResNext](https://mmpose.readthedocs.io/en/latest/papers/backbones.html#resnext-cvpr-2017) (CVPR'2017)
+- [x] [SEResNet](https://mmpose.readthedocs.io/en/latest/papers/backbones.html#seresnet-cvpr-2018) (CVPR'2018)
+- [x] [ShufflenetV1](https://mmpose.readthedocs.io/en/latest/papers/backbones.html#shufflenetv1-cvpr-2018) (CVPR'2018)
+- [x] [ShufflenetV2](https://mmpose.readthedocs.io/en/latest/papers/backbones.html#shufflenetv2-eccv-2018) (ECCV'2018)
+- [x] [MobilenetV2](https://mmpose.readthedocs.io/en/latest/papers/backbones.html#mobilenetv2-cvpr-2018) (CVPR'2018)
+- [x] [ResNetV1D](https://mmpose.readthedocs.io/en/latest/papers/backbones.html#resnetv1d-cvpr-2019) (CVPR'2019)
+- [x] [ResNeSt](https://mmpose.readthedocs.io/en/latest/papers/backbones.html#resnest-arxiv-2020) (ArXiv'2020)
+- [x] [Swin](https://mmpose.readthedocs.io/en/latest/papers/backbones.html#swin-cvpr-2021) (CVPR'2021)
+- [x] [HRFormer](https://mmpose.readthedocs.io/en/latest/papers/backbones.html#hrformer-nips-2021) (NIPS'2021)
+- [x] [PVT](https://mmpose.readthedocs.io/en/latest/papers/backbones.html#pvt-iccv-2021) (ICCV'2021)
+- [x] [PVTV2](https://mmpose.readthedocs.io/en/latest/papers/backbones.html#pvtv2-cvmj-2022) (CVMJ'2022)
+
+</details>
+
+### Model Request
+
+We will keep up with the latest progress of the community, and support more popular algorithms and frameworks. If you have any feature requests, please feel free to leave a comment in [MMPose Roadmap](https://github.com/open-mmlab/mmpose/issues/9).
+
+### Benchmark
+
+#### Accuracy and Training Speed
+
+MMPose achieves superior of training speed and accuracy on the standard keypoint detection benchmarks like COCO. See more details at [benchmark.md](docs/en/benchmark.md).
+
+#### Inference Speed
+
+We summarize the model complexity and inference speed of major models in MMPose, including FLOPs, parameter counts and inference speeds on both CPU and GPU devices with different batch sizes. Please refer to [inference_speed_summary.md](docs/en/inference_speed_summary.md) for more details.
+
+## Data Preparation
+
+Please refer to [data_preparation.md](docs/en/data_preparation.md) for a general knowledge of data preparation.
+
+## FAQ
+
+Please refer to [FAQ](docs/en/faq.md) for frequently asked questions.
+
+## Contributing
+
+We appreciate all contributions to improve MMPose. Please refer to [CONTRIBUTING.md](.github/CONTRIBUTING.md) for the contributing guideline.
+
+## Acknowledgement
+
+MMPose is an open source project that is contributed by researchers and engineers from various colleges and companies.
+We appreciate all the contributors who implement their methods or add new features, as well as users who give valuable feedbacks.
+We wish that the toolbox and benchmark could serve the growing research community by providing a flexible toolkit to reimplement existing methods and develop their own new models.
+
+## Citation
+
+If you find this project useful in your research, please consider cite:
+
+```bibtex
+@misc{mmpose2020,
+    title={OpenMMLab Pose Estimation Toolbox and Benchmark},
+    author={MMPose Contributors},
+    howpublished = {\url{https://github.com/open-mmlab/mmpose}},
+    year={2020}
+}
+```
+
+## License
+
+This project is released under the [Apache 2.0 license](LICENSE).
+
+## Projects in OpenMMLab
+
+- [MMCV](https://github.com/open-mmlab/mmcv): OpenMMLab foundational library for computer vision.
+- [MIM](https://github.com/open-mmlab/mim): MIM installs OpenMMLab packages.
+- [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab image classification toolbox and benchmark.
+- [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab detection toolbox and benchmark.
+- [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab's next-generation platform for general 3D object detection.
+- [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab rotated object detection toolbox and benchmark.
+- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab semantic segmentation toolbox and benchmark.
+- [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab text detection, recognition, and understanding toolbox.
+- [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab pose estimation toolbox and benchmark.
+- [MMHuman3D](https://github.com/open-mmlab/mmhuman3d): OpenMMLab 3D human parametric model toolbox and benchmark.
+- [MMSelfSup](https://github.com/open-mmlab/mmselfsup): OpenMMLab self-supervised learning toolbox and benchmark.
+- [MMRazor](https://github.com/open-mmlab/mmrazor): OpenMMLab model compression toolbox and benchmark.
+- [MMFewShot](https://github.com/open-mmlab/mmfewshot): OpenMMLab fewshot learning toolbox and benchmark.
+- [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab's next-generation action understanding toolbox and benchmark.
+- [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab video perception toolbox and benchmark.
+- [MMFlow](https://github.com/open-mmlab/mmflow): OpenMMLab optical flow toolbox and benchmark.
+- [MMEditing](https://github.com/open-mmlab/mmediting): OpenMMLab image and video editing toolbox.
+- [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab image and video generative models toolbox.
+- [MMDeploy](https://github.com/open-mmlab/mmdeploy): OpenMMLab Model Deployment Framework.
diff --git a/main/transformer_utils/mmpose_README_CN.md b/main/transformer_utils/mmpose_README_CN.md
new file mode 100644
index 0000000000000000000000000000000000000000..05074e3ef9c670dcea4a4bac953f4016ea318b0a
--- /dev/null
+++ b/main/transformer_utils/mmpose_README_CN.md
@@ -0,0 +1,311 @@
+<div align="center">
+  <img src="resources/mmpose-logo.png" width="450"/>
+  <div>&nbsp;</div>
+  <div align="center">
+    <b><font size="5">OpenMMLab 官网</font></b>
+    <sup>
+      <a href="https://openmmlab.com">
+        <i><font size="4">HOT</font></i>
+      </a>
+    </sup>
+    &nbsp;&nbsp;&nbsp;&nbsp;
+    <b><font size="5">OpenMMLab 开放平台</font></b>
+    <sup>
+      <a href="https://platform.openmmlab.com">
+        <i><font size="4">TRY IT OUT</font></i>
+      </a>
+    </sup>
+  </div>
+  <div>&nbsp;</div>
+
+[![Documentation](https://readthedocs.org/projects/mmpose/badge/?version=latest)](https://mmpose.readthedocs.io/en/latest/?badge=latest)
+[![actions](https://github.com/open-mmlab/mmpose/workflows/build/badge.svg)](https://github.com/open-mmlab/mmpose/actions)
+[![codecov](https://codecov.io/gh/open-mmlab/mmpose/branch/master/graph/badge.svg)](https://codecov.io/gh/open-mmlab/mmpose)
+[![PyPI](https://img.shields.io/pypi/v/mmpose)](https://pypi.org/project/mmpose/)
+[![LICENSE](https://img.shields.io/github/license/open-mmlab/mmpose.svg)](https://github.com/open-mmlab/mmpose/blob/master/LICENSE)
+[![Average time to resolve an issue](https://isitmaintained.com/badge/resolution/open-mmlab/mmpose.svg)](https://github.com/open-mmlab/mmpose/issues)
+[![Percentage of issues still open](https://isitmaintained.com/badge/open/open-mmlab/mmpose.svg)](https://github.com/open-mmlab/mmpose/issues)
+
+[📘文档](https://mmpose.readthedocs.io/zh_CN/v0.28.0/) |
+[🛠️安装](https://mmpose.readthedocs.io/zh_CN/v0.28.0/install.html) |
+[👀模型库](https://mmpose.readthedocs.io/zh_CN/v0.28.0/modelzoo.html) |
+[📜论文库](https://mmpose.readthedocs.io/zh_CN/v0.28.0/papers/algorithms.html) |
+[🆕更新日志](https://mmpose.readthedocs.io/en/v0.28.0/changelog.html) |
+[🤔报告问题](https://github.com/open-mmlab/mmpose/issues/new/choose)
+
+</div>
+
+<div align="center">
+
+[English](README.md) | 简体中文
+
+</div>
+
+## 简介
+
+MMPose 是一款基于 PyTorch 的姿态分析的开源工具箱，是 [OpenMMLab](http://openmmlab.org/) 项目的成员之一。
+
+主分支代码目前支持 **PyTorch 1.5 以上**的版本。
+
+https://user-images.githubusercontent.com/15977946/124654387-0fd3c500-ded1-11eb-84f6-24eeddbf4d91.mp4
+
+<details open>
+<summary><b>主要特性</b></summary>
+
+- **支持多种人体姿态分析相关任务**
+
+  MMPose 支持当前学界广泛关注的主流姿态分析任务：主要包括 2D多人姿态估计、2D手部姿态估计、2D人脸关键点检测、133关键点的全身人体姿态估计、3D人体形状恢复、服饰关键点检测、动物关键点检测等。
+  具体请参考 [功能演示](demo/README.md)。
+
+- **更高的精度和更快的速度**
+
+  MMPose 复现了多种学界最先进的人体姿态分析模型，包括“自顶向下”和“自底向上”两大类算法。MMPose 相比于其他主流的代码库，具有更高的模型精度和训练速度。
+  具体请参考 [基准测试](docs/en/benchmark.md)（英文）。
+
+- **支持多样的数据集**
+
+  MMPose 支持了很多主流数据集的准备和构建，如 COCO、 MPII 等。 具体请参考 [数据集准备](docs/en/data_preparation.md)。
+
+- **模块化设计**
+
+  MMPose 将统一的人体姿态分析框架解耦成不同的模块组件，通过组合不同的模块组件，用户可以便捷地构建自定义的人体姿态分析模型。
+
+- **详尽的单元测试和文档**
+
+  MMPose 提供了详尽的说明文档，API 接口说明，全面的单元测试，以供社区参考。
+
+</details>
+
+## 最新进展
+
+- 2022-07-06: MMPose [v0.28.0](https://github.com/open-mmlab/mmpose/releases/tag/v0.28.0) 已经发布. 主要更新包括:
+  - 支持了新的主干网络 [TCFormer](https://openaccess.thecvf.com/content/CVPR2022/html/Zeng_Not_All_Tokens_Are_Equal_Human-Centric_Visual_Analysis_via_Token_CVPR_2022_paper.html) (CVPR'2022)，详见 [模型信息](/configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/tcformer_coco-wholebody.md)
+  - 增加了 [RLE](https://arxiv.org/abs/2107.11291) 在 COCO 数据集上的模型，详见 [模型信息](/configs/body/2d_kpt_sview_rgb_img/deeppose/coco/resnet_rle_coco.md)
+  - 优化了 [Swin](/configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/swin_coco.md) 模型精度
+- 2022-04: MMPose 代码可以通过 [Gitee](https://gitee.com/open-mmlab/mmpose) 访问
+- 2022-02-28: [MMDeploy](https://github.com/open-mmlab/mmdeploy) v0.3.0 支持 MMPose 模型部署
+- 2021-12-29: OpenMMLab 开放平台已经正式上线! 欢迎试用基于 MMPose 的[姿态估计 Demo](https://platform.openmmlab.com/web-demo/demo/poseestimation)
+
+## 安装
+
+MMPose 依赖 [PyTorch](https://pytorch.org/) 和 [MMCV](https://github.com/open-mmlab/mmcv)，以下是安装的简要步骤。
+更详细的安装指南请参考 [install.md](docs/zh_cn/install.md)。
+
+```shell
+conda create -n open-mmlab python=3.8 pytorch=1.10 cudatoolkit=11.3 torchvision -c pytorch -y
+conda activate open-mmlab
+pip3 install openmim
+mim install mmcv-full
+git clone https://github.com/open-mmlab/mmpose.git
+cd mmpose
+pip3 install -e .
+```
+
+## 教程
+
+请参考 [get_started.md](docs/zh_cn/get_started.md) 了解 MMPose 的基本使用。
+MMPose 也提供了其他更详细的教程:
+
+- [如何编写配置文件](docs/zh_cn/tutorials/0_config.md)
+- [如何微调模型](docs/zh_cn/tutorials/1_finetune.md)
+- [如何增加新数据集](docs/zh_cn/tutorials/2_new_dataset.md)
+- [如何设计数据处理流程](docs/zh_cn/tutorials/3_data_pipeline.md)
+- [如何增加新模块](docs/zh_cn/tutorials/4_new_modules.md)
+- [如何导出模型为 onnx 格式](docs/zh_cn/tutorials/5_export_model.md)
+- [如何自定义运行配置](docs/zh_cn/tutorials/6_customize_runtime.md)
+- [如何使用摄像头应用接口（Webcam API）](docs/zh_cn/tutorials/7_webcam_api.md)
+
+## 模型库
+
+各个模型的结果和设置都可以在对应的 config（配置）目录下的 *README.md* 中查看。
+整体的概况也可也在 [模型库](https://mmpose.readthedocs.io/zh_CN/latest/recognition_models.html) 页面中查看。
+
+<details open>
+<summary><b>支持的算法</b></summary>
+
+- [x] [DeepPose](https://mmpose.readthedocs.io/zh_CN/latest/papers/algorithms.html#deeppose-cvpr-2014) (CVPR'2014)
+- [x] [CPM](https://mmpose.readthedocs.io/zh_CN/latest/papers/backbones.html#cpm-cvpr-2016) (CVPR'2016)
+- [x] [Hourglass](https://mmpose.readthedocs.io/zh_CN/latest/papers/backbones.html#hourglass-eccv-2016) (ECCV'2016)
+- [x] [SimpleBaseline3D](https://mmpose.readthedocs.io/zh_CN/latest/papers/algorithms.html#simplebaseline3d-iccv-2017) (ICCV'2017)
+- [x] [Associative Embedding](https://mmpose.readthedocs.io/zh_CN/latest/papers/algorithms.html#associative-embedding-nips-2017) (NeurIPS'2017)
+- [x] [HMR](https://mmpose.readthedocs.io/zh_CN/latest/papers/algorithms.html#hmr-cvpr-2018) (CVPR'2018)
+- [x] [SimpleBaseline2D](https://mmpose.readthedocs.io/zh_CN/latest/papers/algorithms.html#simplebaseline2d-eccv-2018) (ECCV'2018)
+- [x] [HRNet](https://mmpose.readthedocs.io/zh_CN/latest/papers/backbones.html#hrnet-cvpr-2019) (CVPR'2019)
+- [x] [VideoPose3D](https://mmpose.readthedocs.io/zh_CN/latest/papers/algorithms.html#videopose3d-cvpr-2019) (CVPR'2019)
+- [x] [HRNetv2](https://mmpose.readthedocs.io/zh_CN/latest/papers/backbones.html#hrnetv2-tpami-2019) (TPAMI'2019)
+- [x] [MSPN](https://mmpose.readthedocs.io/zh_CN/latest/papers/backbones.html#mspn-arxiv-2019) (ArXiv'2019)
+- [x] [SCNet](https://mmpose.readthedocs.io/zh_CN/latest/papers/backbones.html#scnet-cvpr-2020) (CVPR'2020)
+- [x] [HigherHRNet](https://mmpose.readthedocs.io/zh_CN/latest/papers/backbones.html#higherhrnet-cvpr-2020) (CVPR'2020)
+- [x] [RSN](https://mmpose.readthedocs.io/zh_CN/latest/papers/backbones.html#rsn-eccv-2020) (ECCV'2020)
+- [x] [InterNet](https://mmpose.readthedocs.io/zh_CN/latest/papers/algorithms.html#internet-eccv-2020) (ECCV'2020)
+- [x] [VoxelPose](https://mmpose.readthedocs.io/zh_CN/latest/papers/algorithms.html#voxelpose-eccv-2020) (ECCV'2020
+- [x] [LiteHRNet](https://mmpose.readthedocs.io/zh_CN/latest/papers/backbones.html#litehrnet-cvpr-2021) (CVPR'2021)
+- [x] [ViPNAS](https://mmpose.readthedocs.io/zh_CN/latest/papers/backbones.html#vipnas-cvpr-2021) (CVPR'2021)
+
+</details>
+
+<details open>
+<summary><b>支持的技术</b></summary>
+
+- [x] [FPN](https://mmpose.readthedocs.io/zh_CN/latest/papers/techniques.html#fpn-cvpr-2017) (CVPR'2017)
+- [x] [FP16](https://mmpose.readthedocs.io/zh_CN/latest/papers/techniques.html#fp16-arxiv-2017) (ArXiv'2017)
+- [x] [Wingloss](https://mmpose.readthedocs.io/zh_CN/latest/papers/techniques.html#wingloss-cvpr-2018) (CVPR'2018)
+- [x] [AdaptiveWingloss](https://mmpose.readthedocs.io/zh_CN/latest/papers/techniques.html#adaptivewingloss-iccv-2019) (ICCV'2019)
+- [x] [DarkPose](https://mmpose.readthedocs.io/zh_CN/latest/papers/techniques.html#darkpose-cvpr-2020) (CVPR'2020)
+- [x] [UDP](https://mmpose.readthedocs.io/zh_CN/latest/papers/techniques.html#udp-cvpr-2020) (CVPR'2020)
+- [x] [Albumentations](https://mmpose.readthedocs.io/zh_CN/latest/papers/techniques.html#albumentations-information-2020) (Information'2020)
+- [x] [SoftWingloss](https://mmpose.readthedocs.io/zh_CN/latest/papers/techniques.html#softwingloss-tip-2021) (TIP'2021)
+- [x] [SmoothNet](/configs/_base_/filters/smoothnet_h36m.md) (arXiv'2021)
+- [x] [RLE](https://mmpose.readthedocs.io/zh_CN/latest/papers/techniques.html#rle-iccv-2021) (ICCV'2021)
+
+</details>
+
+<details open>
+<summary><b><a href="https://mmpose.readthedocs.io/zh_CN/latest/datasets.html">支持的数据集</a></b></summary>
+
+- [x] [AFLW](https://mmpose.readthedocs.io/zh_CN/latest/papers/datasets.html#aflw-iccvw-2011) \[[homepage](https://www.tugraz.at/institute/icg/research/team-bischof/lrs/downloads/aflw/)\] (ICCVW'2011)
+- [x] [sub-JHMDB](https://mmpose.readthedocs.io/zh_CN/latest/papers/datasets.html#jhmdb-iccv-2013) \[[homepage](http://jhmdb.is.tue.mpg.de/dataset)\] (ICCV'2013)
+- [x] [COFW](https://mmpose.readthedocs.io/zh_CN/latest/papers/datasets.html#cofw-iccv-2013) \[[homepage](http://www.vision.caltech.edu/xpburgos/ICCV13/)\] (ICCV'2013)
+- [x] [MPII](https://mmpose.readthedocs.io/zh_CN/latest/papers/datasets.html#mpii-cvpr-2014) \[[homepage](http://human-pose.mpi-inf.mpg.de/)\] (CVPR'2014)
+- [x] [Human3.6M](https://mmpose.readthedocs.io/zh_CN/latest/papers/datasets.html#human3-6m-tpami-2014) \[[homepage](http://vision.imar.ro/human3.6m/description.php)\] (TPAMI'2014)
+- [x] [COCO](https://mmpose.readthedocs.io/zh_CN/latest/papers/datasets.html#coco-eccv-2014) \[[homepage](http://cocodataset.org/)\] (ECCV'2014)
+- [x] [CMU Panoptic](https://mmpose.readthedocs.io/zh_CN/latest/papers/datasets.html#cmu-panoptic-iccv-2015) (ICCV'2015)
+- [x] [DeepFashion](https://mmpose.readthedocs.io/zh_CN/latest/papers/datasets.html#deepfashion-cvpr-2016) \[[homepage](http://mmlab.ie.cuhk.edu.hk/projects/DeepFashion/LandmarkDetection.html)\] (CVPR'2016)
+- [x] [300W](https://mmpose.readthedocs.io/zh_CN/latest/papers/datasets.html#300w-imavis-2016) \[[homepage](https://ibug.doc.ic.ac.uk/resources/300-W/)\] (IMAVIS'2016)
+- [x] [RHD](https://mmpose.readthedocs.io/zh_CN/latest/papers/datasets.html#rhd-iccv-2017) \[[homepage](https://lmb.informatik.uni-freiburg.de/resources/datasets/RenderedHandposeDataset.en.html)\] (ICCV'2017)
+- [x] [CMU Panoptic](https://mmpose.readthedocs.io/zh_CN/latest/papers/datasets.html#cmu-panoptic-iccv-2015) \[[homepage](http://domedb.perception.cs.cmu.edu/)\] (ICCV'2015)
+- [x] [AI Challenger](https://mmpose.readthedocs.io/zh_CN/latest/papers/datasets.html#ai-challenger-arxiv-2017) \[[homepage](https://github.com/AIChallenger/AI_Challenger_2017)\] (ArXiv'2017)
+- [x] [MHP](https://mmpose.readthedocs.io/zh_CN/latest/papers/datasets.html#mhp-acm-mm-2018) \[[homepage](https://lv-mhp.github.io/dataset)\] (ACM MM'2018)
+- [x] [WFLW](https://mmpose.readthedocs.io/zh_CN/latest/papers/datasets.html#wflw-cvpr-2018) \[[homepage](https://wywu.github.io/projects/LAB/WFLW.html)\] (CVPR'2018)
+- [x] [PoseTrack18](https://mmpose.readthedocs.io/zh_CN/latest/papers/datasets.html#posetrack18-cvpr-2018) \[[homepage](https://posetrack.net/users/download.php)\] (CVPR'2018)
+- [x] [OCHuman](https://mmpose.readthedocs.io/zh_CN/latest/papers/datasets.html#ochuman-cvpr-2019) \[[homepage](https://github.com/liruilong940607/OCHumanApi)\] (CVPR'2019)
+- [x] [CrowdPose](https://mmpose.readthedocs.io/zh_CN/latest/papers/datasets.html#crowdpose-cvpr-2019) \[[homepage](https://github.com/Jeff-sjtu/CrowdPose)\] (CVPR'2019)
+- [x] [MPII-TRB](https://mmpose.readthedocs.io/zh_CN/latest/papers/datasets.html#mpii-trb-iccv-2019) \[[homepage](https://github.com/kennymckormick/Triplet-Representation-of-human-Body)\] (ICCV'2019)
+- [x] [FreiHand](https://mmpose.readthedocs.io/zh_CN/latest/papers/datasets.html#freihand-iccv-2019) \[[homepage](https://lmb.informatik.uni-freiburg.de/projects/freihand/)\] (ICCV'2019)
+- [x] [Animal-Pose](https://mmpose.readthedocs.io/zh_CN/latest/papers/datasets.html#animal-pose-iccv-2019) \[[homepage](https://sites.google.com/view/animal-pose/)\] (ICCV'2019)
+- [x] [OneHand10K](https://mmpose.readthedocs.io/zh_CN/latest/papers/datasets.html#onehand10k-tcsvt-2019) \[[homepage](https://www.yangangwang.com/papers/WANG-MCC-2018-10.html)\] (TCSVT'2019)
+- [x] [Vinegar Fly](https://mmpose.readthedocs.io/zh_CN/latest/papers/datasets.html#vinegar-fly-nature-methods-2019) \[[homepage](https://github.com/jgraving/DeepPoseKit-Data)\] (Nature Methods'2019)
+- [x] [Desert Locust](https://mmpose.readthedocs.io/zh_CN/latest/papers/datasets.html#desert-locust-elife-2019) \[[homepage](https://github.com/jgraving/DeepPoseKit-Data)\] (Elife'2019)
+- [x] [Grévy’s Zebra](https://mmpose.readthedocs.io/zh_CN/latest/papers/datasets.html#grevys-zebra-elife-2019) \[[homepage](https://github.com/jgraving/DeepPoseKit-Data)\] (Elife'2019)
+- [x] [ATRW](https://mmpose.readthedocs.io/zh_CN/latest/papers/datasets.html#atrw-acm-mm-2020) \[[homepage](https://cvwc2019.github.io/challenge.html)\] (ACM MM'2020)
+- [x] [Halpe](https://mmpose.readthedocs.io/zh_CN/latest/papers/datasets.html#halpe-cvpr-2020) \[[homepage](https://github.com/Fang-Haoshu/Halpe-FullBody/)\] (CVPR'2020)
+- [x] [COCO-WholeBody](https://mmpose.readthedocs.io/zh_CN/latest/papers/datasets.html#coco-wholebody-eccv-2020) \[[homepage](https://github.com/jin-s13/COCO-WholeBody/)\] (ECCV'2020)
+- [x] [MacaquePose](https://mmpose.readthedocs.io/zh_CN/latest/papers/datasets.html#macaquepose-biorxiv-2020) \[[homepage](http://www.pri.kyoto-u.ac.jp/datasets/macaquepose/index.html)\] (bioRxiv'2020)
+- [x] [InterHand2.6M](https://mmpose.readthedocs.io/zh_CN/latest/papers/datasets.html#interhand2-6m-eccv-2020) \[[homepage](https://mks0601.github.io/InterHand2.6M/)\] (ECCV'2020)
+- [x] [AP-10K](https://mmpose.readthedocs.io/en/latest/papers/datasets.html#ap-10k-neurips-2021) \[[homepage](https://github.com/AlexTheBad/AP-10K)\] (NeurIPS'2021)
+- [x] [Horse-10](https://mmpose.readthedocs.io/zh_CN/latest/papers/datasets.html#horse-10-wacv-2021) \[[homepage](http://www.mackenziemathislab.org/horse10)\] (WACV'2021)
+
+</details>
+
+<details>
+<summary><b>支持的骨干网络</b></summary>
+
+- [x] [AlexNet](https://mmpose.readthedocs.io/zh_CN/latest/papers/backbones.html#alexnet-neurips-2012) (NeurIPS'2012)
+- [x] [VGG](https://mmpose.readthedocs.io/zh_CN/latest/papers/backbones.html#vgg-iclr-2015) (ICLR'2015)
+- [x] [ResNet](https://mmpose.readthedocs.io/zh_CN/latest/papers/backbones.html#resnet-cvpr-2016) (CVPR'2016)
+- [x] [ResNext](https://mmpose.readthedocs.io/zh_CN/latest/papers/backbones.html#resnext-cvpr-2017) (CVPR'2017)
+- [x] [SEResNet](https://mmpose.readthedocs.io/zh_CN/latest/papers/backbones.html#seresnet-cvpr-2018) (CVPR'2018)
+- [x] [ShufflenetV1](https://mmpose.readthedocs.io/zh_CN/latest/papers/backbones.html#shufflenetv1-cvpr-2018) (CVPR'2018)
+- [x] [ShufflenetV2](https://mmpose.readthedocs.io/zh_CN/latest/papers/backbones.html#shufflenetv2-eccv-2018) (ECCV'2018)
+- [x] [MobilenetV2](https://mmpose.readthedocs.io/zh_CN/latest/papers/backbones.html#mobilenetv2-cvpr-2018) (CVPR'2018)
+- [x] [ResNetV1D](https://mmpose.readthedocs.io/zh_CN/latest/papers/backbones.html#resnetv1d-cvpr-2019) (CVPR'2019)
+- [x] [ResNeSt](https://mmpose.readthedocs.io/zh_CN/latest/papers/backbones.html#resnest-arxiv-2020) (ArXiv'2020)
+- [x] [Swin](https://mmpose.readthedocs.io/en/latest/papers/backbones.html#swin-cvpr-2021) (CVPR'2021)
+- [x] [HRFormer](https://mmpose.readthedocs.io/zh_CN/latest/papers/backbones.html#hrformer-nips-2021) (NIPS'2021)
+- [x] [PVT](https://mmpose.readthedocs.io/zh_CN/latest/papers/backbones.html#pvt-iccv-2021) (ICCV'2021)
+- [x] [PVTV2](https://mmpose.readthedocs.io/zh_CN/latest/papers/backbones.html#pvtv2-cvmj-2022) (CVMJ'2022)
+
+</details>
+
+### 模型需求
+
+我们将跟进学界的最新进展，并支持更多算法和框架。如果您对 MMPose 有任何功能需求，请随时在 [MMPose Roadmap](https://github.com/open-mmlab/mmpose/issues/9) 中留言。
+
+### 基准测试
+
+#### 训练精度和速度
+
+MMPose 在主流关键点检测基准 COCO 上达到了优越的模型精度和训练速度。
+
+详细信息可见 [基准测试](docs/en/benchmark.md)(英文)。
+
+#### 推理速度
+
+我们总结了 MMPose 中主要模型的复杂度信息和推理速度，包括模型的计算复杂度、参数数量，以及以不同的批处理大小在 CPU 和 GPU 上的推理速度。
+
+详细信息可见 [模型推理速度](docs/zh_cn/inference_speed_summary.md)。
+
+## 数据准备
+
+请参考 [data_preparation.md](docs/en/data_preparation.md)（英文） 进行数据集准备。
+
+## 常见问题
+
+请参考 [FAQ](docs/en/faq.md) 了解其他用户的常见问题。
+
+## 参与贡献
+
+我们非常欢迎用户对于 MMPose 做出的任何贡献，可以参考 [CONTRIBUTION.md](.github/CONTRIBUTING.md) 文件了解更多细节。
+
+## 致谢
+
+MMPose 是一款由不同学校和公司共同贡献的开源项目。我们感谢所有为项目提供算法复现和新功能支持的贡献者，以及提供宝贵反馈的用户。
+我们希望该工具箱和基准测试可以为社区提供灵活的代码工具，供用户复现现有算法并开发自己的新模型，从而不断为开源社区提供贡献。
+
+## 引用
+
+如果您觉得 MMPose 对您的研究有所帮助，请考虑引用它：
+
+```bibtex
+@misc{mmpose2020,
+    title={OpenMMLab Pose Estimation Toolbox and Benchmark},
+    author={MMPose Contributors},
+    howpublished = {\url{https://github.com/open-mmlab/mmpose}},
+    year={2020}
+}
+```
+
+## 许可证
+
+该项目采用 [Apache 2.0 license](LICENSE) 开源协议。
+
+## OpenMMLab的其他项目
+
+- [MMCV](https://github.com/open-mmlab/mmcv): OpenMMLab 计算机视觉基础库
+- [MIM](https://github.com/open-mmlab/mim): MIM 是 OpenMMlab 项目、算法、模型的统一入口
+- [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab 图像分类工具箱
+- [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab 目标检测工具箱
+- [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab 新一代通用 3D 目标检测平台
+- [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab 旋转框检测工具箱与测试基准
+- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab 语义分割工具箱
+- [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab 全流程文字检测识别理解工具包
+- [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab 姿态估计工具箱
+- [MMHuman3D](https://github.com/open-mmlab/mmhuman3d): OpenMMLab 人体参数化模型工具箱与测试基准
+- [MMSelfSup](https://github.com/open-mmlab/mmselfsup): OpenMMLab 自监督学习工具箱与测试基准
+- [MMRazor](https://github.com/open-mmlab/mmrazor): OpenMMLab 模型压缩工具箱与测试基准
+- [MMFewShot](https://github.com/open-mmlab/mmfewshot): OpenMMLab 少样本学习工具箱与测试基准
+- [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab 新一代视频理解工具箱
+- [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab 一体化视频目标感知平台
+- [MMFlow](https://github.com/open-mmlab/mmflow): OpenMMLab 光流估计工具箱与测试基准
+- [MMEditing](https://github.com/open-mmlab/mmediting): OpenMMLab 图像视频编辑工具箱
+- [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab 图片视频生成模型工具箱
+- [MMDeploy](https://github.com/open-mmlab/mmdeploy): OpenMMLab 模型部署框架
+
+## 欢迎加入 OpenMMLab 社区
+
+扫描下方的二维码可关注 OpenMMLab 团队的 [知乎官方账号](https://www.zhihu.com/people/openmmlab)，联络 OpenMMLab [官方微信小助手](/docs/en/imgs/wechat_assistant_qrcode.png)或加入 OpenMMLab 团队的 [官方交流 QQ 群](https://jq.qq.com/?_wv=1027&k=GJP18SjI)
+
+<div align="center">
+<img src="https://raw.githubusercontent.com/open-mmlab/mmcv/master/docs/en/_static/zhihu_qrcode.jpg" height="400"><img src="https://raw.githubusercontent.com/open-mmlab/mmcv/master/docs/en/_static/wechat_qrcode.jpg" height="400"><img src="https://raw.githubusercontent.com/open-mmlab/mmcv/master/docs/en/_static/qq_group_qrcode.jpg" height="400">
+</div>
+
+我们会在 OpenMMLab 社区为大家
+
+- 📢 分享 AI 框架的前沿核心技术
+- 💻 解读 PyTorch 常用模块源码
+- 📰 发布 OpenMMLab 的相关新闻
+- 🚀 介绍 OpenMMLab 开发的前沿算法
+- 🏃 获取更高效的问题答疑和意见反馈
+- 🔥 提供与各行各业开发者充分交流的平台
+
+干货满满 📘，等你来撩 💗，OpenMMLab 社区期待您的加入 👬
diff --git a/main/transformer_utils/model-index.yml b/main/transformer_utils/model-index.yml
new file mode 100644
index 0000000000000000000000000000000000000000..aa7691be16b169e2c08c6d9a482a32f23d17d80c
--- /dev/null
+++ b/main/transformer_utils/model-index.yml
@@ -0,0 +1,147 @@
+Import:
+- configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/animalpose/hrnet_animalpose.yml
+- configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/animalpose/resnet_animalpose.yml
+- configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/ap10k/hrnet_ap10k.yml
+- configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/ap10k/resnet_ap10k.yml
+- configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/atrw/hrnet_atrw.yml
+- configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/atrw/resnet_atrw.yml
+- configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/fly/resnet_fly.yml
+- configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/hrnet_horse10.yml
+- configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/horse10/resnet_horse10.yml
+- configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/locust/resnet_locust.yml
+- configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/macaque/hrnet_macaque.yml
+- configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/macaque/resnet_macaque.yml
+- configs/animal/2d_kpt_sview_rgb_img/topdown_heatmap/zebra/resnet_zebra.yml
+- configs/body/2d_kpt_sview_rgb_img/associative_embedding/aic/higherhrnet_aic.yml
+- configs/body/2d_kpt_sview_rgb_img/associative_embedding/aic/hrnet_aic.yml
+- configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/higherhrnet_udp_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hourglass_ae_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/hrnet_udp_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/mobilenetv2_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/associative_embedding/coco/resnet_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/associative_embedding/crowdpose/higherhrnet_crowdpose.yml
+- configs/body/2d_kpt_sview_rgb_img/associative_embedding/mhp/hrnet_mhp.yml
+- configs/body/2d_kpt_sview_rgb_img/deeppose/coco/resnet_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/deeppose/coco/resnet_rle_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/deeppose/mpii/resnet_mpii.yml
+- configs/body/2d_kpt_sview_rgb_img/deeppose/mpii/resnet_rle_mpii.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/hrnet_aic.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/aic/resnet_aic.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/alexnet_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/cpm_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hourglass_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrformer_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_augmentation_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_dark_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_fp16_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/hrnet_udp_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/litehrnet_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/mobilenetv2_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/mspn_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/pvt_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnest_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnet_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnet_dark_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnet_fp16_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnetv1d_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/resnext_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/rsn_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/scnet_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/seresnet_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/shufflenetv1_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/shufflenetv2_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/swin_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vgg_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/vipnas_coco.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/hrnet_crowdpose.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/crowdpose/resnet_crowdpose.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/h36m/hrnet_h36m.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/cpm_jhmdb.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/jhmdb/resnet_jhmdb.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mhp/resnet_mhp.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/cpm_mpii.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/hourglass_mpii.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/hrnet_dark_mpii.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/hrnet_mpii.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/litehrnet_mpii.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/mobilenetv2_mpii.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/resnet_mpii.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/resnetv1d_mpii.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/resnext_mpii.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/scnet_mpii.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/seresnet_mpii.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/shufflenetv1_mpii.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii/shufflenetv2_mpii.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/mpii_trb/resnet_mpii_trb.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/hrnet_ochuman.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/ochuman/resnet_ochuman.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/posetrack18/hrnet_posetrack18.yml
+- configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/posetrack18/resnet_posetrack18.yml
+- configs/body/2d_kpt_sview_rgb_vid/posewarper/posetrack18/hrnet_posetrack18_posewarper.yml
+- configs/body/3d_kpt_mview_rgb_img/voxelpose/campus/voxelpose_campus.yml
+- configs/body/3d_kpt_mview_rgb_img/voxelpose/panoptic/voxelpose_prn64x64x64_cpn80x80x20_panoptic_cam5.yml
+- configs/body/3d_kpt_mview_rgb_img/voxelpose/shelf/voxelpose_shelf.yml
+- configs/body/3d_kpt_sview_rgb_img/pose_lift/h36m/simplebaseline3d_h36m.yml
+- configs/body/3d_kpt_sview_rgb_img/pose_lift/mpi_inf_3dhp/simplebaseline3d_mpi-inf-3dhp.yml
+- configs/body/3d_kpt_sview_rgb_vid/video_pose_lift/h36m/videopose3d_h36m.yml
+- configs/body/3d_kpt_sview_rgb_vid/video_pose_lift/mpi_inf_3dhp/videopose3d_mpi-inf-3dhp.yml
+- configs/body/3d_mesh_sview_rgb_img/hmr/mixed/resnet_mixed.yml
+- configs/face/2d_kpt_sview_rgb_img/deeppose/wflw/resnet_softwingloss_wflw.yml
+- configs/face/2d_kpt_sview_rgb_img/deeppose/wflw/resnet_wflw.yml
+- configs/face/2d_kpt_sview_rgb_img/deeppose/wflw/resnet_wingloss_wflw.yml
+- configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/300w/hrnetv2_300w.yml
+- configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/aflw/hrnetv2_aflw.yml
+- configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/aflw/hrnetv2_dark_aflw.yml
+- configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/hourglass_coco_wholebody_face.yml
+- configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/hrnetv2_coco_wholebody_face.yml
+- configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/hrnetv2_dark_coco_wholebody_face.yml
+- configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/mobilenetv2_coco_wholebody_face.yml
+- configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/resnet_coco_wholebody_face.yml
+- configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_face/scnet_coco_wholebody_face.yml
+- configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/cofw/hrnetv2_cofw.yml
+- configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/wflw/hrnetv2_awing_wflw.yml
+- configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/wflw/hrnetv2_dark_wflw.yml
+- configs/face/2d_kpt_sview_rgb_img/topdown_heatmap/wflw/hrnetv2_wflw.yml
+- configs/fashion/2d_kpt_sview_rgb_img/deeppose/deepfashion/resnet_deepfashion.yml
+- configs/fashion/2d_kpt_sview_rgb_img/topdown_heatmap/deepfashion/resnet_deepfashion.yml
+- configs/hand/2d_kpt_sview_rgb_img/deeppose/onehand10k/resnet_onehand10k.yml
+- configs/hand/2d_kpt_sview_rgb_img/deeppose/panoptic2d/resnet_panoptic2d.yml
+- configs/hand/2d_kpt_sview_rgb_img/deeppose/rhd2d/resnet_rhd2d.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/hourglass_coco_wholebody_hand.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/hrnetv2_coco_wholebody_hand.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/hrnetv2_dark_coco_wholebody_hand.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/litehrnet_coco_wholebody_hand.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/mobilenetv2_coco_wholebody_hand.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/resnet_coco_wholebody_hand.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/coco_wholebody_hand/scnet_coco_wholebody_hand.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/freihand2d/resnet_freihand2d.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/interhand2d/resnet_interhand2d.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/hrnetv2_dark_onehand10k.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/hrnetv2_onehand10k.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/hrnetv2_udp_onehand10k.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/mobilenetv2_onehand10k.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/onehand10k/resnet_onehand10k.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/hrnetv2_dark_panoptic2d.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/hrnetv2_panoptic2d.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/hrnetv2_udp_panoptic2d.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/mobilenetv2_panoptic2d.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/panoptic2d/resnet_panoptic2d.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/hrnetv2_dark_rhd2d.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/hrnetv2_rhd2d.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/hrnetv2_udp_rhd2d.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/mobilenetv2_rhd2d.yml
+- configs/hand/2d_kpt_sview_rgb_img/topdown_heatmap/rhd2d/resnet_rhd2d.yml
+- configs/hand/3d_kpt_sview_rgb_img/internet/interhand3d/internet_interhand3d.yml
+- configs/hand/gesture_sview_rgbd_vid/mtut/nvgesture/i3d_nvgesture.yml
+- configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/higherhrnet_coco-wholebody.yml
+- configs/wholebody/2d_kpt_sview_rgb_img/associative_embedding/coco-wholebody/hrnet_coco-wholebody.yml
+- configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/hrnet_coco-wholebody.yml
+- configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/hrnet_dark_coco-wholebody.yml
+- configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/resnet_coco-wholebody.yml
+- configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/tcformer_coco-wholebody.yml
+- configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/vipnas_coco-wholebody.yml
+- configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/coco-wholebody/vipnas_dark_coco-wholebody.yml
+- configs/wholebody/2d_kpt_sview_rgb_img/topdown_heatmap/halpe/hrnet_dark_halpe.yml
diff --git a/main/transformer_utils/pytest.ini b/main/transformer_utils/pytest.ini
new file mode 100644
index 0000000000000000000000000000000000000000..9796e871e70c7c67345b1d6bcf708c0c82377a98
--- /dev/null
+++ b/main/transformer_utils/pytest.ini
@@ -0,0 +1,7 @@
+[pytest]
+addopts = --xdoctest --xdoctest-style=auto
+norecursedirs = .git ignore build __pycache__ data docker docs .eggs
+
+filterwarnings= default
+                ignore:.*No cfgstr given in Cacher constructor or call.*:Warning
+                ignore:.*Define the __nice__ method for.*:Warning
diff --git a/main/transformer_utils/requirements.txt b/main/transformer_utils/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b5b5d97a6ea7837890ff0247bac8c5f24f6eabab
--- /dev/null
+++ b/main/transformer_utils/requirements.txt
@@ -0,0 +1,4 @@
+-r requirements/build.txt
+-r requirements/runtime.txt
+-r requirements/tests.txt
+-r requirements/optional.txt
diff --git a/main/transformer_utils/requirements/build.txt b/main/transformer_utils/requirements/build.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a9566943cef029e5c8dab0b52ba564a7f9c7ad30
--- /dev/null
+++ b/main/transformer_utils/requirements/build.txt
@@ -0,0 +1,3 @@
+# These must be installed before building mmpose
+numpy
+torch>=1.3
diff --git a/main/transformer_utils/requirements/docs.txt b/main/transformer_utils/requirements/docs.txt
new file mode 100644
index 0000000000000000000000000000000000000000..20170845c44eefcb139ee2baa1a3d375b71c34ec
--- /dev/null
+++ b/main/transformer_utils/requirements/docs.txt
@@ -0,0 +1,6 @@
+docutils==0.16.0
+myst-parser
+-e git+https://github.com/gaotongxiao/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
+sphinx==4.0.2
+sphinx_copybutton
+sphinx_markdown_tables
diff --git a/main/transformer_utils/requirements/mminstall.txt b/main/transformer_utils/requirements/mminstall.txt
new file mode 100644
index 0000000000000000000000000000000000000000..89199e36061dcd5361d029606fa25cb791af110a
--- /dev/null
+++ b/main/transformer_utils/requirements/mminstall.txt
@@ -0,0 +1,3 @@
+mmcv-full>=1.3.8
+mmdet>=2.14.0
+mmtrack>=0.6.0
diff --git a/main/transformer_utils/requirements/optional.txt b/main/transformer_utils/requirements/optional.txt
new file mode 100644
index 0000000000000000000000000000000000000000..bfb1e75f86aba2fd074b0b1723e9b07a2037e9c3
--- /dev/null
+++ b/main/transformer_utils/requirements/optional.txt
@@ -0,0 +1,8 @@
+albumentations>=0.3.2 --no-binary qudida,albumentations
+onnx
+onnxruntime
+poseval@git+https://github.com/svenkreiss/poseval.git
+pyrender
+requests
+smplx>=0.1.28
+trimesh
diff --git a/main/transformer_utils/requirements/readthedocs.txt b/main/transformer_utils/requirements/readthedocs.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b8b69d3ca2f051dcb6d6a96a25e7cb9054483c76
--- /dev/null
+++ b/main/transformer_utils/requirements/readthedocs.txt
@@ -0,0 +1,9 @@
+mmcv-full
+munkres
+poseval@git+https://github.com/svenkreiss/poseval.git
+regex
+scipy
+titlecase
+torch
+torchvision
+xtcocotools>=1.8
diff --git a/main/transformer_utils/requirements/runtime.txt b/main/transformer_utils/requirements/runtime.txt
new file mode 100644
index 0000000000000000000000000000000000000000..30f20b6f8b678dac9d0043454e9b6d6f980de21e
--- /dev/null
+++ b/main/transformer_utils/requirements/runtime.txt
@@ -0,0 +1,13 @@
+chumpy
+dataclasses; python_version == '3.6'
+json_tricks
+matplotlib
+munkres
+numpy
+opencv-python
+pillow
+scipy
+torchvision
+xtcocotools>=1.12
+# easydict
+# einops
diff --git a/main/transformer_utils/requirements/tests.txt b/main/transformer_utils/requirements/tests.txt
new file mode 100644
index 0000000000000000000000000000000000000000..aa23e69da611f7dec62cf84541b7b508f4437a26
--- /dev/null
+++ b/main/transformer_utils/requirements/tests.txt
@@ -0,0 +1,9 @@
+coverage
+flake8
+interrogate
+isort==4.3.21
+pytest
+pytest-runner
+smplx>=0.1.28
+xdoctest>=0.10.0
+yapf
diff --git a/main/transformer_utils/resources/mmpose-logo.png b/main/transformer_utils/resources/mmpose-logo.png
new file mode 100644
index 0000000000000000000000000000000000000000..128e1714f0933d0dfe0ab82d6f8780c48e0edc21
Binary files /dev/null and b/main/transformer_utils/resources/mmpose-logo.png differ
diff --git a/main/transformer_utils/setup.cfg b/main/transformer_utils/setup.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..c4d8643bc91a06cc48f0d88b23288e892121249c
--- /dev/null
+++ b/main/transformer_utils/setup.cfg
@@ -0,0 +1,24 @@
+[bdist_wheel]
+universal=1
+
+[aliases]
+test=pytest
+
+[tool:pytest]
+addopts=tests/
+
+[yapf]
+based_on_style = pep8
+blank_line_before_nested_class_or_def = true
+split_before_expression_after_opening_paren = true
+split_penalty_import_names=0
+SPLIT_PENALTY_AFTER_OPENING_BRACKET=800
+
+[isort]
+line_length = 79
+multi_line_output = 0
+extra_standard_library = pkg_resources,setuptools
+known_first_party = mmpose
+known_third_party = PIL,cv2,h5py,json_tricks,matplotlib,mmcv,munkres,numpy,pytest,pytorch_sphinx_theme,requests,scipy,seaborn,spacepy,titlecase,torch,torchvision,webcam_apis,xmltodict,xtcocotools
+no_lines_before = STDLIB,LOCALFOLDER
+default_section = THIRDPARTY
diff --git a/main/transformer_utils/setup.py b/main/transformer_utils/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..13b4b1fba1c3da1e561d6480b11debb4e39b796a
--- /dev/null
+++ b/main/transformer_utils/setup.py
@@ -0,0 +1,194 @@
+import os
+import os.path as osp
+import platform
+import shutil
+import sys
+import warnings
+from setuptools import find_packages, setup
+
+
+def readme():
+    with open('README.md', encoding='utf-8') as f:
+        content = f.read()
+    return content
+
+
+version_file = 'mmpose/version.py'
+
+
+def get_version():
+    with open(version_file, 'r') as f:
+        exec(compile(f.read(), version_file, 'exec'))
+    import sys
+
+    # return short version for sdist
+    if 'sdist' in sys.argv or 'bdist_wheel' in sys.argv:
+        return locals()['short_version']
+    else:
+        return locals()['__version__']
+
+
+def parse_requirements(fname='requirements.txt', with_version=True):
+    """Parse the package dependencies listed in a requirements file but strips
+    specific versioning information.
+
+    Args:
+        fname (str): path to requirements file
+        with_version (bool, default=False): if True include version specs
+
+    Returns:
+        List[str]: list of requirements items
+
+    CommandLine:
+        python -c "import setup; print(setup.parse_requirements())"
+    """
+    import re
+    import sys
+    from os.path import exists
+    require_fpath = fname
+
+    def parse_line(line):
+        """Parse information from a line in a requirements text file."""
+        if line.startswith('-r '):
+            # Allow specifying requirements in other files
+            target = line.split(' ')[1]
+            for info in parse_require_file(target):
+                yield info
+        else:
+            info = {'line': line}
+            if line.startswith('-e '):
+                info['package'] = line.split('#egg=')[1]
+            elif '@git+' in line:
+                info['package'] = line
+            else:
+                # Remove versioning from the package
+                pat = '(' + '|'.join(['>=', '==', '>']) + ')'
+                parts = re.split(pat, line, maxsplit=1)
+                parts = [p.strip() for p in parts]
+
+                info['package'] = parts[0]
+                if len(parts) > 1:
+                    op, rest = parts[1:]
+                    if ';' in rest:
+                        # Handle platform specific dependencies
+                        # http://setuptools.readthedocs.io/en/latest/setuptools.html#declaring-platform-specific-dependencies
+                        version, platform_deps = map(str.strip,
+                                                     rest.split(';'))
+                        info['platform_deps'] = platform_deps
+                    else:
+                        version = rest  # NOQA
+                    info['version'] = (op, version)
+            yield info
+
+    def parse_require_file(fpath):
+        with open(fpath, 'r') as f:
+            for line in f.readlines():
+                line = line.strip()
+                if line and not line.startswith('#'):
+                    for info in parse_line(line):
+                        yield info
+
+    def gen_packages_items():
+        if exists(require_fpath):
+            for info in parse_require_file(require_fpath):
+                parts = [info['package']]
+                if with_version and 'version' in info:
+                    parts.extend(info['version'])
+                if not sys.version.startswith('3.4'):
+                    # apparently package_deps are broken in 3.4
+                    platform_deps = info.get('platform_deps')
+                    if platform_deps is not None:
+                        parts.append(';' + platform_deps)
+                item = ''.join(parts)
+                yield item
+
+    packages = list(gen_packages_items())
+    return packages
+
+
+def add_mim_extension():
+    """Add extra files that are required to support MIM into the package.
+
+    These files will be added by creating a symlink to the originals if the
+    package is installed in `editable` mode (e.g. pip install -e .), or by
+    copying from the originals otherwise.
+    """
+
+    # parse installment mode
+    if 'develop' in sys.argv:
+        # installed by `pip install -e .`
+        if platform.system() == 'Windows':
+            mode = 'copy'
+        else:
+            mode = 'symlink'
+    elif 'sdist' in sys.argv or 'bdist_wheel' in sys.argv:
+        # installed by `pip install .`
+        # or create source distribution by `python setup.py sdist`
+        mode = 'copy'
+    else:
+        return
+
+    filenames = ['tools', 'configs', 'demo', 'model-index.yml']
+    repo_path = osp.dirname(__file__)
+    mim_path = osp.join(repo_path, 'mmpose', '.mim')
+    os.makedirs(mim_path, exist_ok=True)
+
+    for filename in filenames:
+        if osp.exists(filename):
+            src_path = osp.join(repo_path, filename)
+            tar_path = osp.join(mim_path, filename)
+
+            if osp.isfile(tar_path) or osp.islink(tar_path):
+                os.remove(tar_path)
+            elif osp.isdir(tar_path):
+                shutil.rmtree(tar_path)
+
+            if mode == 'symlink':
+                src_relpath = osp.relpath(src_path, osp.dirname(tar_path))
+                os.symlink(src_relpath, tar_path)
+            elif mode == 'copy':
+                if osp.isfile(src_path):
+                    shutil.copyfile(src_path, tar_path)
+                elif osp.isdir(src_path):
+                    shutil.copytree(src_path, tar_path)
+                else:
+                    warnings.warn(f'Cannot copy file {src_path}.')
+            else:
+                raise ValueError(f'Invalid mode {mode}')
+
+
+if __name__ == '__main__':
+    add_mim_extension()
+    setup(
+        name='mmpose',
+        version=get_version(),
+        description='OpenMMLab Pose Estimation Toolbox and Benchmark.',
+        author='MMPose Contributors',
+        author_email='openmmlab@gmail.com',
+        keywords='computer vision, pose estimation',
+        long_description=readme(),
+        long_description_content_type='text/markdown',
+        packages=find_packages(exclude=('configs', 'tools', 'demo')),
+        include_package_data=True,
+        package_data={'mmpose.ops': ['*/*.so']},
+        classifiers=[
+            'Development Status :: 4 - Beta',
+            'License :: OSI Approved :: Apache Software License',
+            'Operating System :: OS Independent',
+            'Programming Language :: Python :: 3',
+            'Programming Language :: Python :: 3.5',
+            'Programming Language :: Python :: 3.6',
+            'Programming Language :: Python :: 3.7',
+            'Programming Language :: Python :: 3.8',
+            'Programming Language :: Python :: 3.9',
+        ],
+        url='https://github.com/open-mmlab/mmpose',
+        license='Apache License 2.0',
+        install_requires=parse_requirements('requirements/runtime.txt'),
+        extras_require={
+            'tests': parse_requirements('requirements/tests.txt'),
+            'build': parse_requirements('requirements/build.txt'),
+            'runtime': parse_requirements('requirements/runtime.txt'),
+            'mim': parse_requirements('requirements/mminstall.txt'),
+        },
+        zip_safe=False)
diff --git a/packages.txt b/packages.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d23342ec0faa2a5916c416869bc62b62dc94baab
--- /dev/null
+++ b/packages.txt
@@ -0,0 +1,14 @@
+libglfw3-dev
+libgles2-mesa-dev
+libgl1
+freeglut3-dev
+zip
+unzip
+ffmpeg 
+libsm6 
+libxext6
+libgl1-mesa-dri
+libegl1-mesa
+libgbm1
+build-essential
+libturbojpeg
\ No newline at end of file
diff --git a/pre-requirements.txt b/pre-requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4e1c5f6a2c1603941b6cc4be0c03ca373e6327ec
--- /dev/null
+++ b/pre-requirements.txt
@@ -0,0 +1,6 @@
+numpy==1.23
+
+--extra-index-url https://download.pytorch.org/whl/cu118
+torch==2.0.0+cu118
+torchvision==0.15.0+cu118
+torchaudio==2.0.0+cu118
\ No newline at end of file
diff --git a/pretrained_models/mmdet/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth b/pretrained_models/mmdet/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth
new file mode 100644
index 0000000000000000000000000000000000000000..fbdd2e26cc16db3c55149dad743e001aba4198d8
--- /dev/null
+++ b/pretrained_models/mmdet/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:047c8118fc5ca88ba5ae1fab72f2cd6b070501fe3af2f3cba5cfa9a89b44b03e
+size 167287506
diff --git a/pretrained_models/mmdet/mmdet_faster_rcnn_r50_fpn_coco.py b/pretrained_models/mmdet/mmdet_faster_rcnn_r50_fpn_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9a25c5586041ff25447e0bad521858eecc10cf0
--- /dev/null
+++ b/pretrained_models/mmdet/mmdet_faster_rcnn_r50_fpn_coco.py
@@ -0,0 +1,182 @@
+checkpoint_config = dict(interval=1)
+# yapf: disable
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+# yapf: enable
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
+# optimizer
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[8, 11])
+total_epochs = 12
+
+model = dict(
+    type='FasterRCNN',
+    pretrained='torchvision://resnet50',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch'),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+    roi_head=dict(
+        type='StandardRoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=dict(
+            type='Shared2FCBBoxHead',
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=80,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_pre=2000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.5,
+                min_pos_iou=0.5,
+                match_low_quality=False,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=512,
+                pos_fraction=0.25,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            pos_weight=-1,
+            debug=False)),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=1000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            score_thr=0.05,
+            nms=dict(type='nms', iou_threshold=0.5),
+            max_per_img=100)
+        # soft-nms is also supported for rcnn testing
+        # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
+    ))
+
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='DefaultFormatBundle'),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_train2017.json',
+        img_prefix=data_root + 'train2017/',
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        pipeline=test_pipeline))
+evaluation = dict(interval=1, metric='bbox')
\ No newline at end of file
diff --git a/pretrained_models/smpler_x_h32.pth.tar b/pretrained_models/smpler_x_h32.pth.tar
new file mode 100644
index 0000000000000000000000000000000000000000..eead6378fc7b3cf1e5d00fe53c39f96b1840e2e8
--- /dev/null
+++ b/pretrained_models/smpler_x_h32.pth.tar
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0f9bc3bda088758e8e16cdf381b0dffabd5e6bc07326027b88d6f54aa3a215c7
+size 7942394512
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f9b09095f601c5e554450d5828707e0ffbc76831
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,31 @@
+--extra-index-url https://download.openmmlab.com/mmcv/dist/cu118/torch2.0/index.html
+https://download.openmmlab.com/mmcv/dist/cu118/torch2.0.0/mmcv_full-1.7.2-cp38-cp38-manylinux1_x86_64.whl
+
+scikit-image
+scipy
+scikit-learn
+smplx==0.1.28
+tqdm
+yacs
+numba
+opencv-python
+tensorboardx
+filterpy
+cython
+chumpy
+Pillow
+trimesh
+pyrender
+matplotlib
+json_tricks
+torchgeometry
+einops
+joblib
+boto3
+requests
+easydict
+pycocotools
+plyfile
+timm
+pyglet
+mmdet==2.26.0
\ No newline at end of file