File size: 7,086 Bytes
9667e74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
# Copyright (c) Facebook, Inc. and its affiliates.

import os
import os.path as osp
import sys
import numpy as np
import cv2
import math

import torch
import torchvision.transforms as transforms
# from PIL import Image

# Code from https://github.com/Daniil-Osokin/lightweight-human-pose-estimation.pytorch/blob/master/demo.py

# 2D body pose estimator
sys.path.append('/apdcephfs/share_1474453/zejunzhang/workspace/HR-VITON/dataset_process_utils/lite_openpose')
from pose2d_models.with_mobilenet import PoseEstimationWithMobileNet
from modules.load_state import load_state
from modules.pose import Pose, track_poses
from modules.keypoints import extract_keypoints, group_keypoints


def normalize(img, img_mean, img_scale):
    img = np.array(img, dtype=np.float32)
    img = (img - img_mean) * img_scale
    return img


def pad_width(img, stride, pad_value, min_dims):
    h, w, _ = img.shape
    h = min(min_dims[0], h)
    min_dims[0] = math.ceil(min_dims[0] / float(stride)) * stride
    min_dims[1] = max(min_dims[1], w)
    min_dims[1] = math.ceil(min_dims[1] / float(stride)) * stride
    pad = []
    pad.append(int(math.floor((min_dims[0] - h) / 2.0)))
    pad.append(int(math.floor((min_dims[1] - w) / 2.0)))
    pad.append(int(min_dims[0] - h - pad[0]))
    pad.append(int(min_dims[1] - w - pad[1]))
    padded_img = cv2.copyMakeBorder(img, pad[0], pad[2], pad[1], pad[3],
                                    cv2.BORDER_CONSTANT, value=pad_value)
    return padded_img, pad


class BodyPoseEstimator(object):
    """
    Hand Detector for third-view input.
    It combines a body pose estimator (https://github.com/jhugestar/lightweight-human-pose-estimation.pytorch.git)
    """
    def __init__(self, device='cpu'):
        # print("Loading Body Pose Estimator")
        self.device=device
        self.__load_body_estimator()
        
    

    def __load_body_estimator(self):
        net = PoseEstimationWithMobileNet()
        pose2d_checkpoint = "lite_openpose/checkpoint_iter_370000.pth"
        checkpoint = torch.load(pose2d_checkpoint, map_location='cpu')
        load_state(net, checkpoint)
        net = net.eval()
        net = net.to(self.device)
        self.model = net
    

    #Code from https://github.com/Daniil-Osokin/lightweight-human-pose-estimation.pytorch/demo.py
    def __infer_fast(self, img, input_height_size, stride, upsample_ratio, 
        cpu=False, pad_value=(0, 0, 0), img_mean=(128, 128, 128), img_scale=1/256):
        height, width, _ = img.shape
        scale = input_height_size / height

        scaled_img = cv2.resize(img, (0, 0), fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
        scaled_img = normalize(scaled_img, img_mean, img_scale)
        min_dims = [input_height_size, max(scaled_img.shape[1], input_height_size)]
        padded_img, pad = pad_width(scaled_img, stride, pad_value, min_dims)

        tensor_img = torch.from_numpy(padded_img).permute(2, 0, 1).unsqueeze(0).float()
        if not cpu:
            tensor_img = tensor_img.to(self.device)
        
        with torch.no_grad():
            stages_output = self.model(tensor_img)

        stage2_heatmaps = stages_output[-2]
        heatmaps = np.transpose(stage2_heatmaps.squeeze().cpu().data.numpy(), (1, 2, 0))
        heatmaps = cv2.resize(heatmaps, (0, 0), fx=upsample_ratio, fy=upsample_ratio, interpolation=cv2.INTER_CUBIC)

        stage2_pafs = stages_output[-1]
        pafs = np.transpose(stage2_pafs.squeeze().cpu().data.numpy(), (1, 2, 0))
        pafs = cv2.resize(pafs, (0, 0), fx=upsample_ratio, fy=upsample_ratio, interpolation=cv2.INTER_CUBIC)

        return heatmaps, pafs, scale, pad
    
    def detect_body_pose(self, img):
        """
        Output:
            current_bbox: BBOX_XYWH
        """
        stride = 8
        upsample_ratio = 4
        orig_img = img.copy()

        # forward
        heatmaps, pafs, scale, pad = self.__infer_fast(img, 
            input_height_size=256, stride=stride, upsample_ratio=upsample_ratio)

        total_keypoints_num = 0
        all_keypoints_by_type = []
        num_keypoints = Pose.num_kpts
        for kpt_idx in range(num_keypoints):  # 19th for bg
            total_keypoints_num += extract_keypoints(heatmaps[:, :, kpt_idx], all_keypoints_by_type, total_keypoints_num)
        
        pose_entries, all_keypoints = group_keypoints(all_keypoints_by_type, pafs, demo=True)
        for kpt_id in range(all_keypoints.shape[0]):
            all_keypoints[kpt_id, 0] = (all_keypoints[kpt_id, 0] * stride / upsample_ratio - pad[1]) / scale
            all_keypoints[kpt_id, 1] = (all_keypoints[kpt_id, 1] * stride / upsample_ratio - pad[0]) / scale
        
        '''
        # print(len(pose_entries))
        if len(pose_entries)>1:
            pose_entries = pose_entries[:1]
            print("We only support one person currently")
            # assert len(pose_entries) == 1, "We only support one person currently"
        '''

        current_poses, current_bbox = list(), list()
        for n in range(len(pose_entries)):
            if len(pose_entries[n]) == 0:
                continue
            pose_keypoints = np.ones((num_keypoints, 2), dtype=np.int32) * -1
            for kpt_id in range(num_keypoints):
                if pose_entries[n][kpt_id] != -1.0:  # keypoint was found
                    pose_keypoints[kpt_id, 0] = int(all_keypoints[int(pose_entries[n][kpt_id]), 0])
                    pose_keypoints[kpt_id, 1] = int(all_keypoints[int(pose_entries[n][kpt_id]), 1])
            pose = Pose(pose_keypoints, pose_entries[n][18]) 
            current_poses.append(pose.keypoints)
            current_bbox.append(np.array(pose.bbox))

        # enlarge the bbox
        for i, bbox in enumerate(current_bbox):
            x, y, w, h = bbox
            margin = 0.2
            x_margin = int(w * margin)
            y_margin = int(h * margin)
            x0 = max(x-x_margin, 0)
            y0 = max(y-y_margin, 0)
            x1 = min(x+w+x_margin, orig_img.shape[1])
            y1 = min(y+h+y_margin, orig_img.shape[0])
            current_bbox[i] = np.array((x0, y0, x1, y1)).astype(np.int32) # ltrb

        # 只拿一个人
        body_point_list = []
        if len(current_poses) > 0:
            for item in current_poses[0]:
                if item[0] == item[1] == -1:
                    body_point_list += [0.0, 0.0, 0.0]
                else:
                    body_point_list += [float(item[0]), float(item[1]), 1.0]
        else:
            for i in range(18):
                body_point_list += [0.0, 0.0, 0.0]

        pose_dict = dict()
        pose_dict["people"] = []
        pose_dict["people"].append({
            "person_id": [-1],
            "pose_keypoints_2d": body_point_list,
            "hand_left_keypoints_2d": [],
            "hand_right_keypoints_2d": [],
            "face_keypoints_2d": [],
            "pose_keypoints_3d": [],
            "face_keypoints_3d": [],
            "hand_left_keypoints_3d": [],
            "hand_right_keypoints_3d": [],
        })

        return current_poses, current_bbox