Dref360 commited on
Commit
afb0729
1 Parent(s): 1abd45a

First commit

Browse files
Files changed (2) hide show
  1. app.py +221 -0
  2. requirements.txt +4 -0
app.py ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import numpy as np
4
+ import cv2
5
+ from PIL import Image
6
+ import supervision as sv
7
+ from transformers import (
8
+ RTDetrForObjectDetection,
9
+ RTDetrImageProcessor,
10
+ VitPoseConfig,
11
+ VitPoseForPoseEstimation,
12
+ VitPoseImageProcessor,
13
+ )
14
+
15
+
16
+ KEYPOINT_LABEL_MAP = {
17
+ 0: "Nose",
18
+ 1: "L_Eye",
19
+ 2: "R_Eye",
20
+ 3: "L_Ear",
21
+ 4: "R_Ear",
22
+ 5: "L_Shoulder",
23
+ 6: "R_Shoulder",
24
+ 7: "L_Elbow",
25
+ 8: "R_Elbow",
26
+ 9: "L_Wrist",
27
+ 10: "R_Wrist",
28
+ 11: "L_Hip",
29
+ 12: "R_Hip",
30
+ 13: "L_Knee",
31
+ 14: "R_Knee",
32
+ 15: "L_Ankle",
33
+ 16: "R_Ankle",
34
+ }
35
+
36
+
37
+ class KeypointDetector:
38
+ def __init__(self):
39
+ self.person_detector = None
40
+ self.person_processor = None
41
+ self.pose_model = None
42
+ self.pose_processor = None
43
+ self.load_models()
44
+
45
+ def load_models(self):
46
+ """Load all required models"""
47
+ # Object detection model
48
+ self.person_processor = RTDetrImageProcessor.from_pretrained("PekingU/rtdetr_r50vd_coco_o365")
49
+ self.person_detector = RTDetrForObjectDetection.from_pretrained("PekingU/rtdetr_r50vd_coco_o365")
50
+
51
+ # Pose estimation model
52
+ self.pose_processor = VitPoseImageProcessor.from_pretrained("nielsr/vitpose-base-simple")
53
+ self.pose_model = VitPoseForPoseEstimation.from_pretrained("nielsr/vitpose-base-simple")
54
+
55
+ @staticmethod
56
+ def pascal_voc_to_coco(bboxes: np.ndarray) -> np.ndarray:
57
+ """Convert Pascal VOC format to COCO format"""
58
+ bboxes = bboxes.copy() # Create a copy to avoid modifying the input
59
+ bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 0]
60
+ bboxes[:, 3] = bboxes[:, 3] - bboxes[:, 1]
61
+ return bboxes
62
+
63
+ @staticmethod
64
+ def coco_to_xyxy(bboxes: np.ndarray) -> np.ndarray:
65
+ """Convert COCO format (x,y,w,h) to xyxy format (x1,y1,x2,y2)"""
66
+ bboxes = bboxes.copy()
67
+ bboxes[:, 2] = bboxes[:, 0] + bboxes[:, 2]
68
+ bboxes[:, 3] = bboxes[:, 1] + bboxes[:, 3]
69
+ return bboxes
70
+
71
+ def detect_persons(self, image: Image.Image):
72
+ """Detect persons in the image"""
73
+ inputs = self.person_processor(images=image, return_tensors="pt")
74
+ with torch.no_grad():
75
+ outputs = self.person_detector(**inputs)
76
+
77
+ results = self.person_processor.post_process_object_detection(
78
+ outputs,
79
+ target_sizes=torch.tensor([(image.height, image.width)]),
80
+ threshold=0.3
81
+ )
82
+
83
+ # Get boxes and scores for human class (index 0 in COCO dataset)
84
+ boxes = results[0]["boxes"][results[0]["labels"] == 0]
85
+ scores = results[0]["scores"][results[0]["labels"] == 0]
86
+ return boxes.cpu().numpy(), scores.cpu().numpy()
87
+
88
+ def detect_keypoints(self, image: Image.Image):
89
+ """Detect keypoints in the image"""
90
+ # Detect persons first
91
+ boxes, scores = self.detect_persons(image)
92
+ boxes_coco = [self.pascal_voc_to_coco(boxes)]
93
+
94
+ # Detect pose keypoints
95
+ pixel_values = self.pose_processor(image, boxes=boxes_coco, return_tensors="pt").pixel_values
96
+ with torch.no_grad():
97
+ outputs = self.pose_model(pixel_values)
98
+
99
+ pose_results = self.pose_processor.post_process_pose_estimation(outputs, boxes=boxes_coco)[0]
100
+ return pose_results, boxes, scores
101
+
102
+ def visualize_detections(self, image: Image.Image, pose_results, boxes, scores):
103
+ """Visualize both bounding boxes and keypoints on the image"""
104
+ # Convert image to numpy array if needed
105
+ image_array = np.array(image)
106
+
107
+ # Setup detections for bounding boxes
108
+ detections = sv.Detections(
109
+ xyxy=self.coco_to_xyxy(boxes),
110
+ confidence=scores,
111
+ class_id=np.array([0]*len(scores))
112
+ )
113
+
114
+ # Create box annotator
115
+ box_annotator = sv.BoxAnnotator(
116
+ color=sv.ColorPalette.DEFAULT,
117
+ thickness=2
118
+ )
119
+
120
+ # Create edge annotator for keypoints
121
+ edge_annotator = sv.EdgeAnnotator(
122
+ color=sv.Color.GREEN,
123
+ thickness=3
124
+ )
125
+
126
+ # Convert keypoints to supervision format
127
+ key_points = sv.KeyPoints(
128
+ xy=torch.cat([pose_result['keypoints'].unsqueeze(0) for pose_result in pose_results]).cpu().numpy()
129
+ )
130
+
131
+ # Annotate image with boxes first
132
+ annotated_frame = box_annotator.annotate(
133
+ scene=image_array.copy(),
134
+ detections=detections
135
+ )
136
+
137
+ # Then add keypoints
138
+ annotated_frame = edge_annotator.annotate(
139
+ scene=annotated_frame,
140
+ key_points=key_points
141
+ )
142
+
143
+ return Image.fromarray(annotated_frame)
144
+
145
+ def process_image(self, input_image):
146
+ """Process image and return visualization"""
147
+ if input_image is None:
148
+ return None, ""
149
+
150
+ # Convert to PIL Image if necessary
151
+ if isinstance(input_image, np.ndarray):
152
+ image = Image.fromarray(input_image)
153
+ else:
154
+ image = input_image
155
+
156
+ # Detect keypoints and boxes
157
+ pose_results, boxes, scores = self.detect_keypoints(image)
158
+
159
+ # Visualize results
160
+ result_image = self.visualize_detections(image, pose_results, boxes, scores)
161
+
162
+ # Create detection information text
163
+ info_text = []
164
+
165
+ # Box information
166
+ for i, (box, score) in enumerate(zip(boxes, scores)):
167
+ info_text.append(f"\nPerson {i + 1} (confidence: {score:.2f})")
168
+ info_text.append(f"Bounding Box: x1={box[0]:.1f}, y1={box[1]:.1f}, x2={box[2]:.1f}, y2={box[3]:.1f}")
169
+
170
+ # Add keypoint information for this person
171
+ pose_result = pose_results[i]
172
+ for j, keypoint in enumerate(pose_result["keypoints"]):
173
+ x, y, confidence = keypoint
174
+ info_text.append(f"Keypoint {KEYPOINT_LABEL_MAP[j]}: x={x:.1f}, y={y:.1f}, confidence={confidence:.2f}")
175
+
176
+ return result_image, "\n".join(info_text)
177
+
178
+
179
+ def create_gradio_interface():
180
+ """Create Gradio interface"""
181
+ detector = KeypointDetector()
182
+
183
+ with gr.Blocks() as interface:
184
+ gr.Markdown("# Human Detection and Keypoint Estimation using VitPose")
185
+ gr.Markdown("Upload an image to detect people and their keypoints. The model will:")
186
+ gr.Markdown("1. Detect people in the image (shown as bounding boxes)")
187
+ gr.Markdown("2. Identify keypoints for each detected person (shown as connected green lines)")
188
+ gr.Markdown("Huge shoutout to @NielsRogge and @SangbumChoi for this work!")
189
+
190
+ with gr.Row():
191
+ with gr.Column():
192
+ input_image = gr.Image(label="Input Image")
193
+ process_button = gr.Button("Detect People & Keypoints")
194
+
195
+ with gr.Column():
196
+ output_image = gr.Image(label="Detection Results")
197
+ detection_info = gr.Textbox(
198
+ label="Detection Information",
199
+ lines=10,
200
+ placeholder="Detection details will appear here..."
201
+ )
202
+
203
+ process_button.click(
204
+ fn=detector.process_image,
205
+ inputs=input_image,
206
+ outputs=[output_image, detection_info]
207
+ )
208
+
209
+ gr.Examples(
210
+ examples=[
211
+ "http://images.cocodataset.org/val2017/000000000139.jpg"
212
+ ],
213
+ inputs=input_image
214
+ )
215
+
216
+ return interface
217
+
218
+
219
+ if __name__ == "__main__":
220
+ interface = create_gradio_interface()
221
+ interface.launch()
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ git+https://github.com/NielsRogge/transformers.git@add_vitpose_autobackbone
2
+ supervision==0.24.0
3
+ torch==2.5.1
4
+ gradio==4.44.1