JulianPhillips commited on
Commit
85612d7
·
verified ·
1 Parent(s): 1827bea

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +253 -42
app.py CHANGED
@@ -1,56 +1,267 @@
1
  from flask import Flask, request, jsonify
 
 
 
 
2
  import torch
3
- from PIL import Image
4
- from io import BytesIO
5
- import torchvision.transforms as transforms
6
- from transformers import AutoModelForSequenceClassification, AutoTokenizer
7
 
8
- # Load Meta Sapiens Pose model
9
- sapiens_model = torch.jit.load('/models/sapiens_pose/model.pt')
10
- sapiens_model.eval()
11
 
12
- # Load MotionBERT model
13
- motionbert_model = AutoModelForSequenceClassification.from_pretrained('/models/motionbert')
14
- motionbert_tokenizer = AutoTokenizer.from_pretrained('/models/motionbert')
15
 
16
- # Flask app
17
- app = Flask(__name__)
 
18
 
19
- # Define a transformation for input images
20
- transform = transforms.Compose([
21
- transforms.Resize((256, 256)), # Resize image to the required size
22
- transforms.ToTensor(), # Convert image to PyTorch tensor
23
- ])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
- @app.route('/pose_estimation', methods=['POST'])
26
- def pose_estimation():
27
- try:
28
- # Accept an image file as input for pose estimation
29
- image = request.files['image']
30
- img = Image.open(BytesIO(image.read()))
31
-
32
- # Preprocess the image
33
- img_tensor = transform(img).unsqueeze(0) # Add batch dimension
34
-
35
- # Perform pose estimation
36
- with torch.no_grad():
37
- pose_result = sapiens_model(img_tensor)
38
-
39
- return jsonify({"pose_result": pose_result.tolist()})
40
- except Exception as e:
41
- return jsonify({"error": str(e)}), 500
42
 
43
- @app.route('/sequence_analysis', methods=['POST'])
44
- def sequence_analysis():
45
  try:
46
- # Accept keypoint data as input for sequence analysis
47
- keypoints = request.json['keypoints']
48
- inputs = motionbert_tokenizer(keypoints, return_tensors="pt")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
- with torch.no_grad():
51
- sequence_output = motionbert_model(**inputs)
52
 
53
- return jsonify({"sequence_analysis": sequence_output.logits.tolist()})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  except Exception as e:
55
  return jsonify({"error": str(e)}), 500
56
 
 
1
  from flask import Flask, request, jsonify
2
+ import cv2
3
+ import numpy as np
4
+ import tensorflow as tf
5
+ from transformers import BlipProcessor, BlipForConditionalGeneration, CLIPProcessor, CLIPModel
6
  import torch
7
+ import os
8
+ import requests
9
+ from tempfile import NamedTemporaryFile
 
10
 
11
+ # Load MoveNet model
12
+ movenet_model_path = '/models/movenet/movenet_lightning'
13
+ movenet_model = tf.saved_model.load(movenet_model_path)
14
 
15
+ # Load BLIP model
16
+ blip_model = BlipForConditionalGeneration.from_pretrained('Salesforce/blip-image-captioning-large')
17
+ blip_processor = BlipProcessor.from_pretrained('Salesforce/blip-image-captioning-large')
18
 
19
+ # Load CLIP model
20
+ clip_model = CLIPModel.from_pretrained('openai/clip-vit-large-patch14')
21
+ clip_processor = CLIPProcessor.from_pretrained('openai/clip-vit-large-patch14')
22
 
23
+ # Keypoint dictionary for reference
24
+ KEYPOINT_DICT = {
25
+ 'nose': 0,
26
+ 'left_eye': 1,
27
+ 'right_eye': 2,
28
+ 'left_ear': 3,
29
+ 'right_ear': 4,
30
+ 'left_shoulder': 5,
31
+ 'right_shoulder': 6,
32
+ 'left_elbow': 7,
33
+ 'right_elbow': 8,
34
+ 'left_wrist': 9,
35
+ 'right_wrist': 10,
36
+ 'left_hip': 11,
37
+ 'right_hip': 12,
38
+ 'left_knee': 13,
39
+ 'right_knee': 14,
40
+ 'left_ankle': 15,
41
+ 'right_ankle': 16
42
+ }
43
 
44
+ app = Flask(__name__)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
+ @app.route('/process_video', methods=['POST'])
47
+ def process_video():
48
  try:
49
+ # Get the video URL from the request
50
+ video_url = request.json.get('videoURL')
51
+ height = request.json.get('height')
52
+ weight = request.json.get('weight')
53
+ wingspan = request.json.get('wingspan')
54
+
55
+
56
+ if not video_url:
57
+ return jsonify({"error": "No video URL provided"}), 400
58
+
59
+
60
+ if not all([height, weight, wingspan]):
61
+ return jsonify({"error": "Height, weight, and wingspan are required"}), 400
62
+
63
+
64
+ # Download the video from the S3 URL
65
+ with NamedTemporaryFile(delete=False, suffix=".mp4") as temp_video_file:
66
+ response = requests.get(video_url)
67
+ if response.status_code != 200:
68
+ return jsonify({"error": "Failed to download video from the provided URL"}), 400
69
+ temp_video_file.write(response.content)
70
+ video_path = temp_video_file.name
71
+
72
+ # Open the video file
73
+ cap = cv2.VideoCapture(video_path)
74
+ frames = []
75
+
76
+ # Extract 60 frames from the video
77
+ success, frame = cap.read()
78
+ frame_count = 0
79
+ while success and frame_count < 60:
80
+ frames.append(frame)
81
+ success, frame = cap.read()
82
+ frame_count += 1
83
+
84
+ cap.release()
85
+ os.remove(video_path)
86
+
87
+ # Process each frame with MoveNet (to get 3D keypoints and detect stance)
88
+ movenet_results = []
89
+ stances = []
90
+ hip_rotations = []
91
+ arm_extensions = []
92
+ stepping_jabs = []
93
+ guard_up = []
94
+ hand_returned = []
95
+ hips_width_apart = []
96
+ leg_angle_correct = []
97
+ punch_started = False
98
+ initial_left_wrist = None
99
+ initial_right_wrist = None
100
+
101
+ for frame_index, frame in enumerate(frames):
102
+ input_tensor = tf.image.resize_with_pad(tf.convert_to_tensor(frame, dtype=tf.uint8), 256, 256)
103
+ input_tensor = tf.cast(input_tensor, dtype=tf.float32)
104
+ input_tensor = tf.expand_dims(input_tensor, axis=0)
105
+ keypoints = movenet_model.signatures['serving_default'](input_tensor)
106
+ keypoints_3d = keypoints['output_0'][0].numpy().tolist() # Assuming the model returns 3D keypoints
107
+ movenet_results.append(keypoints_3d)
108
+
109
+ # Detect stance based on keypoints (using ankles and wrists)
110
+ left_ankle = keypoints_3d[KEYPOINT_DICT['left_ankle']]
111
+ right_ankle = keypoints_3d[KEYPOINT_DICT['right_ankle']]
112
+ left_wrist = keypoints_3d[KEYPOINT_DICT['left_wrist']]
113
+ right_wrist = keypoints_3d[KEYPOINT_DICT['right_wrist']]
114
+
115
+ if right_ankle[0] < left_ankle[0] and right_wrist[0] < left_wrist[0]:
116
+ stance = "orthodox"
117
+ elif left_ankle[0] < right_ankle[0] and left_wrist[0] < right_wrist[0]:
118
+ stance = "southpaw"
119
+ else:
120
+ stance = "unknown"
121
+ stances.append(stance)
122
+
123
+ # Detect if guard is up (both hands near eye level at the side of the head)
124
+ nose = keypoints_3d[KEYPOINT_DICT['nose']]
125
+ guard_threshold = 0.1 # Threshold distance to consider hands near the head
126
+ left_hand_near_head = abs(left_wrist[1] - nose[1]) < guard_threshold
127
+ right_hand_near_head = abs(right_wrist[1] - nose[1]) < guard_threshold
128
+ guard_up.append(left_hand_near_head and right_hand_near_head)
129
+
130
+ # Determine if the punch has started (based on wrist movement)
131
+ if frame_index > 0:
132
+ previous_left_wrist = movenet_results[frame_index - 1][KEYPOINT_DICT['left_wrist']]
133
+ previous_right_wrist = movenet_results[frame_index - 1][KEYPOINT_DICT['right_wrist']]
134
+
135
+ if stance == "orthodox" and (left_wrist[0] - previous_left_wrist[0]) > 0.05:
136
+ punch_started = True
137
+ if initial_left_wrist is None:
138
+ initial_left_wrist = left_wrist
139
+ elif stance == "southpaw" and (right_wrist[0] - previous_right_wrist[0]) > 0.05:
140
+ punch_started = True
141
+ if initial_right_wrist is None:
142
+ initial_right_wrist = right_wrist
143
+
144
+ # Detect hip rotation (based on left and right hips, considering stance and punch start)
145
+ left_hip = keypoints_3d[KEYPOINT_DICT['left_hip']]
146
+ right_hip = keypoints_3d[KEYPOINT_DICT['right_hip']]
147
+ if punch_started:
148
+ if stance == "orthodox":
149
+ hip_rotation = right_hip[0] - left_hip[0] # Right hip should move forward
150
+ elif stance == "southpaw":
151
+ hip_rotation = left_hip[0] - right_hip[0] # Left hip should move forward
152
+ else:
153
+ hip_rotation = 0
154
+ else:
155
+ hip_rotation = 0
156
+ hip_rotations.append(hip_rotation)
157
+
158
+ # Detect full arm extension (based on shoulder, elbow, and wrist, considering stance)
159
+ left_shoulder = keypoints_3d[KEYPOINT_DICT['left_shoulder']]
160
+ left_elbow = keypoints_3d[KEYPOINT_DICT['left_elbow']]
161
+ right_shoulder = keypoints_3d[KEYPOINT_DICT['right_shoulder']]
162
+ right_elbow = keypoints_3d[KEYPOINT_DICT['right_elbow']]
163
+
164
+ if stance == "orthodox":
165
+ lead_arm_extension = np.linalg.norm(np.array(left_wrist) - np.array(left_shoulder))
166
+ elif stance == "southpaw":
167
+ lead_arm_extension = np.linalg.norm(np.array(right_wrist) - np.array(right_shoulder))
168
+ else:
169
+ lead_arm_extension = 0
170
+ arm_extensions.append(lead_arm_extension)
171
+
172
+ # Detect stepping with the jab and coming back (based on ankles, considering stance and punch start)
173
+ if punch_started and frame_index > 0:
174
+ previous_left_ankle = movenet_results[frame_index - 1][KEYPOINT_DICT['left_ankle']]
175
+ previous_right_ankle = movenet_results[frame_index - 1][KEYPOINT_DICT['right_ankle']]
176
+
177
+ if stance == "orthodox":
178
+ step_movement = (left_ankle[0] - previous_left_ankle[0]) > 0.05 # Lead foot is left
179
+ elif stance == "southpaw":
180
+ step_movement = (right_ankle[0] - previous_right_ankle[0]) > 0.05 # Lead foot is right
181
+ else:
182
+ step_movement = False
183
+ stepping_jabs.append(step_movement)
184
+ else:
185
+ stepping_jabs.append(False)
186
+
187
+ # Detect if the hand returns to the initial position after the punch
188
+ if punch_started:
189
+ if stance == "orthodox" and initial_left_wrist is not None:
190
+ hand_returned.append(np.linalg.norm(np.array(left_wrist) - np.array(initial_left_wrist)) < 0.05)
191
+ elif stance == "southpaw" and initial_right_wrist is not None:
192
+ hand_returned.append(np.linalg.norm(np.array(right_wrist) - np.array(initial_right_wrist)) < 0.05)
193
+ else:
194
+ hand_returned.append(False)
195
+ else:
196
+ hand_returned.append(False)
197
+
198
+ # Detect if hips are shoulder width apart
199
+ left_shoulder = keypoints_3d[KEYPOINT_DICT['left_shoulder']]
200
+ right_shoulder = keypoints_3d[KEYPOINT_DICT['right_shoulder']]
201
+ shoulder_width = abs(left_shoulder[0] - right_shoulder[0])
202
+ hips_width = abs(left_hip[0] - right_hip[0])
203
+ hips_width_apart.append(hips_width > 0.9 * shoulder_width and hips_width < 1.1 * shoulder_width)
204
+
205
+ # Detect if the back leg is at a 45 degree angle outward (for orthodox and southpaw)
206
+ if stance == "orthodox":
207
+ right_leg_angle = np.arctan2(right_ankle[1] - right_hip[1], right_ankle[0] - right_hip[0]) * 180 / np.pi
208
+ leg_angle_correct.append(40 <= right_leg_angle <= 50)
209
+ elif stance == "southpaw":
210
+ left_leg_angle = np.arctan2(left_ankle[1] - left_hip[1], left_ankle[0] - left_hip[0]) * 180 / np.pi
211
+ leg_angle_correct.append(40 <= left_leg_angle <= 50)
212
+ else:
213
+ leg_angle_correct.append(False)
214
+
215
+ # Generate captions for all 60 frames using BLIP
216
+ captions = []
217
+ for frame in frames:
218
+ inputs = blip_processor(images=frame, return_tensors="pt")
219
+ with torch.no_grad():
220
+ caption = blip_model.generate(**inputs)
221
+ captions.append(blip_processor.decode(caption[0], skip_special_tokens=True))
222
+
223
+ # Use CLIP to assess the similarity of frames to a Muay Thai jab prompt, including stance
224
+ clip_results = []
225
+ for i, frame in enumerate(frames):
226
+ stance = stances[i]
227
+ prompt = f"A person performing a Muay Thai jab in {stance} stance at {height} in in height, {weight} lbs in weight, and a wingspan of {wingspan} cm, with hip rotation of {hip_rotations[i]:.2f}, arm extension of {arm_extensions[i]:.2f}, {'stepping forward' if stepping_jabs[i] else 'not stepping'}, {'guard up' if guard_up[i] else 'guard down'}, {'hand returned to initial position' if hand_returned[i] else 'hand not returned'}, {'hips shoulder width apart' if hips_width_apart[i] else 'hips not shoulder width apart'}, and {'correct leg angle' if leg_angle_correct[i] else 'incorrect leg angle'}"
228
+ text_inputs = clip_processor(text=[prompt], return_tensors="pt")
229
+ image_inputs = clip_processor(images=frame, return_tensors="pt")
230
+ with torch.no_grad():
231
+ image_features = clip_model.get_image_features(**image_inputs)
232
+ text_features = clip_model.get_text_features(**text_inputs)
233
+ similarity = torch.nn.functional.cosine_similarity(image_features, text_features)
234
+ clip_results.append(similarity.item())
235
+
236
+ # Calculate score based on CLIP results and BLIP captions
237
+ avg_clip_similarity = sum(clip_results) / len(clip_results) if clip_results else 0
238
+ guard_score = sum(guard_up) / len(guard_up) if guard_up else 0
239
+ hand_return_score = sum(hand_returned) / len(hand_returned) if hand_returned else 0
240
+ hips_width_score = sum(hips_width_apart) / len(hips_width_apart) if hips_width_apart else 0
241
+ leg_angle_score = sum(leg_angle_correct) / len(leg_angle_correct) if leg_angle_correct else 0
242
+ overall_score = (avg_clip_similarity + guard_score + hand_return_score + hips_width_score + leg_angle_score) / 5
243
 
244
+ # Scale the overall score to a range of 0 - 10
245
+ overall_score = max(0, min(overall_score * 10, 10))
246
 
247
+ # Return combined results
248
+ response = {
249
+ "movenet_results": movenet_results,
250
+ "blip_captions": captions,
251
+ "clip_similarities": clip_results,
252
+ "stances": stances,
253
+ "hip_rotations": hip_rotations,
254
+ "arm_extensions": arm_extensions,
255
+ "stepping_jabs": stepping_jabs,
256
+ "hips_width_apart": hips_width_apart,
257
+ "leg_angle_correct": leg_angle_correct,
258
+ "overall_score": overall_score,
259
+ "guard_score": guard_score,
260
+ "hand_return_score": hand_return_score,
261
+ "hips_width_score":hips_width_score,
262
+ "leg_angle_score": leg_angle_score,
263
+ }
264
+ return jsonify(response)
265
  except Exception as e:
266
  return jsonify({"error": str(e)}), 500
267