import mediapipe as mp import numpy as np import cv2 import copy mp_holistic = mp.solutions.holistic mp_drawing = mp.solutions.drawing_utils width, height = 640, 480 model = mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) def mediapipe_detection(image): # từ image, model dự đoán trả về kết quả (định dạng mặc định) results = model.process(image) return results def extract_keypoint(results,last): res = [] if results.pose_landmarks: for p in results.pose_landmarks.landmark: res.append(np.array([p.x,p.y,p.z,p.visibility])) else: for _ in range(33): res.append(np.array([0,0,0,0])) #-------------- if results.left_hand_landmarks: for p in results.left_hand_landmarks.landmark: res.append(np.array([p.x,p.y,p.z])) elif last!= None and last.left_hand_landmarks: for p in last.left_hand_landmarks.landmark: res.append(np.array([p.x,p.y,p.z])) else: for _ in range(21): res.append(np.array([0,0,0])) #--------------- if results.right_hand_landmarks: for p in results.right_hand_landmarks.landmark: res.append(np.array([p.x,p.y,p.z])) elif last!=None and last.right_hand_landmarks: for p in last.right_hand_landmarks.landmark: res.append(np.array([p.x,p.y,p.z])) else: for _ in range(21): res.append(np.array([0,0,0])) return res def normalize_keypoint(res,img=None): #normalize keypoint x1,y1,x2,y2 = res[11][0]*width,res[11][1]*height,res[12][0]*width,res[12][1]*height try: cv2.circle(img,(int(x1),int(y1)),4,(0,255,255),-1) cv2.circle(img,(int(x2),int(y2)),4,(0,255,255),-1) except: # print("No img found") pass dis = np.sqrt((x1-x2)**2+(y1-y2)**2) x_cen = (res[11][0]+res[12][0])/2 y_cen = (res[11][1]+res[12][1])/2 vector = [0.5-x_cen,0.5-y_cen] scale = (200*width/640)/dis for i in range(len(res)): if res[i][0]==0 and res[i][1]==0: continue res[i][0] = vector[0]+res[i][0] res[i][1] = vector[1]+res[i][1] res[i][0] = 0.5+(res[i][0]-0.5)*scale res[i][1] = 0.5+(res[i][1]-0.5)*scale return res def update_mpresult(res,results,last): c = 0 if results.pose_landmarks: for p in results.pose_landmarks.landmark: p.x = res[c][0] p.y = res[c][1] if(c==20 and p.y>1.1 and last): last.right_hand_landmarks = None elif(c==19 and p.y>1.1 and last): last.left_hand_landmarks = None c+=1 else: for _ in range(33): c+=1 if results.left_hand_landmarks: for p in results.left_hand_landmarks.landmark: p.x = res[c][0] p.y = res[c][1] c+=1 else: if last!=None and last.left_hand_landmarks: results.left_hand_landmarks = copy.deepcopy(last.left_hand_landmarks) for _ in range(21): c+=1 if results.right_hand_landmarks: for p in results.right_hand_landmarks.landmark: p.x = res[c][0] p.y = res[c][1] c+=1 else: if last!=None and last.right_hand_landmarks: results.right_hand_landmarks = copy.deepcopy(last.right_hand_landmarks) for _ in range(21): c+=1 return results def extract_keypoints_flatten(result, last, img=None): #đây là hàm chính thức res = extract_keypoint(result, last) res = normalize_keypoint(res,img) update_mpresult(res,result,last) return np.concatenate([x for x in res]) def mediapipe_process(frames): """Main function to call, process a batch of frames into numpy array for prediction""" sequence = [] last = None for frame in frames: results = mediapipe_detection(frame) keypoints = extract_keypoints_flatten(results, last) last = copy.deepcopy(results) sequence.append(keypoints) return np.array(sequence)