import cv2 import gradio as gr import mediapipe as mp import numpy as np import tensorflow.lite as tflite # Initialize MediaPipe solutions mp_hands = mp.solutions.hands mp_pose = mp.solutions.pose mp_face_mesh = mp.solutions.face_mesh hands = mp_hands.Hands() pose = mp_pose.Pose() face_mesh = mp_face_mesh.FaceMesh() # Load the TFLite model interpreter = tflite.Interpreter(model_path="model.tflite") interpreter.allocate_tensors() input_details = interpreter.get_input_details() output_details = interpreter.get_output_details() # Preprocess landmarks def preprocess_landmarks(hand1_landmarks, hand2_landmarks, pose_landmarks, lip_landmarks): hand1_landmarks = [[landmark.x, landmark.y, landmark.z] for landmark in hand1_landmarks.landmark] hand2_landmarks = [[landmark.x, landmark.y, landmark.z] for landmark in hand2_landmarks.landmark] pose_landmarks = [[landmark.x, landmark.y, landmark.z] for landmark in pose_landmarks.landmark] lip_landmarks = [[landmark.x, landmark.y, landmark.z] for landmark in lip_landmarks] combined_landmarks = lip_landmarks + hand1_landmarks + hand2_landmarks + pose_landmarks return np.array(combined_landmarks, dtype=np.float32) # Function to extract landmarks from the webcam frame def extract_landmarks(frame): frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) results = hands.process(frame_rgb) pose_results = pose.process(frame_rgb) face_results = face_mesh.process(frame_rgb) if not results.multi_hand_landmarks or not pose_results.pose_landmarks or not face_results.multi_face_landmarks: return None hand1_landmarks = results.multi_hand_landmarks[0] if len(results.multi_hand_landmarks) > 1: hand2_landmarks = results.multi_hand_landmarks[1] else: hand2_landmarks = hand1_landmarks pose_landmarks = pose_results.pose_landmarks face_landmarks = face_results.multi_face_landmarks[0] lip_landmarks = [face_landmarks.landmark[i] for i in LIPS_IDXS0 - START_IDX] return hand1_landmarks, hand2_landmarks, pose_landmarks, lip_landmarks # Make prediction def make_prediction(processed_landmarks): inputs = np.array([processed_landmarks]) interpreter.set_tensor(input_details[0]['index'], inputs) interpreter.invoke() outputs = interpreter.get_tensor(output_details[0]['index']) return outputs[0].argmax() # Gradio Interface Function def predict_with_webcam(frame): landmarks = extract_landmarks(frame) if landmarks is not None: processed_landmarks = preprocess_landmarks(*landmarks) prediction = make_prediction(processed_landmarks) return str(prediction) # Define the Gradio interface with the Webcam input and Text output webcam_interface = gr.Interface( fn=predict_with_webcam, inputs=gr.inputs.Image(shape=(480, 640), source="webcam"), outputs="text", live=True, interpretation="default", title="Webcam Landmark Prediction", description="Make predictions using landmarks extracted from your webcam stream.", ) # Launch the Gradio app with the webcam interface if __name__ == "__main__": webcam_interface.launch(share=True)