import cv2
import gradio as gr
import mediapipe as mp
import numpy as np
import tensorflow.lite as tflite

# Initialize MediaPipe solutions
mp_hands = mp.solutions.hands
mp_pose = mp.solutions.pose
mp_face_mesh = mp.solutions.face_mesh

hands = mp_hands.Hands()
pose = mp_pose.Pose()
face_mesh = mp_face_mesh.FaceMesh()

# Load the TFLite model
interpreter = tflite.Interpreter(model_path="model.tflite")
interpreter.allocate_tensors()
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

# Preprocess landmarks
def preprocess_landmarks(hand1_landmarks, hand2_landmarks, pose_landmarks, lip_landmarks):
    hand1_landmarks = [[landmark.x, landmark.y, landmark.z] for landmark in hand1_landmarks.landmark]
    hand2_landmarks = [[landmark.x, landmark.y, landmark.z] for landmark in hand2_landmarks.landmark]
    pose_landmarks = [[landmark.x, landmark.y, landmark.z] for landmark in pose_landmarks.landmark]
    lip_landmarks = [[landmark.x, landmark.y, landmark.z] for landmark in lip_landmarks]

    combined_landmarks = lip_landmarks + hand1_landmarks + hand2_landmarks + pose_landmarks

    return np.array(combined_landmarks, dtype=np.float32)

# Function to extract landmarks from the webcam frame
def extract_landmarks(frame):
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands.process(frame_rgb)
    pose_results = pose.process(frame_rgb)
    face_results = face_mesh.process(frame_rgb)

    if not results.multi_hand_landmarks or not pose_results.pose_landmarks or not face_results.multi_face_landmarks:
        return None

    hand1_landmarks = results.multi_hand_landmarks[0]
    if len(results.multi_hand_landmarks) > 1:
        hand2_landmarks = results.multi_hand_landmarks[1]
    else:
        hand2_landmarks = hand1_landmarks
    
    pose_landmarks = pose_results.pose_landmarks
    face_landmarks = face_results.multi_face_landmarks[0]
    lip_landmarks = [face_landmarks.landmark[i] for i in LIPS_IDXS0 - START_IDX]

    return hand1_landmarks, hand2_landmarks, pose_landmarks, lip_landmarks

# Make prediction
def make_prediction(processed_landmarks):
    inputs = np.array([processed_landmarks])
    interpreter.set_tensor(input_details[0]['index'], inputs)
    interpreter.invoke()
    outputs = interpreter.get_tensor(output_details[0]['index'])
    return outputs[0].argmax()

# Gradio Interface Function
def predict_with_webcam(frame):
    landmarks = extract_landmarks(frame)
    if landmarks is not None:
        processed_landmarks = preprocess_landmarks(*landmarks)
        prediction = make_prediction(processed_landmarks)
        return str(prediction)

# Define the Gradio interface with the Webcam input and Text output
webcam_interface = gr.Interface(
    fn=predict_with_webcam,
    inputs=gr.inputs.Image(shape=(480, 640), source="webcam"),
    outputs="text",
    live=True,
    interpretation="default",
    title="Webcam Landmark Prediction",
    description="Make predictions using landmarks extracted from your webcam stream.",
)

# Launch the Gradio app with the webcam interface
if __name__ == "__main__":
    webcam_interface.launch(share=True)