Spaces:
Runtime error
Runtime error
adding these print statements, we can check if the landmarks are being correctly extracted and if the number of landmarks is as expected for each category
0dded4a
import os | |
import cv2 | |
import gradio as gr | |
import mediapipe as mp | |
import numpy as np | |
import tensorflow as tf | |
import tensorflow.lite as tflite | |
index_to_class = { | |
"TV": 0, "after": 1, "airplane": 2, "all": 3, "alligator": 4, "animal": 5, "another": 6, "any": 7, "apple": 8, | |
"arm": 9, "aunt": 10, "awake": 11, "backyard": 12, "bad": 13, "balloon": 14, "bath": 15, "because": 16, "bed": 17, | |
"bedroom": 18, "bee": 19, "before": 20, "beside": 21, "better": 22, "bird": 23, "black": 24, "blow": 25, "blue": 26, | |
"boat": 27, "book": 28, "boy": 29, "brother": 30, "brown": 31, "bug": 32, "bye": 33, "callonphone": 34, "can": 35, | |
"car": 36, "carrot": 37, "cat": 38, "cereal": 39, "chair": 40, "cheek": 41, "child": 42, "chin": 43, | |
"chocolate": 44, "clean": 45, "close": 46, "closet": 47, "cloud": 48, "clown": 49, "cow": 50, "cowboy": 51, | |
"cry": 52, "cut": 53, "cute": 54, "dad": 55, "dance": 56, "dirty": 57, "dog": 58, "doll": 59, "donkey": 60, | |
"down": 61, "drawer": 62, "drink": 63, "drop": 64, "dry": 65, "dryer": 66, "duck": 67, "ear": 68, "elephant": 69, | |
"empty": 70, "every": 71, "eye": 72, "face": 73, "fall": 74, "farm": 75, "fast": 76, "feet": 77, "find": 78, | |
"fine": 79, "finger": 80, "finish": 81, "fireman": 82, "first": 83, "fish": 84, "flag": 85, "flower": 86, | |
"food": 87, "for": 88, "frenchfries": 89, "frog": 90, "garbage": 91, "gift": 92, "giraffe": 93, "girl": 94, | |
"give": 95, "glasswindow": 96, "go": 97, "goose": 98, "grandma": 99, "grandpa": 100, "grass": 101, "green": 102, | |
"gum": 103, "hair": 104, "happy": 105, "hat": 106, "hate": 107, "have": 108, "haveto": 109, "head": 110, | |
"hear": 111, "helicopter": 112, "hello": 113, "hen": 114, "hesheit": 115, "hide": 116, "high": 117, "home": 118, | |
"horse": 119, "hot": 120, "hungry": 121, "icecream": 122, "if": 123, "into": 124, "jacket": 125, "jeans": 126, | |
"jump": 127, "kiss": 128, "kitty": 129, "lamp": 130, "later": 131, "like": 132, "lion": 133, "lips": 134, | |
"listen": 135, "look": 136, "loud": 137, "mad": 138, "make": 139, "man": 140, "many": 141, "milk": 142, | |
"minemy": 143, "mitten": 144, "mom": 145, "moon": 146, "morning": 147, "mouse": 148, "mouth": 149, "nap": 150, | |
"napkin": 151, "night": 152, "no": 153, "noisy": 154, "nose": 155, "not": 156, "now": 157, "nuts": 158, "old": 159, | |
"on": 160, "open": 161, "orange": 162, "outside": 163, "owie": 164, "owl": 165, "pajamas": 166, "pen": 167, | |
"pencil": 168, "penny": 169, "person": 170, "pig": 171, "pizza": 172, "please": 173, "police": 174, "pool": 175, | |
"potty": 176, "pretend": 177, "pretty": 178, "puppy": 179, "puzzle": 180, "quiet": 181, "radio": 182, "rain": 183, | |
"read": 184, "red": 185, "refrigerator": 186, "ride": 187, "room": 188, "sad": 189, "same": 190, "say": 191, | |
"scissors": 192, "see": 193, "shhh": 194, "shirt": 195, "shoe": 196, "shower": 197, "sick": 198, "sleep": 199, | |
"sleepy": 200, "smile": 201, "snack": 202, "snow": 203, "stairs": 204, "stay": 205, "sticky": 206, "store": 207, | |
"story": 208, "stuck": 209, "sun": 210, "table": 211, "talk": 212, "taste": 213, "thankyou": 214, "that": 215, | |
"there": 216, "think": 217, "thirsty": 218, "tiger": 219, "time": 220, "tomorrow": 221, "tongue": 222, "tooth": 223, | |
"toothbrush": 224, "touch": 225, "toy": 226, "tree": 227, "uncle": 228, "underwear": 229, "up": 230, "vacuum": 231, | |
"wait": 232, "wake": 233, "water": 234, "wet": 235, "weus": 236, "where": 237, "white": 238, "who": 239, "why": 240, | |
"will": 241, "wolf": 242, "yellow": 243, "yes": 244, "yesterday": 245, "yourself": 246, "yucky": 247, "zebra": 248, | |
"zipper": 249 | |
} | |
inv_index_to_class = {v: k for k, v in index_to_class.items()} | |
# Initialize MediaPipe solutions | |
mp_hands = mp.solutions.hands | |
mp_pose = mp.solutions.pose | |
mp_face_mesh = mp.solutions.face_mesh | |
hands = mp_hands.Hands() | |
pose = mp_pose.Pose() | |
face_mesh = mp_face_mesh.FaceMesh() | |
# Get the absolute path to the directory containing app.py | |
current_dir = os.path.dirname(os.path.abspath(__file__)) | |
# Define the filename of the TFLite model | |
model_filename = "model.tflite" | |
# Construct the full path to the TFLite model file | |
model_path = os.path.join(current_dir, model_filename) | |
# Load the TFLite model using the interpreter | |
interpreter = tf.lite.Interpreter(model_path=model_path) | |
interpreter.allocate_tensors() | |
# Preprocess landmarks | |
def preprocess_landmarks(hand1_landmarks, hand2_landmarks, pose_landmarks, lip_landmarks): | |
hand1_landmarks = [[landmark.x, landmark.y, landmark.z] for landmark in hand1_landmarks.landmark] | |
hand2_landmarks = [[landmark.x, landmark.y, landmark.z] for landmark in hand2_landmarks.landmark] | |
pose_landmarks = [[landmark.x, landmark.y, landmark.z] for landmark in pose_landmarks.landmark] | |
lip_landmarks = [[landmark.x, landmark.y, landmark.z] for landmark in lip_landmarks] | |
combined_landmarks = lip_landmarks + hand1_landmarks + hand2_landmarks + pose_landmarks | |
return np.array(combined_landmarks, dtype=np.float32) | |
# Function to extract landmarks from the webcam frame | |
def extract_landmarks(frame): | |
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) | |
results = hands.process(frame_rgb) | |
pose_results = pose.process(frame_rgb) | |
face_results = face_mesh.process(frame_rgb) | |
if not results.multi_hand_landmarks or not pose_results.pose_landmarks or not face_results.multi_face_landmarks: | |
return None | |
hand1_landmarks = results.multi_hand_landmarks[0] | |
if len(results.multi_hand_landmarks) > 1: | |
hand2_landmarks = results.multi_hand_landmarks[1] | |
else: | |
hand2_landmarks = hand1_landmarks | |
pose_landmarks = pose_results.pose_landmarks | |
face_landmarks = face_results.multi_face_landmarks[0] | |
lip_landmarks = [face_landmarks.landmark[i] for i in LIPS_IDXS0 - START_IDX] | |
print(f"Number of landmarks for hand1: {len(hand1_landmarks.landmark)}") | |
print(f"Number of landmarks for hand2: {len(hand2_landmarks.landmark)}") | |
print(f"Number of landmarks for pose: {len(pose_landmarks.landmark)}") | |
print(f"Number of landmarks for lip: {len(lip_landmarks)}") | |
return hand1_landmarks, hand2_landmarks, pose_landmarks, lip_landmarks | |
# Make prediction | |
def make_prediction(processed_landmarks): | |
inputs = np.array(processed_landmarks, dtype=np.float32) | |
# Set the input tensor for the TFLite model | |
interpreter.set_tensor(input_details[0]['index'], inputs) | |
# Invoke the TFLite interpreter to perform inference | |
interpreter.invoke() | |
# Get the output tensor of the TFLite model | |
output_data = interpreter.get_tensor(output_details[0]['index']) | |
# Find the index of the predicted class | |
index = np.argmax(output_data) | |
# Map the index to the corresponding class label using the index_to_class dictionary | |
prediction = inv_index_to_class[index] | |
return prediction | |
# Gradio Interface Function | |
def predict_with_webcam(frame): | |
landmarks = extract_landmarks(frame) | |
if landmarks is not None: | |
processed_landmarks = preprocess_landmarks(*landmarks) | |
prediction = make_prediction(processed_landmarks) | |
return str(prediction) | |
# Define the Gradio interface with the Webcam input and Text output | |
# Modify the Gradio interface to use a "label" type output instead of "textbox" | |
webcam_interface = gr.Interface( | |
fn=predict_with_webcam, | |
inputs=gr.inputs.Image(shape=(480, 640), source="webcam"), | |
outputs=gr.outputs.Label(), # Use "Label" type instead of "Textbox" | |
live=True, | |
interpretation="default", | |
title="Webcam Landmark Prediction", | |
description="Make predictions using landmarks extracted from your webcam stream.", | |
) | |
# Launch the Gradio app with the webcam interface | |
if __name__ == "__main__": | |
webcam_interface.launch() | |