Spaces:
Runtime error
Runtime error
modified make prediction and added inv_index_to_class = {v: k for k, v in index_to_class.items()}
27ae852
import os | |
import cv2 | |
import gradio as gr | |
import mediapipe as mp | |
import numpy as np | |
import tensorflow as tf | |
import tensorflow.lite as tflite | |
index_to_class = { | |
"TV": 0, "after": 1, "airplane": 2, "all": 3, "alligator": 4, "animal": 5, "another": 6, "any": 7, "apple": 8, | |
"arm": 9, "aunt": 10, "awake": 11, "backyard": 12, "bad": 13, "balloon": 14, "bath": 15, "because": 16, "bed": 17, | |
"bedroom": 18, "bee": 19, "before": 20, "beside": 21, "better": 22, "bird": 23, "black": 24, "blow": 25, "blue": 26, | |
"boat": 27, "book": 28, "boy": 29, "brother": 30, "brown": 31, "bug": 32, "bye": 33, "callonphone": 34, "can": 35, | |
"car": 36, "carrot": 37, "cat": 38, "cereal": 39, "chair": 40, "cheek": 41, "child": 42, "chin": 43, | |
"chocolate": 44, "clean": 45, "close": 46, "closet": 47, "cloud": 48, "clown": 49, "cow": 50, "cowboy": 51, | |
"cry": 52, "cut": 53, "cute": 54, "dad": 55, "dance": 56, "dirty": 57, "dog": 58, "doll": 59, "donkey": 60, | |
"down": 61, "drawer": 62, "drink": 63, "drop": 64, "dry": 65, "dryer": 66, "duck": 67, "ear": 68, "elephant": 69, | |
"empty": 70, "every": 71, "eye": 72, "face": 73, "fall": 74, "farm": 75, "fast": 76, "feet": 77, "find": 78, | |
"fine": 79, "finger": 80, "finish": 81, "fireman": 82, "first": 83, "fish": 84, "flag": 85, "flower": 86, | |
"food": 87, "for": 88, "frenchfries": 89, "frog": 90, "garbage": 91, "gift": 92, "giraffe": 93, "girl": 94, | |
"give": 95, "glasswindow": 96, "go": 97, "goose": 98, "grandma": 99, "grandpa": 100, "grass": 101, "green": 102, | |
"gum": 103, "hair": 104, "happy": 105, "hat": 106, "hate": 107, "have": 108, "haveto": 109, "head": 110, | |
"hear": 111, "helicopter": 112, "hello": 113, "hen": 114, "hesheit": 115, "hide": 116, "high": 117, "home": 118, | |
"horse": 119, "hot": 120, "hungry": 121, "icecream": 122, "if": 123, "into": 124, "jacket": 125, "jeans": 126, | |
"jump": 127, "kiss": 128, "kitty": 129, "lamp": 130, "later": 131, "like": 132, "lion": 133, "lips": 134, | |
"listen": 135, "look": 136, "loud": 137, "mad": 138, "make": 139, "man": 140, "many": 141, "milk": 142, | |
"minemy": 143, "mitten": 144, "mom": 145, "moon": 146, "morning": 147, "mouse": 148, "mouth": 149, "nap": 150, | |
"napkin": 151, "night": 152, "no": 153, "noisy": 154, "nose": 155, "not": 156, "now": 157, "nuts": 158, "old": 159, | |
"on": 160, "open": 161, "orange": 162, "outside": 163, "owie": 164, "owl": 165, "pajamas": 166, "pen": 167, | |
"pencil": 168, "penny": 169, "person": 170, "pig": 171, "pizza": 172, "please": 173, "police": 174, "pool": 175, | |
"potty": 176, "pretend": 177, "pretty": 178, "puppy": 179, "puzzle": 180, "quiet": 181, "radio": 182, "rain": 183, | |
"read": 184, "red": 185, "refrigerator": 186, "ride": 187, "room": 188, "sad": 189, "same": 190, "say": 191, | |
"scissors": 192, "see": 193, "shhh": 194, "shirt": 195, "shoe": 196, "shower": 197, "sick": 198, "sleep": 199, | |
"sleepy": 200, "smile": 201, "snack": 202, "snow": 203, "stairs": 204, "stay": 205, "sticky": 206, "store": 207, | |
"story": 208, "stuck": 209, "sun": 210, "table": 211, "talk": 212, "taste": 213, "thankyou": 214, "that": 215, | |
"there": 216, "think": 217, "thirsty": 218, "tiger": 219, "time": 220, "tomorrow": 221, "tongue": 222, "tooth": 223, | |
"toothbrush": 224, "touch": 225, "toy": 226, "tree": 227, "uncle": 228, "underwear": 229, "up": 230, "vacuum": 231, | |
"wait": 232, "wake": 233, "water": 234, "wet": 235, "weus": 236, "where": 237, "white": 238, "who": 239, "why": 240, | |
"will": 241, "wolf": 242, "yellow": 243, "yes": 244, "yesterday": 245, "yourself": 246, "yucky": 247, "zebra": 248, | |
"zipper": 249 | |
} | |
inv_index_to_class = {v: k for k, v in index_to_class.items()} | |
# Initialize MediaPipe solutions | |
mp_hands = mp.solutions.hands | |
mp_pose = mp.solutions.pose | |
mp_face_mesh = mp.solutions.face_mesh | |
hands = mp_hands.Hands() | |
pose = mp_pose.Pose() | |
face_mesh = mp_face_mesh.FaceMesh() | |
# Get the absolute path to the directory containing app.py | |
current_dir = os.path.dirname(os.path.abspath(__file__)) | |
# Define the filename of the TFLite model | |
model_filename = "model.tflite" | |
# Construct the full path to the TFLite model file | |
model_path = os.path.join(current_dir, model_filename) | |
# Load the TFLite model using the interpreter | |
interpreter = tf.lite.Interpreter(model_path=model_path) | |
interpreter.allocate_tensors() | |
# Preprocess landmarks | |
def preprocess_landmarks(hand1_landmarks, hand2_landmarks, pose_landmarks, lip_landmarks): | |
hand1_landmarks = [[landmark.x, landmark.y, landmark.z] for landmark in hand1_landmarks.landmark] | |
hand2_landmarks = [[landmark.x, landmark.y, landmark.z] for landmark in hand2_landmarks.landmark] | |
pose_landmarks = [[landmark.x, landmark.y, landmark.z] for landmark in pose_landmarks.landmark] | |
lip_landmarks = [[landmark.x, landmark.y, landmark.z] for landmark in lip_landmarks] | |
combined_landmarks = lip_landmarks + hand1_landmarks + hand2_landmarks + pose_landmarks | |
return np.array(combined_landmarks, dtype=np.float32) | |
# Function to extract landmarks from the webcam frame | |
def extract_landmarks(frame): | |
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) | |
results = hands.process(frame_rgb) | |
pose_results = pose.process(frame_rgb) | |
face_results = face_mesh.process(frame_rgb) | |
if not results.multi_hand_landmarks or not pose_results.pose_landmarks or not face_results.multi_face_landmarks: | |
return None | |
hand1_landmarks = results.multi_hand_landmarks[0] | |
if len(results.multi_hand_landmarks) > 1: | |
hand2_landmarks = results.multi_hand_landmarks[1] | |
else: | |
hand2_landmarks = hand1_landmarks | |
pose_landmarks = pose_results.pose_landmarks | |
face_landmarks = face_results.multi_face_landmarks[0] | |
lip_landmarks = [face_landmarks.landmark[i] for i in LIPS_IDXS0 - START_IDX] | |
return hand1_landmarks, hand2_landmarks, pose_landmarks, lip_landmarks | |
# Make prediction | |
def make_prediction(processed_landmarks): | |
inputs = np.array(processed_landmarks, dtype=np.float32) | |
# Set the input tensor for the TFLite model | |
interpreter.set_tensor(input_details[0]['index'], inputs) | |
# Invoke the TFLite interpreter to perform inference | |
interpreter.invoke() | |
# Get the output tensor of the TFLite model | |
output_data = interpreter.get_tensor(output_details[0]['index']) | |
# Find the index of the predicted class | |
index = np.argmax(output_data) | |
# Map the index to the corresponding class label using the index_to_class dictionary | |
prediction = inv_index_to_class[index] | |
return prediction | |
# Gradio Interface Function | |
def predict_with_webcam(frame): | |
landmarks = extract_landmarks(frame) | |
if landmarks is not None: | |
processed_landmarks = preprocess_landmarks(*landmarks) | |
prediction = make_prediction(processed_landmarks) | |
return str(prediction) | |
# Define the Gradio interface with the Webcam input and Text output | |
# Modify the Gradio interface to use a "label" type output instead of "textbox" | |
webcam_interface = gr.Interface( | |
fn=predict_with_webcam, | |
inputs=gr.inputs.Image(shape=(480, 640), source="webcam"), | |
outputs=gr.outputs.Label(), # Use "label" type instead of "textbox" | |
live=True, | |
interpretation="default", | |
title="Webcam Landmark Prediction", | |
description="Make predictions using landmarks extracted from your webcam stream.", | |
) | |
# Launch the Gradio app with the webcam interface | |
if __name__ == "__main__": | |
webcam_interface.launch() | |