Spaces:

JefferyJapheth
/

topStep

Runtime error

topStep / app.py

adding these print statements, we can check if the landmarks are being correctly extracted and if the number of landmarks is as expected for each category

0dded4a 11 months ago

raw

history blame

No virus

7.78 kB

	import os

	import cv2
	import gradio as gr
	import mediapipe as mp
	import numpy as np
	import tensorflow as tf
	import tensorflow.lite as tflite

	index_to_class = {
	"TV": 0, "after": 1, "airplane": 2, "all": 3, "alligator": 4, "animal": 5, "another": 6, "any": 7, "apple": 8,
	"arm": 9, "aunt": 10, "awake": 11, "backyard": 12, "bad": 13, "balloon": 14, "bath": 15, "because": 16, "bed": 17,
	"bedroom": 18, "bee": 19, "before": 20, "beside": 21, "better": 22, "bird": 23, "black": 24, "blow": 25, "blue": 26,
	"boat": 27, "book": 28, "boy": 29, "brother": 30, "brown": 31, "bug": 32, "bye": 33, "callonphone": 34, "can": 35,
	"car": 36, "carrot": 37, "cat": 38, "cereal": 39, "chair": 40, "cheek": 41, "child": 42, "chin": 43,
	"chocolate": 44, "clean": 45, "close": 46, "closet": 47, "cloud": 48, "clown": 49, "cow": 50, "cowboy": 51,
	"cry": 52, "cut": 53, "cute": 54, "dad": 55, "dance": 56, "dirty": 57, "dog": 58, "doll": 59, "donkey": 60,
	"down": 61, "drawer": 62, "drink": 63, "drop": 64, "dry": 65, "dryer": 66, "duck": 67, "ear": 68, "elephant": 69,
	"empty": 70, "every": 71, "eye": 72, "face": 73, "fall": 74, "farm": 75, "fast": 76, "feet": 77, "find": 78,
	"fine": 79, "finger": 80, "finish": 81, "fireman": 82, "first": 83, "fish": 84, "flag": 85, "flower": 86,
	"food": 87, "for": 88, "frenchfries": 89, "frog": 90, "garbage": 91, "gift": 92, "giraffe": 93, "girl": 94,
	"give": 95, "glasswindow": 96, "go": 97, "goose": 98, "grandma": 99, "grandpa": 100, "grass": 101, "green": 102,
	"gum": 103, "hair": 104, "happy": 105, "hat": 106, "hate": 107, "have": 108, "haveto": 109, "head": 110,
	"hear": 111, "helicopter": 112, "hello": 113, "hen": 114, "hesheit": 115, "hide": 116, "high": 117, "home": 118,
	"horse": 119, "hot": 120, "hungry": 121, "icecream": 122, "if": 123, "into": 124, "jacket": 125, "jeans": 126,
	"jump": 127, "kiss": 128, "kitty": 129, "lamp": 130, "later": 131, "like": 132, "lion": 133, "lips": 134,
	"listen": 135, "look": 136, "loud": 137, "mad": 138, "make": 139, "man": 140, "many": 141, "milk": 142,
	"minemy": 143, "mitten": 144, "mom": 145, "moon": 146, "morning": 147, "mouse": 148, "mouth": 149, "nap": 150,
	"napkin": 151, "night": 152, "no": 153, "noisy": 154, "nose": 155, "not": 156, "now": 157, "nuts": 158, "old": 159,
	"on": 160, "open": 161, "orange": 162, "outside": 163, "owie": 164, "owl": 165, "pajamas": 166, "pen": 167,
	"pencil": 168, "penny": 169, "person": 170, "pig": 171, "pizza": 172, "please": 173, "police": 174, "pool": 175,
	"potty": 176, "pretend": 177, "pretty": 178, "puppy": 179, "puzzle": 180, "quiet": 181, "radio": 182, "rain": 183,
	"read": 184, "red": 185, "refrigerator": 186, "ride": 187, "room": 188, "sad": 189, "same": 190, "say": 191,
	"scissors": 192, "see": 193, "shhh": 194, "shirt": 195, "shoe": 196, "shower": 197, "sick": 198, "sleep": 199,
	"sleepy": 200, "smile": 201, "snack": 202, "snow": 203, "stairs": 204, "stay": 205, "sticky": 206, "store": 207,
	"story": 208, "stuck": 209, "sun": 210, "table": 211, "talk": 212, "taste": 213, "thankyou": 214, "that": 215,
	"there": 216, "think": 217, "thirsty": 218, "tiger": 219, "time": 220, "tomorrow": 221, "tongue": 222, "tooth": 223,
	"toothbrush": 224, "touch": 225, "toy": 226, "tree": 227, "uncle": 228, "underwear": 229, "up": 230, "vacuum": 231,
	"wait": 232, "wake": 233, "water": 234, "wet": 235, "weus": 236, "where": 237, "white": 238, "who": 239, "why": 240,
	"will": 241, "wolf": 242, "yellow": 243, "yes": 244, "yesterday": 245, "yourself": 246, "yucky": 247, "zebra": 248,
	"zipper": 249
	}

	inv_index_to_class = {v: k for k, v in index_to_class.items()}

	# Initialize MediaPipe solutions
	mp_hands = mp.solutions.hands
	mp_pose = mp.solutions.pose
	mp_face_mesh = mp.solutions.face_mesh

	hands = mp_hands.Hands()
	pose = mp_pose.Pose()
	face_mesh = mp_face_mesh.FaceMesh()

	# Get the absolute path to the directory containing app.py
	current_dir = os.path.dirname(os.path.abspath(__file__))
	# Define the filename of the TFLite model
	model_filename = "model.tflite"
	# Construct the full path to the TFLite model file
	model_path = os.path.join(current_dir, model_filename)
	# Load the TFLite model using the interpreter
	interpreter = tf.lite.Interpreter(model_path=model_path)
	interpreter.allocate_tensors()


	# Preprocess landmarks
	def preprocess_landmarks(hand1_landmarks, hand2_landmarks, pose_landmarks, lip_landmarks):
	hand1_landmarks = [[landmark.x, landmark.y, landmark.z] for landmark in hand1_landmarks.landmark]
	hand2_landmarks = [[landmark.x, landmark.y, landmark.z] for landmark in hand2_landmarks.landmark]
	pose_landmarks = [[landmark.x, landmark.y, landmark.z] for landmark in pose_landmarks.landmark]
	lip_landmarks = [[landmark.x, landmark.y, landmark.z] for landmark in lip_landmarks]

	combined_landmarks = lip_landmarks + hand1_landmarks + hand2_landmarks + pose_landmarks

	return np.array(combined_landmarks, dtype=np.float32)


	# Function to extract landmarks from the webcam frame
	def extract_landmarks(frame):
	frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
	results = hands.process(frame_rgb)
	pose_results = pose.process(frame_rgb)
	face_results = face_mesh.process(frame_rgb)

	if not results.multi_hand_landmarks or not pose_results.pose_landmarks or not face_results.multi_face_landmarks:
	return None

	hand1_landmarks = results.multi_hand_landmarks[0]
	if len(results.multi_hand_landmarks) > 1:
	hand2_landmarks = results.multi_hand_landmarks[1]
	else:
	hand2_landmarks = hand1_landmarks

	pose_landmarks = pose_results.pose_landmarks
	face_landmarks = face_results.multi_face_landmarks[0]
	lip_landmarks = [face_landmarks.landmark[i] for i in LIPS_IDXS0 - START_IDX]

	print(f"Number of landmarks for hand1: {len(hand1_landmarks.landmark)}")
	print(f"Number of landmarks for hand2: {len(hand2_landmarks.landmark)}")
	print(f"Number of landmarks for pose: {len(pose_landmarks.landmark)}")
	print(f"Number of landmarks for lip: {len(lip_landmarks)}")

	return hand1_landmarks, hand2_landmarks, pose_landmarks, lip_landmarks


	# Make prediction
	def make_prediction(processed_landmarks):
	inputs = np.array(processed_landmarks, dtype=np.float32)

	# Set the input tensor for the TFLite model
	interpreter.set_tensor(input_details[0]['index'], inputs)

	# Invoke the TFLite interpreter to perform inference
	interpreter.invoke()

	# Get the output tensor of the TFLite model
	output_data = interpreter.get_tensor(output_details[0]['index'])

	# Find the index of the predicted class
	index = np.argmax(output_data)

	# Map the index to the corresponding class label using the index_to_class dictionary
	prediction = inv_index_to_class[index]

	return prediction


	# Gradio Interface Function
	def predict_with_webcam(frame):
	landmarks = extract_landmarks(frame)
	if landmarks is not None:
	processed_landmarks = preprocess_landmarks(*landmarks)
	prediction = make_prediction(processed_landmarks)
	return str(prediction)


	# Define the Gradio interface with the Webcam input and Text output
	# Modify the Gradio interface to use a "label" type output instead of "textbox"
	webcam_interface = gr.Interface(
	fn=predict_with_webcam,
	inputs=gr.inputs.Image(shape=(480, 640), source="webcam"),
	outputs=gr.outputs.Label(), # Use "Label" type instead of "Textbox"
	live=True,
	interpretation="default",
	title="Webcam Landmark Prediction",
	description="Make predictions using landmarks extracted from your webcam stream.",
	)




	# Launch the Gradio app with the webcam interface
	if __name__ == "__main__":
	webcam_interface.launch()