Spaces:
Runtime error
Runtime error
File size: 12,300 Bytes
e156e53 4506fce 9f9bcb1 4ca0f67 7d38abd 64e989b 7d38abd e156e53 4ca0f67 4d98c67 4ca0f67 64e989b 760b3da 21ed3a6 760b3da 5eb4620 e156e53 3f94b52 5eb4620 e156e53 21ed3a6 e156e53 4ca0f67 e6b8cdf e156e53 dd22aa4 4ca0f67 5faf21e 9f9bcb1 3d1072b 9f9bcb1 3d1072b 9f9bcb1 5faf21e 9f9bcb1 5faf21e 9f9bcb1 e6b8cdf 3d1072b 2ae007c 21ed3a6 9f9bcb1 9b0c68a cef8734 db3ffc3 2ae007c 9b0c68a 2165cdc 9b0c68a 2165cdc 2ae007c 2165cdc 5faf21e 21ed3a6 9b0c68a 2ae007c 9b0c68a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 |
import os
import mediapipe as mp
import tensorflow as tf
N_ROWS = 543
N_DIMS = 3
DIM_NAMES = ['x', 'y', 'z']
SEED = 42
NUM_CLASSES = 250
INPUT_SIZE = 32
# Tensorflow layer to process data in TFLite
# Data needs to be processed in the model itself, so we cannot use Python
class PreprocessLayer(tf.keras.layers.Layer):
def __init__(self):
super(PreprocessLayer, self).__init__()
def pad_edge(self, t, repeats, side):
if side == 'LEFT':
return tf.concat((tf.repeat(t[:1], repeats=repeats, axis=0), t), axis=0)
elif side == 'RIGHT':
return tf.concat((t, tf.repeat(t[-1:], repeats=repeats, axis=0)), axis=0)
@tf.function(
input_signature=(tf.TensorSpec(shape=[None, N_ROWS, N_DIMS], dtype=tf.float32),),
)
def call(self, data0):
# Number of Frames in Video
N_FRAMES0 = tf.shape(data0)[0]
# Filter Out Frames With Empty Hand Data
frames_hands_nansum = tf.experimental.numpy.nanmean(tf.gather(data0, HAND_IDXS0, axis=1), axis=[1, 2])
non_empty_frames_idxs = tf.where(frames_hands_nansum > 0)
non_empty_frames_idxs = tf.squeeze(non_empty_frames_idxs, axis=1)
data = tf.gather(data0, non_empty_frames_idxs, axis=0)
# Cast Indices in float32 to be compatible with Tensorflow Lite
non_empty_frames_idxs = tf.cast(non_empty_frames_idxs, tf.float32)
# Number of Frames in Filtered Video
N_FRAMES = tf.shape(data)[0]
# Gather Relevant Landmark Columns
data = tf.gather(data, LANDMARK_IDXS0, axis=1)
# Video fits in INPUT_SIZE
if N_FRAMES < INPUT_SIZE:
# Pad With -1 to indicate padding
non_empty_frames_idxs = tf.pad(non_empty_frames_idxs, [[0, INPUT_SIZE - N_FRAMES]], constant_values=-1)
# Pad Data With Zeros
data = tf.pad(data, [[0, INPUT_SIZE - N_FRAMES], [0, 0], [0, 0]], constant_values=0)
# Fill NaN Values With 0
data = tf.where(tf.math.is_nan(data), 0.0, data)
return data, non_empty_frames_idxs
# Video needs to be downsampled to INPUT_SIZE
else:
# Repeat
if N_FRAMES < INPUT_SIZE ** 2:
repeats = tf.math.floordiv(INPUT_SIZE * INPUT_SIZE, N_FRAMES0)
data = tf.repeat(data, repeats=repeats, axis=0)
non_empty_frames_idxs = tf.repeat(non_empty_frames_idxs, repeats=repeats, axis=0)
# Pad To Multiple Of Input Size
pool_size = tf.math.floordiv(len(data), INPUT_SIZE)
if tf.math.mod(len(data), INPUT_SIZE) > 0:
pool_size += 1
if pool_size == 1:
pad_size = (pool_size * INPUT_SIZE) - len(data)
else:
pad_size = (pool_size * INPUT_SIZE) % len(data)
# Pad Start/End with Start/End value
pad_left = tf.math.floordiv(pad_size, 2) + tf.math.floordiv(INPUT_SIZE, 2)
pad_right = tf.math.floordiv(pad_size, 2) + tf.math.floordiv(INPUT_SIZE, 2)
if tf.math.mod(pad_size, 2) > 0:
pad_right += 1
# Pad By Concatenating Left/Right Edge Values
data = self.pad_edge(data, pad_left, 'LEFT')
data = self.pad_edge(data, pad_right, 'RIGHT')
# Pad Non Empty Frame Indices
non_empty_frames_idxs = self.pad_edge(non_empty_frames_idxs, pad_left, 'LEFT')
non_empty_frames_idxs = self.pad_edge(non_empty_frames_idxs, pad_right, 'RIGHT')
# Reshape to Mean Pool
data = tf.reshape(data, [INPUT_SIZE, -1, N_COLS, N_DIMS])
non_empty_frames_idxs = tf.reshape(non_empty_frames_idxs, [INPUT_SIZE, -1])
# Mean Pool
data = tf.experimental.numpy.nanmean(data, axis=1)
non_empty_frames_idxs = tf.experimental.numpy.nanmean(non_empty_frames_idxs, axis=1)
# Fill NaN Values With 0
data = tf.where(tf.math.is_nan(data), 0.0, data)
return data, non_empty_frames_idxs
# Get the absolute path to the directory containing app.py
current_dir = os.path.dirname(os.path.abspath(__file__))
# Define the filename of the TFLite model
model_filename = "model.tflite"
# Construct the full path to the TFLite model file
model_path = os.path.join(current_dir, model_filename)
# Load the TFLite model using the interpreter
interpreter = tf.lite.Interpreter(model_path=model_path)
interpreter.allocate_tensors()
# Get input and output details of the TFLite model
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()
index_to_class = {
"TV": 0, "after": 1, "airplane": 2, "all": 3, "alligator": 4, "animal": 5, "another": 6, "any": 7, "apple": 8,
"arm": 9, "aunt": 10, "awake": 11, "backyard": 12, "bad": 13, "balloon": 14, "bath": 15, "because": 16, "bed": 17,
"bedroom": 18, "bee": 19, "before": 20, "beside": 21, "better": 22, "bird": 23, "black": 24, "blow": 25, "blue": 26,
"boat": 27, "book": 28, "boy": 29, "brother": 30, "brown": 31, "bug": 32, "bye": 33, "callonphone": 34, "can": 35,
"car": 36, "carrot": 37, "cat": 38, "cereal": 39, "chair": 40, "cheek": 41, "child": 42, "chin": 43,
"chocolate": 44, "clean": 45, "close": 46, "closet": 47, "cloud": 48, "clown": 49, "cow": 50, "cowboy": 51,
"cry": 52, "cut": 53, "cute": 54, "dad": 55, "dance": 56, "dirty": 57, "dog": 58, "doll": 59, "donkey": 60,
"down": 61, "drawer": 62, "drink": 63, "drop": 64, "dry": 65, "dryer": 66, "duck": 67, "ear": 68, "elephant": 69,
"empty": 70, "every": 71, "eye": 72, "face": 73, "fall": 74, "farm": 75, "fast": 76, "feet": 77, "find": 78,
"fine": 79, "finger": 80, "finish": 81, "fireman": 82, "first": 83, "fish": 84, "flag": 85, "flower": 86,
"food": 87, "for": 88, "frenchfries": 89, "frog": 90, "garbage": 91, "gift": 92, "giraffe": 93, "girl": 94,
"give": 95, "glasswindow": 96, "go": 97, "goose": 98, "grandma": 99, "grandpa": 100, "grass": 101, "green": 102,
"gum": 103, "hair": 104, "happy": 105, "hat": 106, "hate": 107, "have": 108, "haveto": 109, "head": 110,
"hear": 111, "helicopter": 112, "hello": 113, "hen": 114, "hesheit": 115, "hide": 116, "high": 117, "home": 118,
"horse": 119, "hot": 120, "hungry": 121, "icecream": 122, "if": 123, "into": 124, "jacket": 125, "jeans": 126,
"jump": 127, "kiss": 128, "kitty": 129, "lamp": 130, "later": 131, "like": 132, "lion": 133, "lips": 134,
"listen": 135, "look": 136, "loud": 137, "mad": 138, "make": 139, "man": 140, "many": 141, "milk": 142,
"minemy": 143, "mitten": 144, "mom": 145, "moon": 146, "morning": 147, "mouse": 148, "mouth": 149, "nap": 150,
"napkin": 151, "night": 152, "no": 153, "noisy": 154, "nose": 155, "not": 156, "now": 157, "nuts": 158, "old": 159,
"on": 160, "open": 161, "orange": 162, "outside": 163, "owie": 164, "owl": 165, "pajamas": 166, "pen": 167,
"pencil": 168, "penny": 169, "person": 170, "pig": 171, "pizza": 172, "please": 173, "police": 174, "pool": 175,
"potty": 176, "pretend": 177, "pretty": 178, "puppy": 179, "puzzle": 180, "quiet": 181, "radio": 182, "rain": 183,
"read": 184, "red": 185, "refrigerator": 186, "ride": 187, "room": 188, "sad": 189, "same": 190, "say": 191,
"scissors": 192, "see": 193, "shhh": 194, "shirt": 195, "shoe": 196, "shower": 197, "sick": 198, "sleep": 199,
"sleepy": 200, "smile": 201, "snack": 202, "snow": 203, "stairs": 204, "stay": 205, "sticky": 206, "store": 207,
"story": 208, "stuck": 209, "sun": 210, "table": 211, "talk": 212, "taste": 213, "thankyou": 214, "that": 215,
"there": 216, "think": 217, "thirsty": 218, "tiger": 219, "time": 220, "tomorrow": 221, "tongue": 222, "tooth": 223,
"toothbrush": 224, "touch": 225, "toy": 226, "tree": 227, "uncle": 228, "underwear": 229, "up": 230, "vacuum": 231,
"wait": 232, "wake": 233, "water": 234, "wet": 235, "weus": 236, "where": 237, "white": 238, "who": 239, "why": 240,
"will": 241, "wolf": 242, "yellow": 243, "yes": 244, "yesterday": 245, "yourself": 246, "yucky": 247, "zebra": 248,
"zipper": 249
}
inv_index_to_class = {v: k for k, v in index_to_class.items()}
mp_holistic = mp.solutions.holistic
def mediapipe_detection(image, model):
# COLOR CONVERSION BGR 2 RGB
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
image.flags.writeable = False # Image is no longer writeable
results = model.process(image) # Make prediction
image.flags.writeable = True # Image is now writeable
image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # COLOR COVERSION RGB 2 BGR
return image, results
def extract_keypoints(results):
face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten()
lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten()
rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten()
pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten()
# Pad or truncate the arrays to the expected length (543)
face = np.pad(face, (0, max(0, 543 - len(face))), mode='constant')
lh = np.pad(lh, (0, max(0, 543 - len(lh))), mode='constant')
rh = np.pad(rh, (0, max(0, 543 - len(rh))), mode='constant')
pose = np.pad(pose, (0, max(0, 543 - len(pose))), mode='constant')
# Concatenate the arrays in the correct order and return the result
return np.concatenate([face, lh, rh, pose])
# Make prediction
def make_prediction(processed_landmarks):
inputs = np.array(processed_landmarks, dtype=np.float32)
# Set the input tensor for the TFLite model
interpreter.set_tensor(input_details[0]['index'], inputs)
# Invoke the TFLite interpreter to perform inference
interpreter.invoke()
# Get the output tensor of the TFLite model
output_data = interpreter.get_tensor(output_details[0]['index'])
# Find the index of the predicted class
index = np.argmax(output_data)
# Map the index to the corresponding class label using the index_to_class dictionary
prediction = inv_index_to_class[index]
return prediction
# ...
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
import cv2
import numpy as np
import gradio as gr
import tensorflow as tf
# Modify the predict_with_webcam function to take an image as input and return the prediction string
def predict_with_webcam(frame):
if frame is None:
raise ValueError("Frame is None. Make sure your webcam is working properly.")
# Make detections using mediapipe
image, results = mediapipe_detection(frame, holistic)
print(results)
if results is not None and results.face_landmarks is not None:
landmarks = extract_keypoints(results)
if landmarks is not None:
# Calculate the number of landmarks per frame
landmarks_per_frame = len(landmarks) // (N_ROWS * N_DIMS)
# Reshape the landmarks to have shape (None, N_ROWS, N_DIMS)
landmarks = landmarks.reshape(-1, landmarks_per_frame, N_DIMS)
# Initialize PreprocessLayer
preprocess_layer = PreprocessLayer()
# Call the PreprocessLayer to preprocess the landmarks
processed_landmarks, _ = preprocess_layer.call(landmarks)
prediction = make_prediction(processed_landmarks) # Pass the preprocessed landmarks to make_prediction
print("Prediction:", prediction)
return prediction
else:
return "Could not detect landmarks or extract keypoints. Make sure your webcam is working properly."
else:
return "Could not detect face landmarks. Make sure your webcam is working properly."
# Define the Gradio interface
iface = gr.Interface(
fn=predict_with_webcam, # The function to use for prediction
inputs="webcam", # Use Gradio's "webcam" input to capture frames from the webcam
outputs=gr.outputs.Textbox() # Display the prediction as text
)
# Launch the interface
iface.launch()
|