Spaces:
Runtime error
Runtime error
Commit
•
2aae5c8
1
Parent(s):
bd4424b
using old code
Browse files
app.py
CHANGED
@@ -1,104 +1,14 @@
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
-
|
3 |
-
import mediapipe as mp
|
4 |
import tensorflow as tf
|
|
|
|
|
5 |
|
6 |
-
|
7 |
-
|
8 |
-
DIM_NAMES = ['x', 'y', 'z']
|
9 |
-
SEED = 42
|
10 |
-
NUM_CLASSES = 250
|
11 |
-
INPUT_SIZE = 32
|
12 |
-
|
13 |
-
|
14 |
-
# Tensorflow layer to process data in TFLite
|
15 |
-
# Data needs to be processed in the model itself, so we cannot use Python
|
16 |
-
class PreprocessLayer(tf.keras.layers.Layer):
|
17 |
-
def __init__(self):
|
18 |
-
super(PreprocessLayer, self).__init__()
|
19 |
-
|
20 |
-
def pad_edge(self, t, repeats, side):
|
21 |
-
if side == 'LEFT':
|
22 |
-
return tf.concat((tf.repeat(t[:1], repeats=repeats, axis=0), t), axis=0)
|
23 |
-
elif side == 'RIGHT':
|
24 |
-
return tf.concat((t, tf.repeat(t[-1:], repeats=repeats, axis=0)), axis=0)
|
25 |
-
|
26 |
-
@tf.function(
|
27 |
-
input_signature=(tf.TensorSpec(shape=[None, N_ROWS, N_DIMS], dtype=tf.float32),),
|
28 |
-
)
|
29 |
-
def call(self, data0):
|
30 |
-
# Number of Frames in Video
|
31 |
-
N_FRAMES0 = tf.shape(data0)[0]
|
32 |
-
|
33 |
-
# Filter Out Frames With Empty Hand Data
|
34 |
-
frames_hands_nansum = tf.experimental.numpy.nanmean(tf.gather(data0, HAND_IDXS0, axis=1), axis=[1, 2])
|
35 |
-
non_empty_frames_idxs = tf.where(frames_hands_nansum > 0)
|
36 |
-
non_empty_frames_idxs = tf.squeeze(non_empty_frames_idxs, axis=1)
|
37 |
-
data = tf.gather(data0, non_empty_frames_idxs, axis=0)
|
38 |
-
|
39 |
-
# Cast Indices in float32 to be compatible with Tensorflow Lite
|
40 |
-
non_empty_frames_idxs = tf.cast(non_empty_frames_idxs, tf.float32)
|
41 |
-
|
42 |
-
# Number of Frames in Filtered Video
|
43 |
-
N_FRAMES = tf.shape(data)[0]
|
44 |
-
|
45 |
-
# Gather Relevant Landmark Columns
|
46 |
-
data = tf.gather(data, LANDMARK_IDXS0, axis=1)
|
47 |
-
|
48 |
-
# Video fits in INPUT_SIZE
|
49 |
-
if N_FRAMES < INPUT_SIZE:
|
50 |
-
# Pad With -1 to indicate padding
|
51 |
-
non_empty_frames_idxs = tf.pad(non_empty_frames_idxs, [[0, INPUT_SIZE - N_FRAMES]], constant_values=-1)
|
52 |
-
# Pad Data With Zeros
|
53 |
-
data = tf.pad(data, [[0, INPUT_SIZE - N_FRAMES], [0, 0], [0, 0]], constant_values=0)
|
54 |
-
# Fill NaN Values With 0
|
55 |
-
data = tf.where(tf.math.is_nan(data), 0.0, data)
|
56 |
-
return data, non_empty_frames_idxs
|
57 |
-
# Video needs to be downsampled to INPUT_SIZE
|
58 |
-
else:
|
59 |
-
# Repeat
|
60 |
-
if N_FRAMES < INPUT_SIZE ** 2:
|
61 |
-
repeats = tf.math.floordiv(INPUT_SIZE * INPUT_SIZE, N_FRAMES0)
|
62 |
-
data = tf.repeat(data, repeats=repeats, axis=0)
|
63 |
-
non_empty_frames_idxs = tf.repeat(non_empty_frames_idxs, repeats=repeats, axis=0)
|
64 |
-
|
65 |
-
# Pad To Multiple Of Input Size
|
66 |
-
pool_size = tf.math.floordiv(len(data), INPUT_SIZE)
|
67 |
-
if tf.math.mod(len(data), INPUT_SIZE) > 0:
|
68 |
-
pool_size += 1
|
69 |
-
|
70 |
-
if pool_size == 1:
|
71 |
-
pad_size = (pool_size * INPUT_SIZE) - len(data)
|
72 |
-
else:
|
73 |
-
pad_size = (pool_size * INPUT_SIZE) % len(data)
|
74 |
-
|
75 |
-
# Pad Start/End with Start/End value
|
76 |
-
pad_left = tf.math.floordiv(pad_size, 2) + tf.math.floordiv(INPUT_SIZE, 2)
|
77 |
-
pad_right = tf.math.floordiv(pad_size, 2) + tf.math.floordiv(INPUT_SIZE, 2)
|
78 |
-
if tf.math.mod(pad_size, 2) > 0:
|
79 |
-
pad_right += 1
|
80 |
-
|
81 |
-
# Pad By Concatenating Left/Right Edge Values
|
82 |
-
data = self.pad_edge(data, pad_left, 'LEFT')
|
83 |
-
data = self.pad_edge(data, pad_right, 'RIGHT')
|
84 |
-
|
85 |
-
# Pad Non Empty Frame Indices
|
86 |
-
non_empty_frames_idxs = self.pad_edge(non_empty_frames_idxs, pad_left, 'LEFT')
|
87 |
-
non_empty_frames_idxs = self.pad_edge(non_empty_frames_idxs, pad_right, 'RIGHT')
|
88 |
-
|
89 |
-
# Reshape to Mean Pool
|
90 |
-
data = tf.reshape(data, [INPUT_SIZE, -1, N_COLS, N_DIMS])
|
91 |
-
non_empty_frames_idxs = tf.reshape(non_empty_frames_idxs, [INPUT_SIZE, -1])
|
92 |
-
|
93 |
-
# Mean Pool
|
94 |
-
data = tf.experimental.numpy.nanmean(data, axis=1)
|
95 |
-
non_empty_frames_idxs = tf.experimental.numpy.nanmean(non_empty_frames_idxs, axis=1)
|
96 |
-
|
97 |
-
# Fill NaN Values With 0
|
98 |
-
data = tf.where(tf.math.is_nan(data), 0.0, data)
|
99 |
-
|
100 |
-
return data, non_empty_frames_idxs
|
101 |
-
|
102 |
|
103 |
# Get the absolute path to the directory containing app.py
|
104 |
current_dir = os.path.dirname(os.path.abspath(__file__))
|
@@ -106,146 +16,61 @@ current_dir = os.path.dirname(os.path.abspath(__file__))
|
|
106 |
model_filename = "model.tflite"
|
107 |
# Construct the full path to the TFLite model file
|
108 |
model_path = os.path.join(current_dir, model_filename)
|
|
|
109 |
# Load the TFLite model using the interpreter
|
110 |
interpreter = tf.lite.Interpreter(model_path=model_path)
|
111 |
interpreter.allocate_tensors()
|
112 |
|
113 |
-
#
|
114 |
-
input_details = interpreter.get_input_details()
|
115 |
-
output_details = interpreter.get_output_details()
|
116 |
-
|
117 |
-
index_to_class = {
|
118 |
-
"TV": 0, "after": 1, "airplane": 2, "all": 3, "alligator": 4, "animal": 5, "another": 6, "any": 7, "apple": 8,
|
119 |
-
"arm": 9, "aunt": 10, "awake": 11, "backyard": 12, "bad": 13, "balloon": 14, "bath": 15, "because": 16, "bed": 17,
|
120 |
-
"bedroom": 18, "bee": 19, "before": 20, "beside": 21, "better": 22, "bird": 23, "black": 24, "blow": 25, "blue": 26,
|
121 |
-
"boat": 27, "book": 28, "boy": 29, "brother": 30, "brown": 31, "bug": 32, "bye": 33, "callonphone": 34, "can": 35,
|
122 |
-
"car": 36, "carrot": 37, "cat": 38, "cereal": 39, "chair": 40, "cheek": 41, "child": 42, "chin": 43,
|
123 |
-
"chocolate": 44, "clean": 45, "close": 46, "closet": 47, "cloud": 48, "clown": 49, "cow": 50, "cowboy": 51,
|
124 |
-
"cry": 52, "cut": 53, "cute": 54, "dad": 55, "dance": 56, "dirty": 57, "dog": 58, "doll": 59, "donkey": 60,
|
125 |
-
"down": 61, "drawer": 62, "drink": 63, "drop": 64, "dry": 65, "dryer": 66, "duck": 67, "ear": 68, "elephant": 69,
|
126 |
-
"empty": 70, "every": 71, "eye": 72, "face": 73, "fall": 74, "farm": 75, "fast": 76, "feet": 77, "find": 78,
|
127 |
-
"fine": 79, "finger": 80, "finish": 81, "fireman": 82, "first": 83, "fish": 84, "flag": 85, "flower": 86,
|
128 |
-
"food": 87, "for": 88, "frenchfries": 89, "frog": 90, "garbage": 91, "gift": 92, "giraffe": 93, "girl": 94,
|
129 |
-
"give": 95, "glasswindow": 96, "go": 97, "goose": 98, "grandma": 99, "grandpa": 100, "grass": 101, "green": 102,
|
130 |
-
"gum": 103, "hair": 104, "happy": 105, "hat": 106, "hate": 107, "have": 108, "haveto": 109, "head": 110,
|
131 |
-
"hear": 111, "helicopter": 112, "hello": 113, "hen": 114, "hesheit": 115, "hide": 116, "high": 117, "home": 118,
|
132 |
-
"horse": 119, "hot": 120, "hungry": 121, "icecream": 122, "if": 123, "into": 124, "jacket": 125, "jeans": 126,
|
133 |
-
"jump": 127, "kiss": 128, "kitty": 129, "lamp": 130, "later": 131, "like": 132, "lion": 133, "lips": 134,
|
134 |
-
"listen": 135, "look": 136, "loud": 137, "mad": 138, "make": 139, "man": 140, "many": 141, "milk": 142,
|
135 |
-
"minemy": 143, "mitten": 144, "mom": 145, "moon": 146, "morning": 147, "mouse": 148, "mouth": 149, "nap": 150,
|
136 |
-
"napkin": 151, "night": 152, "no": 153, "noisy": 154, "nose": 155, "not": 156, "now": 157, "nuts": 158, "old": 159,
|
137 |
-
"on": 160, "open": 161, "orange": 162, "outside": 163, "owie": 164, "owl": 165, "pajamas": 166, "pen": 167,
|
138 |
-
"pencil": 168, "penny": 169, "person": 170, "pig": 171, "pizza": 172, "please": 173, "police": 174, "pool": 175,
|
139 |
-
"potty": 176, "pretend": 177, "pretty": 178, "puppy": 179, "puzzle": 180, "quiet": 181, "radio": 182, "rain": 183,
|
140 |
-
"read": 184, "red": 185, "refrigerator": 186, "ride": 187, "room": 188, "sad": 189, "same": 190, "say": 191,
|
141 |
-
"scissors": 192, "see": 193, "shhh": 194, "shirt": 195, "shoe": 196, "shower": 197, "sick": 198, "sleep": 199,
|
142 |
-
"sleepy": 200, "smile": 201, "snack": 202, "snow": 203, "stairs": 204, "stay": 205, "sticky": 206, "store": 207,
|
143 |
-
"story": 208, "stuck": 209, "sun": 210, "table": 211, "talk": 212, "taste": 213, "thankyou": 214, "that": 215,
|
144 |
-
"there": 216, "think": 217, "thirsty": 218, "tiger": 219, "time": 220, "tomorrow": 221, "tongue": 222, "tooth": 223,
|
145 |
-
"toothbrush": 224, "touch": 225, "toy": 226, "tree": 227, "uncle": 228, "underwear": 229, "up": 230, "vacuum": 231,
|
146 |
-
"wait": 232, "wake": 233, "water": 234, "wet": 235, "weus": 236, "where": 237, "white": 238, "who": 239, "why": 240,
|
147 |
-
"will": 241, "wolf": 242, "yellow": 243, "yes": 244, "yesterday": 245, "yourself": 246, "yucky": 247, "zebra": 248,
|
148 |
-
"zipper": 249
|
149 |
-
}
|
150 |
-
|
151 |
-
inv_index_to_class = {v: k for k, v in index_to_class.items()}
|
152 |
-
|
153 |
-
mp_holistic = mp.solutions.holistic
|
154 |
-
|
155 |
|
|
|
156 |
def mediapipe_detection(image, model):
|
157 |
# COLOR CONVERSION BGR 2 RGB
|
158 |
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
159 |
-
image.flags.writeable = False
|
160 |
-
results = model.process(image)
|
161 |
-
image.flags.writeable = True
|
162 |
image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # COLOR COVERSION RGB 2 BGR
|
163 |
return image, results
|
164 |
|
165 |
-
|
166 |
def extract_keypoints(results):
|
167 |
-
|
168 |
-
|
169 |
-
rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten(
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
face = np.
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
# Find the index of the predicted class
|
196 |
-
index = np.argmax(output_data)
|
197 |
-
|
198 |
-
# Map the index to the corresponding class label using the index_to_class dictionary
|
199 |
-
prediction = inv_index_to_class[index]
|
200 |
-
|
201 |
-
return prediction
|
202 |
-
|
203 |
-
|
204 |
-
# ...
|
205 |
-
|
206 |
-
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
|
207 |
-
import cv2
|
208 |
-
import numpy as np
|
209 |
-
import gradio as gr
|
210 |
-
import tensorflow as tf
|
211 |
-
|
212 |
-
|
213 |
-
# Modify the predict_with_webcam function to take an image as input and return the prediction string
|
214 |
-
def predict_with_webcam(frame):
|
215 |
-
if frame is None:
|
216 |
-
raise ValueError("Frame is None. Make sure your webcam is working properly.")
|
217 |
-
|
218 |
-
# Make detections using mediapipe
|
219 |
-
image, results = mediapipe_detection(frame, holistic)
|
220 |
-
print(results)
|
221 |
-
|
222 |
-
if results is not None and results.face_landmarks is not None:
|
223 |
-
landmarks = extract_keypoints(results)
|
224 |
-
if landmarks is not None:
|
225 |
-
# Calculate the number of landmarks per frame
|
226 |
-
landmarks_per_frame = len(landmarks) // (N_ROWS * N_DIMS)
|
227 |
-
# Reshape the landmarks to have shape (None, N_ROWS, N_DIMS)
|
228 |
-
landmarks = landmarks.reshape(-1, landmarks_per_frame, N_DIMS)
|
229 |
-
# Initialize PreprocessLayer
|
230 |
-
preprocess_layer = PreprocessLayer()
|
231 |
-
# Call the PreprocessLayer to preprocess the landmarks
|
232 |
-
processed_landmarks, _ = preprocess_layer.call(landmarks)
|
233 |
-
prediction = make_prediction(processed_landmarks) # Pass the preprocessed landmarks to make_prediction
|
234 |
-
print("Prediction:", prediction)
|
235 |
-
return prediction
|
236 |
-
else:
|
237 |
-
return "Could not detect landmarks or extract keypoints. Make sure your webcam is working properly."
|
238 |
-
else:
|
239 |
-
return "Could not detect face landmarks. Make sure your webcam is working properly."
|
240 |
-
|
241 |
-
|
242 |
-
# Define the Gradio interface
|
243 |
-
iface = gr.Interface(
|
244 |
fn=predict_with_webcam,
|
245 |
-
inputs=gr.inputs.Image(shape=(
|
246 |
-
outputs=
|
|
|
|
|
|
|
|
|
247 |
)
|
248 |
|
249 |
-
|
250 |
-
|
251 |
-
|
|
|
1 |
+
# Import the required libraries
|
2 |
+
import cv2
|
3 |
+
import numpy as np
|
4 |
import os
|
5 |
+
import gradio as gr
|
|
|
6 |
import tensorflow as tf
|
7 |
+
import tensorflow.lite as tflite
|
8 |
+
import mediapipe as mp
|
9 |
|
10 |
+
# Initialize MediaPipe solutions
|
11 |
+
mp_holistic = mp.solutions.holistic
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
# Get the absolute path to the directory containing app.py
|
14 |
current_dir = os.path.dirname(os.path.abspath(__file__))
|
|
|
16 |
model_filename = "model.tflite"
|
17 |
# Construct the full path to the TFLite model file
|
18 |
model_path = os.path.join(current_dir, model_filename)
|
19 |
+
|
20 |
# Load the TFLite model using the interpreter
|
21 |
interpreter = tf.lite.Interpreter(model_path=model_path)
|
22 |
interpreter.allocate_tensors()
|
23 |
|
24 |
+
# ... (other functions from previous code)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
+
# Function to perform holistic detection using Mediapipe
|
27 |
def mediapipe_detection(image, model):
|
28 |
# COLOR CONVERSION BGR 2 RGB
|
29 |
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
30 |
+
image.flags.writeable = False # Image is no longer writeable
|
31 |
+
results = model.process(image) # Make prediction
|
32 |
+
image.flags.writeable = True # Image is now writeable
|
33 |
image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # COLOR COVERSION RGB 2 BGR
|
34 |
return image, results
|
35 |
|
36 |
+
# Function to extract keypoints from Mediapipe results
|
37 |
def extract_keypoints(results):
|
38 |
+
lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten(
|
39 |
+
) if results.left_hand_landmarks else np.zeros(21*3)
|
40 |
+
rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten(
|
41 |
+
) if results.right_hand_landmarks else np.zeros(21*3)
|
42 |
+
pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten(
|
43 |
+
) if results.pose_landmarks else np.zeros(33*4)
|
44 |
+
face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten(
|
45 |
+
) if results.face_landmarks else np.zeros(468*3)
|
46 |
+
return np.concatenate([lh, rh, pose, face])
|
47 |
+
|
48 |
+
# Main prediction function that combines everything
|
49 |
+
def predict_with_webcam(frame):
|
50 |
+
# Perform holistic detection
|
51 |
+
image, results = mediapipe_detection(frame, holistic)
|
52 |
+
# Extract keypoints
|
53 |
+
keypoints = extract_keypoints(results)
|
54 |
+
if np.count_nonzero(keypoints) > 0:
|
55 |
+
# Preprocess keypoints and make prediction
|
56 |
+
processed_landmarks = np.array([keypoints], dtype=np.float32)
|
57 |
+
interpreter.set_tensor(input_details[0]['index'], processed_landmarks)
|
58 |
+
interpreter.invoke()
|
59 |
+
outputs = interpreter.get_tensor(output_details[0]['index'])
|
60 |
+
prediction = outputs[0].argmax()
|
61 |
+
return str(prediction)
|
62 |
+
|
63 |
+
# Define the Gradio interface with the Webcam input and Text output
|
64 |
+
webcam_interface = gr.Interface(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
fn=predict_with_webcam,
|
66 |
+
inputs=gr.inputs.Image(shape=(480, 640), source="webcam"),
|
67 |
+
outputs="text",
|
68 |
+
live=True,
|
69 |
+
interpretation="default",
|
70 |
+
title="Webcam Landmark Prediction",
|
71 |
+
description="Make predictions using landmarks extracted from your webcam stream.",
|
72 |
)
|
73 |
|
74 |
+
# Launch the Gradio app with the webcam interface
|
75 |
+
if __name__ == "__main__":
|
76 |
+
webcam_interface.launch()
|