Spaces:
Runtime error
Runtime error
JefferyJapheth
commited on
Commit
•
e32630d
1
Parent(s):
455e90a
app_final_year
Browse files
app.py
ADDED
@@ -0,0 +1,310 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import time
|
3 |
+
|
4 |
+
import cv2
|
5 |
+
import gradio as gr
|
6 |
+
import mediapipe as mp
|
7 |
+
import numpy as np
|
8 |
+
from matplotlib import pyplot as plt
|
9 |
+
|
10 |
+
mp_holistic = mp.solutions.holistic
|
11 |
+
|
12 |
+
# Import TensorFlow
|
13 |
+
import tensorflow as tf
|
14 |
+
|
15 |
+
N_ROWS = 543
|
16 |
+
N_DIMS = 3
|
17 |
+
DIM_NAMES = ['x', 'y', 'z']
|
18 |
+
SEED = 42
|
19 |
+
NUM_CLASSES = 250
|
20 |
+
INPUT_SIZE = 64
|
21 |
+
|
22 |
+
BATCH_ALL_SIGNS_N = 4
|
23 |
+
BATCH_SIZE = 256
|
24 |
+
N_EPOCHS = 100
|
25 |
+
LR_MAX = 1e-3
|
26 |
+
N_WARMUP_EPOCHS = 0
|
27 |
+
WD_RATIO = 0.05
|
28 |
+
MASK_VAL = 4237
|
29 |
+
|
30 |
+
USE_TYPES = ['left_hand', 'pose', 'right_hand']
|
31 |
+
START_IDX = 468
|
32 |
+
LIPS_IDXS0 = np.array([
|
33 |
+
61, 185, 40, 39, 37, 0, 267, 269, 270, 409,
|
34 |
+
291, 146, 91, 181, 84, 17, 314, 405, 321, 375,
|
35 |
+
78, 191, 80, 81, 82, 13, 312, 311, 310, 415,
|
36 |
+
95, 88, 178, 87, 14, 317, 402, 318, 324, 308,
|
37 |
+
])
|
38 |
+
# Landmark indices in original data
|
39 |
+
LEFT_HAND_IDXS0 = np.arange(468, 489)
|
40 |
+
RIGHT_HAND_IDXS0 = np.arange(522, 543)
|
41 |
+
LEFT_POSE_IDXS0 = np.array([502, 504, 506, 508, 510])
|
42 |
+
RIGHT_POSE_IDXS0 = np.array([503, 505, 507, 509, 511])
|
43 |
+
LANDMARK_IDXS_LEFT_DOMINANT0 = np.concatenate((LIPS_IDXS0, LEFT_HAND_IDXS0, LEFT_POSE_IDXS0))
|
44 |
+
LANDMARK_IDXS_RIGHT_DOMINANT0 = np.concatenate((LIPS_IDXS0, RIGHT_HAND_IDXS0, RIGHT_POSE_IDXS0))
|
45 |
+
HAND_IDXS0 = np.concatenate((LEFT_HAND_IDXS0, RIGHT_HAND_IDXS0), axis=0)
|
46 |
+
N_COLS = LANDMARK_IDXS_LEFT_DOMINANT0.size
|
47 |
+
# Landmark indices in processed data
|
48 |
+
LIPS_IDXS = np.argwhere(np.isin(LANDMARK_IDXS_LEFT_DOMINANT0, LIPS_IDXS0)).squeeze()
|
49 |
+
LEFT_HAND_IDXS = np.argwhere(np.isin(LANDMARK_IDXS_LEFT_DOMINANT0, LEFT_HAND_IDXS0)).squeeze()
|
50 |
+
RIGHT_HAND_IDXS = np.argwhere(np.isin(LANDMARK_IDXS_LEFT_DOMINANT0, RIGHT_HAND_IDXS0)).squeeze()
|
51 |
+
HAND_IDXS = np.argwhere(np.isin(LANDMARK_IDXS_LEFT_DOMINANT0, HAND_IDXS0)).squeeze()
|
52 |
+
POSE_IDXS = np.argwhere(np.isin(LANDMARK_IDXS_LEFT_DOMINANT0, LEFT_POSE_IDXS0)).squeeze()
|
53 |
+
|
54 |
+
print(f'# HAND_IDXS: {len(HAND_IDXS)}, N_COLS: {N_COLS}')
|
55 |
+
|
56 |
+
LIPS_START = 0
|
57 |
+
LEFT_HAND_START = LIPS_IDXS.size
|
58 |
+
RIGHT_HAND_START = LEFT_HAND_START + LEFT_HAND_IDXS.size
|
59 |
+
POSE_START = RIGHT_HAND_START + RIGHT_HAND_IDXS.size
|
60 |
+
|
61 |
+
print(
|
62 |
+
f'LIPS_START: {LIPS_START}, LEFT_HAND_START: {LEFT_HAND_START}, RIGHT_HAND_START: {RIGHT_HAND_START}, POSE_START: {POSE_START}')
|
63 |
+
|
64 |
+
|
65 |
+
def mediapipe_detection(image, model):
|
66 |
+
# COLOR CONVERSION BGR 2 RGB
|
67 |
+
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
68 |
+
image.flags.writeable = False # Image is no longer writeable
|
69 |
+
results = model.process(image) # Make prediction
|
70 |
+
image.flags.writeable = True # Image is now writeable
|
71 |
+
image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # COLOR COVERSION RGB 2 BGR
|
72 |
+
return image, results
|
73 |
+
|
74 |
+
|
75 |
+
def extract_keypoints(results):
|
76 |
+
lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten(
|
77 |
+
) if results.left_hand_landmarks else np.zeros(21 * 3)
|
78 |
+
rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten(
|
79 |
+
) if results.right_hand_landmarks else np.zeros(21 * 3)
|
80 |
+
pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten(
|
81 |
+
) if results.pose_landmarks else np.zeros(33 * 4)
|
82 |
+
face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten(
|
83 |
+
) if results.face_landmarks else np.zeros(468 * 3)
|
84 |
+
return np.concatenate([lh, rh, pose, face])
|
85 |
+
|
86 |
+
|
87 |
+
cap = cv2.VideoCapture(0, cv2.CAP_DSHOW)
|
88 |
+
|
89 |
+
# Set mediapipe model
|
90 |
+
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
|
91 |
+
while cap.isOpened():
|
92 |
+
|
93 |
+
# Read feed
|
94 |
+
ret, frame = cap.read()
|
95 |
+
|
96 |
+
# Make detections
|
97 |
+
image, results = mediapipe_detection(frame, holistic)
|
98 |
+
|
99 |
+
print(results)
|
100 |
+
|
101 |
+
|
102 |
+
# Function to make predictions using the TensorFlow Lite model
|
103 |
+
def make_prediction(processed_landmarks):
|
104 |
+
inputs = np.array(processed_landmarks, dtype=np.float32)
|
105 |
+
|
106 |
+
# Set the input tensor for the TFLite model
|
107 |
+
interpreter.set_tensor(input_details[0]['index'], inputs)
|
108 |
+
|
109 |
+
# Invoke the TFLite interpreter to perform inference
|
110 |
+
interpreter.invoke()
|
111 |
+
|
112 |
+
# Get the output tensor of the TFLite model
|
113 |
+
output_data = interpreter.get_tensor(output_details[0]['index'])
|
114 |
+
|
115 |
+
# Find the index of the predicted class
|
116 |
+
index = np.argmax(output_data)
|
117 |
+
|
118 |
+
# Map the index to the corresponding class label using the index_to_class dictionary
|
119 |
+
prediction = inv_index_to_class[index]
|
120 |
+
|
121 |
+
return prediction
|
122 |
+
|
123 |
+
|
124 |
+
class PreprocessLayer(tf.keras.layers.Layer):
|
125 |
+
def __init__(self):
|
126 |
+
super(PreprocessLayer, self).__init__()
|
127 |
+
normalisation_correction = tf.constant([
|
128 |
+
# Add 0.50 to left hand (original right hand) and substract 0.50 of right hand (original left hand)
|
129 |
+
[0] * len(LIPS_IDXS) + [0.50] * len(LEFT_HAND_IDXS) + [0.50] * len(POSE_IDXS),
|
130 |
+
# Y coordinates stay intact
|
131 |
+
[0] * len(LANDMARK_IDXS_LEFT_DOMINANT0),
|
132 |
+
# Z coordinates stay intact
|
133 |
+
[0] * len(LANDMARK_IDXS_LEFT_DOMINANT0),
|
134 |
+
],
|
135 |
+
dtype=tf.float32,
|
136 |
+
)
|
137 |
+
self.normalisation_correction = tf.transpose(normalisation_correction, [1, 0])
|
138 |
+
|
139 |
+
def pad_edge(self, t, repeats, side):
|
140 |
+
if side == 'LEFT':
|
141 |
+
return tf.concat((tf.repeat(t[:1], repeats=repeats, axis=0), t), axis=0)
|
142 |
+
elif side == 'RIGHT':
|
143 |
+
return tf.concat((t, tf.repeat(t[-1:], repeats=repeats, axis=0)), axis=0)
|
144 |
+
|
145 |
+
@tf.function(
|
146 |
+
input_signature=(tf.TensorSpec(shape=[None, N_ROWS, N_DIMS], dtype=tf.float32),),
|
147 |
+
)
|
148 |
+
def call(self, data0):
|
149 |
+
# Number of Frames in Video
|
150 |
+
N_FRAMES0 = tf.shape(data0)[0]
|
151 |
+
|
152 |
+
# Find dominant hand by comparing summed absolute coordinates
|
153 |
+
left_hand_sum = tf.math.reduce_sum(
|
154 |
+
tf.where(tf.math.is_nan(tf.gather(data0, LEFT_HAND_IDXS0, axis=1)), 0, 1))
|
155 |
+
right_hand_sum = tf.math.reduce_sum(
|
156 |
+
tf.where(tf.math.is_nan(tf.gather(data0, RIGHT_HAND_IDXS0, axis=1)), 0, 1))
|
157 |
+
left_dominant = left_hand_sum >= right_hand_sum
|
158 |
+
|
159 |
+
# Count non NaN Hand values in each frame for the dominant hand
|
160 |
+
if left_dominant:
|
161 |
+
frames_hands_non_nan_sum = tf.math.reduce_sum(
|
162 |
+
tf.where(tf.math.is_nan(tf.gather(data0, LEFT_HAND_IDXS0, axis=1)), 0, 1),
|
163 |
+
axis=[1, 2],
|
164 |
+
)
|
165 |
+
else:
|
166 |
+
frames_hands_non_nan_sum = tf.math.reduce_sum(
|
167 |
+
tf.where(tf.math.is_nan(tf.gather(data0, RIGHT_HAND_IDXS0, axis=1)), 0, 1),
|
168 |
+
axis=[1, 2],
|
169 |
+
)
|
170 |
+
|
171 |
+
# Find frames indices with coordinates of dominant hand
|
172 |
+
non_empty_frames_idxs = tf.where(frames_hands_non_nan_sum > 0)
|
173 |
+
non_empty_frames_idxs = tf.squeeze(non_empty_frames_idxs, axis=1)
|
174 |
+
# Filter frames
|
175 |
+
data = tf.gather(data0, non_empty_frames_idxs, axis=0)
|
176 |
+
|
177 |
+
# Cast Indices in float32 to be compatible with Tensorflow Lite
|
178 |
+
non_empty_frames_idxs = tf.cast(non_empty_frames_idxs, tf.float32)
|
179 |
+
# Normalize to start with 0
|
180 |
+
non_empty_frames_idxs -= tf.reduce_min(non_empty_frames_idxs)
|
181 |
+
|
182 |
+
# Number of Frames in Filtered Video
|
183 |
+
N_FRAMES = tf.shape(data)[0]
|
184 |
+
|
185 |
+
# Gather Relevant Landmark Columns
|
186 |
+
if left_dominant:
|
187 |
+
data = tf.gather(data, LANDMARK_IDXS_LEFT_DOMINANT0, axis=1)
|
188 |
+
else:
|
189 |
+
data = tf.gather(data, LANDMARK_IDXS_RIGHT_DOMINANT0, axis=1)
|
190 |
+
data = (
|
191 |
+
self.normalisation_correction + (
|
192 |
+
(data - self.normalisation_correction) * tf.where(self.normalisation_correction != 0, -1.0,
|
193 |
+
1.0))
|
194 |
+
)
|
195 |
+
|
196 |
+
# Video fits in INPUT_SIZE
|
197 |
+
if N_FRAMES < INPUT_SIZE:
|
198 |
+
# Pad With -1 to indicate padding
|
199 |
+
non_empty_frames_idxs = tf.pad(non_empty_frames_idxs, [[0, INPUT_SIZE - N_FRAMES]],
|
200 |
+
constant_values=-1)
|
201 |
+
# Pad Data With Zeros
|
202 |
+
data = tf.pad(data, [[0, INPUT_SIZE - N_FRAMES], [0, 0], [0, 0]], constant_values=0)
|
203 |
+
# Fill NaN Values With 0
|
204 |
+
data = tf.where(tf.math.is_nan(data), 0.0, data)
|
205 |
+
return data, non_empty_frames_idxs
|
206 |
+
# Video needs to be downsampled to INPUT_SIZE
|
207 |
+
else:
|
208 |
+
# Repeat
|
209 |
+
if N_FRAMES < INPUT_SIZE ** 2:
|
210 |
+
repeats = tf.math.floordiv(INPUT_SIZE * INPUT_SIZE, N_FRAMES0)
|
211 |
+
data = tf.repeat(data, repeats=repeats, axis=0)
|
212 |
+
non_empty_frames_idxs = tf.repeat(non_empty_frames_idxs, repeats=repeats, axis=0)
|
213 |
+
|
214 |
+
# Pad To Multiple Of Input Size
|
215 |
+
pool_size = tf.math.floordiv(len(data), INPUT_SIZE)
|
216 |
+
if tf.math.mod(len(data), INPUT_SIZE) > 0:
|
217 |
+
pool_size += 1
|
218 |
+
|
219 |
+
if pool_size == 1:
|
220 |
+
pad_size = (pool_size * INPUT_SIZE) - len(data)
|
221 |
+
else:
|
222 |
+
pad_size = (pool_size * INPUT_SIZE) % len(data)
|
223 |
+
|
224 |
+
# Pad Start/End with Start/End value
|
225 |
+
pad_left = tf.math.floordiv(pad_size, 2) + tf.math.floordiv(INPUT_SIZE, 2)
|
226 |
+
pad_right = tf.math.floordiv(pad_size, 2) + tf.math.floordiv(INPUT_SIZE, 2)
|
227 |
+
if tf.math.mod(pad_size, 2) > 0:
|
228 |
+
pad_right += 1
|
229 |
+
|
230 |
+
# Pad By Concatenating Left/Right Edge Values
|
231 |
+
data = self.pad_edge(data, pad_left, 'LEFT')
|
232 |
+
data = self.pad_edge(data, pad_right, 'RIGHT')
|
233 |
+
|
234 |
+
# Pad Non Empty Frame Indices
|
235 |
+
non_empty_frames_idxs = self.pad_edge(non_empty_frames_idxs, pad_left, 'LEFT')
|
236 |
+
non_empty_frames_idxs = self.pad_edge(non_empty_frames_idxs, pad_right, 'RIGHT')
|
237 |
+
|
238 |
+
# Reshape to Mean Pool
|
239 |
+
data = tf.reshape(data, [INPUT_SIZE, -1, N_COLS, N_DIMS])
|
240 |
+
non_empty_frames_idxs = tf.reshape(non_empty_frames_idxs, [INPUT_SIZE, -1])
|
241 |
+
|
242 |
+
# Mean Pool
|
243 |
+
data = tf.experimental.numpy.nanmean(data, axis=1)
|
244 |
+
non_empty_frames_idxs = tf.experimental.numpy.nanmean(non_empty_frames_idxs, axis=1)
|
245 |
+
|
246 |
+
# Fill NaN Values With 0
|
247 |
+
data = tf.where(tf.math.is_nan(data), 0.0, data)
|
248 |
+
|
249 |
+
return data, non_empty_frames_idxs
|
250 |
+
|
251 |
+
|
252 |
+
preprocess_layer = PreprocessLayer()
|
253 |
+
|
254 |
+
|
255 |
+
def translate_sign_language(image):
|
256 |
+
# Convert the frame to RGB (Mediapipe expects RGB images)
|
257 |
+
rgb_frame = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
258 |
+
|
259 |
+
with mp_hands.Hands(min_detection_confidence=0.5, min_tracking_confidence=0.5) as hands_tracker:
|
260 |
+
# Process the frame with Mediapipe Hands
|
261 |
+
hands_results = hands_tracker.process(rgb_frame)
|
262 |
+
|
263 |
+
with mp_pose.Pose(min_detection_confidence=0.5, min_tracking_confidence=0.5) as pose_tracker:
|
264 |
+
# Process the frame with Mediapipe Pose
|
265 |
+
pose_results = mp_pose.process(rgb_frame)
|
266 |
+
|
267 |
+
# Extract keypoints from the results
|
268 |
+
hand_pose_keypoints = extract_keypoints(hands_results)
|
269 |
+
pose_keypoints = extract_keypoints(pose_results)
|
270 |
+
|
271 |
+
# Prepare the input data for the TFLite model
|
272 |
+
left_hand_landmarks = hand_pose_keypoints[:63].reshape(1, -1, 3)
|
273 |
+
right_hand_landmarks = hand_pose_keypoints[63:126].reshape(1, -1, 3)
|
274 |
+
pose_landmarks = pose_keypoints[126:].reshape(1, -1, 4)
|
275 |
+
|
276 |
+
# Call the PreprocessLayer to preprocess the hand and pose landmark data
|
277 |
+
preprocessed_left_hand, _ = preprocess_layer(left_hand_landmarks)
|
278 |
+
preprocessed_right_hand, _ = preprocess_layer(right_hand_landmarks)
|
279 |
+
preprocessed_pose, _ = preprocess_layer(pose_landmarks)
|
280 |
+
|
281 |
+
# Prepare the input data for the TFLite model
|
282 |
+
input_data = [preprocessed_left_hand, preprocessed_right_hand, preprocessed_pose]
|
283 |
+
|
284 |
+
# Perform inference using the loaded sign language model (assuming you have already loaded it)
|
285 |
+
interpreter.set_tensor(interpreter.get_input_details()[0]['index'], input_data[0])
|
286 |
+
interpreter.set_tensor(interpreter.get_input_details()[1]['index'], input_data[1])
|
287 |
+
interpreter.set_tensor(interpreter.get_input_details()[2]['index'], input_data[2])
|
288 |
+
interpreter.invoke()
|
289 |
+
output = interpreter.get_tensor(interpreter.get_output_details()[0]['index'])
|
290 |
+
|
291 |
+
# Make prediction using the processed landmarks
|
292 |
+
translated_text = make_prediction(output)
|
293 |
+
|
294 |
+
# Return the translated text
|
295 |
+
return translated_text
|
296 |
+
|
297 |
+
|
298 |
+
gr_interface = gr.Interface(fn=translate_sign_language,
|
299 |
+
inputs="webcam", # Input from webcam
|
300 |
+
outputs="text", # Output as text
|
301 |
+
#capture_session=True, # To properly release the webcam after running the interface
|
302 |
+
live=True, # Show live webcam feed
|
303 |
+
title="Sign Language Translation",
|
304 |
+
description="Translate sign language to text using TensorFlow Lite and Mediapipe.")
|
305 |
+
|
306 |
+
gr_interface.launch(share=True)
|
307 |
+
|
308 |
+
cap.release()
|
309 |
+
cv2.destroyAllWindows()
|
310 |
+
|