JefferyJapheth commited on
Commit
e32630d
1 Parent(s): 455e90a

app_final_year

Browse files
Files changed (1) hide show
  1. app.py +310 -0
app.py ADDED
@@ -0,0 +1,310 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+
4
+ import cv2
5
+ import gradio as gr
6
+ import mediapipe as mp
7
+ import numpy as np
8
+ from matplotlib import pyplot as plt
9
+
10
+ mp_holistic = mp.solutions.holistic
11
+
12
+ # Import TensorFlow
13
+ import tensorflow as tf
14
+
15
+ N_ROWS = 543
16
+ N_DIMS = 3
17
+ DIM_NAMES = ['x', 'y', 'z']
18
+ SEED = 42
19
+ NUM_CLASSES = 250
20
+ INPUT_SIZE = 64
21
+
22
+ BATCH_ALL_SIGNS_N = 4
23
+ BATCH_SIZE = 256
24
+ N_EPOCHS = 100
25
+ LR_MAX = 1e-3
26
+ N_WARMUP_EPOCHS = 0
27
+ WD_RATIO = 0.05
28
+ MASK_VAL = 4237
29
+
30
+ USE_TYPES = ['left_hand', 'pose', 'right_hand']
31
+ START_IDX = 468
32
+ LIPS_IDXS0 = np.array([
33
+ 61, 185, 40, 39, 37, 0, 267, 269, 270, 409,
34
+ 291, 146, 91, 181, 84, 17, 314, 405, 321, 375,
35
+ 78, 191, 80, 81, 82, 13, 312, 311, 310, 415,
36
+ 95, 88, 178, 87, 14, 317, 402, 318, 324, 308,
37
+ ])
38
+ # Landmark indices in original data
39
+ LEFT_HAND_IDXS0 = np.arange(468, 489)
40
+ RIGHT_HAND_IDXS0 = np.arange(522, 543)
41
+ LEFT_POSE_IDXS0 = np.array([502, 504, 506, 508, 510])
42
+ RIGHT_POSE_IDXS0 = np.array([503, 505, 507, 509, 511])
43
+ LANDMARK_IDXS_LEFT_DOMINANT0 = np.concatenate((LIPS_IDXS0, LEFT_HAND_IDXS0, LEFT_POSE_IDXS0))
44
+ LANDMARK_IDXS_RIGHT_DOMINANT0 = np.concatenate((LIPS_IDXS0, RIGHT_HAND_IDXS0, RIGHT_POSE_IDXS0))
45
+ HAND_IDXS0 = np.concatenate((LEFT_HAND_IDXS0, RIGHT_HAND_IDXS0), axis=0)
46
+ N_COLS = LANDMARK_IDXS_LEFT_DOMINANT0.size
47
+ # Landmark indices in processed data
48
+ LIPS_IDXS = np.argwhere(np.isin(LANDMARK_IDXS_LEFT_DOMINANT0, LIPS_IDXS0)).squeeze()
49
+ LEFT_HAND_IDXS = np.argwhere(np.isin(LANDMARK_IDXS_LEFT_DOMINANT0, LEFT_HAND_IDXS0)).squeeze()
50
+ RIGHT_HAND_IDXS = np.argwhere(np.isin(LANDMARK_IDXS_LEFT_DOMINANT0, RIGHT_HAND_IDXS0)).squeeze()
51
+ HAND_IDXS = np.argwhere(np.isin(LANDMARK_IDXS_LEFT_DOMINANT0, HAND_IDXS0)).squeeze()
52
+ POSE_IDXS = np.argwhere(np.isin(LANDMARK_IDXS_LEFT_DOMINANT0, LEFT_POSE_IDXS0)).squeeze()
53
+
54
+ print(f'# HAND_IDXS: {len(HAND_IDXS)}, N_COLS: {N_COLS}')
55
+
56
+ LIPS_START = 0
57
+ LEFT_HAND_START = LIPS_IDXS.size
58
+ RIGHT_HAND_START = LEFT_HAND_START + LEFT_HAND_IDXS.size
59
+ POSE_START = RIGHT_HAND_START + RIGHT_HAND_IDXS.size
60
+
61
+ print(
62
+ f'LIPS_START: {LIPS_START}, LEFT_HAND_START: {LEFT_HAND_START}, RIGHT_HAND_START: {RIGHT_HAND_START}, POSE_START: {POSE_START}')
63
+
64
+
65
+ def mediapipe_detection(image, model):
66
+ # COLOR CONVERSION BGR 2 RGB
67
+ image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
68
+ image.flags.writeable = False # Image is no longer writeable
69
+ results = model.process(image) # Make prediction
70
+ image.flags.writeable = True # Image is now writeable
71
+ image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # COLOR COVERSION RGB 2 BGR
72
+ return image, results
73
+
74
+
75
+ def extract_keypoints(results):
76
+ lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten(
77
+ ) if results.left_hand_landmarks else np.zeros(21 * 3)
78
+ rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten(
79
+ ) if results.right_hand_landmarks else np.zeros(21 * 3)
80
+ pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten(
81
+ ) if results.pose_landmarks else np.zeros(33 * 4)
82
+ face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten(
83
+ ) if results.face_landmarks else np.zeros(468 * 3)
84
+ return np.concatenate([lh, rh, pose, face])
85
+
86
+
87
+ cap = cv2.VideoCapture(0, cv2.CAP_DSHOW)
88
+
89
+ # Set mediapipe model
90
+ with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
91
+ while cap.isOpened():
92
+
93
+ # Read feed
94
+ ret, frame = cap.read()
95
+
96
+ # Make detections
97
+ image, results = mediapipe_detection(frame, holistic)
98
+
99
+ print(results)
100
+
101
+
102
+ # Function to make predictions using the TensorFlow Lite model
103
+ def make_prediction(processed_landmarks):
104
+ inputs = np.array(processed_landmarks, dtype=np.float32)
105
+
106
+ # Set the input tensor for the TFLite model
107
+ interpreter.set_tensor(input_details[0]['index'], inputs)
108
+
109
+ # Invoke the TFLite interpreter to perform inference
110
+ interpreter.invoke()
111
+
112
+ # Get the output tensor of the TFLite model
113
+ output_data = interpreter.get_tensor(output_details[0]['index'])
114
+
115
+ # Find the index of the predicted class
116
+ index = np.argmax(output_data)
117
+
118
+ # Map the index to the corresponding class label using the index_to_class dictionary
119
+ prediction = inv_index_to_class[index]
120
+
121
+ return prediction
122
+
123
+
124
+ class PreprocessLayer(tf.keras.layers.Layer):
125
+ def __init__(self):
126
+ super(PreprocessLayer, self).__init__()
127
+ normalisation_correction = tf.constant([
128
+ # Add 0.50 to left hand (original right hand) and substract 0.50 of right hand (original left hand)
129
+ [0] * len(LIPS_IDXS) + [0.50] * len(LEFT_HAND_IDXS) + [0.50] * len(POSE_IDXS),
130
+ # Y coordinates stay intact
131
+ [0] * len(LANDMARK_IDXS_LEFT_DOMINANT0),
132
+ # Z coordinates stay intact
133
+ [0] * len(LANDMARK_IDXS_LEFT_DOMINANT0),
134
+ ],
135
+ dtype=tf.float32,
136
+ )
137
+ self.normalisation_correction = tf.transpose(normalisation_correction, [1, 0])
138
+
139
+ def pad_edge(self, t, repeats, side):
140
+ if side == 'LEFT':
141
+ return tf.concat((tf.repeat(t[:1], repeats=repeats, axis=0), t), axis=0)
142
+ elif side == 'RIGHT':
143
+ return tf.concat((t, tf.repeat(t[-1:], repeats=repeats, axis=0)), axis=0)
144
+
145
+ @tf.function(
146
+ input_signature=(tf.TensorSpec(shape=[None, N_ROWS, N_DIMS], dtype=tf.float32),),
147
+ )
148
+ def call(self, data0):
149
+ # Number of Frames in Video
150
+ N_FRAMES0 = tf.shape(data0)[0]
151
+
152
+ # Find dominant hand by comparing summed absolute coordinates
153
+ left_hand_sum = tf.math.reduce_sum(
154
+ tf.where(tf.math.is_nan(tf.gather(data0, LEFT_HAND_IDXS0, axis=1)), 0, 1))
155
+ right_hand_sum = tf.math.reduce_sum(
156
+ tf.where(tf.math.is_nan(tf.gather(data0, RIGHT_HAND_IDXS0, axis=1)), 0, 1))
157
+ left_dominant = left_hand_sum >= right_hand_sum
158
+
159
+ # Count non NaN Hand values in each frame for the dominant hand
160
+ if left_dominant:
161
+ frames_hands_non_nan_sum = tf.math.reduce_sum(
162
+ tf.where(tf.math.is_nan(tf.gather(data0, LEFT_HAND_IDXS0, axis=1)), 0, 1),
163
+ axis=[1, 2],
164
+ )
165
+ else:
166
+ frames_hands_non_nan_sum = tf.math.reduce_sum(
167
+ tf.where(tf.math.is_nan(tf.gather(data0, RIGHT_HAND_IDXS0, axis=1)), 0, 1),
168
+ axis=[1, 2],
169
+ )
170
+
171
+ # Find frames indices with coordinates of dominant hand
172
+ non_empty_frames_idxs = tf.where(frames_hands_non_nan_sum > 0)
173
+ non_empty_frames_idxs = tf.squeeze(non_empty_frames_idxs, axis=1)
174
+ # Filter frames
175
+ data = tf.gather(data0, non_empty_frames_idxs, axis=0)
176
+
177
+ # Cast Indices in float32 to be compatible with Tensorflow Lite
178
+ non_empty_frames_idxs = tf.cast(non_empty_frames_idxs, tf.float32)
179
+ # Normalize to start with 0
180
+ non_empty_frames_idxs -= tf.reduce_min(non_empty_frames_idxs)
181
+
182
+ # Number of Frames in Filtered Video
183
+ N_FRAMES = tf.shape(data)[0]
184
+
185
+ # Gather Relevant Landmark Columns
186
+ if left_dominant:
187
+ data = tf.gather(data, LANDMARK_IDXS_LEFT_DOMINANT0, axis=1)
188
+ else:
189
+ data = tf.gather(data, LANDMARK_IDXS_RIGHT_DOMINANT0, axis=1)
190
+ data = (
191
+ self.normalisation_correction + (
192
+ (data - self.normalisation_correction) * tf.where(self.normalisation_correction != 0, -1.0,
193
+ 1.0))
194
+ )
195
+
196
+ # Video fits in INPUT_SIZE
197
+ if N_FRAMES < INPUT_SIZE:
198
+ # Pad With -1 to indicate padding
199
+ non_empty_frames_idxs = tf.pad(non_empty_frames_idxs, [[0, INPUT_SIZE - N_FRAMES]],
200
+ constant_values=-1)
201
+ # Pad Data With Zeros
202
+ data = tf.pad(data, [[0, INPUT_SIZE - N_FRAMES], [0, 0], [0, 0]], constant_values=0)
203
+ # Fill NaN Values With 0
204
+ data = tf.where(tf.math.is_nan(data), 0.0, data)
205
+ return data, non_empty_frames_idxs
206
+ # Video needs to be downsampled to INPUT_SIZE
207
+ else:
208
+ # Repeat
209
+ if N_FRAMES < INPUT_SIZE ** 2:
210
+ repeats = tf.math.floordiv(INPUT_SIZE * INPUT_SIZE, N_FRAMES0)
211
+ data = tf.repeat(data, repeats=repeats, axis=0)
212
+ non_empty_frames_idxs = tf.repeat(non_empty_frames_idxs, repeats=repeats, axis=0)
213
+
214
+ # Pad To Multiple Of Input Size
215
+ pool_size = tf.math.floordiv(len(data), INPUT_SIZE)
216
+ if tf.math.mod(len(data), INPUT_SIZE) > 0:
217
+ pool_size += 1
218
+
219
+ if pool_size == 1:
220
+ pad_size = (pool_size * INPUT_SIZE) - len(data)
221
+ else:
222
+ pad_size = (pool_size * INPUT_SIZE) % len(data)
223
+
224
+ # Pad Start/End with Start/End value
225
+ pad_left = tf.math.floordiv(pad_size, 2) + tf.math.floordiv(INPUT_SIZE, 2)
226
+ pad_right = tf.math.floordiv(pad_size, 2) + tf.math.floordiv(INPUT_SIZE, 2)
227
+ if tf.math.mod(pad_size, 2) > 0:
228
+ pad_right += 1
229
+
230
+ # Pad By Concatenating Left/Right Edge Values
231
+ data = self.pad_edge(data, pad_left, 'LEFT')
232
+ data = self.pad_edge(data, pad_right, 'RIGHT')
233
+
234
+ # Pad Non Empty Frame Indices
235
+ non_empty_frames_idxs = self.pad_edge(non_empty_frames_idxs, pad_left, 'LEFT')
236
+ non_empty_frames_idxs = self.pad_edge(non_empty_frames_idxs, pad_right, 'RIGHT')
237
+
238
+ # Reshape to Mean Pool
239
+ data = tf.reshape(data, [INPUT_SIZE, -1, N_COLS, N_DIMS])
240
+ non_empty_frames_idxs = tf.reshape(non_empty_frames_idxs, [INPUT_SIZE, -1])
241
+
242
+ # Mean Pool
243
+ data = tf.experimental.numpy.nanmean(data, axis=1)
244
+ non_empty_frames_idxs = tf.experimental.numpy.nanmean(non_empty_frames_idxs, axis=1)
245
+
246
+ # Fill NaN Values With 0
247
+ data = tf.where(tf.math.is_nan(data), 0.0, data)
248
+
249
+ return data, non_empty_frames_idxs
250
+
251
+
252
+ preprocess_layer = PreprocessLayer()
253
+
254
+
255
+ def translate_sign_language(image):
256
+ # Convert the frame to RGB (Mediapipe expects RGB images)
257
+ rgb_frame = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
258
+
259
+ with mp_hands.Hands(min_detection_confidence=0.5, min_tracking_confidence=0.5) as hands_tracker:
260
+ # Process the frame with Mediapipe Hands
261
+ hands_results = hands_tracker.process(rgb_frame)
262
+
263
+ with mp_pose.Pose(min_detection_confidence=0.5, min_tracking_confidence=0.5) as pose_tracker:
264
+ # Process the frame with Mediapipe Pose
265
+ pose_results = mp_pose.process(rgb_frame)
266
+
267
+ # Extract keypoints from the results
268
+ hand_pose_keypoints = extract_keypoints(hands_results)
269
+ pose_keypoints = extract_keypoints(pose_results)
270
+
271
+ # Prepare the input data for the TFLite model
272
+ left_hand_landmarks = hand_pose_keypoints[:63].reshape(1, -1, 3)
273
+ right_hand_landmarks = hand_pose_keypoints[63:126].reshape(1, -1, 3)
274
+ pose_landmarks = pose_keypoints[126:].reshape(1, -1, 4)
275
+
276
+ # Call the PreprocessLayer to preprocess the hand and pose landmark data
277
+ preprocessed_left_hand, _ = preprocess_layer(left_hand_landmarks)
278
+ preprocessed_right_hand, _ = preprocess_layer(right_hand_landmarks)
279
+ preprocessed_pose, _ = preprocess_layer(pose_landmarks)
280
+
281
+ # Prepare the input data for the TFLite model
282
+ input_data = [preprocessed_left_hand, preprocessed_right_hand, preprocessed_pose]
283
+
284
+ # Perform inference using the loaded sign language model (assuming you have already loaded it)
285
+ interpreter.set_tensor(interpreter.get_input_details()[0]['index'], input_data[0])
286
+ interpreter.set_tensor(interpreter.get_input_details()[1]['index'], input_data[1])
287
+ interpreter.set_tensor(interpreter.get_input_details()[2]['index'], input_data[2])
288
+ interpreter.invoke()
289
+ output = interpreter.get_tensor(interpreter.get_output_details()[0]['index'])
290
+
291
+ # Make prediction using the processed landmarks
292
+ translated_text = make_prediction(output)
293
+
294
+ # Return the translated text
295
+ return translated_text
296
+
297
+
298
+ gr_interface = gr.Interface(fn=translate_sign_language,
299
+ inputs="webcam", # Input from webcam
300
+ outputs="text", # Output as text
301
+ #capture_session=True, # To properly release the webcam after running the interface
302
+ live=True, # Show live webcam feed
303
+ title="Sign Language Translation",
304
+ description="Translate sign language to text using TensorFlow Lite and Mediapipe.")
305
+
306
+ gr_interface.launch(share=True)
307
+
308
+ cap.release()
309
+ cv2.destroyAllWindows()
310
+