Spaces:

JefferyJapheth
/

finalYear

Runtime error

App Files Files Community

JefferyJapheth commited on Aug 2, 2023

Commit

e32630d

•

1 Parent(s): 455e90a

app_final_year

Browse files

Files changed (1) hide show

app.py +310 -0

app.py ADDED Viewed

	@@ -0,0 +1,310 @@

+import os
+import time
+import cv2
+import gradio as gr
+import mediapipe as mp
+import numpy as np
+from matplotlib import pyplot as plt
+mp_holistic = mp.solutions.holistic
+# Import TensorFlow
+import tensorflow as tf
+N_ROWS = 543
+N_DIMS = 3
+DIM_NAMES = ['x', 'y', 'z']
+SEED = 42
+NUM_CLASSES = 250
+INPUT_SIZE = 64
+BATCH_ALL_SIGNS_N = 4
+BATCH_SIZE = 256
+N_EPOCHS = 100
+LR_MAX = 1e-3
+N_WARMUP_EPOCHS = 0
+WD_RATIO = 0.05
+MASK_VAL = 4237
+USE_TYPES = ['left_hand', 'pose', 'right_hand']
+START_IDX = 468
+LIPS_IDXS0 = np.array([
+    61, 185, 40, 39, 37, 0, 267, 269, 270, 409,
+    291, 146, 91, 181, 84, 17, 314, 405, 321, 375,
+    78, 191, 80, 81, 82, 13, 312, 311, 310, 415,
+    95, 88, 178, 87, 14, 317, 402, 318, 324, 308,
+])
+# Landmark indices in original data
+LEFT_HAND_IDXS0 = np.arange(468, 489)
+RIGHT_HAND_IDXS0 = np.arange(522, 543)
+LEFT_POSE_IDXS0 = np.array([502, 504, 506, 508, 510])
+RIGHT_POSE_IDXS0 = np.array([503, 505, 507, 509, 511])
+LANDMARK_IDXS_LEFT_DOMINANT0 = np.concatenate((LIPS_IDXS0, LEFT_HAND_IDXS0, LEFT_POSE_IDXS0))
+LANDMARK_IDXS_RIGHT_DOMINANT0 = np.concatenate((LIPS_IDXS0, RIGHT_HAND_IDXS0, RIGHT_POSE_IDXS0))
+HAND_IDXS0 = np.concatenate((LEFT_HAND_IDXS0, RIGHT_HAND_IDXS0), axis=0)
+N_COLS = LANDMARK_IDXS_LEFT_DOMINANT0.size
+# Landmark indices in processed data
+LIPS_IDXS = np.argwhere(np.isin(LANDMARK_IDXS_LEFT_DOMINANT0, LIPS_IDXS0)).squeeze()
+LEFT_HAND_IDXS = np.argwhere(np.isin(LANDMARK_IDXS_LEFT_DOMINANT0, LEFT_HAND_IDXS0)).squeeze()
+RIGHT_HAND_IDXS = np.argwhere(np.isin(LANDMARK_IDXS_LEFT_DOMINANT0, RIGHT_HAND_IDXS0)).squeeze()
+HAND_IDXS = np.argwhere(np.isin(LANDMARK_IDXS_LEFT_DOMINANT0, HAND_IDXS0)).squeeze()
+POSE_IDXS = np.argwhere(np.isin(LANDMARK_IDXS_LEFT_DOMINANT0, LEFT_POSE_IDXS0)).squeeze()
+print(f'# HAND_IDXS: {len(HAND_IDXS)}, N_COLS: {N_COLS}')
+LIPS_START = 0
+LEFT_HAND_START = LIPS_IDXS.size
+RIGHT_HAND_START = LEFT_HAND_START + LEFT_HAND_IDXS.size
+POSE_START = RIGHT_HAND_START + RIGHT_HAND_IDXS.size
+print(
+    f'LIPS_START: {LIPS_START}, LEFT_HAND_START: {LEFT_HAND_START}, RIGHT_HAND_START: {RIGHT_HAND_START}, POSE_START: {POSE_START}')
+def mediapipe_detection(image, model):
+    # COLOR CONVERSION BGR 2 RGB
+    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+    image.flags.writeable = False  # Image is no longer writeable
+    results = model.process(image)  # Make prediction
+    image.flags.writeable = True  # Image is now writeable
+    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)  # COLOR COVERSION RGB 2 BGR
+    return image, results
+def extract_keypoints(results):
+    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten(
+    ) if results.left_hand_landmarks else np.zeros(21 * 3)
+    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten(
+    ) if results.right_hand_landmarks else np.zeros(21 * 3)
+    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten(
+    ) if results.pose_landmarks else np.zeros(33 * 4)
+    face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten(
+    ) if results.face_landmarks else np.zeros(468 * 3)
+    return np.concatenate([lh, rh, pose, face])
+cap = cv2.VideoCapture(0, cv2.CAP_DSHOW)
+# Set mediapipe model
+with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
+    while cap.isOpened():
+        # Read feed
+        ret, frame = cap.read()
+        # Make detections
+        image, results = mediapipe_detection(frame, holistic)
+        print(results)
+        # Function to make predictions using the TensorFlow Lite model
+        def make_prediction(processed_landmarks):
+            inputs = np.array(processed_landmarks, dtype=np.float32)
+            # Set the input tensor for the TFLite model
+            interpreter.set_tensor(input_details[0]['index'], inputs)
+            # Invoke the TFLite interpreter to perform inference
+            interpreter.invoke()
+            # Get the output tensor of the TFLite model
+            output_data = interpreter.get_tensor(output_details[0]['index'])
+            # Find the index of the predicted class
+            index = np.argmax(output_data)
+            # Map the index to the corresponding class label using the index_to_class dictionary
+            prediction = inv_index_to_class[index]
+            return prediction
+        class PreprocessLayer(tf.keras.layers.Layer):
+            def __init__(self):
+                super(PreprocessLayer, self).__init__()
+                normalisation_correction = tf.constant([
+                    # Add 0.50 to left hand (original right hand) and substract 0.50 of right hand (original left hand)
+                    [0] * len(LIPS_IDXS) + [0.50] * len(LEFT_HAND_IDXS) + [0.50] * len(POSE_IDXS),
+                    # Y coordinates stay intact
+                    [0] * len(LANDMARK_IDXS_LEFT_DOMINANT0),
+                    # Z coordinates stay intact
+                    [0] * len(LANDMARK_IDXS_LEFT_DOMINANT0),
+                ],
+                    dtype=tf.float32,
+                )
+                self.normalisation_correction = tf.transpose(normalisation_correction, [1, 0])
+            def pad_edge(self, t, repeats, side):
+                if side == 'LEFT':
+                    return tf.concat((tf.repeat(t[:1], repeats=repeats, axis=0), t), axis=0)
+                elif side == 'RIGHT':
+                    return tf.concat((t, tf.repeat(t[-1:], repeats=repeats, axis=0)), axis=0)
+            @tf.function(
+                input_signature=(tf.TensorSpec(shape=[None, N_ROWS, N_DIMS], dtype=tf.float32),),
+            )
+            def call(self, data0):
+                # Number of Frames in Video
+                N_FRAMES0 = tf.shape(data0)[0]
+                # Find dominant hand by comparing summed absolute coordinates
+                left_hand_sum = tf.math.reduce_sum(
+                    tf.where(tf.math.is_nan(tf.gather(data0, LEFT_HAND_IDXS0, axis=1)), 0, 1))
+                right_hand_sum = tf.math.reduce_sum(
+                    tf.where(tf.math.is_nan(tf.gather(data0, RIGHT_HAND_IDXS0, axis=1)), 0, 1))
+                left_dominant = left_hand_sum >= right_hand_sum
+                # Count non NaN Hand values in each frame for the dominant hand
+                if left_dominant:
+                    frames_hands_non_nan_sum = tf.math.reduce_sum(
+                        tf.where(tf.math.is_nan(tf.gather(data0, LEFT_HAND_IDXS0, axis=1)), 0, 1),
+                        axis=[1, 2],
+                    )
+                else:
+                    frames_hands_non_nan_sum = tf.math.reduce_sum(
+                        tf.where(tf.math.is_nan(tf.gather(data0, RIGHT_HAND_IDXS0, axis=1)), 0, 1),
+                        axis=[1, 2],
+                    )
+                # Find frames indices with coordinates of dominant hand
+                non_empty_frames_idxs = tf.where(frames_hands_non_nan_sum > 0)
+                non_empty_frames_idxs = tf.squeeze(non_empty_frames_idxs, axis=1)
+                # Filter frames
+                data = tf.gather(data0, non_empty_frames_idxs, axis=0)
+                # Cast Indices in float32 to be compatible with Tensorflow Lite
+                non_empty_frames_idxs = tf.cast(non_empty_frames_idxs, tf.float32)
+                # Normalize to start with 0
+                non_empty_frames_idxs -= tf.reduce_min(non_empty_frames_idxs)
+                # Number of Frames in Filtered Video
+                N_FRAMES = tf.shape(data)[0]
+                # Gather Relevant Landmark Columns
+                if left_dominant:
+                    data = tf.gather(data, LANDMARK_IDXS_LEFT_DOMINANT0, axis=1)
+                else:
+                    data = tf.gather(data, LANDMARK_IDXS_RIGHT_DOMINANT0, axis=1)
+                    data = (
+                            self.normalisation_correction + (
+                            (data - self.normalisation_correction) * tf.where(self.normalisation_correction != 0, -1.0,
+                                                                              1.0))
+                    )
+                # Video fits in INPUT_SIZE
+                if N_FRAMES < INPUT_SIZE:
+                    # Pad With -1 to indicate padding
+                    non_empty_frames_idxs = tf.pad(non_empty_frames_idxs, [[0, INPUT_SIZE - N_FRAMES]],
+                                                   constant_values=-1)
+                    # Pad Data With Zeros
+                    data = tf.pad(data, [[0, INPUT_SIZE - N_FRAMES], [0, 0], [0, 0]], constant_values=0)
+                    # Fill NaN Values With 0
+                    data = tf.where(tf.math.is_nan(data), 0.0, data)
+                    return data, non_empty_frames_idxs
+                # Video needs to be downsampled to INPUT_SIZE
+                else:
+                    # Repeat
+                    if N_FRAMES < INPUT_SIZE ** 2:
+                        repeats = tf.math.floordiv(INPUT_SIZE * INPUT_SIZE, N_FRAMES0)
+                        data = tf.repeat(data, repeats=repeats, axis=0)
+                        non_empty_frames_idxs = tf.repeat(non_empty_frames_idxs, repeats=repeats, axis=0)
+                        # Pad To Multiple Of Input Size
+                    pool_size = tf.math.floordiv(len(data), INPUT_SIZE)
+                    if tf.math.mod(len(data), INPUT_SIZE) > 0:
+                        pool_size += 1
+                    if pool_size == 1:
+                        pad_size = (pool_size * INPUT_SIZE) - len(data)
+                    else:
+                        pad_size = (pool_size * INPUT_SIZE) % len(data)
+                    # Pad Start/End with Start/End value
+                    pad_left = tf.math.floordiv(pad_size, 2) + tf.math.floordiv(INPUT_SIZE, 2)
+                    pad_right = tf.math.floordiv(pad_size, 2) + tf.math.floordiv(INPUT_SIZE, 2)
+                    if tf.math.mod(pad_size, 2) > 0:
+                        pad_right += 1
+                    # Pad By Concatenating Left/Right Edge Values
+                    data = self.pad_edge(data, pad_left, 'LEFT')
+                    data = self.pad_edge(data, pad_right, 'RIGHT')
+                    # Pad Non Empty Frame Indices
+                    non_empty_frames_idxs = self.pad_edge(non_empty_frames_idxs, pad_left, 'LEFT')
+                    non_empty_frames_idxs = self.pad_edge(non_empty_frames_idxs, pad_right, 'RIGHT')
+                    # Reshape to Mean Pool
+                    data = tf.reshape(data, [INPUT_SIZE, -1, N_COLS, N_DIMS])
+                    non_empty_frames_idxs = tf.reshape(non_empty_frames_idxs, [INPUT_SIZE, -1])
+                    # Mean Pool
+                    data = tf.experimental.numpy.nanmean(data, axis=1)
+                    non_empty_frames_idxs = tf.experimental.numpy.nanmean(non_empty_frames_idxs, axis=1)
+                    # Fill NaN Values With 0
+                    data = tf.where(tf.math.is_nan(data), 0.0, data)
+                    return data, non_empty_frames_idxs
+        preprocess_layer = PreprocessLayer()
+        def translate_sign_language(image):
+            # Convert the frame to RGB (Mediapipe expects RGB images)
+            rgb_frame = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+            with mp_hands.Hands(min_detection_confidence=0.5, min_tracking_confidence=0.5) as hands_tracker:
+                # Process the frame with Mediapipe Hands
+                hands_results = hands_tracker.process(rgb_frame)
+            with mp_pose.Pose(min_detection_confidence=0.5, min_tracking_confidence=0.5) as pose_tracker:
+                # Process the frame with Mediapipe Pose
+                pose_results = mp_pose.process(rgb_frame)
+            # Extract keypoints from the results
+            hand_pose_keypoints = extract_keypoints(hands_results)
+            pose_keypoints = extract_keypoints(pose_results)
+            # Prepare the input data for the TFLite model
+            left_hand_landmarks = hand_pose_keypoints[:63].reshape(1, -1, 3)
+            right_hand_landmarks = hand_pose_keypoints[63:126].reshape(1, -1, 3)
+            pose_landmarks = pose_keypoints[126:].reshape(1, -1, 4)
+            # Call the PreprocessLayer to preprocess the hand and pose landmark data
+            preprocessed_left_hand, _ = preprocess_layer(left_hand_landmarks)
+            preprocessed_right_hand, _ = preprocess_layer(right_hand_landmarks)
+            preprocessed_pose, _ = preprocess_layer(pose_landmarks)
+            # Prepare the input data for the TFLite model
+            input_data = [preprocessed_left_hand, preprocessed_right_hand, preprocessed_pose]
+            # Perform inference using the loaded sign language model (assuming you have already loaded it)
+            interpreter.set_tensor(interpreter.get_input_details()[0]['index'], input_data[0])
+            interpreter.set_tensor(interpreter.get_input_details()[1]['index'], input_data[1])
+            interpreter.set_tensor(interpreter.get_input_details()[2]['index'], input_data[2])
+            interpreter.invoke()
+            output = interpreter.get_tensor(interpreter.get_output_details()[0]['index'])
+            # Make prediction using the processed landmarks
+            translated_text = make_prediction(output)
+            # Return the translated text
+            return translated_text
+        gr_interface = gr.Interface(fn=translate_sign_language,
+                                    inputs="webcam",  # Input from webcam
+                                    outputs="text",  # Output as text
+                                    #capture_session=True,  # To properly release the webcam after running the interface
+                                    live=True,  # Show live webcam feed
+                                    title="Sign Language Translation",
+                                    description="Translate sign language to text using TensorFlow Lite and Mediapipe.")
+        gr_interface.launch(share=True)
+    cap.release()
+    cv2.destroyAllWindows()