JefferyJapheth commited on
Commit
e156e53
1 Parent(s): 3f94b52

re-modified code with mediapipe and webcam initialized

Browse files
Files changed (1) hide show
  1. app.py +143 -74
app.py CHANGED
@@ -1,4 +1,4 @@
1
- import os
2
  import cv2
3
  import gradio as gr
4
  import mediapipe as mp
@@ -6,14 +6,96 @@ import numpy as np
6
  import tensorflow as tf
7
  import tensorflow.lite as tflite
8
 
9
- # Initialize MediaPipe solutions
10
- mp_hands = mp.solutions.hands
11
- mp_pose = mp.solutions.pose
12
- mp_face_mesh = mp.solutions.face_mesh
13
 
14
- hands = mp_hands.Hands()
15
- pose = mp_pose.Pose()
16
- face_mesh = mp_face_mesh.FaceMesh()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
  # Get the absolute path to the directory containing app.py
19
  current_dir = os.path.dirname(os.path.abspath(__file__))
@@ -25,50 +107,37 @@ model_path = os.path.join(current_dir, model_filename)
25
  interpreter = tf.lite.Interpreter(model_path=model_path)
26
  interpreter.allocate_tensors()
27
 
 
28
  index_to_class = {
29
  "TV": 0, "after": 1, "airplane": 2, "all": 3, "alligator": 4, "animal": 5, "another": 6, "any": 7, "apple": 8, "arm": 9, "aunt": 10, "awake": 11, "backyard": 12, "bad": 13, "balloon": 14, "bath": 15, "because": 16, "bed": 17, "bedroom": 18, "bee": 19, "before": 20, "beside": 21, "better": 22, "bird": 23, "black": 24, "blow": 25, "blue": 26, "boat": 27, "book": 28, "boy": 29, "brother": 30, "brown": 31, "bug": 32, "bye": 33, "callonphone": 34, "can": 35, "car": 36, "carrot": 37, "cat": 38, "cereal": 39, "chair": 40, "cheek": 41, "child": 42, "chin": 43, "chocolate": 44, "clean": 45, "close": 46, "closet": 47, "cloud": 48, "clown": 49, "cow": 50, "cowboy": 51, "cry": 52, "cut": 53, "cute": 54, "dad": 55, "dance": 56, "dirty": 57, "dog": 58, "doll": 59, "donkey": 60, "down": 61, "drawer": 62, "drink": 63, "drop": 64, "dry": 65, "dryer": 66, "duck": 67, "ear": 68, "elephant": 69, "empty": 70, "every": 71, "eye": 72, "face": 73, "fall": 74, "farm": 75, "fast": 76, "feet": 77, "find": 78, "fine": 79, "finger": 80, "finish": 81, "fireman": 82, "first": 83, "fish": 84, "flag": 85, "flower": 86, "food": 87, "for": 88, "frenchfries": 89, "frog": 90, "garbage": 91, "gift": 92, "giraffe": 93, "girl": 94, "give": 95, "glasswindow": 96, "go": 97, "goose": 98, "grandma": 99, "grandpa": 100, "grass": 101, "green": 102, "gum": 103, "hair": 104, "happy": 105, "hat": 106, "hate": 107, "have": 108, "haveto": 109, "head": 110, "hear": 111, "helicopter": 112, "hello": 113, "hen": 114, "hesheit": 115, "hide": 116, "high": 117, "home": 118, "horse": 119, "hot": 120, "hungry": 121, "icecream": 122, "if": 123, "into": 124, "jacket": 125, "jeans": 126, "jump": 127, "kiss": 128, "kitty": 129, "lamp": 130, "later": 131, "like": 132, "lion": 133, "lips": 134, "listen": 135, "look": 136, "loud": 137, "mad": 138, "make": 139, "man": 140, "many": 141, "milk": 142, "minemy": 143, "mitten": 144, "mom": 145, "moon": 146, "morning": 147, "mouse": 148, "mouth": 149, "nap": 150, "napkin": 151, "night": 152, "no": 153, "noisy": 154, "nose": 155, "not": 156, "now": 157, "nuts": 158, "old": 159, "on": 160, "open": 161, "orange": 162, "outside": 163, "owie": 164, "owl": 165, "pajamas": 166, "pen": 167, "pencil": 168, "penny": 169, "person": 170, "pig": 171, "pizza": 172, "please": 173, "police": 174, "pool": 175, "potty": 176, "pretend": 177, "pretty": 178, "puppy": 179, "puzzle": 180, "quiet": 181, "radio": 182, "rain": 183, "read": 184, "red": 185, "refrigerator": 186, "ride": 187, "room": 188, "sad": 189, "same": 190, "say": 191, "scissors": 192, "see": 193, "shhh": 194, "shirt": 195, "shoe": 196, "shower": 197, "sick": 198, "sleep": 199, "sleepy": 200, "smile": 201, "snack": 202, "snow": 203, "stairs": 204, "stay": 205, "sticky": 206, "store": 207, "story": 208, "stuck": 209, "sun": 210, "table": 211, "talk": 212, "taste": 213, "thankyou": 214, "that": 215, "there": 216, "think": 217, "thirsty": 218, "tiger": 219, "time": 220, "tomorrow": 221, "tongue": 222, "tooth": 223, "toothbrush": 224, "touch": 225, "toy": 226, "tree": 227, "uncle": 228, "underwear": 229, "up": 230, "vacuum": 231, "wait": 232, "wake": 233, "water": 234, "wet": 235, "weus": 236, "where": 237, "white": 238, "who": 239, "why": 240, "will": 241, "wolf": 242, "yellow": 243, "yes": 244, "yesterday": 245, "yourself": 246, "yucky": 247, "zebra": 248, "zipper": 249
30
  }
31
 
32
  inv_index_to_class = {v: k for k, v in index_to_class.items()}
33
 
 
34
 
35
 
36
- # Preprocess landmarks
37
- def preprocess_landmarks(hand1_landmarks, hand2_landmarks, pose_landmarks, lip_landmarks):
38
- hand1_landmarks = [[landmark.x, landmark.y, landmark.z] for landmark in hand1_landmarks.landmark]
39
- hand2_landmarks = [[landmark.x, landmark.y, landmark.z] for landmark in hand2_landmarks.landmark]
40
- pose_landmarks = [[landmark.x, landmark.y, landmark.z] for landmark in pose_landmarks.landmark]
41
- lip_landmarks = [[landmark.x, landmark.y, landmark.z] for landmark in lip_landmarks]
42
-
43
- combined_landmarks = lip_landmarks + hand1_landmarks + hand2_landmarks + pose_landmarks
44
 
45
- return np.array(combined_landmarks, dtype=np.float32)
46
 
47
- # Function to extract landmarks from the webcam frame
48
- def extract_landmarks(frame):
49
- if frame is None:
50
- raise ValueError("Frame is None. Make sure your webcam is working properly.")
51
- frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
52
-
53
- frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
54
- results = hands.process(frame_rgb)
55
- pose_results = pose.process(frame_rgb)
56
- face_results = face_mesh.process(frame_rgb)
57
 
58
- if not results.multi_hand_landmarks or not pose_results.pose_landmarks or not face_results.multi_face_landmarks:
59
- return None
60
-
61
- hand1_landmarks = results.multi_hand_landmarks[0]
62
- if len(results.multi_hand_landmarks) > 1:
63
- hand2_landmarks = results.multi_hand_landmarks[1]
64
- else:
65
- hand2_landmarks = hand1_landmarks
66
-
67
- pose_landmarks = pose_results.pose_landmarks
68
- face_landmarks = face_results.multi_face_landmarks[0]
69
- lip_landmarks = [face_landmarks.landmark[i] for i in LIPS_IDXS0 - START_IDX]
70
-
71
- return hand1_landmarks, hand2_landmarks, pose_landmarks, lip_landmarks
72
 
73
  # Make prediction
74
  def make_prediction(processed_landmarks):
@@ -78,47 +147,48 @@ def make_prediction(processed_landmarks):
78
  index = outputs[0].argmax()
79
  return index_to_class[index]
80
 
81
- # Gradio Interface Function
82
- def predict_with_webcam(frame):
83
-
84
- # Initialize webcam capture (the default camera, usually index 0)
85
- webcam = cv2.VideoCapture(0)
86
 
87
- while True:
88
- # Capture a frame from the webcam
89
- ret, frame = webcam.read()
90
 
91
- if not ret:
92
- print("Failed to capture frame from the webcam.")
93
- break
94
-
95
- landmarks = extract_landmarks(frame)
 
96
  if landmarks is not None:
97
- processed_landmarks = preprocess_landmarks(*landmarks)
98
- prediction = make_prediction(processed_landmarks)
 
 
 
99
  print("Prediction:", prediction)
 
 
 
 
100
 
101
- # Display the frame in a window (optional)
102
- cv2.imshow("Webcam", frame)
 
 
 
 
103
 
104
- # Break the loop when the user presses the 'q' key
 
 
 
 
 
 
 
105
  if cv2.waitKey(1) & 0xFF == ord('q'):
106
  break
107
 
108
- # Release the webcam and close the window
109
- webcam.release()
110
- cv2.destroyAllWindows()
111
-
112
- landmarks = extract_landmarks(frame)
113
- if landmarks is not None:
114
- processed_landmarks = preprocess_landmarks(*landmarks)
115
- prediction = make_prediction(processed_landmarks)
116
- print("Prediction:", prediction)
117
- return prediction
118
- else:
119
- return "Could not detect landmarks. Make sure your webcam is working properly."
120
 
121
-
122
 
123
 
124
  # Define the Gradio interface with the Webcam input and Text output
@@ -133,5 +203,4 @@ webcam_interface = gr.Interface(
133
 
134
  # Launch the Gradio app with the webcam interface and create a public link
135
  if __name__ == "__main__":
136
- webcam_interface.launch()
137
-
 
1
+ import os
2
  import cv2
3
  import gradio as gr
4
  import mediapipe as mp
 
6
  import tensorflow as tf
7
  import tensorflow.lite as tflite
8
 
 
 
 
 
9
 
10
+
11
+ # Tensorflow layer to process data in TFLite
12
+ # Data needs to be processed in the model itself, so we cannot use Python
13
+ class PreprocessLayer(tf.keras.layers.Layer):
14
+ def __init__(self):
15
+ super(PreprocessLayer, self).__init__()
16
+
17
+ def pad_edge(self, t, repeats, side):
18
+ if side == 'LEFT':
19
+ return tf.concat((tf.repeat(t[:1], repeats=repeats, axis=0), t), axis=0)
20
+ elif side == 'RIGHT':
21
+ return tf.concat((t, tf.repeat(t[-1:], repeats=repeats, axis=0)), axis=0)
22
+
23
+ @tf.function(
24
+ input_signature=(tf.TensorSpec(shape=[None, N_ROWS, N_DIMS], dtype=tf.float32),),
25
+ )
26
+ def call(self, data0):
27
+ # Number of Frames in Video
28
+ N_FRAMES0 = tf.shape(data0)[0]
29
+
30
+ # Filter Out Frames With Empty Hand Data
31
+ frames_hands_nansum = tf.experimental.numpy.nanmean(tf.gather(data0, HAND_IDXS0, axis=1), axis=[1, 2])
32
+ non_empty_frames_idxs = tf.where(frames_hands_nansum > 0)
33
+ non_empty_frames_idxs = tf.squeeze(non_empty_frames_idxs, axis=1)
34
+ data = tf.gather(data0, non_empty_frames_idxs, axis=0)
35
+
36
+ # Cast Indices in float32 to be compatible with Tensorflow Lite
37
+ non_empty_frames_idxs = tf.cast(non_empty_frames_idxs, tf.float32)
38
+
39
+ # Number of Frames in Filtered Video
40
+ N_FRAMES = tf.shape(data)[0]
41
+
42
+ # Gather Relevant Landmark Columns
43
+ data = tf.gather(data, LANDMARK_IDXS0, axis=1)
44
+
45
+ # Video fits in INPUT_SIZE
46
+ if N_FRAMES < INPUT_SIZE:
47
+ # Pad With -1 to indicate padding
48
+ non_empty_frames_idxs = tf.pad(non_empty_frames_idxs, [[0, INPUT_SIZE - N_FRAMES]], constant_values=-1)
49
+ # Pad Data With Zeros
50
+ data = tf.pad(data, [[0, INPUT_SIZE - N_FRAMES], [0, 0], [0, 0]], constant_values=0)
51
+ # Fill NaN Values With 0
52
+ data = tf.where(tf.math.is_nan(data), 0.0, data)
53
+ return data, non_empty_frames_idxs
54
+ # Video needs to be downsampled to INPUT_SIZE
55
+ else:
56
+ # Repeat
57
+ if N_FRAMES < INPUT_SIZE ** 2:
58
+ repeats = tf.math.floordiv(INPUT_SIZE * INPUT_SIZE, N_FRAMES0)
59
+ data = tf.repeat(data, repeats=repeats, axis=0)
60
+ non_empty_frames_idxs = tf.repeat(non_empty_frames_idxs, repeats=repeats, axis=0)
61
+
62
+ # Pad To Multiple Of Input Size
63
+ pool_size = tf.math.floordiv(len(data), INPUT_SIZE)
64
+ if tf.math.mod(len(data), INPUT_SIZE) > 0:
65
+ pool_size += 1
66
+
67
+ if pool_size == 1:
68
+ pad_size = (pool_size * INPUT_SIZE) - len(data)
69
+ else:
70
+ pad_size = (pool_size * INPUT_SIZE) % len(data)
71
+
72
+ # Pad Start/End with Start/End value
73
+ pad_left = tf.math.floordiv(pad_size, 2) + tf.math.floordiv(INPUT_SIZE, 2)
74
+ pad_right = tf.math.floordiv(pad_size, 2) + tf.math.floordiv(INPUT_SIZE, 2)
75
+ if tf.math.mod(pad_size, 2) > 0:
76
+ pad_right += 1
77
+
78
+ # Pad By Concatenating Left/Right Edge Values
79
+ data = self.pad_edge(data, pad_left, 'LEFT')
80
+ data = self.pad_edge(data, pad_right, 'RIGHT')
81
+
82
+ # Pad Non Empty Frame Indices
83
+ non_empty_frames_idxs = self.pad_edge(non_empty_frames_idxs, pad_left, 'LEFT')
84
+ non_empty_frames_idxs = self.pad_edge(non_empty_frames_idxs, pad_right, 'RIGHT')
85
+
86
+ # Reshape to Mean Pool
87
+ data = tf.reshape(data, [INPUT_SIZE, -1, N_COLS, N_DIMS])
88
+ non_empty_frames_idxs = tf.reshape(non_empty_frames_idxs, [INPUT_SIZE, -1])
89
+
90
+ # Mean Pool
91
+ data = tf.experimental.numpy.nanmean(data, axis=1)
92
+ non_empty_frames_idxs = tf.experimental.numpy.nanmean(non_empty_frames_idxs, axis=1)
93
+
94
+ # Fill NaN Values With 0
95
+ data = tf.where(tf.math.is_nan(data), 0.0, data)
96
+
97
+ return data, non_empty_frames_idxs
98
+
99
 
100
  # Get the absolute path to the directory containing app.py
101
  current_dir = os.path.dirname(os.path.abspath(__file__))
 
107
  interpreter = tf.lite.Interpreter(model_path=model_path)
108
  interpreter.allocate_tensors()
109
 
110
+
111
  index_to_class = {
112
  "TV": 0, "after": 1, "airplane": 2, "all": 3, "alligator": 4, "animal": 5, "another": 6, "any": 7, "apple": 8, "arm": 9, "aunt": 10, "awake": 11, "backyard": 12, "bad": 13, "balloon": 14, "bath": 15, "because": 16, "bed": 17, "bedroom": 18, "bee": 19, "before": 20, "beside": 21, "better": 22, "bird": 23, "black": 24, "blow": 25, "blue": 26, "boat": 27, "book": 28, "boy": 29, "brother": 30, "brown": 31, "bug": 32, "bye": 33, "callonphone": 34, "can": 35, "car": 36, "carrot": 37, "cat": 38, "cereal": 39, "chair": 40, "cheek": 41, "child": 42, "chin": 43, "chocolate": 44, "clean": 45, "close": 46, "closet": 47, "cloud": 48, "clown": 49, "cow": 50, "cowboy": 51, "cry": 52, "cut": 53, "cute": 54, "dad": 55, "dance": 56, "dirty": 57, "dog": 58, "doll": 59, "donkey": 60, "down": 61, "drawer": 62, "drink": 63, "drop": 64, "dry": 65, "dryer": 66, "duck": 67, "ear": 68, "elephant": 69, "empty": 70, "every": 71, "eye": 72, "face": 73, "fall": 74, "farm": 75, "fast": 76, "feet": 77, "find": 78, "fine": 79, "finger": 80, "finish": 81, "fireman": 82, "first": 83, "fish": 84, "flag": 85, "flower": 86, "food": 87, "for": 88, "frenchfries": 89, "frog": 90, "garbage": 91, "gift": 92, "giraffe": 93, "girl": 94, "give": 95, "glasswindow": 96, "go": 97, "goose": 98, "grandma": 99, "grandpa": 100, "grass": 101, "green": 102, "gum": 103, "hair": 104, "happy": 105, "hat": 106, "hate": 107, "have": 108, "haveto": 109, "head": 110, "hear": 111, "helicopter": 112, "hello": 113, "hen": 114, "hesheit": 115, "hide": 116, "high": 117, "home": 118, "horse": 119, "hot": 120, "hungry": 121, "icecream": 122, "if": 123, "into": 124, "jacket": 125, "jeans": 126, "jump": 127, "kiss": 128, "kitty": 129, "lamp": 130, "later": 131, "like": 132, "lion": 133, "lips": 134, "listen": 135, "look": 136, "loud": 137, "mad": 138, "make": 139, "man": 140, "many": 141, "milk": 142, "minemy": 143, "mitten": 144, "mom": 145, "moon": 146, "morning": 147, "mouse": 148, "mouth": 149, "nap": 150, "napkin": 151, "night": 152, "no": 153, "noisy": 154, "nose": 155, "not": 156, "now": 157, "nuts": 158, "old": 159, "on": 160, "open": 161, "orange": 162, "outside": 163, "owie": 164, "owl": 165, "pajamas": 166, "pen": 167, "pencil": 168, "penny": 169, "person": 170, "pig": 171, "pizza": 172, "please": 173, "police": 174, "pool": 175, "potty": 176, "pretend": 177, "pretty": 178, "puppy": 179, "puzzle": 180, "quiet": 181, "radio": 182, "rain": 183, "read": 184, "red": 185, "refrigerator": 186, "ride": 187, "room": 188, "sad": 189, "same": 190, "say": 191, "scissors": 192, "see": 193, "shhh": 194, "shirt": 195, "shoe": 196, "shower": 197, "sick": 198, "sleep": 199, "sleepy": 200, "smile": 201, "snack": 202, "snow": 203, "stairs": 204, "stay": 205, "sticky": 206, "store": 207, "story": 208, "stuck": 209, "sun": 210, "table": 211, "talk": 212, "taste": 213, "thankyou": 214, "that": 215, "there": 216, "think": 217, "thirsty": 218, "tiger": 219, "time": 220, "tomorrow": 221, "tongue": 222, "tooth": 223, "toothbrush": 224, "touch": 225, "toy": 226, "tree": 227, "uncle": 228, "underwear": 229, "up": 230, "vacuum": 231, "wait": 232, "wake": 233, "water": 234, "wet": 235, "weus": 236, "where": 237, "white": 238, "who": 239, "why": 240, "will": 241, "wolf": 242, "yellow": 243, "yes": 244, "yesterday": 245, "yourself": 246, "yucky": 247, "zebra": 248, "zipper": 249
113
  }
114
 
115
  inv_index_to_class = {v: k for k, v in index_to_class.items()}
116
 
117
+ mp_holistic = mp.solutions.holistic
118
 
119
 
120
+ def mediapipe_detection(image, model):
121
+ # COLOR CONVERSION BGR 2 RGB
122
+ image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
123
+ image.flags.writeable = False # Image is no longer writeable
124
+ results = model.process(image) # Make prediction
125
+ image.flags.writeable = True # Image is now writeable
126
+ image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # COLOR COVERSION RGB 2 BGR
127
+ return image, results
128
 
 
129
 
130
+ def extract_keypoints(results):
131
+ lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten(
132
+ ) if results.left_hand_landmarks else np.zeros(21*3)
133
+ rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten(
134
+ ) if results.right_hand_landmarks else np.zeros(21*3)
135
+ pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten(
136
+ ) if results.pose_landmarks else np.zeros(33*4)
137
+ face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten(
138
+ ) if results.face_landmarks else np.zeros(468*3)
139
+ return np.concatenate([lh, rh, pose, face])
140
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
 
142
  # Make prediction
143
  def make_prediction(processed_landmarks):
 
147
  index = outputs[0].argmax()
148
  return index_to_class[index]
149
 
 
 
 
 
 
150
 
151
+ # ... (previous code)
 
 
152
 
153
+ def predict_with_webcam(frame):
154
+ with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
155
+ # Make detections using mediapipe
156
+ image, results = mediapipe_detection(frame, holistic)
157
+ print(results)
158
+ landmarks = extract_keypoints(results)
159
  if landmarks is not None:
160
+ # Initialize PreprocessLayer
161
+ preprocess_layer = PreprocessLayer()
162
+ # Call the PreprocessLayer to preprocess the landmarks
163
+ processed_landmarks, _ = preprocess_layer.call(landmarks)
164
+ prediction = make_prediction(processed_landmarks) # Pass the preprocessed landmarks to make_prediction
165
  print("Prediction:", prediction)
166
+ return prediction
167
+ else:
168
+ return "Could not detect landmarks. Make sure your webcam is working properly."
169
+
170
 
171
+ cap = cv2.VideoCapture(0)
172
+ # Set mediapipe model
173
+ with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
174
+ while cap.isOpened():
175
+ # Read feed
176
+ ret, frame = cap.read()
177
 
178
+ # Make predictions
179
+ prediction = predict_with_webcam(frame)
180
+
181
+ # Display the frame with the prediction
182
+ cv2.putText(frame, prediction, (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
183
+ cv2.imshow('Webcam Landmark Prediction', frame)
184
+
185
+ # Exit the loop when 'q' key is pressed
186
  if cv2.waitKey(1) & 0xFF == ord('q'):
187
  break
188
 
189
+ cap.release()
190
+ cv2.destroyAllWindows()
 
 
 
 
 
 
 
 
 
 
191
 
 
192
 
193
 
194
  # Define the Gradio interface with the Webcam input and Text output
 
203
 
204
  # Launch the Gradio app with the webcam interface and create a public link
205
  if __name__ == "__main__":
206
+ webcam_interface.launch()