import os import time import cv2 import gradio as gr import mediapipe as mp import numpy as np from matplotlib import pyplot as plt mp_holistic = mp.solutions.holistic # Import TensorFlow import tensorflow as tf # Initialize MediaPipe solutions mp_hands = mp.solutions.hands mp_pose = mp.solutions.pose mp_face_mesh = mp.solutions.face_mesh hands = mp_hands.Hands() pose = mp_pose.Pose() face_mesh = mp_face_mesh.FaceMesh() # Get the absolute path to the directory containing app.py current_dir = os.path.dirname(os.path.abspath(__file__)) # Define the filename of the TFLite model model_filename = "model.tflite" # Construct the full path to the TFLite model file model_path = os.path.join(current_dir, model_filename) # Load the TFLite model using the interpreter interpreter = tf.lite.Interpreter(model_path=model_path) interpreter.allocate_tensors() # Get input and output details input_details = interpreter.get_input_details() output_details = interpreter.get_output_details() N_ROWS = 543 N_DIMS = 3 DIM_NAMES = ['x', 'y', 'z'] SEED = 42 NUM_CLASSES = 250 INPUT_SIZE = 64 BATCH_ALL_SIGNS_N = 4 BATCH_SIZE = 256 N_EPOCHS = 100 LR_MAX = 1e-3 N_WARMUP_EPOCHS = 0 WD_RATIO = 0.05 MASK_VAL = 4237 USE_TYPES = ['left_hand', 'pose', 'right_hand'] START_IDX = 468 LIPS_IDXS0 = np.array([ 61, 185, 40, 39, 37, 0, 267, 269, 270, 409, 291, 146, 91, 181, 84, 17, 314, 405, 321, 375, 78, 191, 80, 81, 82, 13, 312, 311, 310, 415, 95, 88, 178, 87, 14, 317, 402, 318, 324, 308, ]) index_to_class = { "TV": 0, "after": 1, "airplane": 2, "all": 3, "alligator": 4, "animal": 5, "another": 6, "any": 7, "apple": 8, "arm": 9, "aunt": 10, "awake": 11, "backyard": 12, "bad": 13, "balloon": 14, "bath": 15, "because": 16, "bed": 17, "bedroom": 18, "bee": 19, "before": 20, "beside": 21, "better": 22, "bird": 23, "black": 24, "blow": 25, "blue": 26, "boat": 27, "book": 28, "boy": 29, "brother": 30, "brown": 31, "bug": 32, "bye": 33, "callonphone": 34, "can": 35, "car": 36, "carrot": 37, "cat": 38, "cereal": 39, "chair": 40, "cheek": 41, "child": 42, "chin": 43, "chocolate": 44, "clean": 45, "close": 46, "closet": 47, "cloud": 48, "clown": 49, "cow": 50, "cowboy": 51, "cry": 52, "cut": 53, "cute": 54, "dad": 55, "dance": 56, "dirty": 57, "dog": 58, "doll": 59, "donkey": 60, "down": 61, "drawer": 62, "drink": 63, "drop": 64, "dry": 65, "dryer": 66, "duck": 67, "ear": 68, "elephant": 69, "empty": 70, "every": 71, "eye": 72, "face": 73, "fall": 74, "farm": 75, "fast": 76, "feet": 77, "find": 78, "fine": 79, "finger": 80, "finish": 81, "fireman": 82, "first": 83, "fish": 84, "flag": 85, "flower": 86, "food": 87, "for": 88, "frenchfries": 89, "frog": 90, "garbage": 91, "gift": 92, "giraffe": 93, "girl": 94, "give": 95, "glasswindow": 96, "go": 97, "goose": 98, "grandma": 99, "grandpa": 100, "grass": 101, "green": 102, "gum": 103, "hair": 104, "happy": 105, "hat": 106, "hate": 107, "have": 108, "haveto": 109, "head": 110, "hear": 111, "helicopter": 112, "hello": 113, "hen": 114, "hesheit": 115, "hide": 116, "high": 117, "home": 118, "horse": 119, "hot": 120, "hungry": 121, "icecream": 122, "if": 123, "into": 124, "jacket": 125, "jeans": 126, "jump": 127, "kiss": 128, "kitty": 129, "lamp": 130, "later": 131, "like": 132, "lion": 133, "lips": 134, "listen": 135, "look": 136, "loud": 137, "mad": 138, "make": 139, "man": 140, "many": 141, "milk": 142, "minemy": 143, "mitten": 144, "mom": 145, "moon": 146, "morning": 147, "mouse": 148, "mouth": 149, "nap": 150, "napkin": 151, "night": 152, "no": 153, "noisy": 154, "nose": 155, "not": 156, "now": 157, "nuts": 158, "old": 159, "on": 160, "open": 161, "orange": 162, "outside": 163, "owie": 164, "owl": 165, "pajamas": 166, "pen": 167, "pencil": 168, "penny": 169, "person": 170, "pig": 171, "pizza": 172, "please": 173, "police": 174, "pool": 175, "potty": 176, "pretend": 177, "pretty": 178, "puppy": 179, "puzzle": 180, "quiet": 181, "radio": 182, "rain": 183, "read": 184, "red": 185, "refrigerator": 186, "ride": 187, "room": 188, "sad": 189, "same": 190, "say": 191, "scissors": 192, "see": 193, "shhh": 194, "shirt": 195, "shoe": 196, "shower": 197, "sick": 198, "sleep": 199, "sleepy": 200, "smile": 201, "snack": 202, "snow": 203, "stairs": 204, "stay": 205, "sticky": 206, "store": 207, "story": 208, "stuck": 209, "sun": 210, "table": 211, "talk": 212, "taste": 213, "thankyou": 214, "that": 215, "there": 216, "think": 217, "thirsty": 218, "tiger": 219, "time": 220, "tomorrow": 221, "tongue": 222, "tooth": 223, "toothbrush": 224, "touch": 225, "toy": 226, "tree": 227, "uncle": 228, "underwear": 229, "up": 230, "vacuum": 231, "wait": 232, "wake": 233, "water": 234, "wet": 235, "weus": 236, "where": 237, "white": 238, "who": 239, "why": 240, "will": 241, "wolf": 242, "yellow": 243, "yes": 244, "yesterday": 245, "yourself": 246, "yucky": 247, "zebra": 248, "zipper": 249 } inv_index_to_class = {v: k for k, v in index_to_class.items()} # Landmark indices in original data LEFT_HAND_IDXS0 = np.arange(468, 489) RIGHT_HAND_IDXS0 = np.arange(522, 543) LEFT_POSE_IDXS0 = np.array([502, 504, 506, 508, 510]) RIGHT_POSE_IDXS0 = np.array([503, 505, 507, 509, 511]) LANDMARK_IDXS_LEFT_DOMINANT0 = np.concatenate((LIPS_IDXS0, LEFT_HAND_IDXS0, LEFT_POSE_IDXS0)) LANDMARK_IDXS_RIGHT_DOMINANT0 = np.concatenate((LIPS_IDXS0, RIGHT_HAND_IDXS0, RIGHT_POSE_IDXS0)) HAND_IDXS0 = np.concatenate((LEFT_HAND_IDXS0, RIGHT_HAND_IDXS0), axis=0) N_COLS = LANDMARK_IDXS_LEFT_DOMINANT0.size # Landmark indices in processed data LIPS_IDXS = np.argwhere(np.isin(LANDMARK_IDXS_LEFT_DOMINANT0, LIPS_IDXS0)).squeeze() LEFT_HAND_IDXS = np.argwhere(np.isin(LANDMARK_IDXS_LEFT_DOMINANT0, LEFT_HAND_IDXS0)).squeeze() RIGHT_HAND_IDXS = np.argwhere(np.isin(LANDMARK_IDXS_LEFT_DOMINANT0, RIGHT_HAND_IDXS0)).squeeze() HAND_IDXS = np.argwhere(np.isin(LANDMARK_IDXS_LEFT_DOMINANT0, HAND_IDXS0)).squeeze() POSE_IDXS = np.argwhere(np.isin(LANDMARK_IDXS_LEFT_DOMINANT0, LEFT_POSE_IDXS0)).squeeze() print(f'# HAND_IDXS: {len(HAND_IDXS)}, N_COLS: {N_COLS}') LIPS_START = 0 LEFT_HAND_START = LIPS_IDXS.size RIGHT_HAND_START = LEFT_HAND_START + LEFT_HAND_IDXS.size POSE_START = RIGHT_HAND_START + RIGHT_HAND_IDXS.size print( f'LIPS_START: {LIPS_START}, LEFT_HAND_START: {LEFT_HAND_START}, RIGHT_HAND_START: {RIGHT_HAND_START}, POSE_START: {POSE_START}') def mediapipe_detection(image, model): # COLOR CONVERSION BGR 2 RGB image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) image.flags.writeable = False # Image is no longer writeable results = model.process(image) # Make prediction image.flags.writeable = True # Image is now writeable image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # COLOR COVERSION RGB 2 BGR return image, results def extract_keypoints(results): lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten( ) if results.left_hand_landmarks else np.zeros(21 * 3) rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten( ) if results.right_hand_landmarks else np.zeros(21 * 3) pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten( ) if results.pose_landmarks else np.zeros(33 * 4) face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten( ) if results.face_landmarks else np.zeros(468 * 3) return np.concatenate([lh, rh, pose, face]) cap = cv2.VideoCapture(0, cv2.CAP_DSHOW) # Set mediapipe model with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic: while cap.isOpened(): # Read feed ret, frame = cap.read() # Make detections image, results = mediapipe_detection(frame, holistic) print(results) # Function to make predictions using the TensorFlow Lite model def make_prediction(processed_landmarks): inputs = np.array(processed_landmarks, dtype=np.float32) # Set the input tensor for the TFLite model interpreter.set_tensor(input_details[0]['index'], inputs) # Invoke the TFLite interpreter to perform inference interpreter.invoke() # Get the output tensor of the TFLite model output_data = interpreter.get_tensor(output_details[0]['index']) # Find the index of the predicted class index = np.argmax(output_data) # Map the index to the corresponding class label using the index_to_class dictionary prediction = inv_index_to_class[index] return prediction class PreprocessLayer(tf.keras.layers.Layer): def __init__(self): super(PreprocessLayer, self).__init__() normalisation_correction = tf.constant([ # Add 0.50 to left hand (original right hand) and substract 0.50 of right hand (original left hand) [0] * len(LIPS_IDXS) + [0.50] * len(LEFT_HAND_IDXS) + [0.50] * len(POSE_IDXS), # Y coordinates stay intact [0] * len(LANDMARK_IDXS_LEFT_DOMINANT0), # Z coordinates stay intact [0] * len(LANDMARK_IDXS_LEFT_DOMINANT0), ], dtype=tf.float32, ) self.normalisation_correction = tf.transpose(normalisation_correction, [1, 0]) def pad_edge(self, t, repeats, side): if side == 'LEFT': return tf.concat((tf.repeat(t[:1], repeats=repeats, axis=0), t), axis=0) elif side == 'RIGHT': return tf.concat((t, tf.repeat(t[-1:], repeats=repeats, axis=0)), axis=0) @tf.function( input_signature=(tf.TensorSpec(shape=[None, N_ROWS, N_DIMS], dtype=tf.float32),), ) def call(self, data0): # Number of Frames in Video N_FRAMES0 = tf.shape(data0)[0] # Find dominant hand by comparing summed absolute coordinates left_hand_sum = tf.math.reduce_sum( tf.where(tf.math.is_nan(tf.gather(data0, LEFT_HAND_IDXS0, axis=1)), 0, 1)) right_hand_sum = tf.math.reduce_sum( tf.where(tf.math.is_nan(tf.gather(data0, RIGHT_HAND_IDXS0, axis=1)), 0, 1)) left_dominant = left_hand_sum >= right_hand_sum # Count non NaN Hand values in each frame for the dominant hand if left_dominant: frames_hands_non_nan_sum = tf.math.reduce_sum( tf.where(tf.math.is_nan(tf.gather(data0, LEFT_HAND_IDXS0, axis=1)), 0, 1), axis=[1, 2], ) else: frames_hands_non_nan_sum = tf.math.reduce_sum( tf.where(tf.math.is_nan(tf.gather(data0, RIGHT_HAND_IDXS0, axis=1)), 0, 1), axis=[1, 2], ) # Find frames indices with coordinates of dominant hand non_empty_frames_idxs = tf.where(frames_hands_non_nan_sum > 0) non_empty_frames_idxs = tf.squeeze(non_empty_frames_idxs, axis=1) # Filter frames data = tf.gather(data0, non_empty_frames_idxs, axis=0) # Cast Indices in float32 to be compatible with Tensorflow Lite non_empty_frames_idxs = tf.cast(non_empty_frames_idxs, tf.float32) # Normalize to start with 0 non_empty_frames_idxs -= tf.reduce_min(non_empty_frames_idxs) # Number of Frames in Filtered Video N_FRAMES = tf.shape(data)[0] # Gather Relevant Landmark Columns if left_dominant: data = tf.gather(data, LANDMARK_IDXS_LEFT_DOMINANT0, axis=1) else: data = tf.gather(data, LANDMARK_IDXS_RIGHT_DOMINANT0, axis=1) data = ( self.normalisation_correction + ( (data - self.normalisation_correction) * tf.where(self.normalisation_correction != 0, -1.0, 1.0)) ) # Video fits in INPUT_SIZE if N_FRAMES < INPUT_SIZE: # Pad With -1 to indicate padding non_empty_frames_idxs = tf.pad(non_empty_frames_idxs, [[0, INPUT_SIZE - N_FRAMES]], constant_values=-1) # Pad Data With Zeros data = tf.pad(data, [[0, INPUT_SIZE - N_FRAMES], [0, 0], [0, 0]], constant_values=0) # Fill NaN Values With 0 data = tf.where(tf.math.is_nan(data), 0.0, data) return data, non_empty_frames_idxs # Video needs to be downsampled to INPUT_SIZE else: # Repeat if N_FRAMES < INPUT_SIZE ** 2: repeats = tf.math.floordiv(INPUT_SIZE * INPUT_SIZE, N_FRAMES0) data = tf.repeat(data, repeats=repeats, axis=0) non_empty_frames_idxs = tf.repeat(non_empty_frames_idxs, repeats=repeats, axis=0) # Pad To Multiple Of Input Size pool_size = tf.math.floordiv(len(data), INPUT_SIZE) if tf.math.mod(len(data), INPUT_SIZE) > 0: pool_size += 1 if pool_size == 1: pad_size = (pool_size * INPUT_SIZE) - len(data) else: pad_size = (pool_size * INPUT_SIZE) % len(data) # Pad Start/End with Start/End value pad_left = tf.math.floordiv(pad_size, 2) + tf.math.floordiv(INPUT_SIZE, 2) pad_right = tf.math.floordiv(pad_size, 2) + tf.math.floordiv(INPUT_SIZE, 2) if tf.math.mod(pad_size, 2) > 0: pad_right += 1 # Pad By Concatenating Left/Right Edge Values data = self.pad_edge(data, pad_left, 'LEFT') data = self.pad_edge(data, pad_right, 'RIGHT') # Pad Non Empty Frame Indices non_empty_frames_idxs = self.pad_edge(non_empty_frames_idxs, pad_left, 'LEFT') non_empty_frames_idxs = self.pad_edge(non_empty_frames_idxs, pad_right, 'RIGHT') # Reshape to Mean Pool data = tf.reshape(data, [INPUT_SIZE, -1, N_COLS, N_DIMS]) non_empty_frames_idxs = tf.reshape(non_empty_frames_idxs, [INPUT_SIZE, -1]) # Mean Pool data = tf.experimental.numpy.nanmean(data, axis=1) non_empty_frames_idxs = tf.experimental.numpy.nanmean(non_empty_frames_idxs, axis=1) # Fill NaN Values With 0 data = tf.where(tf.math.is_nan(data), 0.0, data) return data, non_empty_frames_idxs preprocess_layer = PreprocessLayer() def translate_sign_language(image): # Convert the frame to RGB (Mediapipe expects RGB images) rgb_frame = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) with mp_hands.Hands(min_detection_confidence=0.5, min_tracking_confidence=0.5) as hands_tracker: # Process the frame with Mediapipe Hands hands_results = hands_tracker.process(rgb_frame) with mp_pose.Pose(min_detection_confidence=0.5, min_tracking_confidence=0.5) as pose_tracker: # Process the frame with Mediapipe Pose pose_results = mp_pose.process(rgb_frame) # Extract keypoints from the results hand_pose_keypoints = extract_keypoints(hands_results) pose_keypoints = extract_keypoints(pose_results) # Prepare the input data for the TFLite model left_hand_landmarks = hand_pose_keypoints[:63].reshape(1, -1, 3) right_hand_landmarks = hand_pose_keypoints[63:126].reshape(1, -1, 3) pose_landmarks = pose_keypoints[126:].reshape(1, -1, 4) # Call the PreprocessLayer to preprocess the hand and pose landmark data preprocessed_left_hand, _ = preprocess_layer(left_hand_landmarks) preprocessed_right_hand, _ = preprocess_layer(right_hand_landmarks) preprocessed_pose, _ = preprocess_layer(pose_landmarks) # Prepare the input data for the TFLite model input_data = [preprocessed_left_hand, preprocessed_right_hand, preprocessed_pose] # Perform inference using the loaded sign language model (assuming you have already loaded it) interpreter.set_tensor(interpreter.get_input_details()[0]['index'], input_data[0]) interpreter.set_tensor(interpreter.get_input_details()[1]['index'], input_data[1]) interpreter.set_tensor(interpreter.get_input_details()[2]['index'], input_data[2]) interpreter.invoke() output = interpreter.get_tensor(interpreter.get_output_details()[0]['index']) # Make prediction using the processed landmarks translated_text = make_prediction(output) # Return the translated text return translated_text gr_interface = gr.Interface(fn=translate_sign_language, inputs="webcam", # Input from webcam outputs="text", # Output as text #capture_session=True, # To properly release the webcam after running the interface live=True, # Show live webcam feed title="Sign Language Translation", description="Translate sign language to text using TensorFlow Lite and Mediapipe.") gr_interface.launch(share=True) cap.release() cv2.destroyAllWindows() video_path = './Test/HAPPY.mp4' cap = cv2.VideoCapture(video_path) mp_drawing = mp.solutions.drawing_utils mp_face_mesh = mp.solutions.face_mesh mp_hands = mp.solutions.hands mp_pose = mp.solutions.pose data_list = [] ROWS_PER_FRAME = 543 # Constant number of landmarks per frame with mp_face_mesh.FaceMesh(static_image_mode=False, max_num_faces=1) as face_mesh, \ mp_hands.Hands(static_image_mode=False, max_num_hands=2) as hands, \ mp_pose.Pose(static_image_mode=False) as pose: frame_number = 0 while cap.isOpened(): ret, image = cap.read() if not ret: break # Convert the BGR image to RGB for Mediapipe image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # Process face landmarks results_face = face_mesh.process(image_rgb) if results_face.multi_face_landmarks: face_landmarks = results_face.multi_face_landmarks[0] for idx, landmark in enumerate(face_landmarks.landmark): data_list.append([frame_number, f"{frame_number}-face-{idx}", "face", idx, landmark.x, landmark.y, landmark.z]) # Process hand landmarks results_hands = hands.process(image_rgb) if results_hands.multi_hand_landmarks: for hand_landmarks in results_hands.multi_hand_landmarks: for idx, landmark in enumerate(hand_landmarks.landmark): data_list.append([frame_number, f"{frame_number}-right_hand-{idx}", "right-hand", idx, landmark.x, landmark.y, landmark.z]) mp_drawing.draw_landmarks(image, hand_landmarks, mp_hands.HAND_CONNECTIONS) # Process pose landmarks results_pose = pose.process(image_rgb) if results_pose.pose_landmarks: pose_landmarks = results_pose.pose_landmarks.landmark for idx, landmark in enumerate(pose_landmarks): data_list.append([frame_number, f"{frame_number}-pose-{idx}", "pose", idx, landmark.x, landmark.y, landmark.z]) # Pad the landmarks with NaN values if the number of landmarks is less than ROWS_PER_FRAME while len(data_list) < (frame_number + 1) * ROWS_PER_FRAME: data_list.append([frame_number, f"{frame_number}-right_hand-{len(data_list) % ROWS_PER_FRAME}", "right-hand", len(data_list) % ROWS_PER_FRAME, np.nan, np.nan, np.nan]) # Draw the landmarks on the frame (optional) mp_drawing.draw_landmarks(image, face_landmarks, mp_face_mesh.FACEMESH_CONTOURS) mp_drawing.draw_landmarks(image, results_pose.pose_landmarks, mp_pose.POSE_CONNECTIONS) # Display the frame (optional) cv2.imshow('MediaPipe', image) frame_number += 1 # Press 'q' to quit if cv2.waitKey(1) & 0xFF == ord('q'): break cap.release() cv2.destroyAllWindows() df = pd.DataFrame(data_list, columns=["frame", "row_id", "type", "landmark_index", "x", "y", "z"]) df.to_parquet("extracted_features.parquet", index=False) # test_data = pd.read_parquet('./1006440534.parquet') # test_data_kaggle = pd.read_parquet('1001373962.parquet') # test_data_kaggle2 = pd.read_parquet('./100015657.parquet') # test_data_kaggle3 = pd.read_parquet('./1003700302.parquet') # test_data_kaggle4 = pd.read_parquet('./1007127288.parquet') test_data_my_own = pd.read_parquet('extracted_features.parquet') test_data_my_own['frame'] = test_data_my_own['frame'].astype('int16') test_data_my_own['landmark_index'] = test_data_my_own['landmark_index'].astype('int16') def load_relevant_data_subset(pq_path, ROWS_PER_FRAME = 543): data_columns = ['x', 'y', 'z'] data = pd.read_parquet(pq_path, columns=data_columns) n_frames = int( len(data) / ROWS_PER_FRAME) print(f"Data: {len(data)} Number of Frames: {n_frames}") data = data.values.reshape(n_frames, ROWS_PER_FRAME, len(data_columns)) return data.astype(np.float32) # demo_raw_data = load_relevant_data_subset('./1006440534.parquet') demo_raw_data = load_relevant_data_subset('./extracted_features.parquet') # demo_raw_data = load_relevant_data_subset('./1003700302.parquet', test_data_kaggle3['frame'].nunique()) # demo_raw_data = load_relevant_data_subset('./extracted_features.parquet') ORD2SIGN = {206: 'sticky', 20: 'before', 178: 'pretty', 114: 'hen', 221: 'tomorrow', 230: 'up', 25: 'blow', 236: 'weus', 184: 'read', 191: 'say', 248: 'zebra', 189: 'sad', 62: 'drawer', 5: 'animal', 167: 'pen', 60: 'donkey', 41: 'cheek', 51: 'cowboy', 192: 'scissors', 181: 'quiet', 63: 'drink', 94: 'girl', 200: 'sleepy', 249: 'zipper', 171: 'pig', 13: 'bad', 9: 'arm', 61: 'down', 123: 'if', 240: 'why', 166: 'pajamas', 203: 'snow', 137: 'loud', 195: 'shirt', 31: 'brown', 146: 'moon', 23: 'bird', 210: 'sun', 76: 'fast', 1: 'after', 54: 'cute', 77: 'feet', 4: 'alligator', 87: 'food', 113: 'hello', 93: 'giraffe', 180: 'puzzle', 211: 'table', 132: 'like', 153: 'no', 122: 'icecream', 67: 'duck', 69: 'elephant', 141: 'many', 18: 'bedroom', 205: 'stay', 74: 'fall', 246: 'yourself', 183: 'rain', 135: 'listen', 44: 'chocolate', 124: 'into', 11: 'awake', 40: 'chair', 7: 'any', 155: 'nose', 118: 'home', 161: 'open', 58: 'dog', 50: 'cow', 241: 'will', 149: 'mouth', 177: 'pretend', 172: 'pizza', 75: 'farm', 163: 'outside', 234: 'water', 81: 'finish', 159: 'old', 121: 'hungry', 112: 'helicopter', 130: 'lamp', 222: 'tongue', 194: 'shhh', 6: 'another', 103: 'gum', 214: 'thankyou', 128: 'kiss', 101: 'grass', 64: 'drop', 157: 'now', 233: 'wake', 116: 'hide', 201: 'smile', 226: 'toy', 216: 'there', 147: 'morning', 10: 'aunt', 102: 'green', 36: 'car', 213: 'taste', 39: 'cereal', 207: 'store', 66: 'dryer', 162: 'orange', 218: 'thirsty', 83: 'first', 45: 'clean', 3: 'all', 198: 'sick', 129: 'kitty', 96: 'glasswindow', 202: 'snack', 150: 'nap', 53: 'cut', 73: 'face', 99: 'grandma', 209: 'stuck', 91: 'garbage', 115: 'hesheit', 95: 'give', 104: 'hair', 125: 'jacket', 165: 'owl', 82: 'fireman', 227: 'tree', 16: 'because', 17: 'bed', 30: 'brother', 143: 'minemy', 127: 'jump', 245: 'yesterday', 145: 'mom', 111: 'hear', 174: 'police', 223: 'tooth', 212: 'talk', 224: 'toothbrush', 164: 'owie', 47: 'closet', 169: 'penny', 24: 'black', 85: 'flag', 238: 'white', 134: 'lips', 231: 'vacuum', 8: 'apple', 105: 'happy', 151: 'napkin', 92: 'gift', 70: 'empty', 46: 'close', 52: 'cry', 138: 'mad', 49: 'clown', 204: 'stairs', 42: 'child', 173: 'please', 65: 'dry', 72: 'eye', 235: 'wet', 32: 'bug', 109: 'haveto', 228: 'uncle', 199: 'sleep', 176: 'potty', 29: 'boy', 136: 'look', 107: 'hate', 71: 'every', 12: 'backyard', 22: 'better', 84: 'fish', 56: 'dance', 139: 'make', 98: 'goose', 38: 'cat', 232: 'wait', 14: 'balloon', 247: 'yucky', 2: 'airplane', 88: 'for', 126: 'jeans', 154: 'noisy', 142: 'milk', 239: 'who', 90: 'frog', 35: 'can', 215: 'that', 117: 'high', 244: 'yes', 196: 'shoe', 108: 'have', 48: 'cloud', 170: 'person', 187: 'ride', 34: 'callonphone', 37: 'carrot', 100: 'grandpa', 120: 'hot', 131: 'later', 229: 'underwear', 0: 'TV', 140: 'man', 217: 'think', 220: 'time', 80: 'finger', 86: 'flower', 15: 'bath', 28: 'book', 193: 'see', 208: 'story', 26: 'blue', 78: 'find', 148: 'mouse', 79: 'fine', 179: 'puppy', 55: 'dad', 21: 'beside', 225: 'touch', 89: 'frenchfries', 188: 'room', 19: 'bee', 27: 'boat', 156: 'not', 59: 'doll', 97: 'go', 190: 'same', 144: 'mitten', 160: 'on', 57: 'dirty', 182: 'radio', 197: 'shower', 186: 'refrigerator', 158: 'nuts', 175: 'pool', 242: 'wolf', 243: 'yellow', 110: 'head', 237: 'where', 33: 'bye', 133: 'lion', 152: 'night', 106: 'hat', 43: 'chin', 68: 'ear', 168: 'pencil', 119: 'horse', 219: 'tiger', 185: 'red'} import tflite_runtime.interpreter as tflite interpreter = tflite.Interpreter("./model.tflite") found_signatures = list(interpreter.get_signature_list().keys()) prediction_fn = interpreter.get_signature_runner("serving_default") prediction_fn(inputs=demo_raw_data) output = prediction_fn(inputs=demo_raw_data) sign = output['outputs'].argmax() print("PRED : ", ORD2SIGN.get(sign), f'[{sign}]')