import os
import time

import cv2
import gradio as gr
import mediapipe as mp
import numpy as np

from matplotlib import pyplot as plt

mp_holistic = mp.solutions.holistic

# Import TensorFlow
import tensorflow as tf

# Initialize MediaPipe solutions
mp_hands = mp.solutions.hands
mp_pose = mp.solutions.pose
mp_face_mesh = mp.solutions.face_mesh

hands = mp_hands.Hands()
pose = mp_pose.Pose()
face_mesh = mp_face_mesh.FaceMesh()

# Get the absolute path to the directory containing app.py
current_dir = os.path.dirname(os.path.abspath(__file__))
# Define the filename of the TFLite model
model_filename = "model.tflite"
# Construct the full path to the TFLite model file
model_path = os.path.join(current_dir, model_filename)
# Load the TFLite model using the interpreter
interpreter = tf.lite.Interpreter(model_path=model_path)
interpreter.allocate_tensors()

# Get input and output details
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()


N_ROWS = 543
N_DIMS = 3
DIM_NAMES = ['x', 'y', 'z']
SEED = 42
NUM_CLASSES = 250
INPUT_SIZE = 64

BATCH_ALL_SIGNS_N = 4
BATCH_SIZE = 256
N_EPOCHS = 100
LR_MAX = 1e-3
N_WARMUP_EPOCHS = 0
WD_RATIO = 0.05
MASK_VAL = 4237

USE_TYPES = ['left_hand', 'pose', 'right_hand']
START_IDX = 468
LIPS_IDXS0 = np.array([
    61, 185, 40, 39, 37, 0, 267, 269, 270, 409,
    291, 146, 91, 181, 84, 17, 314, 405, 321, 375,
    78, 191, 80, 81, 82, 13, 312, 311, 310, 415,
    95, 88, 178, 87, 14, 317, 402, 318, 324, 308,
])

index_to_class = {
    "TV": 0, "after": 1, "airplane": 2, "all": 3, "alligator": 4, "animal": 5, "another": 6, "any": 7, "apple": 8,
    "arm": 9, "aunt": 10, "awake": 11, "backyard": 12, "bad": 13, "balloon": 14, "bath": 15, "because": 16, "bed": 17,
    "bedroom": 18, "bee": 19, "before": 20, "beside": 21, "better": 22, "bird": 23, "black": 24, "blow": 25, "blue": 26,
    "boat": 27, "book": 28, "boy": 29, "brother": 30, "brown": 31, "bug": 32, "bye": 33, "callonphone": 34, "can": 35,
    "car": 36, "carrot": 37, "cat": 38, "cereal": 39, "chair": 40, "cheek": 41, "child": 42, "chin": 43,
    "chocolate": 44, "clean": 45, "close": 46, "closet": 47, "cloud": 48, "clown": 49, "cow": 50, "cowboy": 51,
    "cry": 52, "cut": 53, "cute": 54, "dad": 55, "dance": 56, "dirty": 57, "dog": 58, "doll": 59, "donkey": 60,
    "down": 61, "drawer": 62, "drink": 63, "drop": 64, "dry": 65, "dryer": 66, "duck": 67, "ear": 68, "elephant": 69,
    "empty": 70, "every": 71, "eye": 72, "face": 73, "fall": 74, "farm": 75, "fast": 76, "feet": 77, "find": 78,
    "fine": 79, "finger": 80, "finish": 81, "fireman": 82, "first": 83, "fish": 84, "flag": 85, "flower": 86,
    "food": 87, "for": 88, "frenchfries": 89, "frog": 90, "garbage": 91, "gift": 92, "giraffe": 93, "girl": 94,
    "give": 95, "glasswindow": 96, "go": 97, "goose": 98, "grandma": 99, "grandpa": 100, "grass": 101, "green": 102,
    "gum": 103, "hair": 104, "happy": 105, "hat": 106, "hate": 107, "have": 108, "haveto": 109, "head": 110,
    "hear": 111, "helicopter": 112, "hello": 113, "hen": 114, "hesheit": 115, "hide": 116, "high": 117, "home": 118,
    "horse": 119, "hot": 120, "hungry": 121, "icecream": 122, "if": 123, "into": 124, "jacket": 125, "jeans": 126,
    "jump": 127, "kiss": 128, "kitty": 129, "lamp": 130, "later": 131, "like": 132, "lion": 133, "lips": 134,
    "listen": 135, "look": 136, "loud": 137, "mad": 138, "make": 139, "man": 140, "many": 141, "milk": 142,
    "minemy": 143, "mitten": 144, "mom": 145, "moon": 146, "morning": 147, "mouse": 148, "mouth": 149, "nap": 150,
    "napkin": 151, "night": 152, "no": 153, "noisy": 154, "nose": 155, "not": 156, "now": 157, "nuts": 158, "old": 159,
    "on": 160, "open": 161, "orange": 162, "outside": 163, "owie": 164, "owl": 165, "pajamas": 166, "pen": 167,
    "pencil": 168, "penny": 169, "person": 170, "pig": 171, "pizza": 172, "please": 173, "police": 174, "pool": 175,
    "potty": 176, "pretend": 177, "pretty": 178, "puppy": 179, "puzzle": 180, "quiet": 181, "radio": 182, "rain": 183,
    "read": 184, "red": 185, "refrigerator": 186, "ride": 187, "room": 188, "sad": 189, "same": 190, "say": 191,
    "scissors": 192, "see": 193, "shhh": 194, "shirt": 195, "shoe": 196, "shower": 197, "sick": 198, "sleep": 199,
    "sleepy": 200, "smile": 201, "snack": 202, "snow": 203, "stairs": 204, "stay": 205, "sticky": 206, "store": 207,
    "story": 208, "stuck": 209, "sun": 210, "table": 211, "talk": 212, "taste": 213, "thankyou": 214, "that": 215,
    "there": 216, "think": 217, "thirsty": 218, "tiger": 219, "time": 220, "tomorrow": 221, "tongue": 222, "tooth": 223,
    "toothbrush": 224, "touch": 225, "toy": 226, "tree": 227, "uncle": 228, "underwear": 229, "up": 230, "vacuum": 231,
    "wait": 232, "wake": 233, "water": 234, "wet": 235, "weus": 236, "where": 237, "white": 238, "who": 239, "why": 240,
    "will": 241, "wolf": 242, "yellow": 243, "yes": 244, "yesterday": 245, "yourself": 246, "yucky": 247, "zebra": 248,
    "zipper": 249
}

inv_index_to_class = {v: k for k, v in index_to_class.items()}

# Landmark indices in original data
LEFT_HAND_IDXS0 = np.arange(468, 489)
RIGHT_HAND_IDXS0 = np.arange(522, 543)
LEFT_POSE_IDXS0 = np.array([502, 504, 506, 508, 510])
RIGHT_POSE_IDXS0 = np.array([503, 505, 507, 509, 511])
LANDMARK_IDXS_LEFT_DOMINANT0 = np.concatenate((LIPS_IDXS0, LEFT_HAND_IDXS0, LEFT_POSE_IDXS0))
LANDMARK_IDXS_RIGHT_DOMINANT0 = np.concatenate((LIPS_IDXS0, RIGHT_HAND_IDXS0, RIGHT_POSE_IDXS0))
HAND_IDXS0 = np.concatenate((LEFT_HAND_IDXS0, RIGHT_HAND_IDXS0), axis=0)
N_COLS = LANDMARK_IDXS_LEFT_DOMINANT0.size
# Landmark indices in processed data
LIPS_IDXS = np.argwhere(np.isin(LANDMARK_IDXS_LEFT_DOMINANT0, LIPS_IDXS0)).squeeze()
LEFT_HAND_IDXS = np.argwhere(np.isin(LANDMARK_IDXS_LEFT_DOMINANT0, LEFT_HAND_IDXS0)).squeeze()
RIGHT_HAND_IDXS = np.argwhere(np.isin(LANDMARK_IDXS_LEFT_DOMINANT0, RIGHT_HAND_IDXS0)).squeeze()
HAND_IDXS = np.argwhere(np.isin(LANDMARK_IDXS_LEFT_DOMINANT0, HAND_IDXS0)).squeeze()
POSE_IDXS = np.argwhere(np.isin(LANDMARK_IDXS_LEFT_DOMINANT0, LEFT_POSE_IDXS0)).squeeze()

print(f'# HAND_IDXS: {len(HAND_IDXS)}, N_COLS: {N_COLS}')

LIPS_START = 0
LEFT_HAND_START = LIPS_IDXS.size
RIGHT_HAND_START = LEFT_HAND_START + LEFT_HAND_IDXS.size
POSE_START = RIGHT_HAND_START + RIGHT_HAND_IDXS.size

print(
    f'LIPS_START: {LIPS_START}, LEFT_HAND_START: {LEFT_HAND_START}, RIGHT_HAND_START: {RIGHT_HAND_START}, POSE_START: {POSE_START}')


def mediapipe_detection(image, model):
    # COLOR CONVERSION BGR 2 RGB
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image.flags.writeable = False  # Image is no longer writeable
    results = model.process(image)  # Make prediction
    image.flags.writeable = True  # Image is now writeable
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)  # COLOR COVERSION RGB 2 BGR
    return image, results


def extract_keypoints(results):
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten(
    ) if results.left_hand_landmarks else np.zeros(21 * 3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten(
    ) if results.right_hand_landmarks else np.zeros(21 * 3)
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten(
    ) if results.pose_landmarks else np.zeros(33 * 4)
    face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten(
    ) if results.face_landmarks else np.zeros(468 * 3)
    return np.concatenate([lh, rh, pose, face])


cap = cv2.VideoCapture(0, cv2.CAP_DSHOW)

# Set mediapipe model
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():

        # Read feed
        ret, frame = cap.read()

        # Make detections
        image, results = mediapipe_detection(frame, holistic)

        print(results)


        # Function to make predictions using the TensorFlow Lite model
        def make_prediction(processed_landmarks):
            inputs = np.array(processed_landmarks, dtype=np.float32)

            # Set the input tensor for the TFLite model
            interpreter.set_tensor(input_details[0]['index'], inputs)

            # Invoke the TFLite interpreter to perform inference
            interpreter.invoke()

            # Get the output tensor of the TFLite model
            output_data = interpreter.get_tensor(output_details[0]['index'])

            # Find the index of the predicted class
            index = np.argmax(output_data)

            # Map the index to the corresponding class label using the index_to_class dictionary
            prediction = inv_index_to_class[index]

            return prediction


        class PreprocessLayer(tf.keras.layers.Layer):
            def __init__(self):
                super(PreprocessLayer, self).__init__()
                normalisation_correction = tf.constant([
                    # Add 0.50 to left hand (original right hand) and substract 0.50 of right hand (original left hand)
                    [0] * len(LIPS_IDXS) + [0.50] * len(LEFT_HAND_IDXS) + [0.50] * len(POSE_IDXS),
                    # Y coordinates stay intact
                    [0] * len(LANDMARK_IDXS_LEFT_DOMINANT0),
                    # Z coordinates stay intact
                    [0] * len(LANDMARK_IDXS_LEFT_DOMINANT0),
                ],
                    dtype=tf.float32,
                )
                self.normalisation_correction = tf.transpose(normalisation_correction, [1, 0])

            def pad_edge(self, t, repeats, side):
                if side == 'LEFT':
                    return tf.concat((tf.repeat(t[:1], repeats=repeats, axis=0), t), axis=0)
                elif side == 'RIGHT':
                    return tf.concat((t, tf.repeat(t[-1:], repeats=repeats, axis=0)), axis=0)

            @tf.function(
                input_signature=(tf.TensorSpec(shape=[None, N_ROWS, N_DIMS], dtype=tf.float32),),
            )
            def call(self, data0):
                # Number of Frames in Video
                N_FRAMES0 = tf.shape(data0)[0]

                # Find dominant hand by comparing summed absolute coordinates
                left_hand_sum = tf.math.reduce_sum(
                    tf.where(tf.math.is_nan(tf.gather(data0, LEFT_HAND_IDXS0, axis=1)), 0, 1))
                right_hand_sum = tf.math.reduce_sum(
                    tf.where(tf.math.is_nan(tf.gather(data0, RIGHT_HAND_IDXS0, axis=1)), 0, 1))
                left_dominant = left_hand_sum >= right_hand_sum

                # Count non NaN Hand values in each frame for the dominant hand
                if left_dominant:
                    frames_hands_non_nan_sum = tf.math.reduce_sum(
                        tf.where(tf.math.is_nan(tf.gather(data0, LEFT_HAND_IDXS0, axis=1)), 0, 1),
                        axis=[1, 2],
                    )
                else:
                    frames_hands_non_nan_sum = tf.math.reduce_sum(
                        tf.where(tf.math.is_nan(tf.gather(data0, RIGHT_HAND_IDXS0, axis=1)), 0, 1),
                        axis=[1, 2],
                    )

                # Find frames indices with coordinates of dominant hand
                non_empty_frames_idxs = tf.where(frames_hands_non_nan_sum > 0)
                non_empty_frames_idxs = tf.squeeze(non_empty_frames_idxs, axis=1)
                # Filter frames
                data = tf.gather(data0, non_empty_frames_idxs, axis=0)

                # Cast Indices in float32 to be compatible with Tensorflow Lite
                non_empty_frames_idxs = tf.cast(non_empty_frames_idxs, tf.float32)
                # Normalize to start with 0
                non_empty_frames_idxs -= tf.reduce_min(non_empty_frames_idxs)

                # Number of Frames in Filtered Video
                N_FRAMES = tf.shape(data)[0]

                # Gather Relevant Landmark Columns
                if left_dominant:
                    data = tf.gather(data, LANDMARK_IDXS_LEFT_DOMINANT0, axis=1)
                else:
                    data = tf.gather(data, LANDMARK_IDXS_RIGHT_DOMINANT0, axis=1)
                    data = (
                            self.normalisation_correction + (
                            (data - self.normalisation_correction) * tf.where(self.normalisation_correction != 0, -1.0,
                                                                              1.0))
                    )

                # Video fits in INPUT_SIZE
                if N_FRAMES < INPUT_SIZE:
                    # Pad With -1 to indicate padding
                    non_empty_frames_idxs = tf.pad(non_empty_frames_idxs, [[0, INPUT_SIZE - N_FRAMES]],
                                                   constant_values=-1)
                    # Pad Data With Zeros
                    data = tf.pad(data, [[0, INPUT_SIZE - N_FRAMES], [0, 0], [0, 0]], constant_values=0)
                    # Fill NaN Values With 0
                    data = tf.where(tf.math.is_nan(data), 0.0, data)
                    return data, non_empty_frames_idxs
                # Video needs to be downsampled to INPUT_SIZE
                else:
                    # Repeat
                    if N_FRAMES < INPUT_SIZE ** 2:
                        repeats = tf.math.floordiv(INPUT_SIZE * INPUT_SIZE, N_FRAMES0)
                        data = tf.repeat(data, repeats=repeats, axis=0)
                        non_empty_frames_idxs = tf.repeat(non_empty_frames_idxs, repeats=repeats, axis=0)

                        # Pad To Multiple Of Input Size
                    pool_size = tf.math.floordiv(len(data), INPUT_SIZE)
                    if tf.math.mod(len(data), INPUT_SIZE) > 0:
                        pool_size += 1

                    if pool_size == 1:
                        pad_size = (pool_size * INPUT_SIZE) - len(data)
                    else:
                        pad_size = (pool_size * INPUT_SIZE) % len(data)

                    # Pad Start/End with Start/End value
                    pad_left = tf.math.floordiv(pad_size, 2) + tf.math.floordiv(INPUT_SIZE, 2)
                    pad_right = tf.math.floordiv(pad_size, 2) + tf.math.floordiv(INPUT_SIZE, 2)
                    if tf.math.mod(pad_size, 2) > 0:
                        pad_right += 1

                    # Pad By Concatenating Left/Right Edge Values
                    data = self.pad_edge(data, pad_left, 'LEFT')
                    data = self.pad_edge(data, pad_right, 'RIGHT')

                    # Pad Non Empty Frame Indices
                    non_empty_frames_idxs = self.pad_edge(non_empty_frames_idxs, pad_left, 'LEFT')
                    non_empty_frames_idxs = self.pad_edge(non_empty_frames_idxs, pad_right, 'RIGHT')

                    # Reshape to Mean Pool
                    data = tf.reshape(data, [INPUT_SIZE, -1, N_COLS, N_DIMS])
                    non_empty_frames_idxs = tf.reshape(non_empty_frames_idxs, [INPUT_SIZE, -1])

                    # Mean Pool
                    data = tf.experimental.numpy.nanmean(data, axis=1)
                    non_empty_frames_idxs = tf.experimental.numpy.nanmean(non_empty_frames_idxs, axis=1)

                    # Fill NaN Values With 0
                    data = tf.where(tf.math.is_nan(data), 0.0, data)

                    return data, non_empty_frames_idxs


        preprocess_layer = PreprocessLayer()


        def translate_sign_language(image):
            # Convert the frame to RGB (Mediapipe expects RGB images)
            rgb_frame = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

            with mp_hands.Hands(min_detection_confidence=0.5, min_tracking_confidence=0.5) as hands_tracker:
                # Process the frame with Mediapipe Hands
                hands_results = hands_tracker.process(rgb_frame)

            with mp_pose.Pose(min_detection_confidence=0.5, min_tracking_confidence=0.5) as pose_tracker:
                # Process the frame with Mediapipe Pose
                pose_results = mp_pose.process(rgb_frame)

            # Extract keypoints from the results
            hand_pose_keypoints = extract_keypoints(hands_results)
            pose_keypoints = extract_keypoints(pose_results)

            # Prepare the input data for the TFLite model
            left_hand_landmarks = hand_pose_keypoints[:63].reshape(1, -1, 3)
            right_hand_landmarks = hand_pose_keypoints[63:126].reshape(1, -1, 3)
            pose_landmarks = pose_keypoints[126:].reshape(1, -1, 4)

            # Call the PreprocessLayer to preprocess the hand and pose landmark data
            preprocessed_left_hand, _ = preprocess_layer(left_hand_landmarks)
            preprocessed_right_hand, _ = preprocess_layer(right_hand_landmarks)
            preprocessed_pose, _ = preprocess_layer(pose_landmarks)

            # Prepare the input data for the TFLite model
            input_data = [preprocessed_left_hand, preprocessed_right_hand, preprocessed_pose]

            # Perform inference using the loaded sign language model (assuming you have already loaded it)
            interpreter.set_tensor(interpreter.get_input_details()[0]['index'], input_data[0])
            interpreter.set_tensor(interpreter.get_input_details()[1]['index'], input_data[1])
            interpreter.set_tensor(interpreter.get_input_details()[2]['index'], input_data[2])
            interpreter.invoke()
            output = interpreter.get_tensor(interpreter.get_output_details()[0]['index'])

            # Make prediction using the processed landmarks
            translated_text = make_prediction(output)

            # Return the translated text
            return translated_text


        gr_interface = gr.Interface(fn=translate_sign_language,
                                    inputs="webcam",  # Input from webcam
                                    outputs="text",  # Output as text
                                    #capture_session=True,  # To properly release the webcam after running the interface
                                    live=True,  # Show live webcam feed
                                    title="Sign Language Translation",
                                    description="Translate sign language to text using TensorFlow Lite and Mediapipe.")

        gr_interface.launch(share=True)

    cap.release()
    cv2.destroyAllWindows()

video_path = './Test/HAPPY.mp4'

cap = cv2.VideoCapture(video_path)

mp_drawing = mp.solutions.drawing_utils
mp_face_mesh = mp.solutions.face_mesh
mp_hands = mp.solutions.hands
mp_pose = mp.solutions.pose

data_list = []
ROWS_PER_FRAME = 543  # Constant number of landmarks per frame

with mp_face_mesh.FaceMesh(static_image_mode=False, max_num_faces=1) as face_mesh, \
     mp_hands.Hands(static_image_mode=False, max_num_hands=2) as hands, \
     mp_pose.Pose(static_image_mode=False) as pose:

    frame_number = 0
    while cap.isOpened():
        ret, image = cap.read()
        if not ret:
            break

        # Convert the BGR image to RGB for Mediapipe
        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        # Process face landmarks
        results_face = face_mesh.process(image_rgb)
        if results_face.multi_face_landmarks:
            face_landmarks = results_face.multi_face_landmarks[0]
            for idx, landmark in enumerate(face_landmarks.landmark):
                data_list.append([frame_number, f"{frame_number}-face-{idx}", "face", idx, landmark.x, landmark.y, landmark.z])

        # Process hand landmarks
        results_hands = hands.process(image_rgb)
        if results_hands.multi_hand_landmarks:
            for hand_landmarks in results_hands.multi_hand_landmarks:
                for idx, landmark in enumerate(hand_landmarks.landmark):
                    data_list.append([frame_number, f"{frame_number}-right_hand-{idx}", "right-hand", idx, landmark.x, landmark.y, landmark.z])
                    mp_drawing.draw_landmarks(image, hand_landmarks, mp_hands.HAND_CONNECTIONS)

        # Process pose landmarks
        results_pose = pose.process(image_rgb)
        if results_pose.pose_landmarks:
            pose_landmarks = results_pose.pose_landmarks.landmark
            for idx, landmark in enumerate(pose_landmarks):
                data_list.append([frame_number, f"{frame_number}-pose-{idx}", "pose", idx, landmark.x, landmark.y, landmark.z])

        # Pad the landmarks with NaN values if the number of landmarks is less than ROWS_PER_FRAME
        while len(data_list) < (frame_number + 1) * ROWS_PER_FRAME:
            data_list.append([frame_number, f"{frame_number}-right_hand-{len(data_list) % ROWS_PER_FRAME}", "right-hand", len(data_list) % ROWS_PER_FRAME, np.nan, np.nan, np.nan])

        # Draw the landmarks on the frame (optional)
        mp_drawing.draw_landmarks(image, face_landmarks, mp_face_mesh.FACEMESH_CONTOURS)
        mp_drawing.draw_landmarks(image, results_pose.pose_landmarks, mp_pose.POSE_CONNECTIONS)

        # Display the frame (optional)
        cv2.imshow('MediaPipe', image)
        frame_number += 1

        # Press 'q' to quit
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

cap.release()
cv2.destroyAllWindows()

df = pd.DataFrame(data_list, columns=["frame", "row_id", "type", "landmark_index", "x", "y", "z"])
df.to_parquet("extracted_features.parquet", index=False)

# test_data = pd.read_parquet('./1006440534.parquet')
# test_data_kaggle = pd.read_parquet('1001373962.parquet')
# test_data_kaggle2 = pd.read_parquet('./100015657.parquet')
# test_data_kaggle3 = pd.read_parquet('./1003700302.parquet')
# test_data_kaggle4 = pd.read_parquet('./1007127288.parquet')
test_data_my_own = pd.read_parquet('extracted_features.parquet')
test_data_my_own['frame'] = test_data_my_own['frame'].astype('int16')
test_data_my_own['landmark_index'] = test_data_my_own['landmark_index'].astype('int16')


def load_relevant_data_subset(pq_path, ROWS_PER_FRAME = 543):
    data_columns = ['x', 'y', 'z']
    data = pd.read_parquet(pq_path, columns=data_columns) 
    n_frames = int( len(data) / ROWS_PER_FRAME)
    print(f"Data: {len(data)} Number of Frames: {n_frames}")
    data = data.values.reshape(n_frames, ROWS_PER_FRAME, len(data_columns))
    return data.astype(np.float32)


# demo_raw_data = load_relevant_data_subset('./1006440534.parquet')
demo_raw_data = load_relevant_data_subset('./extracted_features.parquet')
# demo_raw_data = load_relevant_data_subset('./1003700302.parquet', test_data_kaggle3['frame'].nunique())
# demo_raw_data = load_relevant_data_subset('./extracted_features.parquet')

ORD2SIGN = {206: 'sticky',
 20: 'before',
 178: 'pretty',
 114: 'hen',
 221: 'tomorrow',
 230: 'up',
 25: 'blow',
 236: 'weus',
 184: 'read',
 191: 'say',
 248: 'zebra',
 189: 'sad',
 62: 'drawer',
 5: 'animal',
 167: 'pen',
 60: 'donkey',
 41: 'cheek',
 51: 'cowboy',
 192: 'scissors',
 181: 'quiet',
 63: 'drink',
 94: 'girl',
 200: 'sleepy',
 249: 'zipper',
 171: 'pig',
 13: 'bad',
 9: 'arm',
 61: 'down',
 123: 'if',
 240: 'why',
 166: 'pajamas',
 203: 'snow',
 137: 'loud',
 195: 'shirt',
 31: 'brown',
 146: 'moon',
 23: 'bird',
 210: 'sun',
 76: 'fast',
 1: 'after',
 54: 'cute',
 77: 'feet',
 4: 'alligator',
 87: 'food',
 113: 'hello',
 93: 'giraffe',
 180: 'puzzle',
 211: 'table',
 132: 'like',
 153: 'no',
 122: 'icecream',
 67: 'duck',
 69: 'elephant',
 141: 'many',
 18: 'bedroom',
 205: 'stay',
 74: 'fall',
 246: 'yourself',
 183: 'rain',
 135: 'listen',
 44: 'chocolate',
 124: 'into',
 11: 'awake',
 40: 'chair',
 7: 'any',
 155: 'nose',
 118: 'home',
 161: 'open',
 58: 'dog',
 50: 'cow',
 241: 'will',
 149: 'mouth',
 177: 'pretend',
 172: 'pizza',
 75: 'farm',
 163: 'outside',
 234: 'water',
 81: 'finish',
 159: 'old',
 121: 'hungry',
 112: 'helicopter',
 130: 'lamp',
 222: 'tongue',
 194: 'shhh',
 6: 'another',
 103: 'gum',
 214: 'thankyou',
 128: 'kiss',
 101: 'grass',
 64: 'drop',
 157: 'now',
 233: 'wake',
 116: 'hide',
 201: 'smile',
 226: 'toy',
 216: 'there',
 147: 'morning',
 10: 'aunt',
 102: 'green',
 36: 'car',
 213: 'taste',
 39: 'cereal',
 207: 'store',
 66: 'dryer',
 162: 'orange',
 218: 'thirsty',
 83: 'first',
 45: 'clean',
 3: 'all',
 198: 'sick',
 129: 'kitty',
 96: 'glasswindow',
 202: 'snack',
 150: 'nap',
 53: 'cut',
 73: 'face',
 99: 'grandma',
 209: 'stuck',
 91: 'garbage',
 115: 'hesheit',
 95: 'give',
 104: 'hair',
 125: 'jacket',
 165: 'owl',
 82: 'fireman',
 227: 'tree',
 16: 'because',
 17: 'bed',
 30: 'brother',
 143: 'minemy',
 127: 'jump',
 245: 'yesterday',
 145: 'mom',
 111: 'hear',
 174: 'police',
 223: 'tooth',
 212: 'talk',
 224: 'toothbrush',
 164: 'owie',
 47: 'closet',
 169: 'penny',
 24: 'black',
 85: 'flag',
 238: 'white',
 134: 'lips',
 231: 'vacuum',
 8: 'apple',
 105: 'happy',
 151: 'napkin',
 92: 'gift',
 70: 'empty',
 46: 'close',
 52: 'cry',
 138: 'mad',
 49: 'clown',
 204: 'stairs',
 42: 'child',
 173: 'please',
 65: 'dry',
 72: 'eye',
 235: 'wet',
 32: 'bug',
 109: 'haveto',
 228: 'uncle',
 199: 'sleep',
 176: 'potty',
 29: 'boy',
 136: 'look',
 107: 'hate',
 71: 'every',
 12: 'backyard',
 22: 'better',
 84: 'fish',
 56: 'dance',
 139: 'make',
 98: 'goose',
 38: 'cat',
 232: 'wait',
 14: 'balloon',
 247: 'yucky',
 2: 'airplane',
 88: 'for',
 126: 'jeans',
 154: 'noisy',
 142: 'milk',
 239: 'who',
 90: 'frog',
 35: 'can',
 215: 'that',
 117: 'high',
 244: 'yes',
 196: 'shoe',
 108: 'have',
 48: 'cloud',
 170: 'person',
 187: 'ride',
 34: 'callonphone',
 37: 'carrot',
 100: 'grandpa',
 120: 'hot',
 131: 'later',
 229: 'underwear',
 0: 'TV',
 140: 'man',
 217: 'think',
 220: 'time',
 80: 'finger',
 86: 'flower',
 15: 'bath',
 28: 'book',
 193: 'see',
 208: 'story',
 26: 'blue',
 78: 'find',
 148: 'mouse',
 79: 'fine',
 179: 'puppy',
 55: 'dad',
 21: 'beside',
 225: 'touch',
 89: 'frenchfries',
 188: 'room',
 19: 'bee',
 27: 'boat',
 156: 'not',
 59: 'doll',
 97: 'go',
 190: 'same',
 144: 'mitten',
 160: 'on',
 57: 'dirty',
 182: 'radio',
 197: 'shower',
 186: 'refrigerator',
 158: 'nuts',
 175: 'pool',
 242: 'wolf',
 243: 'yellow',
 110: 'head',
 237: 'where',
 33: 'bye',
 133: 'lion',
 152: 'night',
 106: 'hat',
 43: 'chin',
 68: 'ear',
 168: 'pencil',
 119: 'horse',
 219: 'tiger',
 185: 'red'}

import tflite_runtime.interpreter as tflite

interpreter = tflite.Interpreter("./model.tflite")
found_signatures = list(interpreter.get_signature_list().keys())
prediction_fn = interpreter.get_signature_runner("serving_default")

prediction_fn(inputs=demo_raw_data)


output = prediction_fn(inputs=demo_raw_data)
sign = output['outputs'].argmax()
print("PRED : ", ORD2SIGN.get(sign), f'[{sign}]')