File size: 7,775 Bytes
e156e53
ac125c0
 
2aae5c8
ac125c0
 
9f9bcb1
2aae5c8
4ca0f67
27ae852
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2aae5c8
2aad215
 
 
 
 
 
 
4ca0f67
4d98c67
 
 
 
 
 
 
 
4ca0f67
 
ac125c0
2aad215
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5eb4620
2aad215
 
 
914e870
0dded4a
 
 
 
 
2aad215
4ca0f67
ac125c0
2aad215
 
27ae852
 
 
2aad215
27ae852
 
2aad215
27ae852
 
 
 
 
 
 
 
 
 
 
ac125c0
2aae5c8
2aad215
2aae5c8
2aad215
 
 
 
2aae5c8
 
6b1e533
2aae5c8
03fbdcd
2aae5c8
bd4424b
2aae5c8
60eabb7
2aae5c8
 
 
 
bd4424b
 
03fbdcd
2abb6d3
60eabb7
2aae5c8
 
8582f12
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import os

import cv2
import gradio as gr
import mediapipe as mp
import numpy as np
import tensorflow as tf
import tensorflow.lite as tflite

index_to_class = {
    "TV": 0, "after": 1, "airplane": 2, "all": 3, "alligator": 4, "animal": 5, "another": 6, "any": 7, "apple": 8,
    "arm": 9, "aunt": 10, "awake": 11, "backyard": 12, "bad": 13, "balloon": 14, "bath": 15, "because": 16, "bed": 17,
    "bedroom": 18, "bee": 19, "before": 20, "beside": 21, "better": 22, "bird": 23, "black": 24, "blow": 25, "blue": 26,
    "boat": 27, "book": 28, "boy": 29, "brother": 30, "brown": 31, "bug": 32, "bye": 33, "callonphone": 34, "can": 35,
    "car": 36, "carrot": 37, "cat": 38, "cereal": 39, "chair": 40, "cheek": 41, "child": 42, "chin": 43,
    "chocolate": 44, "clean": 45, "close": 46, "closet": 47, "cloud": 48, "clown": 49, "cow": 50, "cowboy": 51,
    "cry": 52, "cut": 53, "cute": 54, "dad": 55, "dance": 56, "dirty": 57, "dog": 58, "doll": 59, "donkey": 60,
    "down": 61, "drawer": 62, "drink": 63, "drop": 64, "dry": 65, "dryer": 66, "duck": 67, "ear": 68, "elephant": 69,
    "empty": 70, "every": 71, "eye": 72, "face": 73, "fall": 74, "farm": 75, "fast": 76, "feet": 77, "find": 78,
    "fine": 79, "finger": 80, "finish": 81, "fireman": 82, "first": 83, "fish": 84, "flag": 85, "flower": 86,
    "food": 87, "for": 88, "frenchfries": 89, "frog": 90, "garbage": 91, "gift": 92, "giraffe": 93, "girl": 94,
    "give": 95, "glasswindow": 96, "go": 97, "goose": 98, "grandma": 99, "grandpa": 100, "grass": 101, "green": 102,
    "gum": 103, "hair": 104, "happy": 105, "hat": 106, "hate": 107, "have": 108, "haveto": 109, "head": 110,
    "hear": 111, "helicopter": 112, "hello": 113, "hen": 114, "hesheit": 115, "hide": 116, "high": 117, "home": 118,
    "horse": 119, "hot": 120, "hungry": 121, "icecream": 122, "if": 123, "into": 124, "jacket": 125, "jeans": 126,
    "jump": 127, "kiss": 128, "kitty": 129, "lamp": 130, "later": 131, "like": 132, "lion": 133, "lips": 134,
    "listen": 135, "look": 136, "loud": 137, "mad": 138, "make": 139, "man": 140, "many": 141, "milk": 142,
    "minemy": 143, "mitten": 144, "mom": 145, "moon": 146, "morning": 147, "mouse": 148, "mouth": 149, "nap": 150,
    "napkin": 151, "night": 152, "no": 153, "noisy": 154, "nose": 155, "not": 156, "now": 157, "nuts": 158, "old": 159,
    "on": 160, "open": 161, "orange": 162, "outside": 163, "owie": 164, "owl": 165, "pajamas": 166, "pen": 167,
    "pencil": 168, "penny": 169, "person": 170, "pig": 171, "pizza": 172, "please": 173, "police": 174, "pool": 175,
    "potty": 176, "pretend": 177, "pretty": 178, "puppy": 179, "puzzle": 180, "quiet": 181, "radio": 182, "rain": 183,
    "read": 184, "red": 185, "refrigerator": 186, "ride": 187, "room": 188, "sad": 189, "same": 190, "say": 191,
    "scissors": 192, "see": 193, "shhh": 194, "shirt": 195, "shoe": 196, "shower": 197, "sick": 198, "sleep": 199,
    "sleepy": 200, "smile": 201, "snack": 202, "snow": 203, "stairs": 204, "stay": 205, "sticky": 206, "store": 207,
    "story": 208, "stuck": 209, "sun": 210, "table": 211, "talk": 212, "taste": 213, "thankyou": 214, "that": 215,
    "there": 216, "think": 217, "thirsty": 218, "tiger": 219, "time": 220, "tomorrow": 221, "tongue": 222, "tooth": 223,
    "toothbrush": 224, "touch": 225, "toy": 226, "tree": 227, "uncle": 228, "underwear": 229, "up": 230, "vacuum": 231,
    "wait": 232, "wake": 233, "water": 234, "wet": 235, "weus": 236, "where": 237, "white": 238, "who": 239, "why": 240,
    "will": 241, "wolf": 242, "yellow": 243, "yes": 244, "yesterday": 245, "yourself": 246, "yucky": 247, "zebra": 248,
    "zipper": 249
}

inv_index_to_class = {v: k for k, v in index_to_class.items()}

# Initialize MediaPipe solutions
mp_hands = mp.solutions.hands
mp_pose = mp.solutions.pose
mp_face_mesh = mp.solutions.face_mesh

hands = mp_hands.Hands()
pose = mp_pose.Pose()
face_mesh = mp_face_mesh.FaceMesh()

# Get the absolute path to the directory containing app.py
current_dir = os.path.dirname(os.path.abspath(__file__))
# Define the filename of the TFLite model
model_filename = "model.tflite"
# Construct the full path to the TFLite model file
model_path = os.path.join(current_dir, model_filename)
# Load the TFLite model using the interpreter
interpreter = tf.lite.Interpreter(model_path=model_path)
interpreter.allocate_tensors()


# Preprocess landmarks
def preprocess_landmarks(hand1_landmarks, hand2_landmarks, pose_landmarks, lip_landmarks):
    hand1_landmarks = [[landmark.x, landmark.y, landmark.z] for landmark in hand1_landmarks.landmark]
    hand2_landmarks = [[landmark.x, landmark.y, landmark.z] for landmark in hand2_landmarks.landmark]
    pose_landmarks = [[landmark.x, landmark.y, landmark.z] for landmark in pose_landmarks.landmark]
    lip_landmarks = [[landmark.x, landmark.y, landmark.z] for landmark in lip_landmarks]

    combined_landmarks = lip_landmarks + hand1_landmarks + hand2_landmarks + pose_landmarks

    return np.array(combined_landmarks, dtype=np.float32)


# Function to extract landmarks from the webcam frame
def extract_landmarks(frame):
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands.process(frame_rgb)
    pose_results = pose.process(frame_rgb)
    face_results = face_mesh.process(frame_rgb)

    if not results.multi_hand_landmarks or not pose_results.pose_landmarks or not face_results.multi_face_landmarks:
        return None

    hand1_landmarks = results.multi_hand_landmarks[0]
    if len(results.multi_hand_landmarks) > 1:
        hand2_landmarks = results.multi_hand_landmarks[1]
    else:
        hand2_landmarks = hand1_landmarks

    pose_landmarks = pose_results.pose_landmarks
    face_landmarks = face_results.multi_face_landmarks[0]
    lip_landmarks = [face_landmarks.landmark[i] for i in LIPS_IDXS0 - START_IDX]

    print(f"Number of landmarks for hand1: {len(hand1_landmarks.landmark)}")
    print(f"Number of landmarks for hand2: {len(hand2_landmarks.landmark)}")
    print(f"Number of landmarks for pose: {len(pose_landmarks.landmark)}")
    print(f"Number of landmarks for lip: {len(lip_landmarks)}")

    return hand1_landmarks, hand2_landmarks, pose_landmarks, lip_landmarks


# Make prediction
def make_prediction(processed_landmarks):
    inputs = np.array(processed_landmarks, dtype=np.float32)

    # Set the input tensor for the TFLite model
    interpreter.set_tensor(input_details[0]['index'], inputs)

    # Invoke the TFLite interpreter to perform inference
    interpreter.invoke()

    # Get the output tensor of the TFLite model
    output_data = interpreter.get_tensor(output_details[0]['index'])

    # Find the index of the predicted class
    index = np.argmax(output_data)

    # Map the index to the corresponding class label using the index_to_class dictionary
    prediction = inv_index_to_class[index]

    return prediction


# Gradio Interface Function
def predict_with_webcam(frame):
    landmarks = extract_landmarks(frame)
    if landmarks is not None:
        processed_landmarks = preprocess_landmarks(*landmarks)
        prediction = make_prediction(processed_landmarks)
        return str(prediction)


# Define the Gradio interface with the Webcam input and Text output
# Modify the Gradio interface to use a "label" type output instead of "textbox"
webcam_interface = gr.Interface(
    fn=predict_with_webcam,
    inputs=gr.inputs.Image(shape=(480, 640), source="webcam"),
    outputs=gr.outputs.Label(),  # Use "Label" type instead of "Textbox"
    live=True,
    interpretation="default",
    title="Webcam Landmark Prediction",
    description="Make predictions using landmarks extracted from your webcam stream.",
)




# Launch the Gradio app with the webcam interface
if __name__ == "__main__":
    webcam_interface.launch()