rafiizain's picture
Upload 6 files
00da29b
import streamlit as st
import cv2
import numpy as np
import os
import time
import mediapipe as mp
import pyttsx3
import tensorflow as tf
import warnings
warnings.filterwarnings('ignore')
# MP Holistic:
mp_holistic = mp.solutions.holistic # Holistic model
mp_drawing = mp.solutions.drawing_utils # Drawing utilities
def mediapipe_detection(image, model):
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # COLOR CONVERSION BGR 2 RGB
image.flags.writeable = False # Image is no longer writeable
results = model.process(image) # Make prediction
image.flags.writeable = True # Image is now writeable
image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # COLOR COVERSION RGB 2 BGR
return image, results
def draw_styled_landmarks(image, results):
# Draw face connections
mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACE_CONNECTIONS,
mp_drawing.DrawingSpec(color=(80,110,10), thickness=1, circle_radius=1),
mp_drawing.DrawingSpec(color=(80,256,121), thickness=1, circle_radius=1)
)
# Draw pose connections
mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
mp_drawing.DrawingSpec(color=(80,22,10), thickness=2, circle_radius=4),
mp_drawing.DrawingSpec(color=(80,44,121), thickness=2, circle_radius=2)
)
# Draw left hand connections
mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4),
mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2)
)
# Draw right hand connections
mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=4),
mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2)
)
# Extract Keypoint values
def extract_keypoints(results):
pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(1404)
lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
return np.concatenate([pose, face, lh, rh])
# Load model:
model = tf.keras.models.load_model('./bisindo8kata.h5')
model.compile(optimizer='adam',
loss='categorical_crossentropy',
metrics=['accuracy'])
# Visualize prediction:
def prob_viz(res, actions, input_frame):
output_frame = input_frame.copy()
pred_dict = dict(zip(actions, res))
# sorting for prediction and get top 5
prediction = sorted(pred_dict.items(), key=lambda x: x[1])[::-1][:5]
for num, pred in enumerate(prediction):
text = '{}: {}'.format(pred[0], round(float(pred[1]),4))
# cv2.rectangle(output_frame, (0,60+num*40), (int(prob*100), 90+num*40), colors[num], -1)
cv2.putText(output_frame, text, (0, 85+num*40), cv2.FONT_HERSHEY_SIMPLEX, 0.75, (255,255,255), 2, cv2.LINE_AA)
return output_frame
# New detection variables
sequence = []
sentence = []
threshold = 0.9
tts = False
actions = os.listdir('./Dataset')
label_map = {label:num for num, label in enumerate(actions)}
# Text to speak config:
engine = pyttsx3.init()
voices = engine.getProperty('voices')
engine.setProperty('voice', voices[0].id)
###############################################################################################
# STREAMLIT #
col1, col2 = st.columns((3,1))
with col1:
st.title('BISINDO Recognition')
st.write('by Zain')
with col2:
st.image('./bisindo-app-icon.png')
# Checkboxes
st.header('Webcam')
col1, col2, col3 = st.columns(3)
with col1:
show_webcam = st.checkbox('Show webcam')
with col2:
show_landmarks = st.checkbox('Show landmarks')
with col3:
speak = st.checkbox('Speak')
# Webcam
FRAME_WINDOW = st.image([])
cap = cv2.VideoCapture(0) # device 1/2
# Mediapipe model
with mp_holistic.Holistic(min_detection_confidence=0.7, min_tracking_confidence=0.7) as holistic:
while show_webcam:
# Read feed
ret, frame = cap.read()
# Make detections
image, results = mediapipe_detection(frame, holistic)
# Draw landmarks
if show_landmarks:
draw_styled_landmarks(image, results)
# 2. Prediction logic
keypoints = extract_keypoints(results)
sequence.append(keypoints)
sequence = sequence[-30:]
if len(sequence) == 30:
res = model.predict(np.expand_dims(sequence, axis=0))[0]
#3. Viz logic
if res[np.argmax(res)] > threshold:
if len(sentence) > 0:
if actions[np.argmax(res)] != sentence[-1]:
# incase the first word is halo:
if (sentence[0] == '') and (actions[np.argmax(res)] == 'Halo'):
pass
else:
sentence.append(actions[np.argmax(res)])
tts = True
else:
sentence.append(actions[np.argmax(res)])
tts = True
if len(sentence) > 5:
sentence = sentence[-5:]
# Viz probabilities
if show_landmarks:
image = prob_viz(res, actions, image)
# Text to speak:
if speak:
if tts:
engine.say(sentence[-1])
engine.runAndWait()
# time.sleep(0.5)
tts = False
# show result
cv2.rectangle(image, (0,0), (640, 40), (245, 117, 16), -1)
cv2.putText(image, ' '.join(sentence), (3,30),
cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2, cv2.LINE_AA)
# Show to screen
# cv2.imshow('OpenCV Feed', image)
frameshow = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
FRAME_WINDOW.image(frameshow)
# Break gracefully
if cv2.waitKey(10) & 0xFF == ord('q'):
break
cap.release()
cv2.destroyAllWindows()
cap.release()
cv2.destroyAllWindows()