File size: 2,596 Bytes
1d6b8f2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1b395e1
 
02ad2a8
1b395e1
 
 
1d6b8f2
 
 
 
 
245aec5
62c85c9
 
01cbfd5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import cv2
import numpy as np
import mediapipe as mp
from tensorflow.keras.models import load_model
import gradio as gr

# Load the sign language recognition model
model = load_model('isl.h5')

# Initialize Mediapipe
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils

# Define actions
actions = ['hello', 'me', 'no', 'please', 'sorry', 'thank you', 'welcome', 'what', 'yes', 'you']

# Function to perform Mediapipe detection
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image.flags.writeable = False
    results = model.process(image)
    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    return image, results

# Function to extract keypoints
def extract_keypoints(results):
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    return np.concatenate([pose, lh, rh])

# Function to predict sign from video
def predict_sign_from_video(video_path):
    cap = cv2.VideoCapture(video_path)
    frames = []
    with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break
            
            image, results = mediapipe_detection(frame, holistic)
            keypoints = extract_keypoints(results)
            frames.append(keypoints)
            if len(frames) == 30:
                sequence = np.array(frames)
                res = model.predict(np.expand_dims(sequence, axis=0))[0]
                sign = actions[np.argmax(res)]
                frames = []  # Reset frames for next sequence
                return sign
    
    cap.release()

examples = [
        ['videos/abvv.webm'],
        ['videos/gdgdh.mp4']
        
    ]
    
# Create Gradio Interface
iface = gr.Interface(predict_sign_from_video, 
                      inputs="video", 
                      outputs="text", 
                      title="Sign Speak",
                      description="Upload a video and get the predicted sign.",
                      examples=examples,
                      cache_examples=False)
iface.launch(share=True)