Spaces:

AreebKhan
/

Sign_Language_Translator2

Sleeping

App Files Files Community

AreebKhan commited on Feb 22

Commit

86ac4de

verified ·

1 Parent(s): 372d1ef

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -17

app.py CHANGED Viewed

@@ -1,51 +1,62 @@
 import gradio as gr
 import torch
 import cv2
 import numpy as np
 from transformers import VideoMAEForVideoClassification, VideoMAEImageProcessor
-# Load the pretrained model (VideoMAE)
-model_name = "sokaina55/xclip-base-patch32-finetuned-ssl-sign-language-recognition"
 model = VideoMAEForVideoClassification.from_pretrained(model_name)
 processor = VideoMAEImageProcessor.from_pretrained(model_name)
-# Function to process video frames and make predictions
-def predict(video_path):
     cap = cv2.VideoCapture(video_path)
     frames = []
     while cap.isOpened():
         ret, frame = cap.read()
         if not ret:
             break
-        frame = cv2.resize(frame, (224, 224))  # Resize for model compatibility
-        frames.append(frame)
     cap.release()
     if len(frames) == 0:
-        return "No frames detected in video!"
-    # Convert frames to tensor
     inputs = processor(images=frames, return_tensors="pt")
     with torch.no_grad():
         outputs = model(**inputs)
     logits = outputs.logits
     predicted_class_idx = logits.argmax(-1).item()
-    predicted_label = model.config.id2label[predicted_class_idx]  # Convert index to label
-    return f"Predicted Sign: {predicted_label}"
-# Gradio UI
 iface = gr.Interface(
     fn=predict,
     inputs=gr.Video(),
-    outputs=gr.Textbox(label="Recognized Sign"),
-    title="Sign Language Translator",
-    description="Upload a video of a hand gesture, and the model will predict the corresponding sign."
 )
 if __name__ == "__main__":
-    iface.launch(debug=True)

 import gradio as gr
 import torch
 import cv2
+import os
 import numpy as np
 from transformers import VideoMAEForVideoClassification, VideoMAEImageProcessor
+# Load a lighter pretrained model
+model_name = "facebook/videomae-base"
 model = VideoMAEForVideoClassification.from_pretrained(model_name)
 processor = VideoMAEImageProcessor.from_pretrained(model_name)
+# Reduce frames for faster processing
+def preprocess_video(video_path):
     cap = cv2.VideoCapture(video_path)
     frames = []
+    frame_skip = 5  # Skip every 5 frames to speed up processing
+    count = 0
     while cap.isOpened():
         ret, frame = cap.read()
         if not ret:
             break
+        if count % frame_skip == 0:
+            frame = cv2.resize(frame, (224, 224))  # Resize to match model input
+            frames.append(frame)
+        count += 1
     cap.release()
+    return frames
+# Function to predict sign language words
+def predict(video_path):
+    frames = preprocess_video(video_path)
     if len(frames) == 0:
+        return "No frames detected, try a different video."
     inputs = processor(images=frames, return_tensors="pt")
     with torch.no_grad():
         outputs = model(**inputs)
     logits = outputs.logits
     predicted_class_idx = logits.argmax(-1).item()
+    # Mapping to common words (example, update with real labels)
+    labels = ["Hello", "Thanks", "Yes", "No", "Goodbye", "Please", "Sorry"]
+    predicted_label = labels[predicted_class_idx % len(labels)]  # Placeholder mapping
+    return predicted_label
 iface = gr.Interface(
     fn=predict,
     inputs=gr.Video(),
+    outputs=gr.Textbox(label="Predicted Sign"),
+    title="Sign Language to Text Converter",
+    description="Upload a video of a hand gesture and get the predicted word."
 )
 if __name__ == "__main__":
+    iface.launch()