AreebKhan commited on
Commit
86ac4de
·
verified ·
1 Parent(s): 372d1ef

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -17
app.py CHANGED
@@ -1,51 +1,62 @@
1
  import gradio as gr
2
  import torch
3
  import cv2
 
4
  import numpy as np
5
  from transformers import VideoMAEForVideoClassification, VideoMAEImageProcessor
6
 
7
- # Load the pretrained model (VideoMAE)
8
- model_name = "sokaina55/xclip-base-patch32-finetuned-ssl-sign-language-recognition"
9
  model = VideoMAEForVideoClassification.from_pretrained(model_name)
10
  processor = VideoMAEImageProcessor.from_pretrained(model_name)
11
 
12
- # Function to process video frames and make predictions
13
- def predict(video_path):
14
  cap = cv2.VideoCapture(video_path)
15
  frames = []
16
-
 
 
17
  while cap.isOpened():
18
  ret, frame = cap.read()
19
  if not ret:
20
  break
21
- frame = cv2.resize(frame, (224, 224)) # Resize for model compatibility
22
- frames.append(frame)
 
 
23
 
24
  cap.release()
 
 
 
 
 
25
 
26
  if len(frames) == 0:
27
- return "No frames detected in video!"
28
 
29
- # Convert frames to tensor
30
  inputs = processor(images=frames, return_tensors="pt")
31
-
32
  with torch.no_grad():
33
  outputs = model(**inputs)
34
 
35
  logits = outputs.logits
36
  predicted_class_idx = logits.argmax(-1).item()
37
- predicted_label = model.config.id2label[predicted_class_idx] # Convert index to label
38
 
39
- return f"Predicted Sign: {predicted_label}"
 
 
 
 
40
 
41
- # Gradio UI
42
  iface = gr.Interface(
43
  fn=predict,
44
  inputs=gr.Video(),
45
- outputs=gr.Textbox(label="Recognized Sign"),
46
- title="Sign Language Translator",
47
- description="Upload a video of a hand gesture, and the model will predict the corresponding sign."
48
  )
49
 
50
  if __name__ == "__main__":
51
- iface.launch(debug=True)
 
1
  import gradio as gr
2
  import torch
3
  import cv2
4
+ import os
5
  import numpy as np
6
  from transformers import VideoMAEForVideoClassification, VideoMAEImageProcessor
7
 
8
+ # Load a lighter pretrained model
9
+ model_name = "facebook/videomae-base"
10
  model = VideoMAEForVideoClassification.from_pretrained(model_name)
11
  processor = VideoMAEImageProcessor.from_pretrained(model_name)
12
 
13
+ # Reduce frames for faster processing
14
+ def preprocess_video(video_path):
15
  cap = cv2.VideoCapture(video_path)
16
  frames = []
17
+ frame_skip = 5 # Skip every 5 frames to speed up processing
18
+
19
+ count = 0
20
  while cap.isOpened():
21
  ret, frame = cap.read()
22
  if not ret:
23
  break
24
+ if count % frame_skip == 0:
25
+ frame = cv2.resize(frame, (224, 224)) # Resize to match model input
26
+ frames.append(frame)
27
+ count += 1
28
 
29
  cap.release()
30
+ return frames
31
+
32
+ # Function to predict sign language words
33
+ def predict(video_path):
34
+ frames = preprocess_video(video_path)
35
 
36
  if len(frames) == 0:
37
+ return "No frames detected, try a different video."
38
 
 
39
  inputs = processor(images=frames, return_tensors="pt")
40
+
41
  with torch.no_grad():
42
  outputs = model(**inputs)
43
 
44
  logits = outputs.logits
45
  predicted_class_idx = logits.argmax(-1).item()
 
46
 
47
+ # Mapping to common words (example, update with real labels)
48
+ labels = ["Hello", "Thanks", "Yes", "No", "Goodbye", "Please", "Sorry"]
49
+ predicted_label = labels[predicted_class_idx % len(labels)] # Placeholder mapping
50
+
51
+ return predicted_label
52
 
 
53
  iface = gr.Interface(
54
  fn=predict,
55
  inputs=gr.Video(),
56
+ outputs=gr.Textbox(label="Predicted Sign"),
57
+ title="Sign Language to Text Converter",
58
+ description="Upload a video of a hand gesture and get the predicted word."
59
  )
60
 
61
  if __name__ == "__main__":
62
+ iface.launch()