archit11
/

videomae-base-finetuned-ucfcrime-full

@@ -32,6 +32,89 @@ More information needed
 ## Intended uses & limitations
 Usage:
 ```python
 import av
@@ -87,6 +170,7 @@ def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
 file_path = hf_hub_download(
     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
 )
 container = av.open(file_path)
 # sample 16 frames
@@ -106,6 +190,8 @@ with torch.no_grad():
 predicted_label = logits.argmax(-1).item()
 print(model.config.id2label[predicted_label])
 ```
 ## Training and evaluation data
 More information needed

 ## Intended uses & limitations
+## Inference using phone camera (have to download ipwebcam on phone from playstore)
+```python
+import cv2
+import torch
+import numpy as np
+from transformers import AutoImageProcessor, VideoMAEForVideoClassification
+np.random.seed(0)
+def preprocess_frames(frames, image_processor):
+    inputs = image_processor(frames, return_tensors="pt")
+    inputs = {k: v.to(device) for k, v in inputs.items()}  # Move tensors to GPU
+    return inputs
+# Initialize the video capture object, replace ip addr with the local ip of your phone  (will be shown in the ipwebcam app)
+cap = cv2.VideoCapture('http://192.168.229.98:8080/video')
+# Set the frame size (optional)
+cap.set(cv2.CAP_PROP_FRAME_WIDTH, 640)
+cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 480)
+image_processor = AutoImageProcessor.from_pretrained("archit11/videomae-base-finetuned-ucfcrime-full")
+model = VideoMAEForVideoClassification.from_pretrained("archit11/videomae-base-finetuned-ucfcrime-full")
+# Move the model to GPU
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model = model.to(device)
+frame_buffer = []
+buffer_size = 16
+previous_labels = []
+top_confidences = []  # Initialize top_confidences
+while True:
+    ret, frame = cap.read()
+    if not ret:
+        print("Failed to capture frame")
+        break
+    # Add the current frame to the buffer
+    frame_buffer.append(frame)
+    # Check if we have enough frames for inference
+    if len(frame_buffer) >= buffer_size:
+        # Preprocess the frames
+        inputs = preprocess_frames(frame_buffer, image_processor)
+        with torch.no_grad():
+            outputs = model(**inputs)
+            logits = outputs.logits
+        # Get the top 3 predicted labels and their confidence scores
+        top_k = 3
+        probs = torch.softmax(logits, dim=-1)
+        top_probs, top_indices = torch.topk(probs, top_k)
+        top_labels = [model.config.id2label[idx.item()] for idx in top_indices[0]]
+        top_confidences = top_probs[0].tolist()  # Update top_confidences
+        # Check if the predicted labels are different from the previous labels
+        if top_labels != previous_labels:
+            previous_labels = top_labels
+            print("Predicted class:", top_labels[0])  # Print the predicted class for debugging
+        # Clear the frame buffer and continue from the next frame
+        frame_buffer.clear()
+        # Display the predicted labels and confidence scores on the frame
+        for i, (label, confidence) in enumerate(zip(previous_labels, top_confidences)):
+            label_text = f"{label}: {confidence:.2f}"
+            cv2.putText(frame, label_text, (10, 30 + i * 30), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 0, 255), 2)
+        # Display the resulting frame
+        cv2.imshow('Video', frame)
+        if cv2.waitKey(1) & 0xFF == ord('q'):
+            break
+# Release everything when done
+cap.release()
+cv2.destroyAllWindows()
+```
+## Simple usage
 Usage:
 ```python
 import av
 file_path = hf_hub_download(
     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
 )
+# use any other video just replace `file_path` with the video path
 container = av.open(file_path)
 # sample 16 frames
 predicted_label = logits.argmax(-1).item()
 print(model.config.id2label[predicted_label])
 ```
+# Inference Using
 ## Training and evaluation data
 More information needed