Update README.md
Browse files
README.md
CHANGED
@@ -32,6 +32,89 @@ More information needed
|
|
32 |
|
33 |
## Intended uses & limitations
|
34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
Usage:
|
36 |
```python
|
37 |
import av
|
@@ -87,6 +170,7 @@ def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
|
|
87 |
file_path = hf_hub_download(
|
88 |
repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
|
89 |
)
|
|
|
90 |
container = av.open(file_path)
|
91 |
|
92 |
# sample 16 frames
|
@@ -106,6 +190,8 @@ with torch.no_grad():
|
|
106 |
predicted_label = logits.argmax(-1).item()
|
107 |
print(model.config.id2label[predicted_label])
|
108 |
```
|
|
|
|
|
109 |
## Training and evaluation data
|
110 |
|
111 |
More information needed
|
|
|
32 |
|
33 |
## Intended uses & limitations
|
34 |
|
35 |
+
## Inference using phone camera (have to download ipwebcam on phone from playstore)
|
36 |
+
```python
|
37 |
+
import cv2
|
38 |
+
import torch
|
39 |
+
import numpy as np
|
40 |
+
from transformers import AutoImageProcessor, VideoMAEForVideoClassification
|
41 |
+
|
42 |
+
np.random.seed(0)
|
43 |
+
|
44 |
+
def preprocess_frames(frames, image_processor):
|
45 |
+
inputs = image_processor(frames, return_tensors="pt")
|
46 |
+
inputs = {k: v.to(device) for k, v in inputs.items()} # Move tensors to GPU
|
47 |
+
return inputs
|
48 |
+
|
49 |
+
# Initialize the video capture object, replace ip addr with the local ip of your phone (will be shown in the ipwebcam app)
|
50 |
+
cap = cv2.VideoCapture('http://192.168.229.98:8080/video')
|
51 |
+
|
52 |
+
# Set the frame size (optional)
|
53 |
+
cap.set(cv2.CAP_PROP_FRAME_WIDTH, 640)
|
54 |
+
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 480)
|
55 |
+
|
56 |
+
image_processor = AutoImageProcessor.from_pretrained("archit11/videomae-base-finetuned-ucfcrime-full")
|
57 |
+
model = VideoMAEForVideoClassification.from_pretrained("archit11/videomae-base-finetuned-ucfcrime-full")
|
58 |
+
|
59 |
+
# Move the model to GPU
|
60 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
61 |
+
model = model.to(device)
|
62 |
+
|
63 |
+
frame_buffer = []
|
64 |
+
buffer_size = 16
|
65 |
+
previous_labels = []
|
66 |
+
top_confidences = [] # Initialize top_confidences
|
67 |
+
|
68 |
+
while True:
|
69 |
+
ret, frame = cap.read()
|
70 |
+
|
71 |
+
if not ret:
|
72 |
+
print("Failed to capture frame")
|
73 |
+
break
|
74 |
+
|
75 |
+
# Add the current frame to the buffer
|
76 |
+
frame_buffer.append(frame)
|
77 |
+
|
78 |
+
# Check if we have enough frames for inference
|
79 |
+
if len(frame_buffer) >= buffer_size:
|
80 |
+
# Preprocess the frames
|
81 |
+
inputs = preprocess_frames(frame_buffer, image_processor)
|
82 |
+
|
83 |
+
with torch.no_grad():
|
84 |
+
outputs = model(**inputs)
|
85 |
+
logits = outputs.logits
|
86 |
+
|
87 |
+
# Get the top 3 predicted labels and their confidence scores
|
88 |
+
top_k = 3
|
89 |
+
probs = torch.softmax(logits, dim=-1)
|
90 |
+
top_probs, top_indices = torch.topk(probs, top_k)
|
91 |
+
top_labels = [model.config.id2label[idx.item()] for idx in top_indices[0]]
|
92 |
+
top_confidences = top_probs[0].tolist() # Update top_confidences
|
93 |
+
|
94 |
+
# Check if the predicted labels are different from the previous labels
|
95 |
+
if top_labels != previous_labels:
|
96 |
+
previous_labels = top_labels
|
97 |
+
print("Predicted class:", top_labels[0]) # Print the predicted class for debugging
|
98 |
+
|
99 |
+
# Clear the frame buffer and continue from the next frame
|
100 |
+
frame_buffer.clear()
|
101 |
+
|
102 |
+
# Display the predicted labels and confidence scores on the frame
|
103 |
+
for i, (label, confidence) in enumerate(zip(previous_labels, top_confidences)):
|
104 |
+
label_text = f"{label}: {confidence:.2f}"
|
105 |
+
cv2.putText(frame, label_text, (10, 30 + i * 30), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 0, 255), 2)
|
106 |
+
|
107 |
+
# Display the resulting frame
|
108 |
+
cv2.imshow('Video', frame)
|
109 |
+
|
110 |
+
if cv2.waitKey(1) & 0xFF == ord('q'):
|
111 |
+
break
|
112 |
+
|
113 |
+
# Release everything when done
|
114 |
+
cap.release()
|
115 |
+
cv2.destroyAllWindows()
|
116 |
+
```
|
117 |
+
## Simple usage
|
118 |
Usage:
|
119 |
```python
|
120 |
import av
|
|
|
170 |
file_path = hf_hub_download(
|
171 |
repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
|
172 |
)
|
173 |
+
# use any other video just replace `file_path` with the video path
|
174 |
container = av.open(file_path)
|
175 |
|
176 |
# sample 16 frames
|
|
|
190 |
predicted_label = logits.argmax(-1).item()
|
191 |
print(model.config.id2label[predicted_label])
|
192 |
```
|
193 |
+
|
194 |
+
# Inference Using
|
195 |
## Training and evaluation data
|
196 |
|
197 |
More information needed
|