File size: 7,537 Bytes
e7a1876 66c91bd 883ce2d e7a1876 d2a2733 ddbaae4 e7a1876 4ddcb94 e7a1876 d1fe899 883ce2d 35a8c91 883ce2d d1fe899 883ce2d d3dda12 883ce2d e7a1876 d2a2733 883ce2d d1fe899 e7a1876 4ddcb94 e7a1876 4ddcb94 e7a1876 66c91bd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 |
---
license: cc-by-nc-4.0
base_model: MCG-NJU/videomae-base
tags:
- generated_from_trainer
- vandalism
- video-classification
- ucf-crime
- vandalism-dectection
- videomae
metrics:
- accuracy
model-index:
- name: videomae-base-finetuned-ucfcrime-full2
results: []
---
<!-- This model card has been generated automatically according to the information the Trainer had access to. You
should probably proofread and complete it, then remove this comment. -->
# videomae-base-finetuned-ucfcrime-full2
This model is a fine-tuned version of [MCG-NJU/videomae-base](https://huggingface.co/MCG-NJU/videomae-base) on the [UCF-CRIME](https://paperswithcode.com/dataset/ucf-crime)
dataset. code : [github](https://github.com/archit-spec/majorproject)
It achieves the following results on the evaluation set:
- Loss: 2.5014
- Accuracy: 0.225
## Model description
More information needed
## Intended uses & limitations
## Inference using phone camera (have to download ipwebcam on phone from playstore)
```python
import cv2
import torch
import numpy as np
from transformers import AutoImageProcessor, VideoMAEForVideoClassification
np.random.seed(0)
def preprocess_frames(frames, image_processor):
inputs = image_processor(frames, return_tensors="pt")
inputs = {k: v.to(device) for k, v in inputs.items()} # Move tensors to GPU
return inputs
# Initialize the video capture object, replace ip addr with the local ip of your phone (will be shown in the ipwebcam app)
cap = cv2.VideoCapture('http://192.168.229.98:8080/video')
# Set the frame size (optional)
cap.set(cv2.CAP_PROP_FRAME_WIDTH, 640)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 480)
image_processor = AutoImageProcessor.from_pretrained("archit11/videomae-base-finetuned-ucfcrime-full")
model = VideoMAEForVideoClassification.from_pretrained("archit11/videomae-base-finetuned-ucfcrime-full")
# Move the model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
frame_buffer = []
buffer_size = 16
previous_labels = []
top_confidences = [] # Initialize top_confidences
while True:
ret, frame = cap.read()
if not ret:
print("Failed to capture frame")
break
# Add the current frame to the buffer
frame_buffer.append(frame)
# Check if we have enough frames for inference
if len(frame_buffer) >= buffer_size:
# Preprocess the frames
inputs = preprocess_frames(frame_buffer, image_processor)
with torch.no_grad():
outputs = model(**inputs)
logits = outputs.logits
# Get the top 3 predicted labels and their confidence scores
top_k = 3
probs = torch.softmax(logits, dim=-1)
top_probs, top_indices = torch.topk(probs, top_k)
top_labels = [model.config.id2label[idx.item()] for idx in top_indices[0]]
top_confidences = top_probs[0].tolist() # Update top_confidences
# Check if the predicted labels are different from the previous labels
if top_labels != previous_labels:
previous_labels = top_labels
print("Predicted class:", top_labels[0]) # Print the predicted class for debugging
# Clear the frame buffer and continue from the next frame
frame_buffer.clear()
# Display the predicted labels and confidence scores on the frame
for i, (label, confidence) in enumerate(zip(previous_labels, top_confidences)):
label_text = f"{label}: {confidence:.2f}"
cv2.putText(frame, label_text, (10, 30 + i * 30), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 0, 255), 2)
# Display the resulting frame
cv2.imshow('Video', frame)
if cv2.waitKey(1) & 0xFF == ord('q'):
break
# Release everything when done
cap.release()
cv2.destroyAllWindows()
```
## Simple usage
Usage:
```python
import av
import torch
import numpy as np
from transformers import AutoImageProcessor, VideoMAEForVideoClassification
from huggingface_hub import hf_hub_download
np.random.seed(0)
def read_video_pyav(container, indices):
'''
Decode the video with PyAV decoder.
Args:
container (`av.container.input.InputContainer`): PyAV container.
indices (`List[int]`): List of frame indices to decode.
Returns:
result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
'''
frames = []
container.seek(0)
start_index = indices[0]
end_index = indices[-1]
for i, frame in enumerate(container.decode(video=0)):
if i > end_index:
break
if i >= start_index and i in indices:
frames.append(frame)
return np.stack([x.to_ndarray(format="rgb24") for x in frames])
def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
'''
Sample a given number of frame indices from the video.
Args:
clip_len (`int`): Total number of frames to sample.
frame_sample_rate (`int`): Sample every n-th frame.
seg_len (`int`): Maximum allowed index of sample's last frame.
Returns:
indices (`List[int]`): List of sampled frame indices
'''
converted_len = int(clip_len * frame_sample_rate)
end_idx = np.random.randint(converted_len, seg_len)
start_idx = end_idx - converted_len
indices = np.linspace(start_idx, end_idx, num=clip_len)
indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
return indices
# video clip consists of 300 frames (10 seconds at 30 FPS)
file_path = hf_hub_download(
repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
)
# use any other video just replace `file_path` with the video path
container = av.open(file_path)
# sample 16 frames
indices = sample_frame_indices(clip_len=16, frame_sample_rate=1, seg_len=container.streams.video[0].frames)
video = read_video_pyav(container, indices)
image_processor = AutoImageProcessor.from_pretrained("archit11/videomae-base-finetuned-ucfcrime-full")
model = VideoMAEForVideoClassification.from_pretrained("archit11/videomae-base-finetuned-ucfcrime-full")
inputs = image_processor(list(video), return_tensors="pt")
with torch.no_grad():
outputs = model(**inputs)
logits = outputs.logits
# model predicts one of the 13 ucf-crime classes
predicted_label = logits.argmax(-1).item()
print(model.config.id2label[predicted_label])
```
# Inference Using
## Training and evaluation data
More information needed
## Training procedure
### Training hyperparameters
The following hyperparameters were used during training:
- learning_rate: 5e-05
- train_batch_size: 8
- eval_batch_size: 8
- seed: 42
- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
- lr_scheduler_type: linear
- lr_scheduler_warmup_ratio: 0.1
- training_steps: 700
### Training results
| Training Loss | Epoch | Step | Validation Loss | Accuracy |
|:-------------:|:-----:|:----:|:---------------:|:--------:|
| 2.5836 | 0.13 | 88 | 2.4944 | 0.2080 |
| 2.3212 | 1.13 | 176 | 2.5855 | 0.1773 |
| 2.2333 | 2.13 | 264 | 2.6270 | 0.1046 |
| 1.985 | 3.13 | 352 | 2.4058 | 0.2109 |
| 2.194 | 4.13 | 440 | 2.3654 | 0.2235 |
| 1.9796 | 5.13 | 528 | 2.2609 | 0.2235 |
| 1.8786 | 6.13 | 616 | 2.2725 | 0.2341 |
| 1.71 | 7.12 | 700 | 2.2228 | 0.2226 |
### Framework versions
- Transformers 4.38.1
- Pytorch 2.1.2
- Datasets 2.1.0
- Tokenizers 0.15.2 |