|
import os |
|
import gradio as gr |
|
import pytorchvideo |
|
import torch |
|
import torchvision |
|
import numpy as np |
|
import accelerate |
|
import evaluate |
|
from transformers import TrainingArguments, Trainer |
|
from transformers import VideoMAEImageProcessor, VideoMAEForVideoClassification |
|
from torchvision.transforms import Compose |
|
from pytorchvideo.data.labeled_video_dataset import LabeledVideoDataset |
|
from pytorchvideo.transforms import ( |
|
ApplyTransformToKey, |
|
Normalize, |
|
RandomShortSideScale, |
|
RemoveKey, |
|
ShortSideScale, |
|
UniformTemporalSubsample, |
|
) |
|
|
|
from torchvision.transforms import ( |
|
Compose, |
|
Lambda, |
|
Resize, |
|
) |
|
|
|
def preprocess_video(video_path, image_processor, model_config): |
|
mean = image_processor.image_mean |
|
std = image_processor.image_std |
|
|
|
if "shortest_edge" in image_processor.size: |
|
height = width = image_processor.size["shortest_edge"] |
|
else: |
|
height = image_processor.size["height"] |
|
width = image_processor.size["width"] |
|
|
|
resize_to = (height, width) |
|
num_frames_to_sample = model_config.num_frames |
|
|
|
transform = Compose( |
|
[ |
|
UniformTemporalSubsample(num_frames_to_sample), |
|
Lambda(lambda x: x / 255.0), |
|
Normalize(mean, std), |
|
Resize(resize_to), |
|
] |
|
) |
|
|
|
video = pytorchvideo.data.encoded_video.EncodedVideo.from_path(video_path) |
|
video_tensor = transform(video) |
|
|
|
return video_tensor |
|
|
|
|
|
def run_inference(model, video): |
|
"""Utility to run inference given a model and test video. |
|
|
|
The video is assumed to be preprocessed already. |
|
""" |
|
|
|
perumuted_sample_test_video = video.permute(1, 0, 2, 3) |
|
|
|
inputs = { |
|
"pixel_values": perumuted_sample_test_video.unsqueeze(0), |
|
"labels": torch.tensor([int(sample_test_video["label"])]), |
|
} |
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
inputs = {k: v.to(device) for k, v in inputs.items()} |
|
model = model.to(device) |
|
|
|
|
|
with torch.no_grad(): |
|
outputs = model(**inputs) |
|
logits = outputs.logits |
|
|
|
return logits |
|
|
|
|
|
model_name = "latif98/videomae-base-finetuned-isl-numbers_aug" |
|
image_processor = VideoMAEImageProcessor.from_pretrained(model_name) |
|
model = VideoMAEForVideoClassification.from_pretrained(model_name) |
|
|
|
|
|
|
|
|
|
def video_identity(video): |
|
return video |
|
|
|
|
|
demo = gr.Interface(video_identity, |
|
gr.Video(), |
|
"playable_video", |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |
|
|
|
|
|
|