|
from decord import VideoReader, cpu |
|
import torch |
|
import numpy as np |
|
|
|
from transformers import VideoMAEFeatureExtractor, VideoMAEForVideoClassification |
|
from huggingface_hub import hf_hub_download |
|
import gradio as gr |
|
|
|
np.random.seed(0) |
|
|
|
|
|
def sample_frame_indices(clip_len, frame_sample_rate, seg_len): |
|
converted_len = int(clip_len * frame_sample_rate) |
|
end_idx = np.random.randint(converted_len, seg_len) |
|
start_idx = end_idx - converted_len |
|
indices = np.linspace(start_idx, end_idx, num=clip_len) |
|
indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64) |
|
return indices |
|
|
|
|
|
def inference(file_path): |
|
|
|
videoreader = VideoReader(file_path, num_threads=1, ctx=cpu(0)) |
|
|
|
|
|
videoreader.seek(0) |
|
indices = sample_frame_indices(clip_len=16, frame_sample_rate=4, seg_len=len(videoreader)) |
|
video = videoreader.get_batch(indices).asnumpy() |
|
|
|
feature_extractor = VideoMAEFeatureExtractor.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics") |
|
model = VideoMAEForVideoClassification.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics") |
|
|
|
inputs = feature_extractor(list(video), return_tensors="pt") |
|
|
|
with torch.no_grad(): |
|
outputs = model(**inputs) |
|
logits = outputs.logits |
|
|
|
|
|
predicted_label = logits.argmax(-1).item() |
|
return model.config.id2label[predicted_label] |
|
|
|
with gr.Blocks() as demo: |
|
with gr.Row(): |
|
with gr.Column(): |
|
video = gr.Video() |
|
btn = gr.Button(value="Run") |
|
with gr.Column(): |
|
label = gr.Textbox(label="Predicted Label") |
|
|
|
translate_btn.click(inference, inputs=video, outputs=label) |
|
|
|
demo.launch() |