Spaces:
Runtime error
Runtime error
import av | |
import torch | |
import numpy as np | |
from fastapi import FastAPI, UploadFile, File | |
from transformers import AutoProcessor, AutoModel | |
from huggingface_hub import hf_hub_download | |
app = FastAPI() | |
np.random.seed(0) | |
def read_video_pyav(container, indices): | |
''' | |
Decode the video with PyAV decoder. | |
Args: | |
container (`av.container.input.InputContainer`): PyAV container. | |
indices (`List[int]`): List of frame indices to decode. | |
Returns: | |
result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3). | |
''' | |
frames = [] | |
container.seek(0) | |
start_index = indices[0] | |
end_index = indices[-1] | |
for i, frame in enumerate(container.decode(video=0)): | |
if i > end_index: | |
break | |
if i >= start_index and i in indices: | |
frames.append(frame) | |
return np.stack([x.to_ndarray(format="rgb24") for x in frames]) | |
def sample_frame_indices(clip_len, frame_sample_rate, seg_len): | |
''' | |
Sample a given number of frame indices from the video. | |
Args: | |
clip_len (`int`): Total number of frames to sample. | |
frame_sample_rate (`int`): Sample every n-th frame. | |
seg_len (`int`): Maximum allowed index of sample's last frame. | |
Returns: | |
indices (`List[int]`): List of sampled frame indices | |
''' | |
converted_len = int(clip_len * frame_sample_rate) | |
end_idx = np.random.randint(converted_len, seg_len) | |
start_idx = end_idx - converted_len | |
indices = np.linspace(start_idx, end_idx, num=clip_len) | |
indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64) | |
return indices | |
processor = AutoProcessor.from_pretrained("microsoft/xclip-base-patch32") | |
model = AutoModel.from_pretrained("microsoft/xclip-base-patch32") | |
async def classify_video(file: UploadFile): | |
file_bytes = await file.read() | |
container = av.open(file_bytes) | |
indices = sample_frame_indices(clip_len=8, frame_sample_rate=1, seg_len=container.streams.video[0].frames) | |
video = read_video_pyav(container, indices) | |
inputs = processor( | |
text=["playing sports", "eating spaghetti", "go shopping"], | |
videos=[video], # Changed list(video) to [video] to avoid error | |
return_tensors="pt", | |
padding=True, | |
) | |
with torch.no_grad(): | |
outputs = model(**inputs) | |
logits_per_video = outputs.logits_per_video | |
probs = logits_per_video.softmax(dim=1) | |
return {"classification_probabilities": probs.tolist()} | |