import av import torch import numpy as np from fastapi import FastAPI, UploadFile, File from transformers import AutoProcessor, AutoModel from huggingface_hub import hf_hub_download app = FastAPI() np.random.seed(0) def read_video_pyav(container, indices): ''' Decode the video with PyAV decoder. Args: container (`av.container.input.InputContainer`): PyAV container. indices (`List[int]`): List of frame indices to decode. Returns: result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3). ''' frames = [] container.seek(0) start_index = indices[0] end_index = indices[-1] for i, frame in enumerate(container.decode(video=0)): if i > end_index: break if i >= start_index and i in indices: frames.append(frame) return np.stack([x.to_ndarray(format="rgb24") for x in frames]) def sample_frame_indices(clip_len, frame_sample_rate, seg_len): ''' Sample a given number of frame indices from the video. Args: clip_len (`int`): Total number of frames to sample. frame_sample_rate (`int`): Sample every n-th frame. seg_len (`int`): Maximum allowed index of sample's last frame. Returns: indices (`List[int]`): List of sampled frame indices ''' converted_len = int(clip_len * frame_sample_rate) end_idx = np.random.randint(converted_len, seg_len) start_idx = end_idx - converted_len indices = np.linspace(start_idx, end_idx, num=clip_len) indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64) return indices processor = AutoProcessor.from_pretrained("microsoft/xclip-base-patch32") model = AutoModel.from_pretrained("microsoft/xclip-base-patch32") @app.post("/classify_video/") async def classify_video(file: UploadFile): file_bytes = await file.read() container = av.open(file_bytes) indices = sample_frame_indices(clip_len=8, frame_sample_rate=1, seg_len=container.streams.video[0].frames) video = read_video_pyav(container, indices) inputs = processor( text=["playing sports", "eating spaghetti", "go shopping"], videos=[video], # Changed list(video) to [video] to avoid error return_tensors="pt", padding=True, ) with torch.no_grad(): outputs = model(**inputs) logits_per_video = outputs.logits_per_video probs = logits_per_video.softmax(dim=1) return {"classification_probabilities": probs.tolist()}