Spaces:

adpai
/

microsoft-xclip-base-patch32

Runtime error

App Files Files Community

microsoft-xclip-base-patch32 / app.py

adpai

Update app.py

9303daf about 1 year ago

raw

history blame contribute delete

2.53 kB

	import av
	import torch
	import numpy as np
	from fastapi import FastAPI, UploadFile, File
	from transformers import AutoProcessor, AutoModel
	from huggingface_hub import hf_hub_download

	app = FastAPI()

	np.random.seed(0)

	def read_video_pyav(container, indices):
	'''
	Decode the video with PyAV decoder.
	Args:
	container (`av.container.input.InputContainer`): PyAV container.
	indices (`List[int]`): List of frame indices to decode.
	Returns:
	result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
	'''
	frames = []
	container.seek(0)
	start_index = indices[0]
	end_index = indices[-1]
	for i, frame in enumerate(container.decode(video=0)):
	if i > end_index:
	break
	if i >= start_index and i in indices:
	frames.append(frame)
	return np.stack([x.to_ndarray(format="rgb24") for x in frames])

	def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
	'''
	Sample a given number of frame indices from the video.
	Args:
	clip_len (`int`): Total number of frames to sample.
	frame_sample_rate (`int`): Sample every n-th frame.
	seg_len (`int`): Maximum allowed index of sample's last frame.
	Returns:
	indices (`List[int]`): List of sampled frame indices
	'''
	converted_len = int(clip_len * frame_sample_rate)
	end_idx = np.random.randint(converted_len, seg_len)
	start_idx = end_idx - converted_len
	indices = np.linspace(start_idx, end_idx, num=clip_len)
	indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
	return indices

	processor = AutoProcessor.from_pretrained("microsoft/xclip-base-patch32")
	model = AutoModel.from_pretrained("microsoft/xclip-base-patch32")

	@app.post("/classify_video/")
	async def classify_video(file: UploadFile):
	file_bytes = await file.read()

	container = av.open(file_bytes)
	indices = sample_frame_indices(clip_len=8, frame_sample_rate=1, seg_len=container.streams.video[0].frames)
	video = read_video_pyav(container, indices)

	inputs = processor(
	text=["playing sports", "eating spaghetti", "go shopping"],
	videos=[video], # Changed list(video) to [video] to avoid error
	return_tensors="pt",
	padding=True,
	)

	with torch.no_grad():
	outputs = model(**inputs)

	logits_per_video = outputs.logits_per_video
	probs = logits_per_video.softmax(dim=1)

	return {"classification_probabilities": probs.tolist()}