Spaces:
Runtime error
Runtime error
File size: 1,861 Bytes
5fb8331 422b42e 5fb8331 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
# Importing the requirements
import torch
from transformers import AutoModel, AutoTokenizer
import spaces
from src.utils import encode_video
# Device for the model
device = "cuda"
# Load the model and tokenizer
model = AutoModel.from_pretrained(
"openbmb/MiniCPM-V-2_6",
trust_remote_code=True,
attn_implementation="sdpa",
torch_dtype=torch.bfloat16,
)
model = model.to(device=device)
tokenizer = AutoTokenizer.from_pretrained(
"openbmb/MiniCPM-V-2_6", trust_remote_code=True
)
model.eval()
@spaces.GPU(duration=50)
def describe_video(video, question):
"""
Describes a video by generating an answer to a given question.
Args:
- video (str): The path to the video file.
- question (str): The question to be answered about the video.
Returns:
str: The generated answer to the question.
"""
# Encode the video frames
frames = encode_video(video)
# Message format for the model
msgs = [{"role": "user", "content": frames + [question]}]
# Set decode params for video
params = {
"use_image_id": False,
"max_slice_nums": 1, # Use 1 if CUDA OOM and video resolution > 448*448
}
# Generate the answer
answer = model.chat(
image=None,
msgs=msgs,
tokenizer=tokenizer,
sampling=True,
stream=True,
top_p=0.8,
top_k=100,
temperature=0.7,
repetition_penalty=1.05,
max_new_tokens=2048,
system_prompt="You are an AI assistant specialized in visual content analysis. Given a video and a related question, analyze the video thoroughly and provide a precise and informative answer based on the visible content. Ensure your response is clear, accurate, and directly addresses the question.",
**params
)
# Return the answer
return "".join(answer)
|