|
|
|
import torch
|
|
from transformers import AutoModel, AutoTokenizer
|
|
import spaces
|
|
|
|
|
|
|
|
device = "cuda"
|
|
|
|
|
|
model = AutoModel.from_pretrained(
|
|
"openbmb/MiniCPM-Llama3-V-2_5", trust_remote_code=True, torch_dtype=torch.float16
|
|
)
|
|
model = model.to(device=device)
|
|
tokenizer = AutoTokenizer.from_pretrained(
|
|
"openbmb/MiniCPM-Llama3-V-2_5", trust_remote_code=True
|
|
)
|
|
model.eval()
|
|
|
|
|
|
@spaces.GPU(duration=120)
|
|
def answer_question(image, question):
|
|
"""
|
|
Generates an answer to a given question based on the provided image and question.
|
|
Args:
|
|
image (str): The path to the image file.
|
|
question (str): The question text.
|
|
Returns:
|
|
str: The generated answer to the question.
|
|
"""
|
|
|
|
|
|
msgs = [{"role": "user", "content": question}]
|
|
|
|
|
|
res = model.chat(
|
|
image=image,
|
|
msgs=msgs,
|
|
tokenizer=tokenizer,
|
|
sampling=True,
|
|
temperature=0.7,
|
|
stream=True,
|
|
system_prompt="You are an AI assistant specialized in visual content analysis. Given an image and a related question, analyze the image thoroughly and provide a precise and informative answer based on the visible content. Ensure your response is clear, accurate, and directly addresses the question.",
|
|
)
|
|
|
|
|
|
return "".join(res)
|
|
|