Multi image inference
#7
by
2U1
- opened
Does the model accept multi-image for the input?
Yes it does! here's an example from the doc:
from transformers import AutoProcessor, AutoModelForImageTextToText
import torch
model_id = "CohereForAI/aya-vision-8b"
processor = AutoProcessor.from_pretrained(model_id)
model = AutoModelForImageTextToText.from_pretrained(
model_id, device_map="cuda:0", torch_dtype=torch.float16
)
# Example with multiple images in a single message
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
},
{
"type": "image",
"url": "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg",
},
{
"type": "text",
"text": "These images depict two different landmarks. Can you identify them?",
},
],
},
]
inputs = processor.apply_chat_template(
messages, padding=True, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt"
).to(model.device)
gen_tokens = model.generate(
**inputs,
max_new_tokens=300,
do_sample=True,
temperature=0.3,
)
gen_text = processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
print(gen_text)