File size: 741 Bytes
f098da2 64a6ef4 438833d f098da2 438833d 8d14cb2 64a6ef4 f098da2 438833d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 |
from transformers import ViltProcessor, ViltForQuestionAnswering
import requests
from PIL import Image
def image(url, text):
image = Image.open(requests.get(url, stream=True).raw)
processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
# prepare inputs
encoding = processor(image, text, return_tensors="pt")
# forward pass
outputs = model(**encoding)
logits = outputs.logits
idx = logits.argmax(-1).item()
print("question asked:", text)
print("image link:", url)
print("Predicted answer:", model.config.id2label[idx])
return model.config.id2label[idx]
# prepare image + question
|