from transformers import ViltProcessor, ViltForQuestionAnswering | |
import requests | |
from PIL import Image | |
def image(url, text): | |
image = Image.open(requests.get(url, stream=True).raw) | |
processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa") | |
model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa") | |
# prepare inputs | |
encoding = processor(image, text, return_tensors="pt") | |
# forward pass | |
outputs = model(**encoding) | |
logits = outputs.logits | |
idx = logits.argmax(-1).item() | |
print("question asked:", text) | |
print("image link:", url) | |
print("Predicted answer:", model.config.id2label[idx]) | |
return model.config.id2label[idx] | |
# prepare image + question | |