from transformers import ViltProcessor, ViltForQuestionAnswering import requests from PIL import Image def image(url, text): image = Image.open(requests.get(url, stream=True).raw) processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa") model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa") # prepare inputs encoding = processor(image, text, return_tensors="pt") # forward pass outputs = model(**encoding) logits = outputs.logits idx = logits.argmax(-1).item() # print("question asked:", text) # print("image link:", url) # print("Predicted answer:", model.config.id2label[idx]) return model.config.id2label[idx] # prepare image + question