import gradio as gr import os from transformers import ViltProcessor, ViltForQuestionAnswering def vqa(image, text): processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa") model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa") encoding = processor(image, text, return_tensors="pt") outputs = model(**encoding) logits = outputs.logits idx = logits.argmax(-1).item() return f"{text}: {model.config.id2label[idx]}" dogs = os.path.join(os.path.dirname(__file__), "617.jpg") text = "What are the dogs riding?" demo = gr.Interface(vqa, gr.Image(type="pil", value=dogs), text, "text") if __name__ == "__main__": demo.launch()