from transformers import ViltProcessor, ViltForQuestionAnswering import torch from PIL import Image import gradio as gr # Load the model and processor processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa") model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa") def answer_question(image, text): # Convert the uploaded image to PIL format image = Image.fromarray(image.astype('uint8'), 'RGB') # Process the image and text encoding = processor(images=image, text=text, return_tensors="pt", padding=True) # Forward pass with torch.no_grad(): outputs = model(**encoding) logits = outputs.logits idx = logits.argmax(-1).item() predicted_answer = model.config.id2label[idx] # Return the predicted answer return predicted_answer # Define Gradio inputs and outputs image = gr.Image(type="numpy", label="Upload Image") question = gr.Textbox(lines=2, label="Question") answer = gr.Textbox(label="Predicted Answer") # Create Gradio Interface gr.Interface( fn=answer_question, inputs=[image, question], outputs=answer, title="Image Based Visual Question Answering", description="This is a demonstration of ViLT (Vision and Language Transformer) using Gradio, which has been fine-tuned on VQAv2 to answer questions based on images. To get a predicted answer, please provide an image and type in your question, then press the submit button." ).launch()