from transformers import BlipForQuestionAnswering, AutoProcessor import gradio as gr from PIL import Image import io model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base") processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base") def answer_question(image, question): # Open the image file-like object image = Image.open(io.BytesIO(image.read())) inputs = processor(image, question, return_tensors="pt") out = model.generate(**inputs) answer = processor.decode(out[0], skip_special_tokens=True) return answer # Create Gradio interface image_input = gr.Image(label="Upload Image") question_input = gr.Textbox(label="Ask a Question", lines=4) output = gr.Textbox(label="Answer") interface = gr.Interface( fn=answer_question, inputs=[image_input, question_input], outputs=output, title="Multimodal Question Answering", description="BlipForQuestionAnswering for Question Answering", ) interface.launch()