import gradio as gr from PIL import Image import torch from transformers import BlipProcessor, BlipForQuestionAnswering # Initialize the model and processor processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base") model = BlipForQuestionAnswering.from_pretrained("ManishThota/InstructBlip-VQA").to("cuda") # model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base") def predict_answer(image, question): # Convert PIL image to RGB if not already image = image.convert("RGB") # Prepare inputs encoding = processor(image, question, return_tensors="pt").to("cuda:0", torch.float16) out = model.generate(**encoding) generated_text = processor.decode(out[0], skip_special_tokens=True) return generated_text def gradio_predict(image, question): answer = predict_answer(image, question) return answer # Define the Gradio interface iface = gr.Interface( fn=gradio_predict, inputs=[gr.Image(type="pil", label="Upload or Drag an Image"), gr.Textbox(label="Question", placeholder="e.g. What is this?", scale=4)], outputs=gr.TextArea(label="Answer"), title="Instruct Visual Question Answering", description="Tiny 1B parameter Vision Language Model.", ) # Launch the app iface.queue().launch(debug=True)