Spaces:
Paused
Paused
import gradio as gr | |
from PIL import Image | |
import torch | |
from transformers import BlipProcessor, BlipForQuestionAnswering | |
# Initialize the model and processor | |
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base") | |
model = BlipForQuestionAnswering.from_pretrained("ManishThota/InstructBlip-VQA").to("cuda") | |
# model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base") | |
def predict_answer(image, question): | |
# Convert PIL image to RGB if not already | |
image = image.convert("RGB") | |
# Prepare inputs | |
encoding = processor(image, question, return_tensors="pt").to("cuda:0", torch.float16) | |
out = model.generate(**encoding) | |
generated_text = processor.decode(out[0], skip_special_tokens=True) | |
return generated_text | |
def gradio_predict(image, question): | |
answer = predict_answer(image, question) | |
return answer | |
# Define the Gradio interface | |
iface = gr.Interface( | |
fn=gradio_predict, | |
inputs=[gr.Image(type="pil", label="Upload or Drag an Image"), gr.Textbox(label="Question", placeholder="e.g. What is this?", scale=4)], | |
outputs=gr.TextArea(label="Answer"), | |
title="Instruct Visual Question Answering", | |
description="Tiny 1B parameter Vision Language Model.", | |
) | |
# Launch the app | |
iface.queue().launch(debug=True) | |