Spaces:
Paused
Paused
| import gradio as gr | |
| from PIL import Image | |
| import torch | |
| from transformers import BlipProcessor, BlipForQuestionAnswering | |
| # Initialize the model and processor | |
| processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base") | |
| model = BlipForQuestionAnswering.from_pretrained("ManishThota/InstructBlip-VQA").to("cuda") | |
| # model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base") | |
| def predict_answer(image, question): | |
| # Convert PIL image to RGB if not already | |
| image = image.convert("RGB") | |
| # Prepare inputs | |
| encoding = processor(image, question, return_tensors="pt").to("cuda:0", torch.float16) | |
| out = model.generate(**encoding) | |
| generated_text = processor.decode(out[0], skip_special_tokens=True) | |
| return generated_text | |
| def gradio_predict(image, question): | |
| answer = predict_answer(image, question) | |
| return answer | |
| # Define the Gradio interface | |
| iface = gr.Interface( | |
| fn=gradio_predict, | |
| inputs=[gr.Image(type="pil", label="Upload or Drag an Image"), gr.Textbox(label="Question", placeholder="e.g. What is this?", scale=4)], | |
| outputs=gr.TextArea(label="Answer"), | |
| title="Instruct Visual Question Answering", | |
| description="Tiny 1B parameter Vision Language Model.", | |
| ) | |
| # Launch the app | |
| iface.queue().launch(debug=True) | |