Spaces:
Running
Running
# This script creates a simple web application using Gradio to generate answers for VQA using the BLIP model from Hugging Face's Transformers library. | |
# Import necessary libraries | |
import gradio as gr | |
import numpy as np | |
from PIL import Image | |
from transformers import BlipProcessor, BlipForQuestionAnswering | |
# Load BLIP processor and model | |
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base") | |
model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base") | |
# Define the function for Visual Question Answering | |
def VQA(input_image: np.ndarray, question): | |
# Convert numpy array to PIL Image and convert to RGB | |
raw_image = Image.fromarray(input_image).convert('RGB') | |
# Prepare the inputs for the model | |
inputs = processor(raw_image, question, return_tensors="pt") | |
# Generate the answer using the model | |
outputs = model.generate(**inputs, max_length=100) | |
# Decode the generated tokens to text and store it into `answer` | |
answer = processor.decode(outputs[0], skip_special_tokens=True) | |
return answer | |
# Create a Gradio interface | |
iface = gr.Interface( | |
fn=VQA, | |
inputs=[ | |
gr.Image(label="Input image:"), | |
gr.Textbox(label="Question:", placeholder="Type your question here...") | |
], | |
outputs="text", | |
title="Visual Question Answering", | |
description="This is a simple web app for VQA using BLIP model from Salesforce.\nUpload the image file:" | |
) | |
# Launch the Gradio app | |
iface.launch() | |