Spaces:
Running
Running
import streamlit as st | |
from transformers import ViltProcessor, ViltForQuestionAnswering, BlipForQuestionAnswering, AutoProcessor | |
from PIL import Image | |
# Define available models | |
models = { | |
"ViLT": (ViltProcessor, ViltForQuestionAnswering, "dandelin/vilt-b32-finetuned-vqa"), | |
"BLIP": (AutoProcessor, BlipForQuestionAnswering, "Salesforce/blip-vqa-base"), | |
} | |
def get_format_response(image,question,selected_model): | |
# Load selected model and processor | |
processor, model_class, model_name = models[selected_model] | |
processor = processor.from_pretrained(model_name) | |
model = model_class.from_pretrained(model_name) | |
encoding = processor(image, question, return_tensors="pt") | |
if selected_model=='ViLT': | |
outputs = model(**encoding) | |
logits = outputs.logits | |
idx = logits.argmax(-1).item() | |
answer = model.config.id2label[idx] | |
return answer | |
else: | |
outputs = model.generate(**encoding) | |
answer = processor.decode(outputs[0], skip_special_tokens=True) | |
return answer | |
# Streamlit app | |
st.title("Simple VQA App π€π") | |
st.subheader("A demo app showcasing VQA models. ViLT and BLIP model.") | |
# Sidebar for model selection | |
selected_model = st.sidebar.selectbox("Select Model", list(models.keys())) | |
# Image and question input | |
uploaded_image = st.file_uploader("Upload Image") | |
question = st.text_input("Ask a Question about the Image") | |
# Process image and question if provided | |
if uploaded_image and question: | |
image = Image.open(uploaded_image) | |
st.image(image, caption="Uploaded Image") | |
answer = get_format_response(image,question,selected_model) | |
# Display answer | |
st.write(f"π€ {selected_model} Answer: {answer} π") | |
# Disclaimer | |
st.sidebar.markdown("This is a demo app showcasing VQA models. Actual performance may vary.") |