import streamlit as st from PIL import Image import requests from io import BytesIO from transformers import ViltProcessor, ViltForQuestionAnswering # Set page layout to wide st.set_page_config(layout="wide") processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa") model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa") def get_answer(image, text): try: # Load and process the image img = Image.open(BytesIO(image)).convert("RGB") # Prepare inputs encoding = processor(img, text, return_tensors="pt") # Forward pass outputs = model(**encoding) logits = outputs.logits idx = logits.argmax(-1).item() answer = model.config.id2label[idx] return answer except Exception as e: return str(e) # Set up the Streamlit app st.title("VisualSense QA") st.write("Upload an image and enter a question to get an answer.") # Create columns for image upload and input fields col1, col2 = st.columns(2) # Image upload with col1: uploaded_file = st.file_uploader("Upload Image", type=["jpg", "jpeg", "png"]) if uploaded_file is not None: st.image(uploaded_file, use_column_width=True) # Question input with col2: question = st.text_input("Question") # Process the image and question when both are provided if uploaded_file and question is not None: if st.button("Ask Question"): image = Image.open(uploaded_file) image_byte_array = BytesIO() image.save(image_byte_array, format='JPEG') image_bytes = image_byte_array.getvalue() # Get the answer answer = get_answer(image_bytes, question) # Display the answer st.success("Answer: " + answer)