import streamlit as st
from transformers import BlipProcessor, BlipForQuestionAnswering
from PIL import Image

# Load the processor and model
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")

# Initialize session state to store chat history
if 'history' not in st.session_state:
    st.session_state.history = []

st.title("Conversational Image Recognition Chatbot")

# Upload image
uploaded_file = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"])

if uploaded_file is not None:
    # Display the uploaded image
    image = Image.open(uploaded_file)
    st.image(image, caption='Uploaded Image.', use_column_width=True)
    
    # Store the uploaded image in session state
    st.session_state.image = image
    
    # Chat interface
    user_input = st.text_input("You: ", key="input")

    if st.button("Send"):
        if user_input:

            # Process the image and question
            inputs = processor(st.session_state.image, user_input, return_tensors="pt")
            output = model.generate(**inputs)
            answer = processor.decode(output[0], skip_special_tokens=True)
            
            # Add user question and model answer to chat history
            st.session_state.history.append({"You": user_input, "chatbot": answer})
    
    # Display the chat history
    if st.session_state.history:
        for i, chat in enumerate(st.session_state.history):
            st.write(f"**You:** {chat['You']}")
            st.write(f"**chatbot:** {chat['chatbot']}")