import os
from PIL import Image
from gtts import gTTS
from io import BytesIO
import io
from openai import OpenAI
#from dotenv import load_dotenv
import streamlit as st
from transformers import pipeline

# For explaining what is going on in the image
img_nar = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")

#load_dotenv()

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

st.header("Image Narrator")

# Temporary
uploaded_image = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"])

if 'history' not in st.session_state:
    st.session_state['history'] = []

personality = st.text_input("Enter a personality")
image_narration = "No narration given"

# Check if an image has been uploaded
if uploaded_image is not None:
    # Convert the uploaded file to a PIL image
    bytes_data = uploaded_image.getvalue()
    pil_image = Image.open(io.BytesIO(bytes_data))

    # Now, use the PIL image with the pipeline
    image_narration = img_nar(pil_image)

    # Display the uploaded image using the original bytes data
    st.image(pil_image, caption='Uploaded Image.', use_column_width=True)

    image_narration = image_narration[0]["generated_text"]

#st.write(image_narration)

def update_and_get_narration(personality, user_input):
    if personality and user_input:
        st.session_state['history'].append({"role": "user", "content": user_input})
        
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": f"You reiterate what is said to you but narrate it like a {personality}."}
            ] + st.session_state['history']
        )
        
        gpt_response = response.choices[0].message.content
        st.session_state['history'].append({"role": "assistant", "content": gpt_response})
        
        return gpt_response
    else:
        return "Please enter both a personality and some image classification text."

if st.button('Narrate'):
    narration = update_and_get_narration(personality, image_narration)
    st.write(narration)
    tts = gTTS(text=narration, lang='en')
    audio_buffer = BytesIO()
    tts.write_to_fp(audio_buffer)
    audio_buffer.seek(0)

    st.audio(audio_buffer, format='audio/mp3', start_time=0)
else:
    st.write(st.session_state['history'][-1]['content'] if st.session_state['history'] else "Narration will appear here.")