import streamlit as st
import requests
from PIL import Image
from io import BytesIO
from playsound import playsound

# Function to make API call to image-to-text model
def img2text(img):
    API_URL = "https://api-inference.huggingface.co/models/Salesforce/blip-image-captioning-base"
    headers = {"Authorization": "Bearer {HF}"}
    response = requests.post(API_URL, headers=headers, data=img)
    return response.json()

# Function to make API call to text-to-speech model
def tts(payload):
    API_URL = "https://api-inference.huggingface.co/models/espnet/kan-bayashi_ljspeech_vits"
    headers = {"Authorization": f"Bearer {HF}"}
    response = requests.post(API_URL, headers=headers, json=payload)
    with open('audio.flac', 'wb') as file:
        file.write(response.content)

# Main function to run the Streamlit app
def main():
    st.title("Image Description to Audio")

    # Upload image
    uploaded_image = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"])

    if uploaded_image:
        st.image(uploaded_image, caption="Uploaded Image", use_column_width=True)
        image = Image.open(uploaded_image)

        # Button to generate audio
        if st.button("Generate Audio"):
            st.spinner("Generating audio...")

            # Convert image to text description
            img_bytes = BytesIO()
            image.save(img_bytes, format="JPEG")
            img_text_response = img2text(img_bytes.getvalue())

            # Convert text description to audio
            tts_payload = {"text": img_text_response[0]['generated_text']}
            tts(tts_payload)

            # Play the audio
            st.audio('audio.flac')

            st.success("Audio generated successfully!")

if __name__ == "__main__":
    main()