import streamlit as st import requests from PIL import Image from io import BytesIO from playsound import playsound # Function to make API call to image-to-text model def img2text(img): API_URL = "https://api-inference.huggingface.co/models/Salesforce/blip-image-captioning-base" headers = {"Authorization": "Bearer {HF}"} response = requests.post(API_URL, headers=headers, data=img) return response.json() # Function to make API call to text-to-speech model def tts(payload): API_URL = "https://api-inference.huggingface.co/models/espnet/kan-bayashi_ljspeech_vits" headers = {"Authorization": f"Bearer {HF}"} response = requests.post(API_URL, headers=headers, json=payload) with open('audio.flac', 'wb') as file: file.write(response.content) # Main function to run the Streamlit app def main(): st.title("Image Description to Audio") # Upload image uploaded_image = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"]) if uploaded_image: st.image(uploaded_image, caption="Uploaded Image", use_column_width=True) image = Image.open(uploaded_image) # Button to generate audio if st.button("Generate Audio"): st.spinner("Generating audio...") # Convert image to text description img_bytes = BytesIO() image.save(img_bytes, format="JPEG") img_text_response = img2text(img_bytes.getvalue()) # Convert text description to audio tts_payload = {"text": img_text_response[0]['generated_text']} tts(tts_payload) # Play the audio st.audio('audio.flac') st.success("Audio generated successfully!") if __name__ == "__main__": main()