demo-img2tts / app.py
di-mitris's picture
Update app.py
dfe2080 verified
raw
history blame contribute delete
No virus
1.77 kB
import streamlit as st
import requests
from PIL import Image
from io import BytesIO
from playsound import playsound
# Function to make API call to image-to-text model
def img2text(img):
API_URL = "https://api-inference.huggingface.co/models/Salesforce/blip-image-captioning-base"
headers = {"Authorization": "Bearer {HF}"}
response = requests.post(API_URL, headers=headers, data=img)
return response.json()
# Function to make API call to text-to-speech model
def tts(payload):
API_URL = "https://api-inference.huggingface.co/models/espnet/kan-bayashi_ljspeech_vits"
headers = {"Authorization": f"Bearer {HF}"}
response = requests.post(API_URL, headers=headers, json=payload)
with open('audio.flac', 'wb') as file:
file.write(response.content)
# Main function to run the Streamlit app
def main():
st.title("Image Description to Audio")
# Upload image
uploaded_image = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"])
if uploaded_image:
st.image(uploaded_image, caption="Uploaded Image", use_column_width=True)
image = Image.open(uploaded_image)
# Button to generate audio
if st.button("Generate Audio"):
st.spinner("Generating audio...")
# Convert image to text description
img_bytes = BytesIO()
image.save(img_bytes, format="JPEG")
img_text_response = img2text(img_bytes.getvalue())
# Convert text description to audio
tts_payload = {"text": img_text_response[0]['generated_text']}
tts(tts_payload)
# Play the audio
st.audio('audio.flac')
st.success("Audio generated successfully!")
if __name__ == "__main__":
main()