Spaces:
Sleeping
Sleeping
File size: 1,768 Bytes
14a204c dfe2080 14a204c dfe2080 14a204c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 |
import streamlit as st
import requests
from PIL import Image
from io import BytesIO
from playsound import playsound
# Function to make API call to image-to-text model
def img2text(img):
API_URL = "https://api-inference.huggingface.co/models/Salesforce/blip-image-captioning-base"
headers = {"Authorization": "Bearer {HF}"}
response = requests.post(API_URL, headers=headers, data=img)
return response.json()
# Function to make API call to text-to-speech model
def tts(payload):
API_URL = "https://api-inference.huggingface.co/models/espnet/kan-bayashi_ljspeech_vits"
headers = {"Authorization": f"Bearer {HF}"}
response = requests.post(API_URL, headers=headers, json=payload)
with open('audio.flac', 'wb') as file:
file.write(response.content)
# Main function to run the Streamlit app
def main():
st.title("Image Description to Audio")
# Upload image
uploaded_image = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"])
if uploaded_image:
st.image(uploaded_image, caption="Uploaded Image", use_column_width=True)
image = Image.open(uploaded_image)
# Button to generate audio
if st.button("Generate Audio"):
st.spinner("Generating audio...")
# Convert image to text description
img_bytes = BytesIO()
image.save(img_bytes, format="JPEG")
img_text_response = img2text(img_bytes.getvalue())
# Convert text description to audio
tts_payload = {"text": img_text_response[0]['generated_text']}
tts(tts_payload)
# Play the audio
st.audio('audio.flac')
st.success("Audio generated successfully!")
if __name__ == "__main__":
main()
|