Spaces:
Sleeping
Sleeping
import streamlit as st | |
import requests | |
from PIL import Image | |
from io import BytesIO | |
from playsound import playsound | |
# Function to make API call to image-to-text model | |
def img2text(img): | |
API_URL = "https://api-inference.huggingface.co/models/Salesforce/blip-image-captioning-base" | |
headers = {"Authorization": "Bearer {HF}"} | |
response = requests.post(API_URL, headers=headers, data=img) | |
return response.json() | |
# Function to make API call to text-to-speech model | |
def tts(payload): | |
API_URL = "https://api-inference.huggingface.co/models/espnet/kan-bayashi_ljspeech_vits" | |
headers = {"Authorization": f"Bearer {HF}"} | |
response = requests.post(API_URL, headers=headers, json=payload) | |
with open('audio.flac', 'wb') as file: | |
file.write(response.content) | |
# Main function to run the Streamlit app | |
def main(): | |
st.title("Image Description to Audio") | |
# Upload image | |
uploaded_image = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"]) | |
if uploaded_image: | |
st.image(uploaded_image, caption="Uploaded Image", use_column_width=True) | |
image = Image.open(uploaded_image) | |
# Button to generate audio | |
if st.button("Generate Audio"): | |
st.spinner("Generating audio...") | |
# Convert image to text description | |
img_bytes = BytesIO() | |
image.save(img_bytes, format="JPEG") | |
img_text_response = img2text(img_bytes.getvalue()) | |
# Convert text description to audio | |
tts_payload = {"text": img_text_response[0]['generated_text']} | |
tts(tts_payload) | |
# Play the audio | |
st.audio('audio.flac') | |
st.success("Audio generated successfully!") | |
if __name__ == "__main__": | |
main() | |