from dotenv import find_dotenv, load_dotenv from transformers import pipeline import requests import os import streamlit as st load_dotenv(find_dotenv()) api_token = os.getenv("HUGGINGFACEHUB_API_TOKEN") #img2text def img2text(url): image_to_text = pipeline("image-to-text",model='Salesforce/blip-image-captioning-large') text = image_to_text(url)[0]["generated_text"] #print(text) return text # #text2speech def text2speech(message): API_URL = "https://api-inference.huggingface.co/models/espnet/kan-bayashi_ljspeech_vits" #API_URL = "https://api-inference.huggingface.co/models/microsoft/speecht5_tts" headers = {"Authorization": f"Bearer {api_token}"} payloads = { "inputs":message } response = requests.post(API_URL, headers=headers, json=payloads) with open('audio.flac','wb') as file: file.write(response.content) def main(): st.title("Image to text to audio by 🤖") st.header("Turn image to audio podcast !!!") st.caption("Sample picture...") st.image("beachboat.jpg") img2text("beachboat.jpg") uploaded_file = st.file_uploader("Choose your image or simpley drag sample image given above",type="jpg") if uploaded_file is not None: print(uploaded_file) bytes_data = uploaded_file.getvalue() with open(uploaded_file.name,"wb")as file: file.write(bytes_data) st.image(uploaded_file,caption='Uploaded image.', use_column_width=True) scenario = img2text(uploaded_file.name) text2speech(scenario) with st.expander("Scenario"): st.write(scenario) st.audio("audio.flac") if __name__ == '__main__': main()