import streamlit as st import time from datetime import datetime from transformers import SpeechT5Processor, SpeechT5ForSpeechToSpeech, SpeechT5HifiGan, SpeechT5ForTextToSpeech import numpy as np import torch from io import StringIO import soundfile as sf # Improved Styling def local_css(file_name): with open(file_name) as f: st.markdown(f'', unsafe_allow_html=True) local_css("style.css") # Assuming a CSS file named 'style.css' in the same directory # Streamlined Layout st.title("Text-to-Voice Conversion") st.markdown("Convert your text to speech using advanced AI models.") # Load models outside of function calls for efficiency @st.cache_data(allow_output_mutation=True) def load_models(): model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") return model, processor, vocoder model, processor, vocoder = load_models() # Load speaker embeddings @st.cache_data(allow_output_mutation=True) def get_speaker_embeddings(): speaker_embeddings = np.load("cmu_us_slt_arctic-wav-arctic_a0508.npy") return torch.tensor(speaker_embeddings).unsqueeze(0) speaker_embeddings = get_speaker_embeddings() # Text Input text = st.text_area("Type your text or upload a text file below.") # Function to convert text to speech def text_to_speech(text): inputs = processor(text=text, return_tensors="pt") spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings) with torch.no_grad(): speech = vocoder(spectrogram) sf.write("speech.wav", speech.numpy(), samplerate=16000) return "speech.wav" # Convert Button if st.button("Convert"): if text: audio_path = text_to_speech(text) audio_file = open(audio_path, 'rb') audio_bytes =, format='audio/wav') else: st.error("Please enter some text to convert.") # File Uploader uploaded_file = st.file_uploader("Upload your text file here", type=['txt']) if uploaded_file is not None: text = uploaded_file.getvalue().decode("utf-8") audio_path = text_to_speech(text) audio_file = open(audio_path, 'rb') audio_bytes =, format='audio/wav')