# Utils import os import timeit import soundfile as sf # Streamlit import streamlit as st # Custom elements from elements.component import ( centered_text, ) def generate_voice( input_text, ): # TTS Inference start_time = timeit.default_timer() c, c_length, phoneme = st.session_state.TTS.tokenize(input_text) tok_time = timeit.default_timer() - start_time start_time = timeit.default_timer() voice = st.session_state.TTS.vocalize(c, c_length) tts_time = timeit.default_timer() - start_time # Time stats total_infer_time = tts_time + tok_time audio_time = voice.shape[-1] / 22050 rtf = total_infer_time / audio_time rt_ratio = 1 / rtf # Save audio (bug in Streamlit, can't play numpy array directly) sf.write(f"cache_sound/{st.session_state.random_str}.wav", voice[0,0], 22050) # Play audio st.audio(f"cache_sound/{st.session_state.random_str}.wav", format = "audio/wav") os.remove(f"cache_sound/{st.session_state.random_str}.wav") st.caption("Generated Voice") st.code( f"💬 Output Audio: {str(audio_time)[:6]} sec.\n\n⏳ Elapsed time for:\n => Tokenization: {str(tok_time)[:6]} sec.\n => Model Inference: {str(tts_time)[:6]} sec.\n\n⏰ Real-time Factor (RTF): {str(rtf)[:6]}\n\n🏃 The model runs {str(rt_ratio)[:6]} x faster than real-time \ ", language = "bash", ) st.caption("Elapsed Time Stats")