File size: 1,434 Bytes
b546670
efaf417
b546670
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
efaf417
b546670
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
# Utils
import os
import timeit
import soundfile as sf

# Streamlit
import streamlit as st

# Custom elements
from elements.component import (
    centered_text,
)

def generate_voice(
    input_text,
):
    # TTS Inference
    start_time = timeit.default_timer()
    c, c_length, phoneme = st.session_state.TTS.tokenize(input_text)
    tok_time = timeit.default_timer() - start_time

    start_time = timeit.default_timer()
    voice = st.session_state.TTS.vocalize(c, c_length)
    tts_time = timeit.default_timer() - start_time

    # Time stats
    total_infer_time = tts_time + tok_time
    audio_time = voice.shape[-1] / 22050
    rtf = total_infer_time / audio_time
    rt_ratio = 1 / rtf

    # Save audio (bug in Streamlit, can't play numpy array directly)
    sf.write(f"cache_sound/{st.session_state.random_str}.wav", voice[0,0], 22050)

    # Play audio
    st.audio(f"cache_sound/{st.session_state.random_str}.wav", format = "audio/wav")
    os.remove(f"cache_sound/{st.session_state.random_str}.wav")
    st.caption("Generated Voice")

    st.code(
        f"💬 Output Audio: {str(audio_time)[:6]} sec.\n\n⏳ Elapsed time for:\n   => Tokenization: {str(tok_time)[:6]} sec.\n   => Model Inference: {str(tts_time)[:6]} sec.\n\n⏰ Real-time Factor (RTF): {str(rtf)[:6]}\n\n🏃 The model runs {str(rt_ratio)[:6]} x faster than real-time \
        ",
        language = "bash",
    )
    st.caption("Elapsed Time Stats")