File size: 5,418 Bytes
0d03c1b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import streamlit as st
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from tensorflow_tts.inference import TFAutoModel
from tensorflow_tts.inference import AutoConfig
from tensorflow_tts.inference import AutoProcessor

st.title("Text-to-Speech Synthesis")

# Sidebar
model_selection = st.sidebar.selectbox("Select Model", [
    "Tacotron2 + MelGAN",
    "Tacotron2 + MelGAN-STFT",
    "Tacotron2 + MB-MelGAN",
    "FastSpeech + MB-MelGAN",
    "FastSpeech + MelGAN-STFT",
    "FastSpeech + MelGAN",
    "FastSpeech2 + MB-MelGAN",
    "FastSpeech2 + MelGAN-STFT",
    "FastSpeech2 + MelGAN"
])

input_text = st.text_area("Enter Text", value="Bill got in the habit of asking himself “Is that thought true?” And if he wasn’t absolutely certain it was, he just let it go.")

# Load models and configurations
tacotron2 = TFAutoModel.from_pretrained("tensorspeech/tts-tacotron2-ljspeech-en", name="tacotron2")
fastspeech = TFAutoModel.from_pretrained("tensorspeech/tts-fastspeech-ljspeech-en", name="fastspeech")
fastspeech2 = TFAutoModel.from_pretrained("tensorspeech/tts-fastspeech2-ljspeech-en", name="fastspeech2")
melgan = TFAutoModel.from_pretrained("tensorspeech/tts-melgan-ljspeech-en", name="melgan")
melgan_stft_config = AutoConfig.from_pretrained('TensorFlowTTS/examples/melgan_stft/conf/melgan_stft.v1.yaml')
melgan_stft = TFAutoModel.from_pretrained(
    config=melgan_stft_config,
    pretrained_path="melgan.stft-2M.h5",
    name="melgan_stft"
)
mb_melgan = TFAutoModel.from_pretrained("tensorspeech/tts-mb_melgan-ljspeech-en", name="mb_melgan")
processor = AutoProcessor.from_pretrained("tensorspeech/tts-tacotron2-ljspeech-en")

def do_synthesis(input_text, text2mel_model, vocoder_model, text2mel_name, vocoder_name):
    input_ids = processor.text_to_sequence(input_text)

    if text2mel_name == "TACOTRON":
        _, mel_outputs, _, _ = text2mel_model.inference(
            tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0),
            tf.convert_to_tensor([len(input_ids)], tf.int32),
            tf.convert_to_tensor([0], dtype=tf.int32)
        )
    elif text2mel_name == "FASTSPEECH":
        _, mel_outputs, _ = text2mel_model.inference(
            input_ids=tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0),
            speaker_ids=tf.convert_to_tensor([0], dtype=tf.int32),
            speed_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32),
        )
    elif text2mel_name == "FASTSPEECH2":
        _, mel_outputs, _, _, _ = text2mel_model.inference(
            tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0),
            speaker_ids=tf.convert_to_tensor([0], dtype=tf.int32),
            speed_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32),
            f0_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32),
            energy_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32),
        )
    else:
        raise ValueError("Only TACOTRON, FASTSPEECH, FASTSPEECH2 are supported on text2mel_name")

    if vocoder_name == "MELGAN" or vocoder_name == "MELGAN-STFT":
        audio = vocoder_model(mel_outputs)[0, :, 0]
    elif vocoder_name == "MB-MELGAN":
        audio = vocoder_model(mel_outputs)[0, :, 0]
    else:
        raise ValueError("Only MELGAN, MELGAN-STFT and MB_MELGAN are supported on vocoder_name")

    return mel_outputs.numpy(), audio.numpy()

if st.button("Synthesize"):
    # Perform synthesis based on selected model
    if model_selection == "Tacotron2 + MelGAN":
        mel_outputs, audio = do_synthesis(input_text, tacotron2, melgan, "TACOTRON", "MELGAN")
    elif model_selection == "Tacotron2 + MelGAN-STFT":
        mel_outputs, audio = do_synthesis(input_text, tacotron2, melgan_stft, "TACOTRON", "MELGAN-STFT")
    elif model_selection == "Tacotron2 + MB-MelGAN":
        mel_outputs, audio = do_synthesis(input_text, tacotron2, mb_melgan, "TACOTRON", "MB-MELGAN")
    elif model_selection == "FastSpeech + MB-MelGAN":
        mel_outputs, audio = do_synthesis(input_text, fastspeech, mb_melgan, "FASTSPEECH", "MB-MELGAN")
    elif model_selection == "FastSpeech + MelGAN-STFT":
        mel_outputs, audio = do_synthesis(input_text, fastspeech, melgan_stft, "FASTSPEECH", "MELGAN-STFT")
    elif model_selection == "FastSpeech + MelGAN":
        mel_outputs, audio = do_synthesis(input_text, fastspeech, melgan, "FASTSPEECH", "MELGAN")
    elif model_selection == "FastSpeech2 + MB-MelGAN":
        mel_outputs, audio = do_synthesis(input_text, fastspeech2, mb_melgan, "FASTSPEECH2", "MB-MELGAN")
    elif model_selection == "FastSpeech2 + MelGAN-STFT":
        mel_outputs, audio = do_synthesis(input_text, fastspeech2, melgan_stft, "FASTSPEECH2", "MELGAN-STFT")
    elif model_selection == "FastSpeech2 + MelGAN":
        mel_outputs, audio = do_synthesis(input_text, fastspeech2, melgan, "FASTSPEECH2", "MELGAN")

    # Visualize mel spectrogram
    mels = np.reshape(mel_outputs, [-1, 80])
    fig = plt.figure(figsize=(10, 8))
    ax1 = fig.add_subplot(311)
    ax1.set_title(f'Predicted Mel-after-Spectrogram')
    im = ax1.imshow(np.rot90(mels), aspect='auto', interpolation='none')
    fig.colorbar(mappable=im, shrink=0.65, orientation='horizontal', ax=ax1)
    st.pyplot(fig)

    # Display the audio using the specified sample rate
    st.audio(audio, format="audio/wav", sample_rate=22050, start_time=0)

if __name__ == '__main__':
    app()