vishred18 commited on
Commit
0d03c1b
1 Parent(s): 35f9ec2

Upload streamlit.py

Browse files
Files changed (1) hide show
  1. streamlit.py +109 -0
streamlit.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import tensorflow as tf
3
+ import numpy as np
4
+ import matplotlib.pyplot as plt
5
+ from tensorflow_tts.inference import TFAutoModel
6
+ from tensorflow_tts.inference import AutoConfig
7
+ from tensorflow_tts.inference import AutoProcessor
8
+
9
+ st.title("Text-to-Speech Synthesis")
10
+
11
+ # Sidebar
12
+ model_selection = st.sidebar.selectbox("Select Model", [
13
+ "Tacotron2 + MelGAN",
14
+ "Tacotron2 + MelGAN-STFT",
15
+ "Tacotron2 + MB-MelGAN",
16
+ "FastSpeech + MB-MelGAN",
17
+ "FastSpeech + MelGAN-STFT",
18
+ "FastSpeech + MelGAN",
19
+ "FastSpeech2 + MB-MelGAN",
20
+ "FastSpeech2 + MelGAN-STFT",
21
+ "FastSpeech2 + MelGAN"
22
+ ])
23
+
24
+ input_text = st.text_area("Enter Text", value="Bill got in the habit of asking himself “Is that thought true?” And if he wasn’t absolutely certain it was, he just let it go.")
25
+
26
+ # Load models and configurations
27
+ tacotron2 = TFAutoModel.from_pretrained("tensorspeech/tts-tacotron2-ljspeech-en", name="tacotron2")
28
+ fastspeech = TFAutoModel.from_pretrained("tensorspeech/tts-fastspeech-ljspeech-en", name="fastspeech")
29
+ fastspeech2 = TFAutoModel.from_pretrained("tensorspeech/tts-fastspeech2-ljspeech-en", name="fastspeech2")
30
+ melgan = TFAutoModel.from_pretrained("tensorspeech/tts-melgan-ljspeech-en", name="melgan")
31
+ melgan_stft_config = AutoConfig.from_pretrained('TensorFlowTTS/examples/melgan_stft/conf/melgan_stft.v1.yaml')
32
+ melgan_stft = TFAutoModel.from_pretrained(
33
+ config=melgan_stft_config,
34
+ pretrained_path="melgan.stft-2M.h5",
35
+ name="melgan_stft"
36
+ )
37
+ mb_melgan = TFAutoModel.from_pretrained("tensorspeech/tts-mb_melgan-ljspeech-en", name="mb_melgan")
38
+ processor = AutoProcessor.from_pretrained("tensorspeech/tts-tacotron2-ljspeech-en")
39
+
40
+ def do_synthesis(input_text, text2mel_model, vocoder_model, text2mel_name, vocoder_name):
41
+ input_ids = processor.text_to_sequence(input_text)
42
+
43
+ if text2mel_name == "TACOTRON":
44
+ _, mel_outputs, _, _ = text2mel_model.inference(
45
+ tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0),
46
+ tf.convert_to_tensor([len(input_ids)], tf.int32),
47
+ tf.convert_to_tensor([0], dtype=tf.int32)
48
+ )
49
+ elif text2mel_name == "FASTSPEECH":
50
+ _, mel_outputs, _ = text2mel_model.inference(
51
+ input_ids=tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0),
52
+ speaker_ids=tf.convert_to_tensor([0], dtype=tf.int32),
53
+ speed_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32),
54
+ )
55
+ elif text2mel_name == "FASTSPEECH2":
56
+ _, mel_outputs, _, _, _ = text2mel_model.inference(
57
+ tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0),
58
+ speaker_ids=tf.convert_to_tensor([0], dtype=tf.int32),
59
+ speed_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32),
60
+ f0_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32),
61
+ energy_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32),
62
+ )
63
+ else:
64
+ raise ValueError("Only TACOTRON, FASTSPEECH, FASTSPEECH2 are supported on text2mel_name")
65
+
66
+ if vocoder_name == "MELGAN" or vocoder_name == "MELGAN-STFT":
67
+ audio = vocoder_model(mel_outputs)[0, :, 0]
68
+ elif vocoder_name == "MB-MELGAN":
69
+ audio = vocoder_model(mel_outputs)[0, :, 0]
70
+ else:
71
+ raise ValueError("Only MELGAN, MELGAN-STFT and MB_MELGAN are supported on vocoder_name")
72
+
73
+ return mel_outputs.numpy(), audio.numpy()
74
+
75
+ if st.button("Synthesize"):
76
+ # Perform synthesis based on selected model
77
+ if model_selection == "Tacotron2 + MelGAN":
78
+ mel_outputs, audio = do_synthesis(input_text, tacotron2, melgan, "TACOTRON", "MELGAN")
79
+ elif model_selection == "Tacotron2 + MelGAN-STFT":
80
+ mel_outputs, audio = do_synthesis(input_text, tacotron2, melgan_stft, "TACOTRON", "MELGAN-STFT")
81
+ elif model_selection == "Tacotron2 + MB-MelGAN":
82
+ mel_outputs, audio = do_synthesis(input_text, tacotron2, mb_melgan, "TACOTRON", "MB-MELGAN")
83
+ elif model_selection == "FastSpeech + MB-MelGAN":
84
+ mel_outputs, audio = do_synthesis(input_text, fastspeech, mb_melgan, "FASTSPEECH", "MB-MELGAN")
85
+ elif model_selection == "FastSpeech + MelGAN-STFT":
86
+ mel_outputs, audio = do_synthesis(input_text, fastspeech, melgan_stft, "FASTSPEECH", "MELGAN-STFT")
87
+ elif model_selection == "FastSpeech + MelGAN":
88
+ mel_outputs, audio = do_synthesis(input_text, fastspeech, melgan, "FASTSPEECH", "MELGAN")
89
+ elif model_selection == "FastSpeech2 + MB-MelGAN":
90
+ mel_outputs, audio = do_synthesis(input_text, fastspeech2, mb_melgan, "FASTSPEECH2", "MB-MELGAN")
91
+ elif model_selection == "FastSpeech2 + MelGAN-STFT":
92
+ mel_outputs, audio = do_synthesis(input_text, fastspeech2, melgan_stft, "FASTSPEECH2", "MELGAN-STFT")
93
+ elif model_selection == "FastSpeech2 + MelGAN":
94
+ mel_outputs, audio = do_synthesis(input_text, fastspeech2, melgan, "FASTSPEECH2", "MELGAN")
95
+
96
+ # Visualize mel spectrogram
97
+ mels = np.reshape(mel_outputs, [-1, 80])
98
+ fig = plt.figure(figsize=(10, 8))
99
+ ax1 = fig.add_subplot(311)
100
+ ax1.set_title(f'Predicted Mel-after-Spectrogram')
101
+ im = ax1.imshow(np.rot90(mels), aspect='auto', interpolation='none')
102
+ fig.colorbar(mappable=im, shrink=0.65, orientation='horizontal', ax=ax1)
103
+ st.pyplot(fig)
104
+
105
+ # Display the audio using the specified sample rate
106
+ st.audio(audio, format="audio/wav", sample_rate=22050, start_time=0)
107
+
108
+ if __name__ == '__main__':
109
+ app()