StevenLimcorn commited on
Commit
346f0e5
β€’
1 Parent(s): 6d5e27b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +76 -29
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import numpy as np
2
  import soundfile as sf
3
  import yaml
@@ -6,40 +7,86 @@ import tensorflow as tf
6
 
7
  from tensorflow_tts.inference import TFAutoModel
8
  from tensorflow_tts.inference import AutoProcessor
 
9
  import gradio as gr
10
 
11
- # initialize fastspeech2 model.
12
- fastspeech2 = TFAutoModel.from_pretrained("tensorspeech/tts-fastspeech2-ljspeech-en")
 
 
13
 
14
- # initialize mb_melgan model
15
- mb_melgan = TFAutoModel.from_pretrained("tensorspeech/tts-mb_melgan-ljspeech-en")
 
 
 
 
 
 
 
 
 
16
 
 
 
 
 
 
 
 
17
 
18
- # inference
19
- processor = AutoProcessor.from_pretrained("tensorspeech/tts-fastspeech2-ljspeech-en")
20
 
21
- def inference(text):
22
- input_ids = processor.text_to_sequence(text)
23
- # fastspeech inference
24
-
25
- mel_before, mel_after, duration_outputs, _, _ = fastspeech2.inference(
26
- input_ids=tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0),
27
- speaker_ids=tf.convert_to_tensor([0], dtype=tf.int32),
28
- speed_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32),
29
- f0_ratios =tf.convert_to_tensor([1.0], dtype=tf.float32),
30
- energy_ratios =tf.convert_to_tensor([1.0], dtype=tf.float32),
31
- )
32
-
33
- # melgan inference
34
- audio_before = mb_melgan.inference(mel_before)[0, :, 0]
35
- audio_after = mb_melgan.inference(mel_after)[0, :, 0]
36
-
37
- # save to file
38
- sf.write('./audio_before.wav', audio_before, 22050, "PCM_16")
39
- sf.write('./audio_after.wav', audio_after, 22050, "PCM_16")
40
- return './audio_after.wav'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
- inputs = gr.inputs.Textbox(lines=5, label="Input Text")
 
 
 
 
43
  outputs = gr.outputs.Audio(type="file", label="Output Audio")
44
 
45
 
@@ -48,8 +95,8 @@ description = "Gradio demo for TensorFlowTTS: Real-Time State-of-the-art Speech
48
  article = "<p style='text-align: center'><a href='https://tensorspeech.github.io/TensorFlowTTS/'>TensorFlowTTS: Real-Time State-of-the-art Speech Synthesis for Tensorflow 2</a> | <a href='https://github.com/TensorSpeech/TensorFlowTTS'>Github Repo</a></p>"
49
 
50
  examples = [
51
- ["TensorFlowTTS provides real-time state-of-the-art speech synthesis architectures such as Tacotron-2, Melgan, Multiband-Melgan, FastSpeech, FastSpeech2 based-on TensorFlow 2."],
52
- ["With Tensorflow 2, we can speed-up training/inference progress, optimizer further by using fake-quantize aware and pruning, make TTS models can be run faster than real-time and be able to deploy on mobile devices or embedded systems."]
53
  ]
54
 
55
  gr.Interface(inference, inputs, outputs, title=title, description=description, article=article, examples=examples).launch()
 
1
+ from matplotlib.pyplot import text
2
  import numpy as np
3
  import soundfile as sf
4
  import yaml
 
7
 
8
  from tensorflow_tts.inference import TFAutoModel
9
  from tensorflow_tts.inference import AutoProcessor
10
+ from tensorflow_tts.inference import AutoConfig
11
  import gradio as gr
12
 
13
+ MODEL_NAMES = [
14
+ "Fastspeech2 + Melgan",
15
+ "Tacotron2 + Melgan",
16
+ ]
17
 
18
+ fastspeech = TFAutoModel.from_pretrained("tensorspeech/tts-fastspeech-ljspeech-en", name="fastspeech")
19
+ fastspeech2 = TFAutoModel.from_pretrained("tensorspeech/tts-fastspeech2-ljspeech-en", name="fastspeech2")
20
+ tacotron2 = TFAutoModel.from_pretrained("tensorspeech/tts-tacotron2-ljspeech-en", name="tacotron2")
21
+ melgan = TFAutoModel.from_pretrained("tensorspeech/tts-melgan-ljspeech-en", name="melgan")
22
+ mb_melgan = TFAutoModel.from_pretrained("tensorspeech/tts-mb_melgan-ljspeech-en", name="mb_melgan")
23
+ melgan_stft_config = AutoConfig.from_pretrained('TensorFlowTTS/examples/melgan_stft/conf/melgan_stft.v1.yaml')
24
+ melgan_stft = TFAutoModel.from_pretrained(
25
+ config=melgan_stft_config,
26
+ pretrained_path="melgan.stft-2M.h5",
27
+ name="melgan_stft"
28
+ )
29
 
30
+ MODEL_DICT = {
31
+ "Fastspeech2" : fastspeech2,
32
+ "Tacotron2" : tacotron2,
33
+ "Melgan": melgan,
34
+ "MB-Melgan": mb_melgan,
35
+ "Melgan-STFT": melgan_stft
36
+ }
37
 
38
+ def inference(input):
39
+ input_text, model_type = input[0], input[1]
40
 
41
+ text2mel_name, vocoder_name = model_type.split(" + ")
42
+ text2mel_model, vocoder_model = MODEL_DICT[text2mel_name], MODEL_DICT[vocoder_name]
43
+ processor = AutoProcessor.from_pretrained(text2mel_name)
44
+ input_ids = processor.text_to_sequence(input_text)
45
+
46
+ if text2mel_name == "Tacotron":
47
+ _, mel_outputs, stop_token_prediction, alignment_history = text2mel_model.inference(
48
+ tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0),
49
+ tf.convert_to_tensor([len(input_ids)], tf.int32),
50
+ tf.convert_to_tensor([0], dtype=tf.int32)
51
+ )
52
+ elif text2mel_name == "Fastspeech":
53
+ mel_before, mel_outputs, duration_outputs = text2mel_model.inference(
54
+ input_ids=tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0),
55
+ speaker_ids=tf.convert_to_tensor([0], dtype=tf.int32),
56
+ speed_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32),
57
+ )
58
+ elif text2mel_name == "Fastspeech2":
59
+ mel_before, mel_outputs, duration_outputs, _, _ = text2mel_model.inference(
60
+ tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0),
61
+ speaker_ids=tf.convert_to_tensor([0], dtype=tf.int32),
62
+ speed_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32),
63
+ f0_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32),
64
+ energy_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32),
65
+ )
66
+ else:
67
+ raise ValueError("Only TACOTRON, FASTSPEECH, FASTSPEECH2 are supported on text2mel_name")
68
+
69
+ # vocoder part
70
+ if vocoder_name == "Melgan" or vocoder_name == "Melgan-STFT":
71
+ audio = vocoder_model(mel_outputs)[0, :, 0]
72
+ elif vocoder_name == "MB-Melgan":
73
+ audio = vocoder_model(mel_outputs)[0, :, 0]
74
+ else:
75
+ raise ValueError("Only MELGAN, MELGAN-STFT and MB_MELGAN are supported on vocoder_name")
76
+
77
+ # if text2mel_name == "TACOTRON":
78
+ # return mel_outputs.numpy(), alignment_history.numpy(), audio.numpy()
79
+ # else:
80
+ # return mel_outputs.numpy(), audio.numpy()
81
+
82
+ sf.write('./audio_after.wav', audio, 22050, "PCM_16")
83
+ return './audio_after.wav'
84
 
85
+ inputs = [
86
+ gr.inputs.Textbox(lines=5, label="Input Text"),
87
+ gr.inputs.Radio(label="Pick a TTS Model",choices=MODEL_NAMES,)
88
+ ]
89
+
90
  outputs = gr.outputs.Audio(type="file", label="Output Audio")
91
 
92
 
 
95
  article = "<p style='text-align: center'><a href='https://tensorspeech.github.io/TensorFlowTTS/'>TensorFlowTTS: Real-Time State-of-the-art Speech Synthesis for Tensorflow 2</a> | <a href='https://github.com/TensorSpeech/TensorFlowTTS'>Github Repo</a></p>"
96
 
97
  examples = [
98
+ ["TensorFlowTTS provides real-time state-of-the-art speech synthesis architectures such as Tacotron-2, Melgan, Multiband-Melgan, FastSpeech, FastSpeech2 based-on TensorFlow 2."],
99
+ ["With Tensorflow 2, we can speed-up training/inference progress, optimizer further by using fake-quantize aware and pruning, make TTS models can be run faster than real-time and be able to deploy on mobile devices or embedded systems."]
100
  ]
101
 
102
  gr.Interface(inference, inputs, outputs, title=title, description=description, article=article, examples=examples).launch()