Pushpak77 StevenLimcorn commited on
Commit
c26e397
0 Parent(s):

Duplicate from StevenLimcorn/fastspeech2-TTS

Browse files

Co-authored-by: Steven Limcorn <StevenLimcorn@users.noreply.huggingface.co>

Files changed (5) hide show
  1. .gitattributes +27 -0
  2. README.md +13 -0
  3. app.py +95 -0
  4. packages.txt +1 -0
  5. requirements.txt +5 -0
.gitattributes ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bin.* filter=lfs diff=lfs merge=lfs -text
5
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.model filter=lfs diff=lfs merge=lfs -text
12
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
13
+ *.onnx filter=lfs diff=lfs merge=lfs -text
14
+ *.ot filter=lfs diff=lfs merge=lfs -text
15
+ *.parquet filter=lfs diff=lfs merge=lfs -text
16
+ *.pb filter=lfs diff=lfs merge=lfs -text
17
+ *.pt filter=lfs diff=lfs merge=lfs -text
18
+ *.pth filter=lfs diff=lfs merge=lfs -text
19
+ *.rar filter=lfs diff=lfs merge=lfs -text
20
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
21
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
22
+ *.tflite filter=lfs diff=lfs merge=lfs -text
23
+ *.tgz filter=lfs diff=lfs merge=lfs -text
24
+ *.xz filter=lfs diff=lfs merge=lfs -text
25
+ *.zip filter=lfs diff=lfs merge=lfs -text
26
+ *.zstandard filter=lfs diff=lfs merge=lfs -text
27
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Fastspeech2 TTS
3
+ emoji: 🚀
4
+ colorFrom: pink
5
+ colorTo: gray
6
+ sdk: gradio
7
+ sdk_version: 2.8.13
8
+ app_file: app.py
9
+ pinned: false
10
+ duplicated_from: StevenLimcorn/fastspeech2-TTS
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces#reference
app.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from matplotlib.pyplot import text
2
+ import numpy as np
3
+ import soundfile as sf
4
+ import yaml
5
+
6
+ import tensorflow as tf
7
+
8
+ from tensorflow_tts.inference import TFAutoModel
9
+ from tensorflow_tts.inference import AutoProcessor
10
+ from tensorflow_tts.inference import AutoConfig
11
+ import gradio as gr
12
+
13
+ MODEL_NAMES = [
14
+ "Fastspeech2 + Melgan",
15
+ "Tacotron2 + Melgan",
16
+ "Tacotron2 + MB-Melgan",
17
+ "Fastspeech2 + MB-Melgan"
18
+ ]
19
+
20
+ fastspeech = TFAutoModel.from_pretrained("tensorspeech/tts-fastspeech-ljspeech-en", name="fastspeech")
21
+ fastspeech2 = TFAutoModel.from_pretrained("tensorspeech/tts-fastspeech2-ljspeech-en", name="fastspeech2")
22
+ tacotron2 = TFAutoModel.from_pretrained("tensorspeech/tts-tacotron2-ljspeech-en", name="tacotron2")
23
+ melgan = TFAutoModel.from_pretrained("tensorspeech/tts-melgan-ljspeech-en", name="melgan")
24
+ mb_melgan = TFAutoModel.from_pretrained("tensorspeech/tts-mb_melgan-ljspeech-en", name="mb_melgan")
25
+
26
+ MODEL_DICT = {
27
+ "Fastspeech2" : fastspeech2,
28
+ "Tacotron2" : tacotron2,
29
+ "Melgan": melgan,
30
+ "MB-Melgan": mb_melgan,
31
+ }
32
+
33
+ def inference(input_text, model_type):
34
+ text2mel_name, vocoder_name = model_type.split(" + ")
35
+ text2mel_model, vocoder_model = MODEL_DICT[text2mel_name], MODEL_DICT[vocoder_name]
36
+ processor = AutoProcessor.from_pretrained("tensorspeech/tts-tacotron2-ljspeech-en")
37
+ input_ids = processor.text_to_sequence(input_text)
38
+
39
+ if text2mel_name == "Tacotron2":
40
+ _, mel_outputs, stop_token_prediction, alignment_history = text2mel_model.inference(
41
+ tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0),
42
+ tf.convert_to_tensor([len(input_ids)], tf.int32),
43
+ tf.convert_to_tensor([0], dtype=tf.int32)
44
+ )
45
+ elif text2mel_name == "Fastspeech":
46
+ mel_before, mel_outputs, duration_outputs = text2mel_model.inference(
47
+ input_ids=tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0),
48
+ speaker_ids=tf.convert_to_tensor([0], dtype=tf.int32),
49
+ speed_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32),
50
+ )
51
+ elif text2mel_name == "Fastspeech2":
52
+ mel_before, mel_outputs, duration_outputs, _, _ = text2mel_model.inference(
53
+ tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0),
54
+ speaker_ids=tf.convert_to_tensor([0], dtype=tf.int32),
55
+ speed_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32),
56
+ f0_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32),
57
+ energy_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32),
58
+ )
59
+ else:
60
+ raise ValueError("Only TACOTRON, FASTSPEECH, FASTSPEECH2 are supported on text2mel_name")
61
+
62
+ # vocoder part
63
+ if vocoder_name == "Melgan":
64
+ audio = vocoder_model(mel_outputs)[0, :, 0]
65
+ elif vocoder_name == "MB-Melgan":
66
+ audio = vocoder_model(mel_outputs)[0, :, 0]
67
+ else:
68
+ raise ValueError("Only MELGAN, MELGAN-STFT and MB_MELGAN are supported on vocoder_name")
69
+
70
+ # if text2mel_name == "TACOTRON":
71
+ # return mel_outputs.numpy(), alignment_history.numpy(), audio.numpy()
72
+ # else:
73
+ # return mel_outputs.numpy(), audio.numpy()
74
+
75
+ sf.write('./audio_after.wav', audio, 22050, "PCM_16")
76
+ return './audio_after.wav'
77
+
78
+ inputs = [
79
+ gr.inputs.Textbox(lines=5, label="Input Text"),
80
+ gr.inputs.Radio(label="Pick a TTS Model",choices=MODEL_NAMES,)
81
+ ]
82
+
83
+ outputs = gr.outputs.Audio(type="file", label="Output Audio")
84
+
85
+
86
+ title = "Tensorflow TTS"
87
+ description = "Gradio demo for TensorFlowTTS: Real-Time State-of-the-art Speech Synthesis for Tensorflow 2. To use it, simply add your text, or click one of the examples to load them. Read more at the links below."
88
+ article = "<p style='text-align: center'><a href='https://tensorspeech.github.io/TensorFlowTTS/'>TensorFlowTTS: Real-Time State-of-the-art Speech Synthesis for Tensorflow 2</a> | <a href='https://github.com/TensorSpeech/TensorFlowTTS'>Github Repo</a></p><p>An extension to akhaliq's implementation <a href='https://huggingface.co/spaces/akhaliq/TensorFlowTTS'></p>"
89
+
90
+ examples = [
91
+ ["Once upon a time there was an old mother pig who had three little pigs and not enough food to feed them. So when they were old enough, she sent them out into the world to seek their fortunes."],
92
+ ["How much wood would a woodchuck chuck if a woodchuck could chuck wood?"]
93
+ ]
94
+
95
+ gr.Interface(inference, inputs, outputs, title=title, description=description, article=article, examples=examples).launch()
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ libsndfile1
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ git+https://github.com/stevenlimcorn/TensorFlowTTS.git
2
+ gradio
3
+ numpy
4
+ SoundFile
5
+ git+https://github.com/repodiac/german_transliterate