rushic24 commited on
Commit
bcfd9f0
1 Parent(s): 3289731

first commit

Browse files
Files changed (42) hide show
  1. .gitignore +2 -0
  2. README.md +1 -1
  3. __pycache__/synthesize.cpython-38.pyc +0 -0
  4. api_app.py +34 -0
  5. app.py +46 -0
  6. checkpoints/checkpoint_0.zip +3 -0
  7. checkpoints/checkpoint_9000.zip +3 -0
  8. flagged/log.csv +2 -0
  9. synthesis/synthesize.py +233 -0
  10. synthesis/vocoders/__init__.py +1 -0
  11. synthesis/vocoders/__pycache__/__init__.cpython-38.pyc +0 -0
  12. synthesis/vocoders/__pycache__/hifigan.cpython-38.pyc +0 -0
  13. synthesis/vocoders/__pycache__/hifigan_model.cpython-38.pyc +0 -0
  14. synthesis/vocoders/__pycache__/vocoder.cpython-38.pyc +0 -0
  15. synthesis/vocoders/hifigan.py +42 -0
  16. synthesis/vocoders/hifigan_model.py +377 -0
  17. synthesis/vocoders/vocoder.py +27 -0
  18. synthesize.py +233 -0
  19. training/__init__.py +6 -0
  20. training/__pycache__/__init__.cpython-38.pyc +0 -0
  21. training/__pycache__/clean_text.cpython-38.pyc +0 -0
  22. training/clean_text.py +113 -0
  23. training/tacotron2_model/__init__.py +4 -0
  24. training/tacotron2_model/__pycache__/__init__.cpython-38.pyc +0 -0
  25. training/tacotron2_model/__pycache__/audio_processing.cpython-38.pyc +0 -0
  26. training/tacotron2_model/__pycache__/collate.cpython-38.pyc +0 -0
  27. training/tacotron2_model/__pycache__/layers.cpython-38.pyc +0 -0
  28. training/tacotron2_model/__pycache__/loss.cpython-38.pyc +0 -0
  29. training/tacotron2_model/__pycache__/model.cpython-38.pyc +0 -0
  30. training/tacotron2_model/__pycache__/stft.cpython-38.pyc +0 -0
  31. training/tacotron2_model/__pycache__/utils.cpython-38.pyc +0 -0
  32. training/tacotron2_model/audio_processing.py +123 -0
  33. training/tacotron2_model/collate.py +78 -0
  34. training/tacotron2_model/layers.py +128 -0
  35. training/tacotron2_model/loss.py +49 -0
  36. training/tacotron2_model/model.py +609 -0
  37. training/tacotron2_model/stft.py +187 -0
  38. training/tacotron2_model/utils.py +90 -0
  39. weights/custom_pctest/config.json +37 -0
  40. weights/custom_pctest/model.pt +3 -0
  41. weights/hifiganvocoderdemo/config.json +37 -0
  42. weights/hifiganvocoderdemo/model.pt +3 -0
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ audio.wav
2
+ *.png
README.md CHANGED
@@ -10,4 +10,4 @@ pinned: false
10
  license: mit
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces#reference
 
10
  license: mit
11
  ---
12
 
13
+ Text-to-Speech (TTS) model for Priyanka Chopra's voice
__pycache__/synthesize.cpython-38.pyc ADDED
Binary file (7.17 kB). View file
 
api_app.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, redirect, url_for, request
2
+ import gradio as gr
3
+ from synthesize import synthesize, load_model
4
+ from synthesis.vocoders import Hifigan
5
+ import sounddevice as sd
6
+ import soundfile as sf
7
+
8
+ model = load_model("checkpoints/checkpoint_9000.zip")
9
+ vocoder = Hifigan("weights/custom_pctest/model.pt", "weights/custom_pctest/config.json")
10
+
11
+ def inference(text: str):
12
+ synthesize(
13
+ model=model,
14
+ text=text,
15
+ graph_path="graph.png",
16
+ audio_path="audio.wav",
17
+ vocoder=vocoder,
18
+ )
19
+ return "audio.wav"
20
+
21
+ app = Flask(__name__)
22
+
23
+ @app.route('/process',methods = ['POST'])
24
+ def login():
25
+ if request.method == 'POST':
26
+ text = request.json['text']
27
+ inference(text)
28
+ data, fs = sf.read("audio.wav", dtype='float32')
29
+ sd.play(data, fs)
30
+ status = sd.wait() # Wait until file is done playing
31
+ return {'success': True}
32
+
33
+ if __name__ == '__main__':
34
+ app.run(debug = True)
app.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tempfile
2
+ import gradio as gr
3
+ from synthesize import synthesize, load_model
4
+ from synthesis.vocoders import Hifigan
5
+
6
+ model = load_model("checkpoints/checkpoint_9000.zip")
7
+ vocoder = Hifigan("weights/custom_pctest/model.pt", "weights/custom_pctest/config.json")
8
+
9
+ title = "Text-to-Speech (TTS) model for Priyanka Chopra's voice"
10
+ description = "Generate english speech from text using a Tacotron2 model" \
11
+
12
+ article = """<p style='text-align: center'>
13
+ <a href='https://rushichaudhari.github.io/posts/2022-01-12-lets-clone-the-voice-of-priyanka-chopra-jonas/'
14
+ target='blank'
15
+ class='footer'>Blog</a> |
16
+ <a href='https://github.com/eugenesiow/practical-ml' target='_blank'
17
+ class='footer'>Github Repo</a></p>"""
18
+ examples = ["Generate english speech from text using a Tacotron2 model.",
19
+ ""]
20
+
21
+ def inference(text: str):
22
+ synthesize(
23
+ model=model,
24
+ text=text,
25
+ graph_path="graph.png",
26
+ audio_path="audio.wav",
27
+ vocoder=vocoder,
28
+ )
29
+ return "audio.wav"
30
+
31
+ gr.Interface(
32
+ fn=inference,
33
+ inputs=[
34
+ gr.inputs.Textbox(
35
+ label="Input",
36
+ default="你好吗?我很好。",
37
+ ),
38
+ ],
39
+ outputs=gr.outputs.Audio(label="Output"),
40
+ title=title,
41
+ description=description,
42
+ article=article,
43
+ examples=examples,
44
+ enable_queue=True,
45
+ allow_flagging=False,
46
+ ).launch(debug=False)
checkpoints/checkpoint_0.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18607124e1a417d9cb60f8d52c360cfda530ead09c1bb5940c2e8b3c9fcd10d1
3
+ size 338411959
checkpoints/checkpoint_9000.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ab42a5e478da1131a4eeed118addb9dbe945d747068ea3cb1cb1ef4584a468b
3
+ size 338412023
flagged/log.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ 'name','output','flag','username','timestamp'
2
+ '','','','','2022-05-17 13:28:37.822200'
synthesis/synthesize.py ADDED
@@ -0,0 +1,233 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ import matplotlib.pyplot as plt
4
+ import torch
5
+ import numpy as np
6
+ import matplotlib
7
+ from scipy.io.wavfile import write
8
+ from os.path import dirname, abspath
9
+ import sys
10
+
11
+ import nltk
12
+
13
+ nltk.download("punkt")
14
+
15
+ sys.path.append(dirname(dirname(abspath(__file__))))
16
+ matplotlib.use("Agg")
17
+
18
+ from training.tacotron2_model import Tacotron2
19
+ from training.clean_text import clean_text
20
+ from training import DEFAULT_ALPHABET
21
+ from synthesis.vocoders import Hifigan
22
+
23
+
24
+ def load_model(model_path):
25
+ """
26
+ Loads the Tacotron2 model.
27
+ Uses GPU if available, otherwise uses CPU.
28
+
29
+ Parameters
30
+ ----------
31
+ model_path : str
32
+ Path to tacotron2 model
33
+
34
+ Returns
35
+ -------
36
+ Tacotron2
37
+ Loaded tacotron2 model
38
+ """
39
+ if torch.cuda.is_available():
40
+ model = Tacotron2().cuda()
41
+ model.load_state_dict(torch.load(model_path)["state_dict"])
42
+ _ = model.cuda().eval().half()
43
+ else:
44
+ model = Tacotron2()
45
+ model.load_state_dict(torch.load(model_path, map_location=torch.device("cpu"))["state_dict"])
46
+ return model
47
+
48
+
49
+ def generate_graph(alignments, filepath, heading=""):
50
+ """
51
+ Generates synthesis alignment graph image.
52
+
53
+ Parameters
54
+ ----------
55
+ alignments : list
56
+ Numpy alignment data
57
+ filepath : str
58
+ Path to save image to
59
+ heading : str (optional)
60
+ Graph heading
61
+ """
62
+ data = alignments.float().data.cpu().numpy()[0].T
63
+ plt.imshow(data, aspect="auto", origin="lower", interpolation="none")
64
+ if heading:
65
+ plt.title(heading)
66
+ plt.savefig(filepath)
67
+
68
+
69
+ def text_to_sequence(text, symbols):
70
+ """
71
+ Generates text sequence for audio file
72
+
73
+ Parameters
74
+ ----------
75
+ text : str
76
+ Text to synthesize
77
+ symbols : list
78
+ List of valid symbols
79
+ """
80
+ symbol_to_id = {s: i for i, s in enumerate(symbols)}
81
+ sequence = np.array([[symbol_to_id[s] for s in text if s in symbol_to_id]])
82
+ if torch.cuda.is_available():
83
+ return torch.autograd.Variable(torch.from_numpy(sequence)).cuda().long()
84
+ else:
85
+ return torch.autograd.Variable(torch.from_numpy(sequence)).cpu().long()
86
+
87
+
88
+ def join_alignment_graphs(alignments):
89
+ """
90
+ Joins multiple alignment graphs.
91
+
92
+ Parameters
93
+ ----------
94
+ alignments : list
95
+ List of alignment Tensors
96
+
97
+ Returns
98
+ -------
99
+ Tensor
100
+ Combined alignment tensor
101
+ """
102
+ alignment_sizes = [a.size() for a in alignments]
103
+ joined = torch.zeros((1, sum([a[1] for a in alignment_sizes]), sum([a[2] for a in alignment_sizes])))
104
+ current_x = 0
105
+ current_y = 0
106
+ for alignment in alignments:
107
+ joined[:, current_x : current_x + alignment.size()[1], current_y : current_y + alignment.size()[2]] = alignment
108
+ current_x += alignment.size()[1]
109
+ current_y += alignment.size()[2]
110
+ return joined
111
+
112
+
113
+ def synthesize(
114
+ model,
115
+ text,
116
+ symbols=DEFAULT_ALPHABET,
117
+ graph_path=None,
118
+ audio_path=None,
119
+ vocoder=None,
120
+ silence_padding=0.15,
121
+ sample_rate=22050,
122
+ max_decoder_steps=1000,
123
+ split_text=False,
124
+ ):
125
+ """
126
+ Synthesise text for a given model.
127
+ Produces graph and/or audio file when given.
128
+ Supports multi line synthesis (seperated by \n).
129
+
130
+ Parameters
131
+ ----------
132
+ model : Tacotron2
133
+ Tacotron2 model
134
+ text : str/list
135
+ Text to synthesize (or list of lines to synthesize)
136
+ symbols : list
137
+ List of symbols (default is English)
138
+ graph_path : str (optional)
139
+ Path to save alignment graph to
140
+ audio_path : str (optional)
141
+ Path to save audio file to
142
+ vocoder : Object (optional)
143
+ Vocoder model (required if generating audio)
144
+ silence_padding : float (optional)
145
+ Seconds of silence to seperate each clip by with multi-line synthesis (default is 0.15)
146
+ sample_rate : int (optional)
147
+ Audio sample rate (default is 22050)
148
+ max_decoder_steps : int (optional)
149
+ Max decoder steps controls sequence length and memory usage during inference.
150
+ Increasing this will use more memory but may allow for longer sentences. (default is 1000)
151
+ split_text : bool (optional)
152
+ Whether to use the split text tool to convert a block of text into multiple shorter sentences
153
+ to synthesize (default is True)
154
+
155
+ Raises
156
+ -------
157
+ AssertionError
158
+ If audio_path is given without a vocoder
159
+ """
160
+ if audio_path:
161
+ assert vocoder, "Missing vocoder"
162
+
163
+ if not isinstance(text, list) and split_text:
164
+ # Split text into multiple lines
165
+ text = nltk.tokenize.sent_tokenize(text)
166
+
167
+ if isinstance(text, list):
168
+ # Multi-lines given
169
+ text = [line.strip() for line in text if line.strip()]
170
+ mels = []
171
+ alignments = []
172
+ for line in text:
173
+ text = clean_text(line, symbols)
174
+ sequence = text_to_sequence(text, symbols)
175
+ _, mel_outputs_postnet, _, alignment = model.inference(sequence, max_decoder_steps)
176
+ mels.append(mel_outputs_postnet)
177
+ alignments.append(alignment)
178
+
179
+ if graph_path:
180
+ generate_graph(join_alignment_graphs(alignments), graph_path)
181
+
182
+ if audio_path:
183
+ silence = np.zeros(int(silence_padding * sample_rate)).astype("int16")
184
+ audio_segments = []
185
+ for i in range(len(mels)):
186
+ audio_segments.append(vocoder.generate_audio(mels[i]))
187
+ if i != len(mels) - 1:
188
+ audio_segments.append(silence)
189
+
190
+ audio = np.concatenate(audio_segments)
191
+ write(audio_path, sample_rate, audio)
192
+ else:
193
+ # Single sentence
194
+ text = clean_text(text.strip(), symbols)
195
+ sequence = text_to_sequence(text, symbols)
196
+ _, mel_outputs_postnet, _, alignment = model.inference(sequence, max_decoder_steps)
197
+
198
+ if graph_path:
199
+ generate_graph(alignment, graph_path)
200
+
201
+ if audio_path:
202
+ audio = vocoder.generate_audio(mel_outputs_postnet)
203
+ write(audio_path, sample_rate, audio)
204
+
205
+
206
+ if __name__ == "__main__":
207
+ """Synthesize audio using model and vocoder"""
208
+ parser = argparse.ArgumentParser(description="Synthesize audio using model and vocoder")
209
+ parser.add_argument("-m", "--model_path", type=str, help="tacotron2 model path", required=True)
210
+ parser.add_argument("-vm", "--vocoder_model_path", type=str, help="vocoder model path", required=True)
211
+ parser.add_argument("-hc", "--hifigan_config_path", type=str, help="hifigan_config path", required=True)
212
+ parser.add_argument("-t", "--text", type=str, help="text to synthesize", required=True)
213
+ parser.add_argument("-g", "--graph_output_path", type=str, help="path to save alignment graph to", required=False)
214
+ parser.add_argument("-a", "--audio_output_path", type=str, help="path to save output audio to", required=False)
215
+ parser.add_argument("--silence_padding", type=float, help="Padding between sentences in seconds", default=0.15)
216
+ parser.add_argument("--sample_rate", type=int, help="Audio sample rate", default=22050)
217
+ args = parser.parse_args()
218
+
219
+ assert os.path.isfile(args.model_path), "Model not found"
220
+ assert os.path.isfile(args.vocoder_model_path), "vocoder model not found"
221
+
222
+ model = load_model(args.model_path)
223
+ vocoder = Hifigan(args.vocoder_model_path, args.hifigan_config_path)
224
+
225
+ synthesize(
226
+ model=model,
227
+ text=args.text,
228
+ graph_path=args.graph_output_path,
229
+ audio_path=args.audio_output_path,
230
+ vocoder=vocoder,
231
+ silence_padding=args.silence_padding,
232
+ sample_rate=args.sample_rate,
233
+ )
synthesis/vocoders/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from synthesis.vocoders.hifigan import Hifigan # noqa
synthesis/vocoders/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (209 Bytes). View file
 
synthesis/vocoders/__pycache__/hifigan.cpython-38.pyc ADDED
Binary file (1.75 kB). View file
 
synthesis/vocoders/__pycache__/hifigan_model.cpython-38.pyc ADDED
Binary file (9.11 kB). View file
 
synthesis/vocoders/__pycache__/vocoder.cpython-38.pyc ADDED
Binary file (879 Bytes). View file
 
synthesis/vocoders/hifigan.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import torch
3
+
4
+ from synthesis.vocoders.hifigan_model import Generator
5
+ from synthesis.vocoders.vocoder import Vocoder, MAX_WAV_VALUE
6
+
7
+
8
+ class AttrDict(dict):
9
+ """
10
+ Credit: https://github.com/jik876/hifi-gan
11
+ """
12
+
13
+ def __init__(self, *args, **kwargs):
14
+ super(AttrDict, self).__init__(*args, **kwargs)
15
+ self.__dict__ = self
16
+
17
+
18
+ class Hifigan(Vocoder):
19
+ def __init__(self, model_path, config_path):
20
+ with open(config_path) as f:
21
+ data = f.read()
22
+
23
+ # Use GPU if available
24
+ device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
25
+ h = AttrDict(json.loads(data))
26
+ self.model = Generator(h).to(device)
27
+
28
+ checkpoint_dict = torch.load(model_path, map_location=device)
29
+ self.model.load_state_dict(checkpoint_dict["generator"])
30
+ self.model.eval()
31
+ self.model.remove_weight_norm()
32
+
33
+ def generate_audio(self, mel_output):
34
+ with torch.no_grad():
35
+ if torch.cuda.is_available():
36
+ mel_output = mel_output.type(torch.cuda.FloatTensor)
37
+
38
+ y_g_hat = self.model(mel_output)
39
+ audio = y_g_hat.squeeze()
40
+ audio = audio * MAX_WAV_VALUE
41
+ audio = audio.cpu().numpy().astype("int16")
42
+ return audio
synthesis/vocoders/hifigan_model.py ADDED
@@ -0,0 +1,377 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Credit: https://github.com/jik876/hifi-gan
2
+ #
3
+ # MIT License
4
+ #
5
+ # Copyright (c) 2020 Jungil Kong
6
+ #
7
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
8
+ # of this software and associated documentation files (the "Software"), to deal
9
+ # in the Software without restriction, including without limitation the rights
10
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11
+ # copies of the Software, and to permit persons to whom the Software is
12
+ # furnished to do so, subject to the following conditions:
13
+ #
14
+ # The above copyright notice and this permission notice shall be included in all
15
+ # copies or substantial portions of the Software.
16
+ #
17
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23
+ # SOFTWARE.
24
+
25
+
26
+ import torch
27
+ import torch.nn.functional as F
28
+ import torch.nn as nn
29
+ from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
30
+ from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
31
+
32
+ LRELU_SLOPE = 0.1
33
+
34
+
35
+ def init_weights(m, mean=0.0, std=0.01):
36
+ classname = m.__class__.__name__
37
+ if classname.find("Conv") != -1:
38
+ m.weight.data.normal_(mean, std)
39
+
40
+
41
+ def get_padding(kernel_size, dilation=1):
42
+ return int((kernel_size * dilation - dilation) / 2)
43
+
44
+
45
+ class ResBlock1(torch.nn.Module):
46
+ def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
47
+ super(ResBlock1, self).__init__()
48
+ self.h = h
49
+ self.convs1 = nn.ModuleList(
50
+ [
51
+ weight_norm(
52
+ Conv1d(
53
+ channels,
54
+ channels,
55
+ kernel_size,
56
+ 1,
57
+ dilation=dilation[0],
58
+ padding=get_padding(kernel_size, dilation[0]),
59
+ )
60
+ ),
61
+ weight_norm(
62
+ Conv1d(
63
+ channels,
64
+ channels,
65
+ kernel_size,
66
+ 1,
67
+ dilation=dilation[1],
68
+ padding=get_padding(kernel_size, dilation[1]),
69
+ )
70
+ ),
71
+ weight_norm(
72
+ Conv1d(
73
+ channels,
74
+ channels,
75
+ kernel_size,
76
+ 1,
77
+ dilation=dilation[2],
78
+ padding=get_padding(kernel_size, dilation[2]),
79
+ )
80
+ ),
81
+ ]
82
+ )
83
+ self.convs1.apply(init_weights)
84
+
85
+ self.convs2 = nn.ModuleList(
86
+ [
87
+ weight_norm(
88
+ Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1))
89
+ ),
90
+ weight_norm(
91
+ Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1))
92
+ ),
93
+ weight_norm(
94
+ Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1))
95
+ ),
96
+ ]
97
+ )
98
+ self.convs2.apply(init_weights)
99
+
100
+ def forward(self, x):
101
+ for c1, c2 in zip(self.convs1, self.convs2):
102
+ xt = F.leaky_relu(x, LRELU_SLOPE)
103
+ xt = c1(xt)
104
+ xt = F.leaky_relu(xt, LRELU_SLOPE)
105
+ xt = c2(xt)
106
+ x = xt + x
107
+ return x
108
+
109
+ def remove_weight_norm(self):
110
+ for l in self.convs1:
111
+ remove_weight_norm(l)
112
+ for l in self.convs2:
113
+ remove_weight_norm(l)
114
+
115
+
116
+ class ResBlock2(torch.nn.Module):
117
+ def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)):
118
+ super(ResBlock2, self).__init__()
119
+ self.h = h
120
+ self.convs = nn.ModuleList(
121
+ [
122
+ weight_norm(
123
+ Conv1d(
124
+ channels,
125
+ channels,
126
+ kernel_size,
127
+ 1,
128
+ dilation=dilation[0],
129
+ padding=get_padding(kernel_size, dilation[0]),
130
+ )
131
+ ),
132
+ weight_norm(
133
+ Conv1d(
134
+ channels,
135
+ channels,
136
+ kernel_size,
137
+ 1,
138
+ dilation=dilation[1],
139
+ padding=get_padding(kernel_size, dilation[1]),
140
+ )
141
+ ),
142
+ ]
143
+ )
144
+ self.convs.apply(init_weights)
145
+
146
+ def forward(self, x):
147
+ for c in self.convs:
148
+ xt = F.leaky_relu(x, LRELU_SLOPE)
149
+ xt = c(xt)
150
+ x = xt + x
151
+ return x
152
+
153
+ def remove_weight_norm(self):
154
+ for l in self.convs:
155
+ remove_weight_norm(l)
156
+
157
+
158
+ class Generator(torch.nn.Module):
159
+ def __init__(self, h):
160
+ super(Generator, self).__init__()
161
+ self.h = h
162
+ self.num_kernels = len(h.resblock_kernel_sizes)
163
+ self.num_upsamples = len(h.upsample_rates)
164
+ self.conv_pre = weight_norm(Conv1d(80, h.upsample_initial_channel, 7, 1, padding=3))
165
+ resblock = ResBlock1 if h.resblock == "1" else ResBlock2
166
+
167
+ self.ups = nn.ModuleList()
168
+ for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
169
+ self.ups.append(
170
+ weight_norm(
171
+ ConvTranspose1d(
172
+ h.upsample_initial_channel // (2**i),
173
+ h.upsample_initial_channel // (2 ** (i + 1)),
174
+ k,
175
+ u,
176
+ padding=(k - u) // 2,
177
+ )
178
+ )
179
+ )
180
+
181
+ self.resblocks = nn.ModuleList()
182
+ for i in range(len(self.ups)):
183
+ ch = h.upsample_initial_channel // (2 ** (i + 1))
184
+ for j, (k, d) in enumerate(zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)):
185
+ self.resblocks.append(resblock(h, ch, k, d))
186
+
187
+ self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
188
+ self.ups.apply(init_weights)
189
+ self.conv_post.apply(init_weights)
190
+
191
+ def forward(self, x):
192
+ x = self.conv_pre(x)
193
+ for i in range(self.num_upsamples):
194
+ x = F.leaky_relu(x, LRELU_SLOPE)
195
+ x = self.ups[i](x)
196
+ xs = None
197
+ for j in range(self.num_kernels):
198
+ if xs is None:
199
+ xs = self.resblocks[i * self.num_kernels + j](x)
200
+ else:
201
+ xs += self.resblocks[i * self.num_kernels + j](x)
202
+ x = xs / self.num_kernels
203
+ x = F.leaky_relu(x)
204
+ x = self.conv_post(x)
205
+ x = torch.tanh(x)
206
+
207
+ return x
208
+
209
+ def remove_weight_norm(self):
210
+ for l in self.ups:
211
+ remove_weight_norm(l)
212
+ for l in self.resblocks:
213
+ l.remove_weight_norm()
214
+ remove_weight_norm(self.conv_pre)
215
+ remove_weight_norm(self.conv_post)
216
+
217
+
218
+ class DiscriminatorP(torch.nn.Module):
219
+ def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
220
+ super(DiscriminatorP, self).__init__()
221
+ self.period = period
222
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
223
+ self.convs = nn.ModuleList(
224
+ [
225
+ norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
226
+ norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
227
+ norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
228
+ norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
229
+ norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))),
230
+ ]
231
+ )
232
+ self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
233
+
234
+ def forward(self, x):
235
+ fmap = []
236
+
237
+ # 1d to 2d
238
+ b, c, t = x.shape
239
+ if t % self.period != 0: # pad first
240
+ n_pad = self.period - (t % self.period)
241
+ x = F.pad(x, (0, n_pad), "reflect")
242
+ t = t + n_pad
243
+ x = x.view(b, c, t // self.period, self.period)
244
+
245
+ for l in self.convs:
246
+ x = l(x)
247
+ x = F.leaky_relu(x, LRELU_SLOPE)
248
+ fmap.append(x)
249
+ x = self.conv_post(x)
250
+ fmap.append(x)
251
+ x = torch.flatten(x, 1, -1)
252
+
253
+ return x, fmap
254
+
255
+
256
+ class MultiPeriodDiscriminator(torch.nn.Module):
257
+ def __init__(self):
258
+ super(MultiPeriodDiscriminator, self).__init__()
259
+ self.discriminators = nn.ModuleList(
260
+ [
261
+ DiscriminatorP(2),
262
+ DiscriminatorP(3),
263
+ DiscriminatorP(5),
264
+ DiscriminatorP(7),
265
+ DiscriminatorP(11),
266
+ ]
267
+ )
268
+
269
+ def forward(self, y, y_hat):
270
+ y_d_rs = []
271
+ y_d_gs = []
272
+ fmap_rs = []
273
+ fmap_gs = []
274
+ for i, d in enumerate(self.discriminators):
275
+ y_d_r, fmap_r = d(y)
276
+ y_d_g, fmap_g = d(y_hat)
277
+ y_d_rs.append(y_d_r)
278
+ fmap_rs.append(fmap_r)
279
+ y_d_gs.append(y_d_g)
280
+ fmap_gs.append(fmap_g)
281
+
282
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
283
+
284
+
285
+ class DiscriminatorS(torch.nn.Module):
286
+ def __init__(self, use_spectral_norm=False):
287
+ super(DiscriminatorS, self).__init__()
288
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
289
+ self.convs = nn.ModuleList(
290
+ [
291
+ norm_f(Conv1d(1, 128, 15, 1, padding=7)),
292
+ norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)),
293
+ norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)),
294
+ norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)),
295
+ norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)),
296
+ norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)),
297
+ norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
298
+ ]
299
+ )
300
+ self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
301
+
302
+ def forward(self, x):
303
+ fmap = []
304
+ for l in self.convs:
305
+ x = l(x)
306
+ x = F.leaky_relu(x, LRELU_SLOPE)
307
+ fmap.append(x)
308
+ x = self.conv_post(x)
309
+ fmap.append(x)
310
+ x = torch.flatten(x, 1, -1)
311
+
312
+ return x, fmap
313
+
314
+
315
+ class MultiScaleDiscriminator(torch.nn.Module):
316
+ def __init__(self):
317
+ super(MultiScaleDiscriminator, self).__init__()
318
+ self.discriminators = nn.ModuleList(
319
+ [
320
+ DiscriminatorS(use_spectral_norm=True),
321
+ DiscriminatorS(),
322
+ DiscriminatorS(),
323
+ ]
324
+ )
325
+ self.meanpools = nn.ModuleList([AvgPool1d(4, 2, padding=2), AvgPool1d(4, 2, padding=2)])
326
+
327
+ def forward(self, y, y_hat):
328
+ y_d_rs = []
329
+ y_d_gs = []
330
+ fmap_rs = []
331
+ fmap_gs = []
332
+ for i, d in enumerate(self.discriminators):
333
+ if i != 0:
334
+ y = self.meanpools[i - 1](y)
335
+ y_hat = self.meanpools[i - 1](y_hat)
336
+ y_d_r, fmap_r = d(y)
337
+ y_d_g, fmap_g = d(y_hat)
338
+ y_d_rs.append(y_d_r)
339
+ fmap_rs.append(fmap_r)
340
+ y_d_gs.append(y_d_g)
341
+ fmap_gs.append(fmap_g)
342
+
343
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
344
+
345
+
346
+ def feature_loss(fmap_r, fmap_g):
347
+ loss = 0
348
+ for dr, dg in zip(fmap_r, fmap_g):
349
+ for rl, gl in zip(dr, dg):
350
+ loss += torch.mean(torch.abs(rl - gl))
351
+
352
+ return loss * 2
353
+
354
+
355
+ def discriminator_loss(disc_real_outputs, disc_generated_outputs):
356
+ loss = 0
357
+ r_losses = []
358
+ g_losses = []
359
+ for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
360
+ r_loss = torch.mean((1 - dr) ** 2)
361
+ g_loss = torch.mean(dg**2)
362
+ loss += r_loss + g_loss
363
+ r_losses.append(r_loss.item())
364
+ g_losses.append(g_loss.item())
365
+
366
+ return loss, r_losses, g_losses
367
+
368
+
369
+ def generator_loss(disc_outputs):
370
+ loss = 0
371
+ gen_losses = []
372
+ for dg in disc_outputs:
373
+ l = torch.mean((1 - dg) ** 2)
374
+ gen_losses.append(l)
375
+ loss += l
376
+
377
+ return loss, gen_losses
synthesis/vocoders/vocoder.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+
3
+
4
+ MAX_WAV_VALUE = 32768.0
5
+
6
+
7
+ class Vocoder(ABC):
8
+ """
9
+ Produces audio data for tacotron2 mel spectrogram output
10
+ """
11
+
12
+ @abstractmethod
13
+ def generate_audio(self, mel_output):
14
+ """
15
+ Produces wav audio data for a given mel output.
16
+
17
+ Parameters
18
+ ----------
19
+ mel_output : Tensor
20
+ Mel spectrogram output
21
+
22
+ Returns
23
+ -------
24
+ np.array
25
+ Generated audio data
26
+ """
27
+ pass
synthesize.py ADDED
@@ -0,0 +1,233 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ import matplotlib.pyplot as plt
4
+ import torch
5
+ import numpy as np
6
+ import matplotlib
7
+ from scipy.io.wavfile import write
8
+ from os.path import dirname, abspath
9
+ import sys
10
+
11
+ import nltk
12
+
13
+ nltk.download("punkt")
14
+
15
+ sys.path.append(dirname(dirname(abspath(__file__))))
16
+ matplotlib.use("Agg")
17
+
18
+ from training.tacotron2_model import Tacotron2
19
+ from training.clean_text import clean_text
20
+ from training import DEFAULT_ALPHABET
21
+ from synthesis.vocoders import Hifigan
22
+
23
+
24
+ def load_model(model_path):
25
+ """
26
+ Loads the Tacotron2 model.
27
+ Uses GPU if available, otherwise uses CPU.
28
+
29
+ Parameters
30
+ ----------
31
+ model_path : str
32
+ Path to tacotron2 model
33
+
34
+ Returns
35
+ -------
36
+ Tacotron2
37
+ Loaded tacotron2 model
38
+ """
39
+ if torch.cuda.is_available():
40
+ model = Tacotron2().cuda()
41
+ model.load_state_dict(torch.load(model_path)["state_dict"])
42
+ _ = model.cuda().eval().half()
43
+ else:
44
+ model = Tacotron2()
45
+ model.load_state_dict(torch.load(model_path, map_location=torch.device("cpu"))["state_dict"])
46
+ return model
47
+
48
+
49
+ def generate_graph(alignments, filepath, heading=""):
50
+ """
51
+ Generates synthesis alignment graph image.
52
+
53
+ Parameters
54
+ ----------
55
+ alignments : list
56
+ Numpy alignment data
57
+ filepath : str
58
+ Path to save image to
59
+ heading : str (optional)
60
+ Graph heading
61
+ """
62
+ data = alignments.float().data.cpu().numpy()[0].T
63
+ plt.imshow(data, aspect="auto", origin="lower", interpolation="none")
64
+ if heading:
65
+ plt.title(heading)
66
+ plt.savefig(filepath)
67
+
68
+
69
+ def text_to_sequence(text, symbols):
70
+ """
71
+ Generates text sequence for audio file
72
+
73
+ Parameters
74
+ ----------
75
+ text : str
76
+ Text to synthesize
77
+ symbols : list
78
+ List of valid symbols
79
+ """
80
+ symbol_to_id = {s: i for i, s in enumerate(symbols)}
81
+ sequence = np.array([[symbol_to_id[s] for s in text if s in symbol_to_id]])
82
+ if torch.cuda.is_available():
83
+ return torch.autograd.Variable(torch.from_numpy(sequence)).cuda().long()
84
+ else:
85
+ return torch.autograd.Variable(torch.from_numpy(sequence)).cpu().long()
86
+
87
+
88
+ def join_alignment_graphs(alignments):
89
+ """
90
+ Joins multiple alignment graphs.
91
+
92
+ Parameters
93
+ ----------
94
+ alignments : list
95
+ List of alignment Tensors
96
+
97
+ Returns
98
+ -------
99
+ Tensor
100
+ Combined alignment tensor
101
+ """
102
+ alignment_sizes = [a.size() for a in alignments]
103
+ joined = torch.zeros((1, sum([a[1] for a in alignment_sizes]), sum([a[2] for a in alignment_sizes])))
104
+ current_x = 0
105
+ current_y = 0
106
+ for alignment in alignments:
107
+ joined[:, current_x : current_x + alignment.size()[1], current_y : current_y + alignment.size()[2]] = alignment
108
+ current_x += alignment.size()[1]
109
+ current_y += alignment.size()[2]
110
+ return joined
111
+
112
+
113
+ def synthesize(
114
+ model,
115
+ text,
116
+ symbols=DEFAULT_ALPHABET,
117
+ graph_path=None,
118
+ audio_path=None,
119
+ vocoder=None,
120
+ silence_padding=0.15,
121
+ sample_rate=22050,
122
+ max_decoder_steps=1000,
123
+ split_text=False,
124
+ ):
125
+ """
126
+ Synthesise text for a given model.
127
+ Produces graph and/or audio file when given.
128
+ Supports multi line synthesis (seperated by \n).
129
+
130
+ Parameters
131
+ ----------
132
+ model : Tacotron2
133
+ Tacotron2 model
134
+ text : str/list
135
+ Text to synthesize (or list of lines to synthesize)
136
+ symbols : list
137
+ List of symbols (default is English)
138
+ graph_path : str (optional)
139
+ Path to save alignment graph to
140
+ audio_path : str (optional)
141
+ Path to save audio file to
142
+ vocoder : Object (optional)
143
+ Vocoder model (required if generating audio)
144
+ silence_padding : float (optional)
145
+ Seconds of silence to seperate each clip by with multi-line synthesis (default is 0.15)
146
+ sample_rate : int (optional)
147
+ Audio sample rate (default is 22050)
148
+ max_decoder_steps : int (optional)
149
+ Max decoder steps controls sequence length and memory usage during inference.
150
+ Increasing this will use more memory but may allow for longer sentences. (default is 1000)
151
+ split_text : bool (optional)
152
+ Whether to use the split text tool to convert a block of text into multiple shorter sentences
153
+ to synthesize (default is True)
154
+
155
+ Raises
156
+ -------
157
+ AssertionError
158
+ If audio_path is given without a vocoder
159
+ """
160
+ if audio_path:
161
+ assert vocoder, "Missing vocoder"
162
+
163
+ if not isinstance(text, list) and split_text:
164
+ # Split text into multiple lines
165
+ text = nltk.tokenize.sent_tokenize(text)
166
+
167
+ if isinstance(text, list):
168
+ # Multi-lines given
169
+ text = [line.strip() for line in text if line.strip()]
170
+ mels = []
171
+ alignments = []
172
+ for line in text:
173
+ text = clean_text(line, symbols)
174
+ sequence = text_to_sequence(text, symbols)
175
+ _, mel_outputs_postnet, _, alignment = model.inference(sequence, max_decoder_steps)
176
+ mels.append(mel_outputs_postnet)
177
+ alignments.append(alignment)
178
+
179
+ if graph_path:
180
+ generate_graph(join_alignment_graphs(alignments), graph_path)
181
+
182
+ if audio_path:
183
+ silence = np.zeros(int(silence_padding * sample_rate)).astype("int16")
184
+ audio_segments = []
185
+ for i in range(len(mels)):
186
+ audio_segments.append(vocoder.generate_audio(mels[i]))
187
+ if i != len(mels) - 1:
188
+ audio_segments.append(silence)
189
+
190
+ audio = np.concatenate(audio_segments)
191
+ write(audio_path, sample_rate, audio)
192
+ else:
193
+ # Single sentence
194
+ text = clean_text(text.strip(), symbols)
195
+ sequence = text_to_sequence(text, symbols)
196
+ _, mel_outputs_postnet, _, alignment = model.inference(sequence, max_decoder_steps)
197
+
198
+ if graph_path:
199
+ generate_graph(alignment, graph_path)
200
+
201
+ if audio_path:
202
+ audio = vocoder.generate_audio(mel_outputs_postnet)
203
+ write(audio_path, sample_rate, audio)
204
+
205
+
206
+ if __name__ == "__main__":
207
+ """Synthesize audio using model and vocoder"""
208
+ parser = argparse.ArgumentParser(description="Synthesize audio using model and vocoder")
209
+ parser.add_argument("-m", "--model_path", type=str, help="tacotron2 model path", required=True)
210
+ parser.add_argument("-vm", "--vocoder_model_path", type=str, help="vocoder model path", required=True)
211
+ parser.add_argument("-hc", "--hifigan_config_path", type=str, help="hifigan_config path", required=True)
212
+ parser.add_argument("-t", "--text", type=str, help="text to synthesize", required=True)
213
+ parser.add_argument("-g", "--graph_output_path", type=str, help="path to save alignment graph to", required=False)
214
+ parser.add_argument("-a", "--audio_output_path", type=str, help="path to save output audio to", required=False)
215
+ parser.add_argument("--silence_padding", type=float, help="Padding between sentences in seconds", default=0.15)
216
+ parser.add_argument("--sample_rate", type=int, help="Audio sample rate", default=22050)
217
+ args = parser.parse_args()
218
+
219
+ assert os.path.isfile(args.model_path), "Model not found"
220
+ assert os.path.isfile(args.vocoder_model_path), "vocoder model not found"
221
+
222
+ model = load_model(args.model_path)
223
+ vocoder = Hifigan(args.vocoder_model_path, args.hifigan_config_path)
224
+
225
+ synthesize(
226
+ model=model,
227
+ text=args.text,
228
+ graph_path=args.graph_output_path,
229
+ audio_path=args.audio_output_path,
230
+ vocoder=vocoder,
231
+ silence_padding=args.silence_padding,
232
+ sample_rate=args.sample_rate,
233
+ )
training/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ SEED = 1234
2
+ PUNCTUATION = list("_-!'(),.:;?")
3
+ BASE_SYMBOLS = PUNCTUATION + [" "]
4
+ DEFAULT_ALPHABET = list("_-!'(),.:;? ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz")
5
+ TRAIN_FILE = "trainlist.txt"
6
+ VALIDATION_FILE = "vallist.txt"
training/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (387 Bytes). View file
 
training/__pycache__/clean_text.cpython-38.pyc ADDED
Binary file (3.18 kB). View file
 
training/clean_text.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import re
3
+
4
+ import inflect
5
+ from training import DEFAULT_ALPHABET
6
+
7
+ INFLECT_ENGINE = inflect.engine()
8
+ COMMA_NUMBER_RE = re.compile(r"([0-9][0-9\,]+[0-9])")
9
+ DECIMAL_NUMBER_RE = re.compile(r"([0-9]+\.[0-9]+)")
10
+ NUMBER_RE = re.compile(r"[0-9]+")
11
+ ORDINALS = re.compile(r"([0-9]+[st|nd|rd|th]+)")
12
+ CURRENCY = re.compile(r"([£|$|€]+[0-9]+)")
13
+ WHITESPACE_RE = re.compile(r"\s+")
14
+ ALLOWED_CHARACTERS_RE = re.compile("[^a-z ,.!?'-]+")
15
+ MONETARY_REPLACEMENT = {"$": " dollars", "£": " pounds", "€": " euros"}
16
+ ABBREVIATION_REPLACEMENT = {
17
+ "mr.": "mister",
18
+ "mrs.": "misess",
19
+ "dr.": "doctor",
20
+ "no.": "number",
21
+ "st.": "saint",
22
+ "co.": "company",
23
+ "jr.": "junior",
24
+ "maj.": "major",
25
+ "gen.": "general",
26
+ "drs.": "doctors",
27
+ "rev.": "reverend",
28
+ "lt.": "lieutenant",
29
+ "hon.": "honorable",
30
+ "sgt.": "sergeant",
31
+ "capt.": "captain",
32
+ "esq.": "esquire",
33
+ "ltd.": "limited",
34
+ "col.": "colonel",
35
+ "ft.": "fort",
36
+ }
37
+
38
+
39
+ def clean_text(text, symbols=DEFAULT_ALPHABET, remove_invalid_characters=True):
40
+ """
41
+ Cleans text. This includes:
42
+ - Replacing monetary terms (i.e. $ -> dollars)
43
+ - Converting ordinals to full words (i.e. 1st -> first)
44
+ - Converting numbers to their full word format (i.e. 100 -> one hundred)
45
+ - Replacing abbreviations (i.e. dr. -> doctor)
46
+ - Removing invalid characters (non utf-8 or invalid punctuation)
47
+
48
+ Parameters
49
+ ----------
50
+ text : str
51
+ Text to clean
52
+ symbols : list (optional)
53
+ List of valid symbols in text (default is English alphabet & punctuation)
54
+ remove_invalid_characters : bool (optional)
55
+ Whether to remove characters not in symbols list (default is True)
56
+
57
+ Returns
58
+ -------
59
+ str
60
+ Cleaned text
61
+ """
62
+ text = text.strip()
63
+ text = text.lower()
64
+ # Convert currency to words
65
+ money = re.findall(CURRENCY, text)
66
+ for amount in money:
67
+ for key, value in MONETARY_REPLACEMENT.items():
68
+ if key in amount:
69
+ text = text.replace(amount, amount[1:] + value)
70
+ # Convert ordinals to words
71
+ ordinals = re.findall(ORDINALS, text)
72
+ for ordinal in ordinals:
73
+ text = text.replace(ordinal, INFLECT_ENGINE.number_to_words(ordinal))
74
+ # Convert comma & decimal numbers to words
75
+ numbers = re.findall(COMMA_NUMBER_RE, text) + re.findall(DECIMAL_NUMBER_RE, text)
76
+ for number in numbers:
77
+ text = text.replace(number, INFLECT_ENGINE.number_to_words(number))
78
+ # Convert standard numbers to words
79
+ numbers = re.findall(NUMBER_RE, text)
80
+ for number in numbers:
81
+ text = text.replace(number, INFLECT_ENGINE.number_to_words(number))
82
+ # Replace abbreviations
83
+ for key, value in ABBREVIATION_REPLACEMENT.items():
84
+ text = text.replace(" " + key + " ", " " + value + " ")
85
+ # Collapse whitespace
86
+ text = re.sub(WHITESPACE_RE, " ", text)
87
+ # Remove banned characters
88
+ if remove_invalid_characters:
89
+ text = "".join([c for c in text if c in symbols])
90
+ return text
91
+
92
+
93
+ if __name__ == "__main__":
94
+ """Script to clean text for training"""
95
+ parser = argparse.ArgumentParser(description="Clean & improve text for training")
96
+ parser.add_argument("-f", "--file", help="Text file path", type=str, required=True)
97
+ parser.add_argument("-o", "--output", help="Output text file path", type=str, required=True)
98
+ args = parser.parse_args()
99
+
100
+ with open(args.file) as f:
101
+ rows = f.readlines()
102
+
103
+ cleaned_text = []
104
+
105
+ for row in rows:
106
+ filename, text = row.split("|")
107
+ text = clean_text(text)
108
+ cleaned_text.append(f"{filename}|{text}")
109
+
110
+ with open(args.output, "w") as f:
111
+ for line in cleaned_text:
112
+ f.write(line)
113
+ f.write("\n")
training/tacotron2_model/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from training.tacotron2_model.model import Tacotron2 # noqa
2
+ from training.tacotron2_model.loss import Tacotron2Loss # noqa
3
+ from training.tacotron2_model.collate import TextMelCollate # noqa
4
+ from training.tacotron2_model.stft import TacotronSTFT # noqa
training/tacotron2_model/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (425 Bytes). View file
 
training/tacotron2_model/__pycache__/audio_processing.cpython-38.pyc ADDED
Binary file (4.33 kB). View file
 
training/tacotron2_model/__pycache__/collate.cpython-38.pyc ADDED
Binary file (3.37 kB). View file
 
training/tacotron2_model/__pycache__/layers.cpython-38.pyc ADDED
Binary file (5.01 kB). View file
 
training/tacotron2_model/__pycache__/loss.cpython-38.pyc ADDED
Binary file (2.5 kB). View file
 
training/tacotron2_model/__pycache__/model.cpython-38.pyc ADDED
Binary file (17 kB). View file
 
training/tacotron2_model/__pycache__/stft.cpython-38.pyc ADDED
Binary file (6.47 kB). View file
 
training/tacotron2_model/__pycache__/utils.cpython-38.pyc ADDED
Binary file (3.78 kB). View file
 
training/tacotron2_model/audio_processing.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ BSD 3-Clause License
3
+
4
+ Copyright (c) 2018, NVIDIA Corporation
5
+ All rights reserved.
6
+
7
+ Redistribution and use in source and binary forms, with or without
8
+ modification, are permitted provided that the following conditions are met:
9
+
10
+ * Redistributions of source code must retain the above copyright notice, this
11
+ list of conditions and the following disclaimer.
12
+
13
+ * Redistributions in binary form must reproduce the above copyright notice,
14
+ this list of conditions and the following disclaimer in the documentation
15
+ and/or other materials provided with the distribution.
16
+
17
+ * Neither the name of the copyright holder nor the names of its
18
+ contributors may be used to endorse or promote products derived from
19
+ this software without specific prior written permission.
20
+
21
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
25
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31
+ """
32
+ import torch
33
+ import numpy as np
34
+ from scipy.signal import get_window
35
+ import librosa.util as librosa_util
36
+
37
+
38
+ def window_sumsquare(window, n_frames, hop_length=200, win_length=800, n_fft=800, dtype=np.float32, norm=None):
39
+ """
40
+ # from librosa 0.6
41
+ Compute the sum-square envelope of a window function at a given hop length.
42
+
43
+ This is used to estimate modulation effects induced by windowing
44
+ observations in short-time fourier transforms.
45
+
46
+ Parameters
47
+ ----------
48
+ window : string, tuple, number, callable, or list-like
49
+ Window specification, as in `get_window`
50
+
51
+ n_frames : int > 0
52
+ The number of analysis frames
53
+
54
+ hop_length : int > 0
55
+ The number of samples to advance between frames
56
+
57
+ win_length : [optional]
58
+ The length of the window function. By default, this matches `n_fft`.
59
+
60
+ n_fft : int > 0
61
+ The length of each analysis frame.
62
+
63
+ dtype : np.dtype
64
+ The data type of the output
65
+
66
+ Returns
67
+ -------
68
+ wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
69
+ The sum-squared envelope of the window function
70
+ """
71
+ if win_length is None:
72
+ win_length = n_fft
73
+
74
+ n = n_fft + hop_length * (n_frames - 1)
75
+ x = np.zeros(n, dtype=dtype)
76
+
77
+ # Compute the squared window at the desired length
78
+ win_sq = get_window(window, win_length, fftbins=True)
79
+ win_sq = librosa_util.normalize(win_sq, norm=norm) ** 2
80
+ win_sq = librosa_util.pad_center(win_sq, n_fft)
81
+
82
+ # Fill the envelope
83
+ for i in range(n_frames):
84
+ sample = i * hop_length
85
+ x[sample : min(n, sample + n_fft)] += win_sq[: max(0, min(n_fft, n - sample))]
86
+ return x
87
+
88
+
89
+ def griffin_lim(magnitudes, stft_fn, n_iters=30):
90
+ """
91
+ PARAMS
92
+ ------
93
+ magnitudes: spectrogram magnitudes
94
+ stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods
95
+ """
96
+
97
+ angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size())))
98
+ angles = angles.astype(np.float32)
99
+ angles = torch.autograd.Variable(torch.from_numpy(angles))
100
+ signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
101
+
102
+ for i in range(n_iters):
103
+ _, angles = stft_fn.transform(signal)
104
+ signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
105
+ return signal
106
+
107
+
108
+ def dynamic_range_compression(x, C=1, clip_val=1e-5):
109
+ """
110
+ PARAMS
111
+ ------
112
+ C: compression factor
113
+ """
114
+ return torch.log(torch.clamp(x, min=clip_val) * C)
115
+
116
+
117
+ def dynamic_range_decompression(x, C=1):
118
+ """
119
+ PARAMS
120
+ ------
121
+ C: compression factor used to compress
122
+ """
123
+ return torch.exp(x) / C
training/tacotron2_model/collate.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ BSD 3-Clause License
3
+
4
+ Copyright (c) 2018, NVIDIA Corporation
5
+ All rights reserved.
6
+
7
+ Redistribution and use in source and binary forms, with or without
8
+ modification, are permitted provided that the following conditions are met:
9
+
10
+ * Redistributions of source code must retain the above copyright notice, this
11
+ list of conditions and the following disclaimer.
12
+
13
+ * Redistributions in binary form must reproduce the above copyright notice,
14
+ this list of conditions and the following disclaimer in the documentation
15
+ and/or other materials provided with the distribution.
16
+
17
+ * Neither the name of the copyright holder nor the names of its
18
+ contributors may be used to endorse or promote products derived from
19
+ this software without specific prior written permission.
20
+
21
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
25
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31
+ """
32
+ import torch
33
+
34
+
35
+ class TextMelCollate:
36
+ """Zero-pads model inputs and targets based on number of frames per setep"""
37
+
38
+ def __init__(self):
39
+ self.n_frames_per_step = 1
40
+
41
+ def __call__(self, batch):
42
+ """Collate's training batch from normalized text and mel-spectrogram
43
+ PARAMS
44
+ ------
45
+ batch: [text_normalized, mel_normalized]
46
+ """
47
+ # Right zero-pad all one-hot text sequences to max input length
48
+ input_lengths, ids_sorted_decreasing = torch.sort(
49
+ torch.LongTensor([len(x[0]) for x in batch]), dim=0, descending=True
50
+ )
51
+ max_input_len = input_lengths[0]
52
+
53
+ text_padded = torch.LongTensor(len(batch), max_input_len)
54
+ text_padded.zero_()
55
+ for i in range(len(ids_sorted_decreasing)):
56
+ text = batch[ids_sorted_decreasing[i]][0]
57
+ text_padded[i, : text.size(0)] = text
58
+
59
+ # Right zero-pad mel-spec
60
+ num_mels = batch[0][1].size(0)
61
+ max_target_len = max([x[1].size(1) for x in batch])
62
+ if max_target_len % self.n_frames_per_step != 0:
63
+ max_target_len += self.n_frames_per_step - max_target_len % self.n_frames_per_step
64
+ assert max_target_len % self.n_frames_per_step == 0
65
+
66
+ # include mel padded and gate padded
67
+ mel_padded = torch.FloatTensor(len(batch), num_mels, max_target_len)
68
+ mel_padded.zero_()
69
+ gate_padded = torch.FloatTensor(len(batch), max_target_len)
70
+ gate_padded.zero_()
71
+ output_lengths = torch.LongTensor(len(batch))
72
+ for i in range(len(ids_sorted_decreasing)):
73
+ mel = batch[ids_sorted_decreasing[i]][1]
74
+ mel_padded[i, :, : mel.size(1)] = mel
75
+ gate_padded[i, mel.size(1) - 1 :] = 1
76
+ output_lengths[i] = mel.size(1)
77
+
78
+ return text_padded, input_lengths, mel_padded, gate_padded, output_lengths
training/tacotron2_model/layers.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ BSD 3-Clause License
3
+
4
+ Copyright (c) 2018, NVIDIA Corporation
5
+ All rights reserved.
6
+
7
+ Redistribution and use in source and binary forms, with or without
8
+ modification, are permitted provided that the following conditions are met:
9
+
10
+ * Redistributions of source code must retain the above copyright notice, this
11
+ list of conditions and the following disclaimer.
12
+
13
+ * Redistributions in binary form must reproduce the above copyright notice,
14
+ this list of conditions and the following disclaimer in the documentation
15
+ and/or other materials provided with the distribution.
16
+
17
+ * Neither the name of the copyright holder nor the names of its
18
+ contributors may be used to endorse or promote products derived from
19
+ this software without specific prior written permission.
20
+
21
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
25
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31
+ """
32
+ import torch
33
+ from librosa.filters import mel as librosa_mel_fn
34
+ from training.tacotron2_model.audio_processing import dynamic_range_compression
35
+ from training.tacotron2_model.audio_processing import dynamic_range_decompression
36
+ from training.tacotron2_model.stft import STFT
37
+
38
+
39
+ class LinearNorm(torch.nn.Module):
40
+ def __init__(self, in_dim, out_dim, bias=True, w_init_gain="linear"):
41
+ super(LinearNorm, self).__init__()
42
+ self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)
43
+
44
+ torch.nn.init.xavier_uniform_(self.linear_layer.weight, gain=torch.nn.init.calculate_gain(w_init_gain))
45
+
46
+ def forward(self, x):
47
+ return self.linear_layer(x)
48
+
49
+
50
+ class ConvNorm(torch.nn.Module):
51
+ def __init__(
52
+ self,
53
+ in_channels,
54
+ out_channels,
55
+ kernel_size=1,
56
+ stride=1,
57
+ padding=None,
58
+ dilation=1,
59
+ bias=True,
60
+ w_init_gain="linear",
61
+ ):
62
+ super(ConvNorm, self).__init__()
63
+ if padding is None:
64
+ assert kernel_size % 2 == 1
65
+ padding = int(dilation * (kernel_size - 1) / 2)
66
+
67
+ self.conv = torch.nn.Conv1d(
68
+ in_channels,
69
+ out_channels,
70
+ kernel_size=kernel_size,
71
+ stride=stride,
72
+ padding=padding,
73
+ dilation=dilation,
74
+ bias=bias,
75
+ )
76
+
77
+ torch.nn.init.xavier_uniform_(self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain))
78
+
79
+ def forward(self, signal):
80
+ conv_signal = self.conv(signal)
81
+ return conv_signal
82
+
83
+
84
+ class TacotronSTFT(torch.nn.Module):
85
+ def __init__(
86
+ self,
87
+ filter_length=1024,
88
+ hop_length=256,
89
+ win_length=1024,
90
+ n_mel_channels=80,
91
+ sampling_rate=22050,
92
+ mel_fmin=0.0,
93
+ mel_fmax=8000.0,
94
+ ):
95
+ super(TacotronSTFT, self).__init__()
96
+ self.n_mel_channels = n_mel_channels
97
+ self.sampling_rate = sampling_rate
98
+ self.stft_fn = STFT(filter_length, hop_length, win_length)
99
+ mel_basis = librosa_mel_fn(sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax)
100
+ mel_basis = torch.from_numpy(mel_basis).float()
101
+ self.register_buffer("mel_basis", mel_basis)
102
+
103
+ def spectral_normalize(self, magnitudes):
104
+ output = dynamic_range_compression(magnitudes)
105
+ return output
106
+
107
+ def spectral_de_normalize(self, magnitudes):
108
+ output = dynamic_range_decompression(magnitudes)
109
+ return output
110
+
111
+ def mel_spectrogram(self, y):
112
+ """Computes mel-spectrograms from a batch of waves
113
+ PARAMS
114
+ ------
115
+ y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1]
116
+
117
+ RETURNS
118
+ -------
119
+ mel_output: torch.FloatTensor of shape (B, n_mel_channels, T)
120
+ """
121
+ assert torch.min(y.data) >= -1
122
+ assert torch.max(y.data) <= 1
123
+
124
+ magnitudes, phases = self.stft_fn.transform(y)
125
+ magnitudes = magnitudes.data
126
+ mel_output = torch.matmul(self.mel_basis, magnitudes)
127
+ mel_output = self.spectral_normalize(mel_output)
128
+ return mel_output
training/tacotron2_model/loss.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ BSD 3-Clause License
3
+
4
+ Copyright (c) 2018, NVIDIA Corporation
5
+ All rights reserved.
6
+
7
+ Redistribution and use in source and binary forms, with or without
8
+ modification, are permitted provided that the following conditions are met:
9
+
10
+ * Redistributions of source code must retain the above copyright notice, this
11
+ list of conditions and the following disclaimer.
12
+
13
+ * Redistributions in binary form must reproduce the above copyright notice,
14
+ this list of conditions and the following disclaimer in the documentation
15
+ and/or other materials provided with the distribution.
16
+
17
+ * Neither the name of the copyright holder nor the names of its
18
+ contributors may be used to endorse or promote products derived from
19
+ this software without specific prior written permission.
20
+
21
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
25
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31
+ """
32
+ from torch import nn
33
+
34
+
35
+ class Tacotron2Loss(nn.Module):
36
+ def __init__(self):
37
+ super(Tacotron2Loss, self).__init__()
38
+
39
+ def forward(self, model_output, targets):
40
+ mel_target, gate_target = targets[0], targets[1]
41
+ mel_target.requires_grad = False
42
+ gate_target.requires_grad = False
43
+ gate_target = gate_target.view(-1, 1)
44
+
45
+ mel_out, mel_out_postnet, gate_out, _ = model_output
46
+ gate_out = gate_out.view(-1, 1)
47
+ mel_loss = nn.MSELoss()(mel_out, mel_target) + nn.MSELoss()(mel_out_postnet, mel_target)
48
+ gate_loss = nn.BCEWithLogitsLoss()(gate_out, gate_target)
49
+ return mel_loss + gate_loss
training/tacotron2_model/model.py ADDED
@@ -0,0 +1,609 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ BSD 3-Clause License
3
+
4
+ Copyright (c) 2018, NVIDIA Corporation
5
+ All rights reserved.
6
+
7
+ Redistribution and use in source and binary forms, with or without
8
+ modification, are permitted provided that the following conditions are met:
9
+
10
+ * Redistributions of source code must retain the above copyright notice, this
11
+ list of conditions and the following disclaimer.
12
+
13
+ * Redistributions in binary form must reproduce the above copyright notice,
14
+ this list of conditions and the following disclaimer in the documentation
15
+ and/or other materials provided with the distribution.
16
+
17
+ * Neither the name of the copyright holder nor the names of its
18
+ contributors may be used to endorse or promote products derived from
19
+ this software without specific prior written permission.
20
+
21
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
25
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31
+ """
32
+ from math import sqrt
33
+ import torch
34
+ from torch.autograd import Variable
35
+ from torch import nn
36
+ from torch.nn import functional as F
37
+ from training.tacotron2_model.layers import ConvNorm, LinearNorm
38
+ from training.tacotron2_model.utils import to_gpu, get_mask_from_lengths, get_x
39
+
40
+
41
+ class LocationLayer(nn.Module):
42
+ def __init__(self, attention_n_filters, attention_kernel_size, attention_dim):
43
+ super(LocationLayer, self).__init__()
44
+ padding = int((attention_kernel_size - 1) / 2)
45
+ self.location_conv = ConvNorm(
46
+ 2, attention_n_filters, kernel_size=attention_kernel_size, padding=padding, bias=False, stride=1, dilation=1
47
+ )
48
+ self.location_dense = LinearNorm(attention_n_filters, attention_dim, bias=False, w_init_gain="tanh")
49
+
50
+ def forward(self, attention_weights_cat):
51
+ processed_attention = self.location_conv(attention_weights_cat)
52
+ processed_attention = processed_attention.transpose(1, 2)
53
+ processed_attention = self.location_dense(processed_attention)
54
+ return processed_attention
55
+
56
+
57
+ class Attention(nn.Module):
58
+ def __init__(
59
+ self,
60
+ attention_rnn_dim,
61
+ embedding_dim,
62
+ attention_dim,
63
+ attention_location_n_filters,
64
+ attention_location_kernel_size,
65
+ ):
66
+ super(Attention, self).__init__()
67
+ self.query_layer = LinearNorm(attention_rnn_dim, attention_dim, bias=False, w_init_gain="tanh")
68
+ self.memory_layer = LinearNorm(embedding_dim, attention_dim, bias=False, w_init_gain="tanh")
69
+ self.v = LinearNorm(attention_dim, 1, bias=False)
70
+ self.location_layer = LocationLayer(attention_location_n_filters, attention_location_kernel_size, attention_dim)
71
+ self.score_mask_value = -float("inf")
72
+
73
+ def get_alignment_energies(self, query, processed_memory, attention_weights_cat):
74
+ """
75
+ PARAMS
76
+ ------
77
+ query: decoder output (batch, n_mel_channels * n_frames_per_step)
78
+ processed_memory: processed encoder outputs (B, T_in, attention_dim)
79
+ attention_weights_cat: cumulative and prev. att weights (B, 2, max_time)
80
+
81
+ RETURNS
82
+ -------
83
+ alignment (batch, max_time)
84
+ """
85
+
86
+ processed_query = self.query_layer(query.unsqueeze(1))
87
+ processed_attention_weights = self.location_layer(attention_weights_cat)
88
+ energies = self.v(torch.tanh(processed_query + processed_attention_weights + processed_memory))
89
+
90
+ energies = energies.squeeze(-1)
91
+ return energies
92
+
93
+ def forward(self, attention_hidden_state, memory, processed_memory, attention_weights_cat, mask):
94
+ """
95
+ PARAMS
96
+ ------
97
+ attention_hidden_state: attention rnn last output
98
+ memory: encoder outputs
99
+ processed_memory: processed encoder outputs
100
+ attention_weights_cat: previous and cummulative attention weights
101
+ mask: binary mask for padded data
102
+ """
103
+ alignment = self.get_alignment_energies(attention_hidden_state, processed_memory, attention_weights_cat)
104
+
105
+ if mask is not None:
106
+ alignment.data.masked_fill_(mask, self.score_mask_value)
107
+
108
+ attention_weights = F.softmax(alignment, dim=1)
109
+ attention_context = torch.bmm(attention_weights.unsqueeze(1), memory)
110
+ attention_context = attention_context.squeeze(1)
111
+
112
+ return attention_context, attention_weights
113
+
114
+
115
+ class Prenet(nn.Module):
116
+ def __init__(self, in_dim, sizes):
117
+ super(Prenet, self).__init__()
118
+ in_sizes = [in_dim] + sizes[:-1]
119
+ self.layers = nn.ModuleList(
120
+ [LinearNorm(in_size, out_size, bias=False) for (in_size, out_size) in zip(in_sizes, sizes)]
121
+ )
122
+
123
+ def forward(self, x):
124
+ for linear in self.layers:
125
+ x = F.dropout(F.relu(linear(x)), p=0.5, training=True)
126
+ return x
127
+
128
+
129
+ class Postnet(nn.Module):
130
+ """Postnet
131
+ - Five 1-d convolution with 512 channels and kernel size 5
132
+ """
133
+
134
+ def __init__(self, n_mel_channels, postnet_embedding_dim, postnet_kernel_size, postnet_n_convolutions):
135
+ super(Postnet, self).__init__()
136
+ self.convolutions = nn.ModuleList()
137
+
138
+ self.convolutions.append(
139
+ nn.Sequential(
140
+ ConvNorm(
141
+ n_mel_channels,
142
+ postnet_embedding_dim,
143
+ kernel_size=postnet_kernel_size,
144
+ stride=1,
145
+ padding=int((postnet_kernel_size - 1) / 2),
146
+ dilation=1,
147
+ w_init_gain="tanh",
148
+ ),
149
+ nn.BatchNorm1d(postnet_embedding_dim),
150
+ )
151
+ )
152
+
153
+ for i in range(1, postnet_n_convolutions - 1):
154
+ self.convolutions.append(
155
+ nn.Sequential(
156
+ ConvNorm(
157
+ postnet_embedding_dim,
158
+ postnet_embedding_dim,
159
+ kernel_size=postnet_kernel_size,
160
+ stride=1,
161
+ padding=int((postnet_kernel_size - 1) / 2),
162
+ dilation=1,
163
+ w_init_gain="tanh",
164
+ ),
165
+ nn.BatchNorm1d(postnet_embedding_dim),
166
+ )
167
+ )
168
+
169
+ self.convolutions.append(
170
+ nn.Sequential(
171
+ ConvNorm(
172
+ postnet_embedding_dim,
173
+ n_mel_channels,
174
+ kernel_size=postnet_kernel_size,
175
+ stride=1,
176
+ padding=int((postnet_kernel_size - 1) / 2),
177
+ dilation=1,
178
+ w_init_gain="linear",
179
+ ),
180
+ nn.BatchNorm1d(n_mel_channels),
181
+ )
182
+ )
183
+
184
+ def forward(self, x):
185
+ for i in range(len(self.convolutions) - 1):
186
+ x = F.dropout(torch.tanh(self.convolutions[i](x)), 0.5, self.training)
187
+ x = F.dropout(self.convolutions[-1](x), 0.5, self.training)
188
+
189
+ return x
190
+
191
+
192
+ class Encoder(nn.Module):
193
+ """Encoder module:
194
+ - Three 1-d convolution banks
195
+ - Bidirectional LSTM
196
+ """
197
+
198
+ def __init__(self, encoder_kernel_size, encoder_n_convolutions, encoder_embedding_dim):
199
+ super(Encoder, self).__init__()
200
+
201
+ convolutions = []
202
+ for _ in range(encoder_n_convolutions):
203
+ conv_layer = nn.Sequential(
204
+ ConvNorm(
205
+ encoder_embedding_dim,
206
+ encoder_embedding_dim,
207
+ kernel_size=encoder_kernel_size,
208
+ stride=1,
209
+ padding=int((encoder_kernel_size - 1) / 2),
210
+ dilation=1,
211
+ w_init_gain="relu",
212
+ ),
213
+ nn.BatchNorm1d(encoder_embedding_dim),
214
+ )
215
+ convolutions.append(conv_layer)
216
+ self.convolutions = nn.ModuleList(convolutions)
217
+
218
+ self.lstm = nn.LSTM(
219
+ encoder_embedding_dim, int(encoder_embedding_dim / 2), 1, batch_first=True, bidirectional=True
220
+ )
221
+
222
+ def forward(self, x, input_lengths):
223
+ for conv in self.convolutions:
224
+ x = F.dropout(F.relu(conv(x)), 0.5, self.training)
225
+
226
+ x = x.transpose(1, 2)
227
+
228
+ # pytorch tensor are not reversible, hence the conversion
229
+ input_lengths = input_lengths.cpu().numpy()
230
+ x = nn.utils.rnn.pack_padded_sequence(x, input_lengths, batch_first=True)
231
+
232
+ self.lstm.flatten_parameters()
233
+ outputs, _ = self.lstm(x)
234
+
235
+ outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True)
236
+
237
+ return outputs
238
+
239
+ def inference(self, x):
240
+ for conv in self.convolutions:
241
+ x = F.dropout(F.relu(conv(x)), 0.5, self.training)
242
+
243
+ x = x.transpose(1, 2)
244
+
245
+ self.lstm.flatten_parameters()
246
+ outputs, _ = self.lstm(x)
247
+
248
+ return outputs
249
+
250
+
251
+ class Decoder(nn.Module):
252
+ def __init__(
253
+ self,
254
+ n_mel_channels,
255
+ n_frames_per_step,
256
+ encoder_embedding_dim,
257
+ attention_dim,
258
+ attention_rnn_dim,
259
+ attention_location_n_filters,
260
+ attention_location_kernel_size,
261
+ decoder_rnn_dim,
262
+ prenet_dim,
263
+ max_decoder_steps,
264
+ gate_threshold,
265
+ p_attention_dropout,
266
+ p_decoder_dropout,
267
+ ):
268
+ super(Decoder, self).__init__()
269
+ self.n_mel_channels = n_mel_channels
270
+ self.n_frames_per_step = n_frames_per_step
271
+ self.encoder_embedding_dim = encoder_embedding_dim
272
+ self.attention_rnn_dim = attention_rnn_dim
273
+ self.decoder_rnn_dim = decoder_rnn_dim
274
+ self.prenet_dim = prenet_dim
275
+ self.max_decoder_steps = max_decoder_steps
276
+ self.gate_threshold = gate_threshold
277
+ self.p_attention_dropout = p_attention_dropout
278
+ self.p_decoder_dropout = p_decoder_dropout
279
+
280
+ self.prenet = Prenet(n_mel_channels * n_frames_per_step, [prenet_dim, prenet_dim])
281
+
282
+ self.attention_rnn = nn.LSTMCell(prenet_dim + encoder_embedding_dim, attention_rnn_dim)
283
+
284
+ self.attention_layer = Attention(
285
+ attention_rnn_dim,
286
+ encoder_embedding_dim,
287
+ attention_dim,
288
+ attention_location_n_filters,
289
+ attention_location_kernel_size,
290
+ )
291
+
292
+ self.decoder_rnn = nn.LSTMCell(attention_rnn_dim + encoder_embedding_dim, decoder_rnn_dim, 1)
293
+
294
+ self.linear_projection = LinearNorm(decoder_rnn_dim + encoder_embedding_dim, n_mel_channels * n_frames_per_step)
295
+
296
+ self.gate_layer = LinearNorm(decoder_rnn_dim + encoder_embedding_dim, 1, bias=True, w_init_gain="sigmoid")
297
+
298
+ def get_go_frame(self, memory):
299
+ """Gets all zeros frames to use as first decoder input
300
+ PARAMS
301
+ ------
302
+ memory: decoder outputs
303
+
304
+ RETURNS
305
+ -------
306
+ decoder_input: all zeros frames
307
+ """
308
+ B = memory.size(0)
309
+ decoder_input = Variable(memory.data.new(B, self.n_mel_channels * self.n_frames_per_step).zero_())
310
+ return decoder_input
311
+
312
+ def initialize_decoder_states(self, memory, mask):
313
+ """Initializes attention rnn states, decoder rnn states, attention
314
+ weights, attention cumulative weights, attention context, stores memory
315
+ and stores processed memory
316
+ PARAMS
317
+ ------
318
+ memory: Encoder outputs
319
+ mask: Mask for padded data if training, expects None for inference
320
+ """
321
+ B = memory.size(0)
322
+ MAX_TIME = memory.size(1)
323
+
324
+ self.attention_hidden = Variable(memory.data.new(B, self.attention_rnn_dim).zero_())
325
+ self.attention_cell = Variable(memory.data.new(B, self.attention_rnn_dim).zero_())
326
+
327
+ self.decoder_hidden = Variable(memory.data.new(B, self.decoder_rnn_dim).zero_())
328
+ self.decoder_cell = Variable(memory.data.new(B, self.decoder_rnn_dim).zero_())
329
+
330
+ self.attention_weights = Variable(memory.data.new(B, MAX_TIME).zero_())
331
+ self.attention_weights_cum = Variable(memory.data.new(B, MAX_TIME).zero_())
332
+ self.attention_context = Variable(memory.data.new(B, self.encoder_embedding_dim).zero_())
333
+
334
+ self.memory = memory
335
+ self.processed_memory = self.attention_layer.memory_layer(memory)
336
+ self.mask = mask
337
+
338
+ def parse_decoder_inputs(self, decoder_inputs):
339
+ """Prepares decoder inputs, i.e. mel outputs
340
+ PARAMS
341
+ ------
342
+ decode encoder_kernel_size=5,
343
+ encoder_n_convolutions=3,
344
+ encoder_embedding_dim=512,r_inputs: inputs used for teacher-forced training, i.e. mel-specs
345
+
346
+ RETURNS
347
+ -------
348
+ inputs: processed decoder inputs
349
+
350
+ """
351
+ # (B, n_mel_channels, T_out) -> (B, T_out, n_mel_channels)
352
+ decoder_inputs = decoder_inputs.transpose(1, 2)
353
+ decoder_inputs = decoder_inputs.view(
354
+ decoder_inputs.size(0), int(decoder_inputs.size(1) / self.n_frames_per_step), -1
355
+ )
356
+ # (B, T_out, n_mel_channels) -> (T_out, B, n_mel_channels)
357
+ decoder_inputs = decoder_inputs.transpose(0, 1)
358
+ return decoder_inputs
359
+
360
+ def parse_decoder_outputs(self, mel_outputs, gate_outputs, alignments):
361
+ """Prepares decoder outputs for output
362
+ PARAMS
363
+ ------
364
+ mel_outputs:
365
+ gate_outputs: gate output energies
366
+ alignments:
367
+
368
+ RETURNS
369
+ -------
370
+ mel_outputs:
371
+ gate_outpust: gate output energies
372
+ alignments:
373
+ """
374
+ # (T_out, B) -> (B, T_out)
375
+ alignments = torch.stack(alignments).transpose(0, 1)
376
+ # (T_out, B) -> (B, T_out)
377
+ gate_outputs = torch.stack(gate_outputs).transpose(0, 1)
378
+ gate_outputs = gate_outputs.contiguous()
379
+ # (T_out, B, n_mel_channels) -> (B, T_out, n_mel_channels)
380
+ mel_outputs = torch.stack(mel_outputs).transpose(0, 1).contiguous()
381
+ # decouple frames per step
382
+ mel_outputs = mel_outputs.view(mel_outputs.size(0), -1, self.n_mel_channels)
383
+ # (B, T_out, n_mel_channels) -> (B, n_mel_channels, T_out)
384
+ mel_outputs = mel_outputs.transpose(1, 2)
385
+
386
+ return mel_outputs, gate_outputs, alignments
387
+
388
+ def decode(self, decoder_input):
389
+ """Decoder step using stored states, attention and memory
390
+ PARAMS
391
+ ------
392
+ decoder_input: previous mel output
393
+
394
+ RETURNS
395
+ -------
396
+ mel_output:
397
+ gate_output: gate output energies
398
+ attention_weights:
399
+ """
400
+ cell_input = torch.cat((decoder_input, self.attention_context), -1)
401
+ self.attention_hidden, self.attention_cell = self.attention_rnn(
402
+ cell_input, (self.attention_hidden, self.attention_cell)
403
+ )
404
+ self.attention_hidden = F.dropout(self.attention_hidden, self.p_attention_dropout, self.training)
405
+
406
+ attention_weights_cat = torch.cat(
407
+ (self.attention_weights.unsqueeze(1), self.attention_weights_cum.unsqueeze(1)), dim=1
408
+ )
409
+ self.attention_context, self.attention_weights = self.attention_layer(
410
+ self.attention_hidden, self.memory, self.processed_memory, attention_weights_cat, self.mask
411
+ )
412
+
413
+ self.attention_weights_cum += self.attention_weights
414
+ decoder_input = torch.cat((self.attention_hidden, self.attention_context), -1)
415
+ self.decoder_hidden, self.decoder_cell = self.decoder_rnn(
416
+ decoder_input, (self.decoder_hidden, self.decoder_cell)
417
+ )
418
+ self.decoder_hidden = F.dropout(self.decoder_hidden, self.p_decoder_dropout, self.training)
419
+
420
+ decoder_hidden_attention_context = torch.cat((self.decoder_hidden, self.attention_context), dim=1)
421
+ decoder_output = self.linear_projection(decoder_hidden_attention_context)
422
+
423
+ gate_prediction = self.gate_layer(decoder_hidden_attention_context)
424
+ return decoder_output, gate_prediction, self.attention_weights
425
+
426
+ def forward(self, memory, decoder_inputs, memory_lengths, device):
427
+ """Decoder forward pass for training
428
+ PARAMS
429
+ ------
430
+ memory: Encoder outputs
431
+ decoder_inputs: Decoder inputs for teacher forcing. i.e. mel-specs
432
+ memory_lengths: Encoder output lengths for attention masking.
433
+
434
+ RETURNS
435
+ -------
436
+ mel_outputs: mel outputs from the decoder
437
+ gate_outputs: gate outputs from the decoder
438
+ alignments: sequence of attention weights from the decoder
439
+ """
440
+
441
+ decoder_input = self.get_go_frame(memory).unsqueeze(0)
442
+ decoder_inputs = self.parse_decoder_inputs(decoder_inputs)
443
+ decoder_inputs = torch.cat((decoder_input, decoder_inputs), dim=0)
444
+ decoder_inputs = self.prenet(decoder_inputs)
445
+
446
+ self.initialize_decoder_states(memory, mask=~get_mask_from_lengths(memory_lengths, device))
447
+
448
+ mel_outputs, gate_outputs, alignments = [], [], []
449
+ while len(mel_outputs) < decoder_inputs.size(0) - 1:
450
+ decoder_input = decoder_inputs[len(mel_outputs)]
451
+ mel_output, gate_output, attention_weights = self.decode(decoder_input)
452
+ mel_outputs += [mel_output.squeeze(1)]
453
+ gate_outputs += [gate_output.squeeze(1)]
454
+ alignments += [attention_weights]
455
+
456
+ mel_outputs, gate_outputs, alignments = self.parse_decoder_outputs(mel_outputs, gate_outputs, alignments)
457
+
458
+ return mel_outputs, gate_outputs, alignments
459
+
460
+ def inference(self, memory, max_decoder_steps=None):
461
+ """Decoder inference
462
+ PARAMS
463
+ ------
464
+ memory: Encoder outputs
465
+
466
+ RETURNS
467
+ -------
468
+ mel_outputs: mel outputs from the decoder
469
+ gate_outputs: gate outputs from the decoder
470
+ alignments: sequence of attention weights from the decoder
471
+ """
472
+ if not max_decoder_steps:
473
+ # Use default max decoder steps if not given
474
+ max_decoder_steps = self.max_decoder_steps
475
+
476
+ decoder_input = self.get_go_frame(memory)
477
+
478
+ self.initialize_decoder_states(memory, mask=None)
479
+
480
+ mel_outputs, gate_outputs, alignments = [], [], []
481
+ while True:
482
+ decoder_input = self.prenet(decoder_input)
483
+ mel_output, gate_output, alignment = self.decode(decoder_input)
484
+
485
+ mel_outputs += [mel_output.squeeze(1)]
486
+ gate_outputs += [gate_output]
487
+ alignments += [alignment]
488
+
489
+ if torch.sigmoid(gate_output.data) > self.gate_threshold:
490
+ break
491
+ elif len(mel_outputs) == max_decoder_steps:
492
+ raise Exception(
493
+ "Warning! Reached max decoder steps. Either the model is low quality or the given sentence is too short/long"
494
+ )
495
+
496
+ decoder_input = mel_output
497
+
498
+ mel_outputs, gate_outputs, alignments = self.parse_decoder_outputs(mel_outputs, gate_outputs, alignments)
499
+
500
+ return mel_outputs, gate_outputs, alignments
501
+
502
+
503
+ class Tacotron2(nn.Module):
504
+ def __init__(
505
+ self,
506
+ mask_padding=True,
507
+ fp16_run=False,
508
+ n_mel_channels=80,
509
+ n_symbols=148,
510
+ symbols_embedding_dim=512,
511
+ encoder_kernel_size=5,
512
+ encoder_n_convolutions=3,
513
+ encoder_embedding_dim=512,
514
+ attention_rnn_dim=1024,
515
+ attention_dim=128,
516
+ attention_location_n_filters=32,
517
+ attention_location_kernel_size=31,
518
+ decoder_rnn_dim=1024,
519
+ prenet_dim=256,
520
+ max_decoder_steps=1000,
521
+ gate_threshold=0.5,
522
+ p_attention_dropout=0.1,
523
+ p_decoder_dropout=0.1,
524
+ postnet_embedding_dim=512,
525
+ postnet_kernel_size=5,
526
+ postnet_n_convolutions=5,
527
+ ):
528
+ super(Tacotron2, self).__init__()
529
+ self.mask_padding = mask_padding
530
+ self.fp16_run = fp16_run
531
+ self.n_mel_channels = n_mel_channels
532
+ self.n_frames_per_step = 1
533
+ self.embedding = nn.Embedding(n_symbols, symbols_embedding_dim)
534
+ std = sqrt(2.0 / (n_symbols + symbols_embedding_dim))
535
+ val = sqrt(3.0) * std # uniform bounds for std
536
+ self.embedding.weight.data.uniform_(-val, val)
537
+ self.encoder = Encoder(encoder_kernel_size, encoder_n_convolutions, encoder_embedding_dim)
538
+ self.decoder = Decoder(
539
+ n_mel_channels,
540
+ self.n_frames_per_step,
541
+ encoder_embedding_dim,
542
+ attention_dim,
543
+ attention_rnn_dim,
544
+ attention_location_n_filters,
545
+ attention_location_kernel_size,
546
+ decoder_rnn_dim,
547
+ prenet_dim,
548
+ max_decoder_steps,
549
+ gate_threshold,
550
+ p_attention_dropout,
551
+ p_decoder_dropout,
552
+ )
553
+ self.postnet = Postnet(n_mel_channels, postnet_embedding_dim, postnet_kernel_size, postnet_n_convolutions)
554
+
555
+ def parse_batch(self, batch):
556
+ text_padded, input_lengths, mel_padded, gate_padded, output_lengths = batch
557
+ text_padded = to_gpu(text_padded).long()
558
+ input_lengths = to_gpu(input_lengths).long()
559
+ max_len = torch.max(input_lengths.data).item()
560
+ mel_padded = to_gpu(mel_padded).float()
561
+ gate_padded = to_gpu(gate_padded).float()
562
+ output_lengths = to_gpu(output_lengths).long()
563
+
564
+ return ((text_padded, input_lengths, mel_padded, max_len, output_lengths), (mel_padded, gate_padded))
565
+
566
+ def parse_output(self, outputs, output_lengths, mask_size, alignment_mask_size, device):
567
+ if self.mask_padding:
568
+ mask = ~get_mask_from_lengths(output_lengths, device, mask_size)
569
+ mask = mask.expand(self.n_mel_channels, mask.size(0), mask.size(1))
570
+ mask = mask.permute(1, 0, 2)
571
+
572
+ outputs[0].data.masked_fill_(mask, 0.0)
573
+ outputs[1].data.masked_fill_(mask, 0.0)
574
+ outputs[2].data.masked_fill_(mask[:, 0, :], 1e3) # gate energies
575
+ if outputs[3].size(2) != alignment_mask_size:
576
+ outputs[3] = nn.ConstantPad1d((0, alignment_mask_size - outputs[3].size(2)), 0)(outputs[3])
577
+
578
+ return outputs
579
+
580
+ def forward(self, inputs, mask_size, alignment_mask_size):
581
+ text_inputs, text_lengths, mels, output_lengths = get_x(inputs)
582
+ device = text_inputs.device
583
+
584
+ text_lengths, output_lengths = text_lengths.data, output_lengths.data
585
+ embedded_inputs = self.embedding(text_inputs).transpose(1, 2)
586
+ encoder_outputs = self.encoder(embedded_inputs, text_lengths)
587
+ mel_outputs, gate_outputs, alignments = self.decoder(
588
+ encoder_outputs, mels, memory_lengths=text_lengths, device=device
589
+ )
590
+ mel_outputs_postnet = self.postnet(mel_outputs)
591
+ mel_outputs_postnet = mel_outputs + mel_outputs_postnet
592
+
593
+ return self.parse_output(
594
+ [mel_outputs, mel_outputs_postnet, gate_outputs, alignments],
595
+ output_lengths,
596
+ mask_size,
597
+ alignment_mask_size,
598
+ device,
599
+ )
600
+
601
+ def inference(self, inputs, max_decoder_steps=None):
602
+ embedded_inputs = self.embedding(inputs).transpose(1, 2)
603
+ encoder_outputs = self.encoder.inference(embedded_inputs)
604
+ mel_outputs, gate_outputs, alignments = self.decoder.inference(encoder_outputs, max_decoder_steps)
605
+
606
+ mel_outputs_postnet = self.postnet(mel_outputs)
607
+ mel_outputs_postnet = mel_outputs + mel_outputs_postnet
608
+
609
+ return [mel_outputs, mel_outputs_postnet, gate_outputs, alignments]
training/tacotron2_model/stft.py ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ BSD 3-Clause License
3
+
4
+ Copyright (c) 2017, Prem Seetharaman
5
+ All rights reserved.
6
+
7
+ * Redistribution and use in source and binary forms, with or without
8
+ modification, are permitted provided that the following conditions are met:
9
+
10
+ * Redistributions of source code must retain the above copyright notice,
11
+ this list of conditions and the following disclaimer.
12
+
13
+ * Redistributions in binary form must reproduce the above copyright notice, this
14
+ list of conditions and the following disclaimer in the
15
+ documentation and/or other materials provided with the distribution.
16
+
17
+ * Neither the name of the copyright holder nor the names of its
18
+ contributors may be used to endorse or promote products derived from this
19
+ software without specific prior written permission.
20
+
21
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
22
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
25
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
26
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
28
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31
+ """
32
+ import torch
33
+ import numpy as np
34
+ import torch.nn.functional as F
35
+ from torch.autograd import Variable
36
+ from scipy.signal import get_window
37
+ from librosa.util import pad_center, tiny
38
+ from librosa.filters import mel as librosa_mel_fn
39
+ from training.tacotron2_model.audio_processing import (
40
+ window_sumsquare,
41
+ dynamic_range_compression,
42
+ dynamic_range_decompression,
43
+ )
44
+
45
+
46
+ class STFT(torch.nn.Module):
47
+ """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft"""
48
+
49
+ def __init__(self, filter_length=800, hop_length=200, win_length=800, window="hann"):
50
+ super(STFT, self).__init__()
51
+ self.filter_length = filter_length
52
+ self.hop_length = hop_length
53
+ self.win_length = win_length
54
+ self.window = window
55
+ self.forward_transform = None
56
+ scale = self.filter_length / self.hop_length
57
+ fourier_basis = np.fft.fft(np.eye(self.filter_length))
58
+
59
+ cutoff = int((self.filter_length / 2 + 1))
60
+ fourier_basis = np.vstack([np.real(fourier_basis[:cutoff, :]), np.imag(fourier_basis[:cutoff, :])])
61
+
62
+ forward_basis = torch.FloatTensor(fourier_basis[:, None, :])
63
+ inverse_basis = torch.FloatTensor(np.linalg.pinv(scale * fourier_basis).T[:, None, :])
64
+
65
+ if window is not None:
66
+ assert filter_length >= win_length
67
+ # get window and zero center pad it to filter_length
68
+ fft_window = get_window(window, win_length, fftbins=True)
69
+ fft_window = pad_center(fft_window, filter_length)
70
+ fft_window = torch.from_numpy(fft_window).float()
71
+
72
+ # window the bases
73
+ forward_basis *= fft_window
74
+ inverse_basis *= fft_window
75
+
76
+ self.register_buffer("forward_basis", forward_basis.float())
77
+ self.register_buffer("inverse_basis", inverse_basis.float())
78
+
79
+ def transform(self, input_data):
80
+ num_batches = input_data.size(0)
81
+ num_samples = input_data.size(1)
82
+
83
+ self.num_samples = num_samples
84
+
85
+ # similar to librosa, reflect-pad the input
86
+ input_data = input_data.view(num_batches, 1, num_samples)
87
+ input_data = F.pad(
88
+ input_data.unsqueeze(1), (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0), mode="reflect"
89
+ )
90
+ input_data = input_data.squeeze(1)
91
+
92
+ forward_transform = F.conv1d(
93
+ input_data, Variable(self.forward_basis, requires_grad=False), stride=self.hop_length, padding=0
94
+ )
95
+
96
+ cutoff = int((self.filter_length / 2) + 1)
97
+ real_part = forward_transform[:, :cutoff, :]
98
+ imag_part = forward_transform[:, cutoff:, :]
99
+
100
+ magnitude = torch.sqrt(real_part**2 + imag_part**2)
101
+ phase = torch.autograd.Variable(torch.atan2(imag_part.data, real_part.data))
102
+
103
+ return magnitude, phase
104
+
105
+ def inverse(self, magnitude, phase):
106
+ recombine_magnitude_phase = torch.cat([magnitude * torch.cos(phase), magnitude * torch.sin(phase)], dim=1)
107
+
108
+ inverse_transform = F.conv_transpose1d(
109
+ recombine_magnitude_phase,
110
+ Variable(self.inverse_basis, requires_grad=False),
111
+ stride=self.hop_length,
112
+ padding=0,
113
+ )
114
+
115
+ if self.window is not None:
116
+ window_sum = window_sumsquare(
117
+ self.window,
118
+ magnitude.size(-1),
119
+ hop_length=self.hop_length,
120
+ win_length=self.win_length,
121
+ n_fft=self.filter_length,
122
+ dtype=np.float32,
123
+ )
124
+ # remove modulation effects
125
+ approx_nonzero_indices = torch.from_numpy(np.where(window_sum > tiny(window_sum))[0])
126
+ window_sum = torch.autograd.Variable(torch.from_numpy(window_sum), requires_grad=False)
127
+ window_sum = window_sum.cuda() if magnitude.is_cuda else window_sum
128
+ inverse_transform[:, :, approx_nonzero_indices] /= window_sum[approx_nonzero_indices]
129
+
130
+ # scale by hop ratio
131
+ inverse_transform *= float(self.filter_length) / self.hop_length
132
+
133
+ inverse_transform = inverse_transform[:, :, int(self.filter_length / 2) :]
134
+ inverse_transform = inverse_transform[:, :, : -int(self.filter_length / 2) :]
135
+
136
+ return inverse_transform
137
+
138
+ def forward(self, input_data):
139
+ self.magnitude, self.phase = self.transform(input_data)
140
+ reconstruction = self.inverse(self.magnitude, self.phase)
141
+ return reconstruction
142
+
143
+
144
+ class TacotronSTFT(torch.nn.Module):
145
+ def __init__(
146
+ self,
147
+ filter_length=1024,
148
+ hop_length=256,
149
+ win_length=1024,
150
+ n_mel_channels=80,
151
+ sampling_rate=22050,
152
+ mel_fmin=0.0,
153
+ mel_fmax=8000.0,
154
+ ):
155
+ super(TacotronSTFT, self).__init__()
156
+ self.n_mel_channels = n_mel_channels
157
+ self.sampling_rate = sampling_rate
158
+ self.stft_fn = STFT(filter_length, hop_length, win_length)
159
+ mel_basis = librosa_mel_fn(sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax)
160
+ mel_basis = torch.from_numpy(mel_basis).float()
161
+ self.register_buffer("mel_basis", mel_basis)
162
+
163
+ def spectral_normalize(self, magnitudes):
164
+ output = dynamic_range_compression(magnitudes)
165
+ return output
166
+
167
+ def spectral_de_normalize(self, magnitudes):
168
+ output = dynamic_range_decompression(magnitudes)
169
+ return output
170
+
171
+ def mel_spectrogram(self, y):
172
+ """Computes mel-spectrograms from a batch of waves
173
+ PARAMS
174
+ ------
175
+ y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1]
176
+ RETURNS
177
+ -------
178
+ mel_output: torch.FloatTensor of shape (B, n_mel_channels, T)
179
+ """
180
+ assert torch.min(y.data) >= -1
181
+ assert torch.max(y.data) <= 1
182
+
183
+ magnitudes, phases = self.stft_fn.transform(y)
184
+ magnitudes = magnitudes.data
185
+ mel_output = torch.matmul(self.mel_basis, magnitudes)
186
+ mel_output = self.spectral_normalize(mel_output)
187
+ return mel_output
training/tacotron2_model/utils.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ BSD 3-Clause License
3
+
4
+ Copyright (c) 2018, NVIDIA Corporation
5
+ All rights reserved.
6
+
7
+ Redistribution and use in source and binary forms, with or without
8
+ modification, are permitted provided that the following conditions are met:
9
+
10
+ * Redistributions of source code must retain the above copyright notice, this
11
+ list of conditions and the following disclaimer.
12
+
13
+ * Redistributions in binary form must reproduce the above copyright notice,
14
+ this list of conditions and the following disclaimer in the documentation
15
+ and/or other materials provided with the distribution.
16
+
17
+ * Neither the name of the copyright holder nor the names of its
18
+ contributors may be used to endorse or promote products derived from
19
+ this software without specific prior written permission.
20
+
21
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
25
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31
+ """
32
+ import numpy as np
33
+ from scipy.io.wavfile import read
34
+ import torch
35
+
36
+
37
+ def get_mask_from_lengths(lengths, device, max_len=None):
38
+ if not max_len:
39
+ max_len = torch.max(lengths).item()
40
+ ids = torch.arange(0, max_len, out=torch.cuda.LongTensor(max_len)).to(device)
41
+ mask = (ids < lengths.to(device).unsqueeze(1)).bool()
42
+ return mask
43
+
44
+
45
+ def load_wav_to_torch(full_path):
46
+ sampling_rate, data = read(full_path)
47
+ return torch.FloatTensor(data.astype(np.float32)), sampling_rate
48
+
49
+
50
+ def load_filepaths_and_text(filename, split="|"):
51
+ with open(filename, encoding="utf-8") as f:
52
+ filepaths_and_text = [line.strip().split(split) for line in f]
53
+ return filepaths_and_text
54
+
55
+
56
+ def to_gpu(x):
57
+ x = x.contiguous().cuda()
58
+ return torch.autograd.Variable(x)
59
+
60
+
61
+ def get_sizes(data):
62
+ _, input_lengths, _, _, output_lengths = data
63
+ output_length_size = torch.max(output_lengths.data).item()
64
+ input_length_size = torch.max(input_lengths.data).item()
65
+ return input_length_size, output_length_size
66
+
67
+
68
+ def get_y(data):
69
+ _, _, mel_padded, gate_padded, _ = data
70
+ mel_padded = to_gpu(mel_padded).float()
71
+ gate_padded = to_gpu(gate_padded).float()
72
+ return mel_padded, gate_padded
73
+
74
+
75
+ def get_x(data):
76
+ text_padded, input_lengths, mel_padded, _, output_lengths = data
77
+ text_padded = to_gpu(text_padded).long()
78
+ input_lengths = to_gpu(input_lengths).long()
79
+ mel_padded = to_gpu(mel_padded).float()
80
+ output_lengths = to_gpu(output_lengths).long()
81
+
82
+ return text_padded, input_lengths, mel_padded, output_lengths
83
+
84
+
85
+ def process_batch(batch, model):
86
+ input_length_size, output_length_size = get_sizes(batch)
87
+ y = get_y(batch)
88
+ y_pred = model(batch, mask_size=output_length_size, alignment_mask_size=input_length_size)
89
+
90
+ return y, y_pred
weights/custom_pctest/config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "resblock": "1",
3
+ "num_gpus": 0,
4
+ "batch_size": 16,
5
+ "learning_rate": 0.0002,
6
+ "adam_b1": 0.8,
7
+ "adam_b2": 0.99,
8
+ "lr_decay": 0.999,
9
+ "seed": 1234,
10
+
11
+ "upsample_rates": [8,8,2,2],
12
+ "upsample_kernel_sizes": [16,16,4,4],
13
+ "upsample_initial_channel": 512,
14
+ "resblock_kernel_sizes": [3,7,11],
15
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
16
+
17
+ "segment_size": 8192,
18
+ "num_mels": 80,
19
+ "num_freq": 1025,
20
+ "n_fft": 1024,
21
+ "hop_size": 256,
22
+ "win_size": 1024,
23
+
24
+ "sampling_rate": 22050,
25
+
26
+ "fmin": 0,
27
+ "fmax": 8000,
28
+ "fmax_for_loss": null,
29
+
30
+ "num_workers": 4,
31
+
32
+ "dist_config": {
33
+ "dist_backend": "nccl",
34
+ "dist_url": "tcp://localhost:54321",
35
+ "world_size": 1
36
+ }
37
+ }
weights/custom_pctest/model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a81953d408ea577ffdef9e4a6ba3d17feb3197db930032ae795ac0663d38fd7
3
+ size 55823149
weights/hifiganvocoderdemo/config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "resblock": "1",
3
+ "num_gpus": 0,
4
+ "batch_size": 16,
5
+ "learning_rate": 0.0002,
6
+ "adam_b1": 0.8,
7
+ "adam_b2": 0.99,
8
+ "lr_decay": 0.999,
9
+ "seed": 1234,
10
+
11
+ "upsample_rates": [8,8,2,2],
12
+ "upsample_kernel_sizes": [16,16,4,4],
13
+ "upsample_initial_channel": 512,
14
+ "resblock_kernel_sizes": [3,7,11],
15
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
16
+
17
+ "segment_size": 8192,
18
+ "num_mels": 80,
19
+ "num_freq": 1025,
20
+ "n_fft": 1024,
21
+ "hop_size": 256,
22
+ "win_size": 1024,
23
+
24
+ "sampling_rate": 22050,
25
+
26
+ "fmin": 0,
27
+ "fmax": 8000,
28
+ "fmax_for_loss": null,
29
+
30
+ "num_workers": 4,
31
+
32
+ "dist_config": {
33
+ "dist_backend": "nccl",
34
+ "dist_url": "tcp://localhost:54321",
35
+ "world_size": 1
36
+ }
37
+ }
weights/hifiganvocoderdemo/model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:771eaf4876485a35e25577563d390c262e23c2421e4a8c929eacfde34a5b7a60
3
+ size 55788858