StarsAi commited on
Commit
35a8858
1 Parent(s): 8582fd7

Initial commit

Browse files
.gitattributes CHANGED
@@ -1,35 +1,35 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
Output/base-TTS.wav ADDED
Binary file (170 kB). View file
 
README.md CHANGED
@@ -1,13 +1,13 @@
1
- ---
2
- title: Tacatron2 TTS
3
- emoji: 💻
4
- colorFrom: purple
5
- colorTo: yellow
6
- sdk: gradio
7
- sdk_version: 4.31.5
8
- app_file: app.py
9
- pinned: false
10
- license: cc-by-nc-4.0
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ ---
2
+ title: Tacatron2 TTS
3
+ emoji: 💻
4
+ colorFrom: purple
5
+ colorTo: yellow
6
+ sdk: gradio
7
+ sdk_version: 4.31.5
8
+ app_file: app.py
9
+ pinned: false
10
+ license: cc-by-nc-4.0
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
Voice Samples/natural_f1.wav ADDED
Binary file (445 kB). View file
 
Voice Samples/natural_m.wav ADDED
Binary file (490 kB). View file
 
Voice Samples/natural_m1.wav ADDED
Binary file (253 kB). View file
 
Voice Samples/natural_m2.wav ADDED
Binary file (237 kB). View file
 
__pycache__/TTS.cpython-310.pyc ADDED
Binary file (1.28 kB). View file
 
__pycache__/app.cpython-310.pyc ADDED
Binary file (1.31 kB). View file
 
pretrained_models/GraphemeToPhoneme-9b27d6eb840bf95c5aedf15ae8ed1172/ctc_lin.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c72639caba01630cf5ccc9b287b6eb7b79acc2276aa6f5cc23640640ac8f7ee
3
+ size 177319
pretrained_models/GraphemeToPhoneme-9b27d6eb840bf95c5aedf15ae8ed1172/hyperparams.yaml ADDED
@@ -0,0 +1,440 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generated 2022-07-09 from:
2
+ # /notebooks/speechbrain/recipes/LibriSpeech/G2P/hparams/hparams_g2p_rnn.yaml
3
+ # yamllint disable
4
+ # ################################
5
+ # Model: LSTM (encoder) + GRU (decoder) (tokenized)
6
+ # Authors:
7
+ # Loren Lugosch & Mirco Ravanelli 2020
8
+ # Artem Ploujnikov 2021
9
+ # ################################
10
+
11
+ # Seed needs to be set at top of yaml, before objects with parameters are made
12
+ seed: 1234
13
+ __set_seed: !apply:torch.manual_seed [!ref <seed>]
14
+
15
+
16
+ # Tokenizers
17
+ char_tokenize: False
18
+ char_token_type: unigram # ["unigram", "bpe", "char"]
19
+ char_token_output: 512
20
+ char_token_wordwise: True
21
+ phn_tokenize: False
22
+ phn_token_type: unigram # ["unigram", "bpe", "char"]
23
+ phn_token_output: 512 # index(blank/eos/bos/unk) = 0
24
+ phn_token_wordwise: True
25
+ character_coverage: 1.0
26
+
27
+
28
+ phonemes_count: 43
29
+ graphemes_count: 31
30
+ phonemes_enable_space: True
31
+
32
+ ctc_weight: 0.5
33
+ ctc_window_size: 0
34
+ homograph_loss_weight: 2.0
35
+
36
+ # Model parameters
37
+ output_neurons: !apply:speechbrain.utils.hparams.choice
38
+ value: !ref <phn_tokenize>
39
+ choices:
40
+ True: !ref <phn_token_output> + 1
41
+ False: !ref <phonemes_count>
42
+
43
+ enc_num_embeddings: !apply:speechbrain.utils.hparams.choice
44
+ value: !ref <char_tokenize>
45
+ choices:
46
+ True: !ref <char_token_output> + 1
47
+ False: !ref <graphemes_count>
48
+
49
+ enc_dropout: 0.5
50
+ enc_neurons: 512
51
+ enc_num_layers: 4
52
+ dec_dropout: 0.5
53
+ dec_neurons: 512
54
+ dec_att_neurons: 256
55
+ dec_num_layers: 4
56
+ embedding_dim: 512
57
+
58
+ # Determines whether to use BOS (beginning-of-sequence) or EOS (end-of-sequence) tokens
59
+ # Available modes:
60
+ # raw: no BOS/EOS tokens are added
61
+ # bos: a beginning-of-sequence token is added
62
+ # eos: an end-of-sequence token is added
63
+ grapheme_sequence_mode: bos
64
+ phoneme_sequence_mode: bos
65
+
66
+
67
+ # Special Token information
68
+ bos_index: 0
69
+ eos_index: 1
70
+ blank_index: 2
71
+ unk_index: 2
72
+ token_space_index: 512
73
+
74
+
75
+ # Language Model
76
+ lm_emb_dim: 256 # dimension of the embeddings
77
+ lm_rnn_size: 512 # dimension of hidden layers
78
+ lm_layers: 2 # number of hidden layers
79
+ lm_output_neurons: 43
80
+
81
+ # Beam Searcher
82
+ beam_search_min_decode_ratio: 0
83
+ beam_search_max_decode_ratio: 1.0
84
+ beam_search_beam_size: 16
85
+ beam_search_beam_size_valid: 16
86
+ beam_search_eos_threshold: 10.0
87
+ beam_search_using_max_attn_shift: false
88
+ beam_search_max_attn_shift: 10
89
+ beam_search_coverage_penalty: 5.0
90
+ beam_search_lm_weight: 0.5
91
+ beam_search_ctc_weight_decode: 0.4
92
+ beam_search_temperature: 1.25
93
+ beam_search_temperature_lm: 1.0
94
+
95
+ # Word embeddings
96
+ use_word_emb: true
97
+ word_emb_model: bert-base-uncased
98
+ word_emb_dim: 768
99
+ word_emb_enc_dim: 256
100
+ word_emb_norm_type: batch
101
+
102
+ graphemes:
103
+ - A
104
+ - B
105
+ - C
106
+ - D
107
+ - E
108
+ - F
109
+ - G
110
+ - H
111
+ - I
112
+ - J
113
+ - K
114
+ - L
115
+ - M
116
+ - N
117
+ - O
118
+ - P
119
+ - Q
120
+ - R
121
+ - S
122
+ - T
123
+ - U
124
+ - V
125
+ - W
126
+ - X
127
+ - Y
128
+ - Z
129
+ - "'"
130
+ - ' '
131
+
132
+ phonemes:
133
+ - AA
134
+ - AE
135
+ - AH
136
+ - AO
137
+ - AW
138
+ - AY
139
+ - B
140
+ - CH
141
+ - D
142
+ - DH
143
+ - EH
144
+ - ER
145
+ - EY
146
+ - F
147
+ - G
148
+ - HH
149
+ - IH
150
+ - IY
151
+ - JH
152
+ - K
153
+ - L
154
+ - M
155
+ - N
156
+ - NG
157
+ - OW
158
+ - OY
159
+ - P
160
+ - R
161
+ - S
162
+ - SH
163
+ - T
164
+ - TH
165
+ - UH
166
+ - UW
167
+ - V
168
+ - W
169
+ - Y
170
+ - Z
171
+ - ZH
172
+ - ' '
173
+
174
+ enc_input_dim: !apply:speechbrain.lobes.models.g2p.model.input_dim
175
+ use_word_emb: !ref <use_word_emb>
176
+ word_emb_enc_dim: !ref <word_emb_enc_dim>
177
+ embedding_dim: !ref <embedding_dim>
178
+
179
+ phn_char_map: !apply:speechbrain.lobes.models.g2p.dataio.build_token_char_map
180
+ tokens: !ref <phonemes>
181
+
182
+ char_phn_map: !apply:speechbrain.lobes.models.g2p.dataio.flip_map
183
+ map_dict: !ref <phn_char_map>
184
+
185
+ enc: !new:speechbrain.nnet.RNN.LSTM
186
+ input_shape: [null, null, !ref <enc_input_dim>]
187
+ bidirectional: True
188
+ hidden_size: !ref <enc_neurons>
189
+ num_layers: !ref <enc_num_layers>
190
+ dropout: !ref <enc_dropout>
191
+
192
+ lin: !new:speechbrain.nnet.linear.Linear
193
+ input_size: !ref <dec_neurons>
194
+ n_neurons: !ref <output_neurons>
195
+ bias: false
196
+
197
+ ctc_lin: !new:speechbrain.nnet.linear.Linear
198
+ input_size: !ref 2 * <enc_neurons>
199
+ n_neurons: !ref <output_neurons>
200
+
201
+ encoder_emb: !new:speechbrain.nnet.embedding.Embedding
202
+ num_embeddings: !ref <enc_num_embeddings>
203
+ embedding_dim: !ref <embedding_dim>
204
+
205
+ emb: !new:speechbrain.nnet.embedding.Embedding
206
+ num_embeddings: !ref <output_neurons>
207
+ embedding_dim: !ref <embedding_dim>
208
+
209
+ dec: !new:speechbrain.nnet.RNN.AttentionalRNNDecoder
210
+ enc_dim: !ref <enc_neurons> * 2
211
+ input_size: !ref <embedding_dim>
212
+ rnn_type: gru
213
+ attn_type: content
214
+ dropout: !ref <dec_dropout>
215
+ hidden_size: !ref <dec_neurons>
216
+ attn_dim: !ref <dec_att_neurons>
217
+ num_layers: !ref <dec_num_layers>
218
+
219
+ word_emb_enc: !new:speechbrain.lobes.models.g2p.model.WordEmbeddingEncoder
220
+ word_emb_dim: !ref <word_emb_dim>
221
+ word_emb_enc_dim: !ref <word_emb_enc_dim>
222
+ norm_type: batch
223
+
224
+ word_emb: !apply:speechbrain.lobes.models.g2p.dataio.lazy_init
225
+ init: !name:speechbrain.wordemb.transformer.TransformerWordEmbeddings
226
+ model: bert-base-uncased
227
+
228
+ log_softmax: !new:speechbrain.nnet.activations.Softmax
229
+ apply_log: true
230
+
231
+ model: !new:speechbrain.lobes.models.g2p.model.AttentionSeq2Seq
232
+ enc: !ref <enc>
233
+ encoder_emb: !ref <encoder_emb>
234
+ emb: !ref <emb>
235
+ dec: !ref <dec>
236
+ lin: !ref <lin>
237
+ out: !ref <log_softmax>
238
+ use_word_emb: !ref <use_word_emb>
239
+ word_emb_enc: !ref <word_emb_enc>
240
+
241
+ modules:
242
+ model: !ref <model>
243
+ enc: !ref <enc>
244
+ encoder_emb: !ref <encoder_emb>
245
+ emb: !ref <emb>
246
+ dec: !ref <dec>
247
+ lin: !ref <lin>
248
+ ctc_lin: !ref <ctc_lin>
249
+ out: !ref <log_softmax>
250
+ word_emb: !ref <word_emb>
251
+ word_emb_enc: !ref <word_emb_enc>
252
+
253
+ lm_model: !new:speechbrain.lobes.models.RNNLM.RNNLM
254
+ embedding_dim: !ref <lm_emb_dim>
255
+ rnn_layers: !ref <lm_layers>
256
+ rnn_neurons: !ref <lm_rnn_size>
257
+ output_neurons: !ref <lm_output_neurons>
258
+ return_hidden: True
259
+
260
+ ctc_scorer: !new:speechbrain.decoders.scorer.CTCScorer
261
+ eos_index: !ref <eos_index>
262
+ blank_index: !ref <blank_index>
263
+ ctc_fc: !ref <ctc_lin>
264
+ ctc_window_size: !ref <ctc_window_size>
265
+
266
+ coverage_scorer: !new:speechbrain.decoders.scorer.CoverageScorer
267
+ vocab_size: !ref <output_neurons>
268
+
269
+ scorer: !new:speechbrain.decoders.scorer.ScorerBuilder
270
+ full_scorers: [!ref <coverage_scorer>, !ref <ctc_scorer>]
271
+ weights:
272
+ coverage: !ref <beam_search_coverage_penalty>
273
+ ctc: !ref <ctc_weight>
274
+
275
+ beam_searcher: !new:speechbrain.decoders.S2SRNNBeamSearcher
276
+ embedding: !ref <emb>
277
+ decoder: !ref <dec>
278
+ linear: !ref <lin>
279
+ bos_index: !ref <bos_index>
280
+ eos_index: !ref <eos_index>
281
+ min_decode_ratio: !ref <beam_search_min_decode_ratio>
282
+ max_decode_ratio: !ref <beam_search_max_decode_ratio>
283
+ beam_size: !ref <beam_search_beam_size>
284
+ eos_threshold: !ref <beam_search_eos_threshold>
285
+ using_max_attn_shift: !ref <beam_search_using_max_attn_shift>
286
+ max_attn_shift: !ref <beam_search_max_attn_shift>
287
+ temperature: !ref <beam_search_temperature>
288
+ scorer: !ref <scorer>
289
+
290
+ beam_searcher_valid: !new:speechbrain.decoders.S2SRNNBeamSearcher
291
+ embedding: !ref <emb>
292
+ decoder: !ref <dec>
293
+ linear: !ref <lin>
294
+ bos_index: !ref <bos_index>
295
+ eos_index: !ref <eos_index>
296
+ min_decode_ratio: !ref <beam_search_min_decode_ratio>
297
+ max_decode_ratio: !ref <beam_search_max_decode_ratio>
298
+ beam_size: !ref <beam_search_beam_size>
299
+ eos_threshold: !ref <beam_search_eos_threshold>
300
+ using_max_attn_shift: !ref <beam_search_using_max_attn_shift>
301
+ max_attn_shift: !ref <beam_search_max_attn_shift>
302
+ temperature: !ref <beam_search_temperature>
303
+ scorer: !ref <scorer>
304
+
305
+ homograph_extractor: !new:speechbrain.lobes.models.g2p.homograph.SubsequenceExtractor
306
+
307
+ model_output_keys:
308
+ - p_seq
309
+ - char_lens
310
+ - encoder_out
311
+
312
+ grapheme_encoder: &id027 !new:speechbrain.dataio.encoder.TextEncoder
313
+ phoneme_encoder: &id024 !new:speechbrain.dataio.encoder.TextEncoder
314
+
315
+
316
+ grapheme_tokenizer: !apply:speechbrain.lobes.models.g2p.dataio.lazy_init
317
+ init: !name:speechbrain.tokenizers.SentencePiece.SentencePiece
318
+ model_dir: grapheme_tokenizer
319
+ bos_id: !ref <bos_index>
320
+ eos_id: !ref <eos_index>
321
+ unk_id: !ref <unk_index>
322
+ vocab_size: !ref <char_token_output>
323
+ annotation_train: null
324
+ annotation_read: char
325
+ model_type: !ref <char_token_type> # ["unigram", "bpe", "char"]
326
+ character_coverage: !ref <character_coverage>
327
+ annotation_format: json
328
+ text_file: grapheme_annotations.txt
329
+
330
+ phoneme_tokenizer: !apply:speechbrain.lobes.models.g2p.dataio.lazy_init
331
+ init: !name:speechbrain.tokenizers.SentencePiece.SentencePiece
332
+ model_dir: phoneme_tokenizer
333
+ bos_id: !ref <bos_index>
334
+ eos_id: !ref <eos_index>
335
+ unk_id: !ref <unk_index>
336
+ vocab_size: !ref <phn_token_output>
337
+ annotation_train: null
338
+ annotation_read: phn
339
+ model_type: !ref <phn_token_type> # ["unigram", "bpe", "char"]
340
+ character_coverage: !ref <character_coverage>
341
+ annotation_format: json
342
+ text_file: null
343
+
344
+ out_phoneme_decoder_tok: !apply:speechbrain.lobes.models.g2p.dataio.char_map_detokenize
345
+ tokenizer: !ref <phoneme_tokenizer>
346
+ char_map: !ref <char_phn_map>
347
+ token_space_index: !ref <token_space_index>
348
+ wordwise: !ref <phn_token_wordwise>
349
+
350
+ out_phoneme_decoder_raw: !name:speechbrain.lobes.models.g2p.dataio.text_decode
351
+ encoder: !ref <phoneme_encoder>
352
+
353
+ out_phoneme_decoder: !apply:speechbrain.utils.hparams.choice
354
+ value: false
355
+ choices:
356
+ True: !ref <out_phoneme_decoder_tok>
357
+ False: !ref <out_phoneme_decoder_raw>
358
+ encode_pipeline:
359
+ batch: false
360
+ use_padded_data: true
361
+ output_keys:
362
+ - grapheme_list
363
+ - grapheme_encoded_list
364
+ - grapheme_encoded
365
+ - word_emb
366
+ init:
367
+ - func: !name:speechbrain.lobes.models.g2p.dataio.enable_eos_bos
368
+ encoder: !ref <grapheme_encoder>
369
+ tokens: !ref <graphemes>
370
+ bos_index: !ref <bos_index>
371
+ eos_index: !ref <eos_index>
372
+ - func: !name:speechbrain.lobes.models.g2p.dataio.enable_eos_bos
373
+ encoder: !ref <phoneme_encoder>
374
+ tokens: !ref <phonemes>
375
+ bos_index: !ref <bos_index>
376
+ eos_index: !ref <eos_index>
377
+ steps:
378
+ - func: !name:speechbrain.lobes.models.g2p.dataio.clean_pipeline
379
+ graphemes: !ref <graphemes>
380
+ takes: txt
381
+ provides: txt_cleaned
382
+ - func: !name:speechbrain.lobes.models.g2p.dataio.grapheme_pipeline
383
+ grapheme_encoder: !ref <grapheme_encoder>
384
+ takes: txt_cleaned
385
+ provides:
386
+ - grapheme_list
387
+ - grapheme_encoded_list
388
+ - grapheme_encoded_raw
389
+
390
+ - func: !name:speechbrain.lobes.models.g2p.dataio.add_bos_eos
391
+ encoder: !ref <grapheme_encoder>
392
+ takes: grapheme_encoded_list
393
+ provides:
394
+ - grapheme_encoded
395
+ - grapheme_len
396
+ - grapheme_encoded_eos
397
+ - grapheme_len_eos
398
+ - func: !name:speechbrain.lobes.models.g2p.dataio.word_emb_pipeline
399
+ word_emb: !ref <word_emb>
400
+ grapheme_encoder: !ref <grapheme_encoder>
401
+ use_word_emb: !ref <use_word_emb>
402
+ takes:
403
+ - txt
404
+ - grapheme_encoded
405
+ - grapheme_len
406
+ provides: word_emb
407
+
408
+ decode_pipeline:
409
+ batch: true
410
+ output_keys:
411
+ - phonemes
412
+ steps:
413
+ - func: !name:speechbrain.lobes.models.g2p.dataio.beam_search_pipeline
414
+ beam_searcher: !ref <beam_searcher>
415
+ takes:
416
+ - char_lens
417
+ - encoder_out
418
+ provides:
419
+ - hyps
420
+ - scores
421
+ - func: !apply:speechbrain.utils.hparams.choice
422
+ value: false
423
+ choices:
424
+ True: !apply:speechbrain.lobes.models.g2p.dataio.char_map_detokenize
425
+ tokenizer: !ref <phoneme_tokenizer>
426
+ char_map: !ref <char_phn_map>
427
+ token_space_index: !ref <token_space_index>
428
+ wordwise: !ref <phn_token_wordwise>
429
+ False: !name:speechbrain.lobes.models.g2p.dataio.phoneme_decoder_pipeline
430
+ phoneme_encoder: !ref <phoneme_encoder>
431
+ takes:
432
+ - hyps
433
+ provides:
434
+ - phonemes
435
+
436
+
437
+ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
438
+ loadables:
439
+ model: !ref <model>
440
+ ctc_lin: !ref <ctc_lin>
pretrained_models/GraphemeToPhoneme-9b27d6eb840bf95c5aedf15ae8ed1172/model.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71bf7a7b290f88de5fdd7364fa4ab249bdd94a29e6cdc742ee6f69edeae64f61
3
+ size 128643257
pretrained_models/MelSpectrogramEncoder-834735cd05736696f9a5c70acdba6396/embedding_model.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:48f8e8ec25cb26f7217e843a9535345e5a62da1aab71d7e99ce734fcfad4f421
3
+ size 83310835
pretrained_models/MelSpectrogramEncoder-834735cd05736696f9a5c70acdba6396/hyperparams.yaml ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ sample_rate: 16000
2
+ hop_length: 256
3
+ win_length: 1024
4
+ n_mel_channels: 80
5
+ n_fft: 1024
6
+ mel_fmin: 0.0
7
+ mel_fmax: 8000.0
8
+ mel_normalized: False
9
+ power: 1
10
+ norm: "slaney"
11
+ mel_scale: "slaney"
12
+ dynamic_range_compression: True
13
+
14
+ # Modules
15
+ embedding_model: !new:speechbrain.lobes.models.ECAPA_TDNN.ECAPA_TDNN
16
+ input_size: !ref <n_mel_channels>
17
+ channels: [1024, 1024, 1024, 1024, 3072]
18
+ kernel_sizes: [5, 3, 3, 3, 1]
19
+ dilations: [1, 2, 3, 4, 1]
20
+ groups: [1, 1, 1, 1, 1]
21
+ attention_channels: 128
22
+ lin_neurons: 192
23
+
24
+
25
+ normalizer: !new:speechbrain.processing.features.InputNormalization
26
+ norm_type: sentence
27
+ std_norm: False
28
+
29
+
30
+ modules:
31
+ normalizer: !ref <normalizer>
32
+ embedding_model: !ref <embedding_model>
33
+
34
+ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
35
+ loadables:
36
+ normalizer: !ref <normalizer>
37
+ embedding_model: !ref <embedding_model>
pretrained_models/MelSpectrogramEncoder-834735cd05736696f9a5c70acdba6396/normalizer.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01dd629a2a24b29ad133b15930494e168114b788c4d7579ac2862e3d406e00fb
3
+ size 1063
pretrained_models/tts-hifigan-libritts-22050Hz/generator.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db0d1249e2c957dca1021749c43334b9c3190664d7c7e386c5c16bef62fd1574
3
+ size 55828077
pretrained_models/tts-hifigan-libritts-22050Hz/hyperparams.yaml ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ in_channels: 80
2
+ out_channels: 1
3
+ resblock_type: "1"
4
+ resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
5
+ resblock_kernel_sizes: [3, 7, 11]
6
+ upsample_kernel_sizes: [16, 16, 4, 4]
7
+ upsample_initial_channel: 512
8
+ upsample_factors: [8, 8, 2, 2]
9
+ inference_padding: 5
10
+ cond_channels: 0
11
+ conv_post_bias: True
12
+
13
+ generator: !new:speechbrain.lobes.models.HifiGAN.HifiganGenerator
14
+ in_channels: !ref <in_channels>
15
+ out_channels: !ref <out_channels>
16
+ resblock_type: !ref <resblock_type>
17
+ resblock_dilation_sizes: !ref <resblock_dilation_sizes>
18
+ resblock_kernel_sizes: !ref <resblock_kernel_sizes>
19
+ upsample_kernel_sizes: !ref <upsample_kernel_sizes>
20
+ upsample_initial_channel: !ref <upsample_initial_channel>
21
+ upsample_factors: !ref <upsample_factors>
22
+ inference_padding: !ref <inference_padding>
23
+ cond_channels: !ref <cond_channels>
24
+ conv_post_bias: !ref <conv_post_bias>
25
+
26
+ modules:
27
+ generator: !ref <generator>
28
+
29
+ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
30
+ loadables:
31
+ generator: !ref <generator>
pretrained_models/tts-mstacotron2-libritts/hyperparams.yaml ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ################################
2
+ # Audio Parameters #
3
+ ################################
4
+ sample_rate: 22050
5
+ hop_length: 256
6
+ win_length: 1024
7
+ n_mel_channels: 80
8
+ n_fft: 1024
9
+ mel_fmin: 0.0
10
+ mel_fmax: 8000.0
11
+ mel_normalized: False
12
+ power: 1
13
+ norm: "slaney"
14
+ mel_scale: "slaney"
15
+ dynamic_range_compression: True
16
+
17
+ ################################
18
+ # Speaker Embedding Parameters #
19
+ ################################
20
+
21
+ spk_emb_size: 192
22
+ spk_emb_sample_rate: 16000
23
+ custom_mel_spec_encoder: True
24
+ spk_emb_encoder: speechbrain/spkrec-ecapa-voxceleb-mel-spec
25
+ random_speaker_sampler: random_speaker_sampler.ckpt
26
+ random_speaker_sampler_source: speechbrain/tts-mstacotron2-libritts
27
+
28
+ ################################
29
+ # Optimization Hyperparameters #
30
+ ################################
31
+ mask_padding: True
32
+
33
+ ################################
34
+ # Model Parameters and model #
35
+ ################################
36
+ n_symbols: 148 #fixed depending on symbols in textToSequence
37
+ symbols_embedding_dim: 1024
38
+
39
+ # Encoder parameters
40
+ encoder_kernel_size: 5
41
+ encoder_n_convolutions: 6
42
+ encoder_embedding_dim: 1024
43
+
44
+ # Decoder parameters
45
+ # The number of frames in the target per encoder step
46
+ n_frames_per_step: 1
47
+ decoder_rnn_dim: 2048
48
+ prenet_dim: 512
49
+ max_decoder_steps: 1500
50
+ gate_threshold: 0.5
51
+ p_attention_dropout: 0.1
52
+ p_decoder_dropout: 0.1
53
+ decoder_no_early_stopping: False
54
+
55
+ # Attention parameters
56
+ attention_rnn_dim: 2048
57
+ attention_dim: 256
58
+
59
+ # Location Layer parameters
60
+ attention_location_n_filters: 32
61
+ attention_location_kernel_size: 31
62
+
63
+ # Mel-post processing network parameters
64
+ postnet_embedding_dim: 1024
65
+ postnet_kernel_size: 5
66
+ postnet_n_convolutions: 10
67
+
68
+ mel_spectogram: !name:speechbrain.lobes.models.Tacotron2.mel_spectogram
69
+ sample_rate: !ref <sample_rate>
70
+ hop_length: !ref <hop_length>
71
+ win_length: !ref <win_length>
72
+ n_fft: !ref <n_fft>
73
+ n_mels: !ref <n_mel_channels>
74
+ f_min: !ref <mel_fmin>
75
+ f_max: !ref <mel_fmax>
76
+ power: !ref <power>
77
+ normalized: !ref <mel_normalized>
78
+ norm: !ref <norm>
79
+ mel_scale: !ref <mel_scale>
80
+ compression: !ref <dynamic_range_compression>
81
+
82
+ #model
83
+ model: !new:speechbrain.lobes.models.MSTacotron2.Tacotron2
84
+ mask_padding: !ref <mask_padding>
85
+ n_mel_channels: !ref <n_mel_channels>
86
+ # symbols
87
+ n_symbols: !ref <n_symbols>
88
+ symbols_embedding_dim: !ref <symbols_embedding_dim>
89
+ # encoder
90
+ encoder_kernel_size: !ref <encoder_kernel_size>
91
+ encoder_n_convolutions: !ref <encoder_n_convolutions>
92
+ encoder_embedding_dim: !ref <encoder_embedding_dim>
93
+ # attention
94
+ attention_rnn_dim: !ref <attention_rnn_dim>
95
+ attention_dim: !ref <attention_dim>
96
+ # attention location
97
+ attention_location_n_filters: !ref <attention_location_n_filters>
98
+ attention_location_kernel_size: !ref <attention_location_kernel_size>
99
+ # decoder
100
+ n_frames_per_step: !ref <n_frames_per_step>
101
+ decoder_rnn_dim: !ref <decoder_rnn_dim>
102
+ prenet_dim: !ref <prenet_dim>
103
+ max_decoder_steps: !ref <max_decoder_steps>
104
+ gate_threshold: !ref <gate_threshold>
105
+ p_attention_dropout: !ref <p_attention_dropout>
106
+ p_decoder_dropout: !ref <p_decoder_dropout>
107
+ # postnet
108
+ postnet_embedding_dim: !ref <postnet_embedding_dim>
109
+ postnet_kernel_size: !ref <postnet_kernel_size>
110
+ postnet_n_convolutions: !ref <postnet_n_convolutions>
111
+ decoder_no_early_stopping: !ref <decoder_no_early_stopping>
112
+ # speaker embeddings
113
+ spk_emb_size: !ref <spk_emb_size>
114
+
115
+ modules:
116
+ model: !ref <model>
117
+
118
+ g2p: speechbrain/soundchoice-g2p
119
+
120
+ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
121
+ loadables:
122
+ model: !ref <model>
pretrained_models/tts-mstacotron2-libritts/model.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d66c6511e9cff9d7a9e956ebdd250257a3a61fe76691929c647c3c3ee6969464
3
+ size 619239275
tmpdir_tts/hyperparams.yaml ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ################################
2
+ # Model: Tacotroon2 for TTS
3
+ # Authors: Artem Ploujnikov, Yingzhi Wang
4
+ # ################################
5
+
6
+ mask_padding: True
7
+ n_mel_channels: 80
8
+ n_symbols: 148
9
+ symbols_embedding_dim: 512
10
+ encoder_kernel_size: 5
11
+ encoder_n_convolutions: 3
12
+ encoder_embedding_dim: 512
13
+ attention_rnn_dim: 1024
14
+ attention_dim: 128
15
+ attention_location_n_filters: 32
16
+ attention_location_kernel_size: 31
17
+ n_frames_per_step: 1
18
+ decoder_rnn_dim: 1024
19
+ prenet_dim: 256
20
+ max_decoder_steps: 1000
21
+ gate_threshold: 0.5
22
+ p_attention_dropout: 0.1
23
+ p_decoder_dropout: 0.1
24
+ postnet_embedding_dim: 512
25
+ postnet_kernel_size: 5
26
+ postnet_n_convolutions: 5
27
+ decoder_no_early_stopping: False
28
+ sample_rate: 22050
29
+
30
+ # Model
31
+ model: !new:speechbrain.lobes.models.Tacotron2.Tacotron2
32
+ mask_padding: !ref <mask_padding>
33
+ n_mel_channels: !ref <n_mel_channels>
34
+ # symbols
35
+ n_symbols: !ref <n_symbols>
36
+ symbols_embedding_dim: !ref <symbols_embedding_dim>
37
+ # encoder
38
+ encoder_kernel_size: !ref <encoder_kernel_size>
39
+ encoder_n_convolutions: !ref <encoder_n_convolutions>
40
+ encoder_embedding_dim: !ref <encoder_embedding_dim>
41
+ # attention
42
+ attention_rnn_dim: !ref <attention_rnn_dim>
43
+ attention_dim: !ref <attention_dim>
44
+ # attention location
45
+ attention_location_n_filters: !ref <attention_location_n_filters>
46
+ attention_location_kernel_size: !ref <attention_location_kernel_size>
47
+ # decoder
48
+ n_frames_per_step: !ref <n_frames_per_step>
49
+ decoder_rnn_dim: !ref <decoder_rnn_dim>
50
+ prenet_dim: !ref <prenet_dim>
51
+ max_decoder_steps: !ref <max_decoder_steps>
52
+ gate_threshold: !ref <gate_threshold>
53
+ p_attention_dropout: !ref <p_attention_dropout>
54
+ p_decoder_dropout: !ref <p_decoder_dropout>
55
+ # postnet
56
+ postnet_embedding_dim: !ref <postnet_embedding_dim>
57
+ postnet_kernel_size: !ref <postnet_kernel_size>
58
+ postnet_n_convolutions: !ref <postnet_n_convolutions>
59
+ decoder_no_early_stopping: !ref <decoder_no_early_stopping>
60
+
61
+ # Function that converts the text into a sequence of valid characters.
62
+ text_to_sequence: !name:speechbrain.utils.text_to_sequence.text_to_sequence
63
+
64
+ modules:
65
+ model: !ref <model>
66
+
67
+ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
68
+ loadables:
69
+ model: !ref <model>
70
+
tmpdir_tts/model.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02d1aa3fc1cc8fb0981895c765e9d0017416bb78861b5450e458dda92e567856
3
+ size 112830206
tmpdir_vocoder/generator.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a389f24ffdc3769e0b9c0331f9d803ad0b78185e0da4f2d094e29eedbc11640
3
+ size 55828077
tmpdir_vocoder/hyperparams.yaml ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ################################
2
+ # Model: Tacotroon2 for TTS
3
+ # Authors: Yingzhi Wang
4
+ # ################################
5
+
6
+ in_channels: 80
7
+ out_channels: 1
8
+ resblock_type: "1"
9
+ resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
10
+ resblock_kernel_sizes: [3, 7, 11]
11
+ upsample_kernel_sizes: [16, 16, 4, 4]
12
+ upsample_initial_channel: 512
13
+ upsample_factors: [8, 8, 2, 2]
14
+ inference_padding: 5
15
+ cond_channels: 0
16
+ conv_post_bias: True
17
+
18
+ generator: !new:speechbrain.lobes.models.HifiGAN.HifiganGenerator
19
+ in_channels: !ref <in_channels>
20
+ out_channels: !ref <out_channels>
21
+ resblock_type: !ref <resblock_type>
22
+ resblock_dilation_sizes: !ref <resblock_dilation_sizes>
23
+ resblock_kernel_sizes: !ref <resblock_kernel_sizes>
24
+ upsample_kernel_sizes: !ref <upsample_kernel_sizes>
25
+ upsample_initial_channel: !ref <upsample_initial_channel>
26
+ upsample_factors: !ref <upsample_factors>
27
+ inference_padding: !ref <inference_padding>
28
+ cond_channels: !ref <cond_channels>
29
+ conv_post_bias: !ref <conv_post_bias>
30
+
31
+ modules:
32
+ generator: !ref <generator>
33
+
34
+ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
35
+ loadables:
36
+ generator: !ref <generator>
37
+