saefro991 commited on
Commit
a80d6a6
1 Parent(s): f4e127f

Update model

Browse files
README.md ADDED
@@ -0,0 +1,421 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - text-to-speech
6
+ language: multilingual
7
+ datasets:
8
+ - masmultts
9
+ license: cc-by-4.0
10
+ ---
11
+
12
+ ## ESPnet2 TTS model
13
+
14
+ ### `saefro991/tts_bytes_css10_7lang_textpretrain_residual_freeze`
15
+
16
+ This model was trained by Takaaki-Saeki using masmultts recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+ ### Demo: How to use in ESPnet2
19
+
20
+ Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html)
21
+ if you haven't done that already.
22
+
23
+ ```bash
24
+ cd espnet
25
+ git checkout 11a7d61312439111d4996d55935ede718d494262
26
+ pip install -e .
27
+ cd egs2/masmultts/tts_byte_css10_adap_residual_freeze
28
+ ./run.sh --skip_data_prep false --skip_train true --download_model saefro991/tts_bytes_css10_7lang_textpretrain_residual_freeze
29
+ ```
30
+
31
+
32
+
33
+ ## TTS config
34
+
35
+ <details><summary>expand</summary>
36
+
37
+ ```
38
+ config: conf/train.yaml
39
+ print_config: false
40
+ log_level: INFO
41
+ dry_run: false
42
+ iterator_type: sequence
43
+ output_dir: exp/tts_train_raw_byte
44
+ ngpu: 1
45
+ seed: 0
46
+ num_workers: 1
47
+ num_att_plot: 1
48
+ dist_backend: nccl
49
+ dist_init_method: env://
50
+ dist_world_size: null
51
+ dist_rank: null
52
+ local_rank: 0
53
+ dist_master_addr: null
54
+ dist_master_port: null
55
+ dist_launcher: null
56
+ multiprocessing_distributed: false
57
+ unused_parameters: false
58
+ sharded_ddp: false
59
+ cudnn_enabled: true
60
+ cudnn_benchmark: false
61
+ cudnn_deterministic: true
62
+ collect_stats: false
63
+ write_collected_feats: false
64
+ max_epoch: 200
65
+ patience: null
66
+ val_scheduler_criterion:
67
+ - valid
68
+ - loss
69
+ early_stopping_criterion:
70
+ - valid
71
+ - loss
72
+ - min
73
+ best_model_criterion:
74
+ - - valid
75
+ - loss
76
+ - min
77
+ - - train
78
+ - loss
79
+ - min
80
+ keep_nbest_models: 3
81
+ nbest_averaging_interval: 0
82
+ grad_clip: 2.0
83
+ grad_clip_type: 2.0
84
+ grad_noise: false
85
+ accum_grad: 4
86
+ no_forward_run: false
87
+ resume: true
88
+ train_dtype: float32
89
+ use_amp: false
90
+ log_interval: null
91
+ use_matplotlib: true
92
+ use_tensorboard: true
93
+ create_graph_in_tensorboard: false
94
+ use_wandb: false
95
+ wandb_project: null
96
+ wandb_id: null
97
+ wandb_entity: null
98
+ wandb_name: null
99
+ wandb_model_log_interval: -1
100
+ detect_anomaly: false
101
+ pretrain_path: null
102
+ init_param:
103
+ - ../tts_pretrain_byte_residual/exp/tts_train_byte/2epoch.pth:tts_pretrain.encoder:tts.encoder
104
+ - ../tts_pretrain_byte_residual/exp/tts_train_byte/2epoch.pth:tts_pretrain.lid_emb:tts.lid_emb
105
+ ignore_init_mismatch: false
106
+ freeze_param:
107
+ - tts.encoder.adapter
108
+ - tts.encoder.embed
109
+ - tts.lid_emb
110
+ num_iters_per_epoch: null
111
+ batch_size: 20
112
+ valid_batch_size: null
113
+ batch_bins: 400000
114
+ valid_batch_bins: null
115
+ train_shape_file:
116
+ - exp/tts_stats_raw_byte/train/text_shape.byte
117
+ - exp/tts_stats_raw_byte/train/speech_shape
118
+ valid_shape_file:
119
+ - exp/tts_stats_raw_byte/valid/text_shape.byte
120
+ - exp/tts_stats_raw_byte/valid/speech_shape
121
+ batch_type: numel
122
+ valid_batch_type: null
123
+ fold_length:
124
+ - 150
125
+ - 204800
126
+ sort_in_batch: descending
127
+ sort_batch: descending
128
+ multiple_iterator: false
129
+ chunk_length: 500
130
+ chunk_shift_ratio: 0.5
131
+ num_cache_chunks: 1024
132
+ train_data_path_and_name_and_type:
133
+ - - /local/11399690.1.gpu/dump/raw/train/text
134
+ - text
135
+ - text
136
+ - - /local/11399690.1.gpu/dump/raw/train/wav.scp
137
+ - speech
138
+ - sound
139
+ - - /local/11399690.1.gpu/dump/xvector/train/xvector.scp
140
+ - spembs
141
+ - kaldi_ark
142
+ - - /local/11399690.1.gpu/dump/raw/train/utt2lid
143
+ - lids
144
+ - text_int
145
+ valid_data_path_and_name_and_type:
146
+ - - /local/11399690.1.gpu/dump/raw/dev/text
147
+ - text
148
+ - text
149
+ - - /local/11399690.1.gpu/dump/raw/dev/wav.scp
150
+ - speech
151
+ - sound
152
+ - - /local/11399690.1.gpu/dump/xvector/dev/xvector.scp
153
+ - spembs
154
+ - kaldi_ark
155
+ - - /local/11399690.1.gpu/dump/raw/dev/utt2lid
156
+ - lids
157
+ - text_int
158
+ allow_variable_data_keys: false
159
+ max_cache_size: 0.0
160
+ max_cache_fd: 32
161
+ valid_max_cache_size: null
162
+ optim: adam
163
+ optim_conf:
164
+ lr: 1.0
165
+ scheduler: noamlr
166
+ scheduler_conf:
167
+ model_size: 512
168
+ warmup_steps: 50000
169
+ token_list:
170
+ - <blank>
171
+ - <unk>
172
+ - '32'
173
+ - '101'
174
+ - '97'
175
+ - '105'
176
+ - '110'
177
+ - '116'
178
+ - '111'
179
+ - '115'
180
+ - '114'
181
+ - '108'
182
+ - '100'
183
+ - '117'
184
+ - '109'
185
+ - '99'
186
+ - '195'
187
+ - '112'
188
+ - '104'
189
+ - '118'
190
+ - '107'
191
+ - '103'
192
+ - '98'
193
+ - '122'
194
+ - '102'
195
+ - '106'
196
+ - '121'
197
+ - '119'
198
+ - '164'
199
+ - '169'
200
+ - '197'
201
+ - '196'
202
+ - '161'
203
+ - '113'
204
+ - '179'
205
+ - '173'
206
+ - '188'
207
+ - '182'
208
+ - '190'
209
+ - '208'
210
+ - '120'
211
+ - '141'
212
+ - '153'
213
+ - '160'
214
+ - '155'
215
+ - '189'
216
+ - '131'
217
+ - '186'
218
+ - '168'
219
+ - '133'
220
+ - '209'
221
+ - '130'
222
+ - '181'
223
+ - '159'
224
+ - '151'
225
+ - '175'
226
+ - '177'
227
+ - '145'
228
+ - '171'
229
+ - '174'
230
+ - '165'
231
+ - '135'
232
+ - '200'
233
+ - '180'
234
+ - '170'
235
+ - '178'
236
+ - '176'
237
+ - '163'
238
+ - '184'
239
+ - '185'
240
+ - '187'
241
+ - '129'
242
+ - '132'
243
+ - '128'
244
+ - '136'
245
+ - '143'
246
+ - '162'
247
+ - '191'
248
+ - '150'
249
+ - '206'
250
+ - '183'
251
+ - '140'
252
+ - '172'
253
+ - '167'
254
+ - '207'
255
+ - '139'
256
+ - '142'
257
+ - '147'
258
+ - '134'
259
+ - '137'
260
+ - '148'
261
+ - '194'
262
+ - '149'
263
+ - '166'
264
+ - '49'
265
+ - '50'
266
+ - '48'
267
+ - '51'
268
+ - '138'
269
+ - '56'
270
+ - '53'
271
+ - '55'
272
+ - '52'
273
+ - '54'
274
+ - '57'
275
+ - '199'
276
+ - '226'
277
+ - '210'
278
+ - '144'
279
+ - '203'
280
+ - '225'
281
+ - '202'
282
+ - '232'
283
+ - '201'
284
+ - '157'
285
+ - '231'
286
+ - '156'
287
+ - '220'
288
+ - <sos/eos>
289
+ odim: null
290
+ model_conf: {}
291
+ use_preprocessor: true
292
+ token_type: byte
293
+ bpemodel: null
294
+ non_linguistic_symbols: null
295
+ cleaner: null
296
+ g2p: byte
297
+ feats_extract: fbank
298
+ feats_extract_conf:
299
+ n_fft: 1024
300
+ hop_length: 256
301
+ win_length: null
302
+ fs: 16000
303
+ fmin: 80
304
+ fmax: 7600
305
+ n_mels: 80
306
+ normalize: global_mvn
307
+ normalize_conf:
308
+ stats_file: exp/tts_stats_raw_byte/train/feats_stats.npz
309
+ tts: transformer
310
+ tts_conf:
311
+ embed_dim: 0
312
+ eprenet_conv_layers: 0
313
+ eprenet_conv_filts: 0
314
+ eprenet_conv_chans: 0
315
+ dprenet_layers: 2
316
+ dprenet_units: 256
317
+ adim: 512
318
+ aheads: 8
319
+ elayers: 6
320
+ eunits: 1024
321
+ dlayers: 6
322
+ dunits: 1024
323
+ positionwise_layer_type: conv1d
324
+ positionwise_conv_kernel_size: 1
325
+ postnet_layers: 5
326
+ postnet_filts: 5
327
+ postnet_chans: 256
328
+ spk_embed_dim: 192
329
+ spk_embed_integration_type: add
330
+ use_gst: true
331
+ gst_heads: 4
332
+ gst_tokens: 16
333
+ use_masking: true
334
+ bce_pos_weight: 5.0
335
+ use_scaled_pos_enc: true
336
+ encoder_normalize_before: true
337
+ decoder_normalize_before: true
338
+ reduction_factor: 1
339
+ init_type: xavier_uniform
340
+ init_enc_alpha: 1.0
341
+ init_dec_alpha: 1.0
342
+ eprenet_dropout_rate: 0.0
343
+ dprenet_dropout_rate: 0.5
344
+ postnet_dropout_rate: 0.5
345
+ transformer_enc_dropout_rate: 0.1
346
+ transformer_enc_positional_dropout_rate: 0.1
347
+ transformer_enc_attn_dropout_rate: 0.1
348
+ transformer_dec_dropout_rate: 0.1
349
+ transformer_dec_positional_dropout_rate: 0.1
350
+ transformer_dec_attn_dropout_rate: 0.1
351
+ transformer_enc_dec_attn_dropout_rate: 0.1
352
+ use_guided_attn_loss: true
353
+ num_heads_applied_guided_attn: 2
354
+ num_layers_applied_guided_attn: 2
355
+ modules_applied_guided_attn:
356
+ - encoder-decoder
357
+ guided_attn_loss_sigma: 0.4
358
+ guided_attn_loss_lambda: 10.0
359
+ langs: 21
360
+ lang_family_encoding: false
361
+ num_lang_family: 7
362
+ use_adapter: true
363
+ adapter_type: residual
364
+ use_encoder_w_lid: true
365
+ pitch_extract: null
366
+ pitch_extract_conf: {}
367
+ pitch_normalize: null
368
+ pitch_normalize_conf: {}
369
+ energy_extract: null
370
+ energy_extract_conf: {}
371
+ energy_normalize: null
372
+ energy_normalize_conf: {}
373
+ required:
374
+ - output_dir
375
+ - token_list
376
+ version: '202209'
377
+ distributed: false
378
+ ```
379
+
380
+ </details>
381
+
382
+
383
+
384
+ ### Citing ESPnet
385
+
386
+ ```BibTex
387
+ @inproceedings{watanabe2018espnet,
388
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
389
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
390
+ year={2018},
391
+ booktitle={Proceedings of Interspeech},
392
+ pages={2207--2211},
393
+ doi={10.21437/Interspeech.2018-1456},
394
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
395
+ }
396
+
397
+
398
+
399
+
400
+ @inproceedings{hayashi2020espnet,
401
+ title={{Espnet-TTS}: Unified, reproducible, and integratable open source end-to-end text-to-speech toolkit},
402
+ author={Hayashi, Tomoki and Yamamoto, Ryuichi and Inoue, Katsuki and Yoshimura, Takenori and Watanabe, Shinji and Toda, Tomoki and Takeda, Kazuya and Zhang, Yu and Tan, Xu},
403
+ booktitle={Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
404
+ pages={7654--7658},
405
+ year={2020},
406
+ organization={IEEE}
407
+ }
408
+ ```
409
+
410
+ or arXiv:
411
+
412
+ ```bibtex
413
+ @misc{watanabe2018espnet,
414
+ title={ESPnet: End-to-End Speech Processing Toolkit},
415
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
416
+ year={2018},
417
+ eprint={1804.00015},
418
+ archivePrefix={arXiv},
419
+ primaryClass={cs.CL}
420
+ }
421
+ ```
dump/raw/org/train/lang2lid ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <unk> 0
2
+ cs_cz 1
3
+ de_de 2
4
+ el_gr 3
5
+ en_uk 4
6
+ en_us 5
7
+ es_419 6
8
+ et_ee 7
9
+ fi_fi 8
10
+ fr_fr 9
11
+ hr_hr 10
12
+ hu_hu 11
13
+ it_it 12
14
+ lt_lt 13
15
+ nl_nl 14
16
+ pl_pl 15
17
+ ro_ro 16
18
+ ru_ru 17
19
+ sk_sk 18
20
+ sl_si 19
21
+ uk_ua 20
dump/xvector/test/spk_xvector.ark ADDED
Binary file (5.54 kB). View file
 
dump/xvector/test/spk_xvector.scp ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ css10_de dump/xvector/test/spk_xvector.ark:9
2
+ css10_el dump/xvector/test/spk_xvector.ark:801
3
+ css10_fi dump/xvector/test/spk_xvector.ark:1593
4
+ css10_fr dump/xvector/test/spk_xvector.ark:2385
5
+ css10_hu dump/xvector/test/spk_xvector.ark:3177
6
+ css10_nl dump/xvector/test/spk_xvector.ark:3969
7
+ css10_ru dump/xvector/test/spk_xvector.ark:4761
dump/xvector/train/spk_xvector.ark ADDED
Binary file (5.54 kB). View file
 
dump/xvector/train/spk_xvector.scp ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ css10_de dump/xvector/train/spk_xvector.ark:9
2
+ css10_el dump/xvector/train/spk_xvector.ark:801
3
+ css10_fi dump/xvector/train/spk_xvector.ark:1593
4
+ css10_fr dump/xvector/train/spk_xvector.ark:2385
5
+ css10_hu dump/xvector/train/spk_xvector.ark:3177
6
+ css10_nl dump/xvector/train/spk_xvector.ark:3969
7
+ css10_ru dump/xvector/train/spk_xvector.ark:4761
exp/tts_stats_raw_byte/train/feats_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed1b26fa9a899031a8727e1a4607db371c918145e6dc39d89cc0556ca2e65237
3
+ size 1402
exp/tts_train_raw_byte/config.yaml ADDED
@@ -0,0 +1,340 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/train.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: exp/tts_train_raw_byte
7
+ ngpu: 1
8
+ seed: 0
9
+ num_workers: 1
10
+ num_att_plot: 1
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: null
14
+ dist_rank: null
15
+ local_rank: 0
16
+ dist_master_addr: null
17
+ dist_master_port: null
18
+ dist_launcher: null
19
+ multiprocessing_distributed: false
20
+ unused_parameters: false
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: true
25
+ collect_stats: false
26
+ write_collected_feats: false
27
+ max_epoch: 200
28
+ patience: null
29
+ val_scheduler_criterion:
30
+ - valid
31
+ - loss
32
+ early_stopping_criterion:
33
+ - valid
34
+ - loss
35
+ - min
36
+ best_model_criterion:
37
+ - - valid
38
+ - loss
39
+ - min
40
+ - - train
41
+ - loss
42
+ - min
43
+ keep_nbest_models: 3
44
+ nbest_averaging_interval: 0
45
+ grad_clip: 2.0
46
+ grad_clip_type: 2.0
47
+ grad_noise: false
48
+ accum_grad: 4
49
+ no_forward_run: false
50
+ resume: true
51
+ train_dtype: float32
52
+ use_amp: false
53
+ log_interval: null
54
+ use_matplotlib: true
55
+ use_tensorboard: true
56
+ create_graph_in_tensorboard: false
57
+ use_wandb: false
58
+ wandb_project: null
59
+ wandb_id: null
60
+ wandb_entity: null
61
+ wandb_name: null
62
+ wandb_model_log_interval: -1
63
+ detect_anomaly: false
64
+ pretrain_path: null
65
+ init_param:
66
+ - ../tts_pretrain_byte_residual/exp/tts_train_byte/2epoch.pth:tts_pretrain.encoder:tts.encoder
67
+ - ../tts_pretrain_byte_residual/exp/tts_train_byte/2epoch.pth:tts_pretrain.lid_emb:tts.lid_emb
68
+ ignore_init_mismatch: false
69
+ freeze_param:
70
+ - tts.encoder.adapter
71
+ - tts.encoder.embed
72
+ - tts.lid_emb
73
+ num_iters_per_epoch: null
74
+ batch_size: 20
75
+ valid_batch_size: null
76
+ batch_bins: 400000
77
+ valid_batch_bins: null
78
+ train_shape_file:
79
+ - exp/tts_stats_raw_byte/train/text_shape.byte
80
+ - exp/tts_stats_raw_byte/train/speech_shape
81
+ valid_shape_file:
82
+ - exp/tts_stats_raw_byte/valid/text_shape.byte
83
+ - exp/tts_stats_raw_byte/valid/speech_shape
84
+ batch_type: numel
85
+ valid_batch_type: null
86
+ fold_length:
87
+ - 150
88
+ - 204800
89
+ sort_in_batch: descending
90
+ sort_batch: descending
91
+ multiple_iterator: false
92
+ chunk_length: 500
93
+ chunk_shift_ratio: 0.5
94
+ num_cache_chunks: 1024
95
+ train_data_path_and_name_and_type:
96
+ - - /local/11399690.1.gpu/dump/raw/train/text
97
+ - text
98
+ - text
99
+ - - /local/11399690.1.gpu/dump/raw/train/wav.scp
100
+ - speech
101
+ - sound
102
+ - - /local/11399690.1.gpu/dump/xvector/train/xvector.scp
103
+ - spembs
104
+ - kaldi_ark
105
+ - - /local/11399690.1.gpu/dump/raw/train/utt2lid
106
+ - lids
107
+ - text_int
108
+ valid_data_path_and_name_and_type:
109
+ - - /local/11399690.1.gpu/dump/raw/dev/text
110
+ - text
111
+ - text
112
+ - - /local/11399690.1.gpu/dump/raw/dev/wav.scp
113
+ - speech
114
+ - sound
115
+ - - /local/11399690.1.gpu/dump/xvector/dev/xvector.scp
116
+ - spembs
117
+ - kaldi_ark
118
+ - - /local/11399690.1.gpu/dump/raw/dev/utt2lid
119
+ - lids
120
+ - text_int
121
+ allow_variable_data_keys: false
122
+ max_cache_size: 0.0
123
+ max_cache_fd: 32
124
+ valid_max_cache_size: null
125
+ optim: adam
126
+ optim_conf:
127
+ lr: 1.0
128
+ scheduler: noamlr
129
+ scheduler_conf:
130
+ model_size: 512
131
+ warmup_steps: 50000
132
+ token_list:
133
+ - <blank>
134
+ - <unk>
135
+ - '32'
136
+ - '101'
137
+ - '97'
138
+ - '105'
139
+ - '110'
140
+ - '116'
141
+ - '111'
142
+ - '115'
143
+ - '114'
144
+ - '108'
145
+ - '100'
146
+ - '117'
147
+ - '109'
148
+ - '99'
149
+ - '195'
150
+ - '112'
151
+ - '104'
152
+ - '118'
153
+ - '107'
154
+ - '103'
155
+ - '98'
156
+ - '122'
157
+ - '102'
158
+ - '106'
159
+ - '121'
160
+ - '119'
161
+ - '164'
162
+ - '169'
163
+ - '197'
164
+ - '196'
165
+ - '161'
166
+ - '113'
167
+ - '179'
168
+ - '173'
169
+ - '188'
170
+ - '182'
171
+ - '190'
172
+ - '208'
173
+ - '120'
174
+ - '141'
175
+ - '153'
176
+ - '160'
177
+ - '155'
178
+ - '189'
179
+ - '131'
180
+ - '186'
181
+ - '168'
182
+ - '133'
183
+ - '209'
184
+ - '130'
185
+ - '181'
186
+ - '159'
187
+ - '151'
188
+ - '175'
189
+ - '177'
190
+ - '145'
191
+ - '171'
192
+ - '174'
193
+ - '165'
194
+ - '135'
195
+ - '200'
196
+ - '180'
197
+ - '170'
198
+ - '178'
199
+ - '176'
200
+ - '163'
201
+ - '184'
202
+ - '185'
203
+ - '187'
204
+ - '129'
205
+ - '132'
206
+ - '128'
207
+ - '136'
208
+ - '143'
209
+ - '162'
210
+ - '191'
211
+ - '150'
212
+ - '206'
213
+ - '183'
214
+ - '140'
215
+ - '172'
216
+ - '167'
217
+ - '207'
218
+ - '139'
219
+ - '142'
220
+ - '147'
221
+ - '134'
222
+ - '137'
223
+ - '148'
224
+ - '194'
225
+ - '149'
226
+ - '166'
227
+ - '49'
228
+ - '50'
229
+ - '48'
230
+ - '51'
231
+ - '138'
232
+ - '56'
233
+ - '53'
234
+ - '55'
235
+ - '52'
236
+ - '54'
237
+ - '57'
238
+ - '199'
239
+ - '226'
240
+ - '210'
241
+ - '144'
242
+ - '203'
243
+ - '225'
244
+ - '202'
245
+ - '232'
246
+ - '201'
247
+ - '157'
248
+ - '231'
249
+ - '156'
250
+ - '220'
251
+ - <sos/eos>
252
+ odim: null
253
+ model_conf: {}
254
+ use_preprocessor: true
255
+ token_type: byte
256
+ bpemodel: null
257
+ non_linguistic_symbols: null
258
+ cleaner: null
259
+ g2p: byte
260
+ feats_extract: fbank
261
+ feats_extract_conf:
262
+ n_fft: 1024
263
+ hop_length: 256
264
+ win_length: null
265
+ fs: 16000
266
+ fmin: 80
267
+ fmax: 7600
268
+ n_mels: 80
269
+ normalize: global_mvn
270
+ normalize_conf:
271
+ stats_file: exp/tts_stats_raw_byte/train/feats_stats.npz
272
+ tts: transformer
273
+ tts_conf:
274
+ embed_dim: 0
275
+ eprenet_conv_layers: 0
276
+ eprenet_conv_filts: 0
277
+ eprenet_conv_chans: 0
278
+ dprenet_layers: 2
279
+ dprenet_units: 256
280
+ adim: 512
281
+ aheads: 8
282
+ elayers: 6
283
+ eunits: 1024
284
+ dlayers: 6
285
+ dunits: 1024
286
+ positionwise_layer_type: conv1d
287
+ positionwise_conv_kernel_size: 1
288
+ postnet_layers: 5
289
+ postnet_filts: 5
290
+ postnet_chans: 256
291
+ spk_embed_dim: 192
292
+ spk_embed_integration_type: add
293
+ use_gst: true
294
+ gst_heads: 4
295
+ gst_tokens: 16
296
+ use_masking: true
297
+ bce_pos_weight: 5.0
298
+ use_scaled_pos_enc: true
299
+ encoder_normalize_before: true
300
+ decoder_normalize_before: true
301
+ reduction_factor: 1
302
+ init_type: xavier_uniform
303
+ init_enc_alpha: 1.0
304
+ init_dec_alpha: 1.0
305
+ eprenet_dropout_rate: 0.0
306
+ dprenet_dropout_rate: 0.5
307
+ postnet_dropout_rate: 0.5
308
+ transformer_enc_dropout_rate: 0.1
309
+ transformer_enc_positional_dropout_rate: 0.1
310
+ transformer_enc_attn_dropout_rate: 0.1
311
+ transformer_dec_dropout_rate: 0.1
312
+ transformer_dec_positional_dropout_rate: 0.1
313
+ transformer_dec_attn_dropout_rate: 0.1
314
+ transformer_enc_dec_attn_dropout_rate: 0.1
315
+ use_guided_attn_loss: true
316
+ num_heads_applied_guided_attn: 2
317
+ num_layers_applied_guided_attn: 2
318
+ modules_applied_guided_attn:
319
+ - encoder-decoder
320
+ guided_attn_loss_sigma: 0.4
321
+ guided_attn_loss_lambda: 10.0
322
+ langs: 21
323
+ lang_family_encoding: false
324
+ num_lang_family: 7
325
+ use_adapter: true
326
+ adapter_type: residual
327
+ use_encoder_w_lid: true
328
+ pitch_extract: null
329
+ pitch_extract_conf: {}
330
+ pitch_normalize: null
331
+ pitch_normalize_conf: {}
332
+ energy_extract: null
333
+ energy_extract_conf: {}
334
+ energy_normalize: null
335
+ energy_normalize_conf: {}
336
+ required:
337
+ - output_dir
338
+ - token_list
339
+ version: '202209'
340
+ distributed: false
exp/tts_train_raw_byte/images/backward_time.png ADDED
exp/tts_train_raw_byte/images/bce_loss.png ADDED
exp/tts_train_raw_byte/images/decoder_alpha.png ADDED
exp/tts_train_raw_byte/images/enc_dec_attn_loss.png ADDED
exp/tts_train_raw_byte/images/encoder_alpha.png ADDED
exp/tts_train_raw_byte/images/forward_time.png ADDED
exp/tts_train_raw_byte/images/gpu_max_cached_mem_GB.png ADDED
exp/tts_train_raw_byte/images/iter_time.png ADDED
exp/tts_train_raw_byte/images/l1_loss.png ADDED
exp/tts_train_raw_byte/images/l2_loss.png ADDED
exp/tts_train_raw_byte/images/lid_loss.png ADDED
exp/tts_train_raw_byte/images/lid_loss_mlm.png ADDED
exp/tts_train_raw_byte/images/loss.png ADDED
exp/tts_train_raw_byte/images/mlm_acc.png ADDED
exp/tts_train_raw_byte/images/mlm_loss.png ADDED
exp/tts_train_raw_byte/images/optim0_lr0.png ADDED
exp/tts_train_raw_byte/images/optim_step_time.png ADDED
exp/tts_train_raw_byte/images/train_time.png ADDED
exp/tts_train_raw_byte/latest.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:508e5fa333678345e16e5c1535a267a26c27a7c0de5dd591bd69bc47f0e8e575
3
+ size 137622133
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: '202209'
2
+ files:
3
+ model_file: exp/tts_train_raw_byte/latest.pth
4
+ python: "3.8.13 (default, Mar 28 2022, 11:38:47) \n[GCC 7.5.0]"
5
+ timestamp: 1691377001.040015
6
+ torch: 1.11.0+cu113
7
+ yaml_files:
8
+ train_config: exp/tts_train_raw_byte/config.yaml