satoshi.2020 commited on
Commit
b059fd8
1 Parent(s): de4005f

Update model

Browse files
Files changed (35) hide show
  1. README.md +508 -0
  2. exp/tts_stats_raw_phn_null_g2pk/train/energy_stats.npz +0 -0
  3. exp/tts_stats_raw_phn_null_g2pk/train/feats_stats.npz +0 -0
  4. exp/tts_stats_raw_phn_null_g2pk/train/pitch_stats.npz +0 -0
  5. exp/tts_train_jets_raw_phn_null_g2pk/config.yaml +433 -0
  6. exp/tts_train_jets_raw_phn_null_g2pk/images/discriminator_backward_time.png +0 -0
  7. exp/tts_train_jets_raw_phn_null_g2pk/images/discriminator_fake_loss.png +0 -0
  8. exp/tts_train_jets_raw_phn_null_g2pk/images/discriminator_forward_time.png +0 -0
  9. exp/tts_train_jets_raw_phn_null_g2pk/images/discriminator_loss.png +0 -0
  10. exp/tts_train_jets_raw_phn_null_g2pk/images/discriminator_optim_step_time.png +0 -0
  11. exp/tts_train_jets_raw_phn_null_g2pk/images/discriminator_real_loss.png +0 -0
  12. exp/tts_train_jets_raw_phn_null_g2pk/images/discriminator_train_time.png +0 -0
  13. exp/tts_train_jets_raw_phn_null_g2pk/images/generator_align_bin_loss.png +0 -0
  14. exp/tts_train_jets_raw_phn_null_g2pk/images/generator_align_forwardsum_loss.png +0 -0
  15. exp/tts_train_jets_raw_phn_null_g2pk/images/generator_align_loss.png +0 -0
  16. exp/tts_train_jets_raw_phn_null_g2pk/images/generator_backward_time.png +0 -0
  17. exp/tts_train_jets_raw_phn_null_g2pk/images/generator_forward_time.png +0 -0
  18. exp/tts_train_jets_raw_phn_null_g2pk/images/generator_g_adv_loss.png +0 -0
  19. exp/tts_train_jets_raw_phn_null_g2pk/images/generator_g_feat_match_loss.png +0 -0
  20. exp/tts_train_jets_raw_phn_null_g2pk/images/generator_g_loss.png +0 -0
  21. exp/tts_train_jets_raw_phn_null_g2pk/images/generator_g_mel_loss.png +0 -0
  22. exp/tts_train_jets_raw_phn_null_g2pk/images/generator_loss.png +0 -0
  23. exp/tts_train_jets_raw_phn_null_g2pk/images/generator_optim_step_time.png +0 -0
  24. exp/tts_train_jets_raw_phn_null_g2pk/images/generator_train_time.png +0 -0
  25. exp/tts_train_jets_raw_phn_null_g2pk/images/generator_var_dur_loss.png +0 -0
  26. exp/tts_train_jets_raw_phn_null_g2pk/images/generator_var_energy_loss.png +0 -0
  27. exp/tts_train_jets_raw_phn_null_g2pk/images/generator_var_loss.png +0 -0
  28. exp/tts_train_jets_raw_phn_null_g2pk/images/generator_var_pitch_loss.png +0 -0
  29. exp/tts_train_jets_raw_phn_null_g2pk/images/gpu_max_cached_mem_GB.png +0 -0
  30. exp/tts_train_jets_raw_phn_null_g2pk/images/iter_time.png +0 -0
  31. exp/tts_train_jets_raw_phn_null_g2pk/images/optim0_lr0.png +0 -0
  32. exp/tts_train_jets_raw_phn_null_g2pk/images/optim1_lr0.png +0 -0
  33. exp/tts_train_jets_raw_phn_null_g2pk/images/train_time.png +0 -0
  34. exp/tts_train_jets_raw_phn_null_g2pk/train.total_count.ave_5best.pth +3 -0
  35. meta.yaml +8 -0
README.md CHANGED
@@ -1,3 +1,511 @@
1
  ---
 
 
 
 
 
 
 
2
  license: cc-by-4.0
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - text-to-speech
6
+ language: ko
7
+ datasets:
8
+ - kss
9
  license: cc-by-4.0
10
  ---
11
+
12
+ ## ESPnet2 TTS model
13
+
14
+ ### `imdanboy/kss_tts_train_jets_raw_phn_null_g2pk_train.total_count.ave`
15
+
16
+ This model was trained by satoshi.2020 using kss recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+ ### Demo: How to use in ESPnet2
19
+
20
+ ```bash
21
+ cd espnet
22
+ git checkout 047d0c474c18a87c205e566948410be16787e477
23
+ pip install -e .
24
+ cd egs2/kss/tts1
25
+ ./run.sh --skip_data_prep false --skip_train true --download_model imdanboy/kss_tts_train_jets_raw_phn_null_g2pk_train.total_count.ave
26
+ ```
27
+
28
+
29
+
30
+ ## TTS config
31
+
32
+ <details><summary>expand</summary>
33
+
34
+ ```
35
+ config: conf/tuning/train_jets.yaml
36
+ print_config: false
37
+ log_level: INFO
38
+ dry_run: false
39
+ iterator_type: sequence
40
+ output_dir: exp/tts_train_jets_raw_phn_null_g2pk
41
+ ngpu: 1
42
+ seed: 777
43
+ num_workers: 4
44
+ num_att_plot: 3
45
+ dist_backend: nccl
46
+ dist_init_method: env://
47
+ dist_world_size: 4
48
+ dist_rank: 0
49
+ local_rank: 0
50
+ dist_master_addr: localhost
51
+ dist_master_port: 52809
52
+ dist_launcher: null
53
+ multiprocessing_distributed: true
54
+ unused_parameters: true
55
+ sharded_ddp: false
56
+ cudnn_enabled: true
57
+ cudnn_benchmark: false
58
+ cudnn_deterministic: false
59
+ collect_stats: false
60
+ write_collected_feats: false
61
+ max_epoch: 1000
62
+ patience: null
63
+ val_scheduler_criterion:
64
+ - valid
65
+ - loss
66
+ early_stopping_criterion:
67
+ - valid
68
+ - loss
69
+ - min
70
+ best_model_criterion:
71
+ - - valid
72
+ - text2mel_loss
73
+ - min
74
+ - - train
75
+ - text2mel_loss
76
+ - min
77
+ - - train
78
+ - total_count
79
+ - max
80
+ keep_nbest_models: 5
81
+ nbest_averaging_interval: 0
82
+ grad_clip: -1
83
+ grad_clip_type: 2.0
84
+ grad_noise: false
85
+ accum_grad: 1
86
+ no_forward_run: false
87
+ resume: true
88
+ train_dtype: float32
89
+ use_amp: false
90
+ log_interval: 50
91
+ use_matplotlib: true
92
+ use_tensorboard: true
93
+ use_wandb: false
94
+ wandb_project: null
95
+ wandb_id: null
96
+ wandb_entity: null
97
+ wandb_name: null
98
+ wandb_model_log_interval: -1
99
+ detect_anomaly: false
100
+ pretrain_path: null
101
+ init_param: []
102
+ ignore_init_mismatch: false
103
+ freeze_param: []
104
+ num_iters_per_epoch: 1000
105
+ batch_size: 20
106
+ valid_batch_size: null
107
+ batch_bins: 2000000
108
+ valid_batch_bins: null
109
+ train_shape_file:
110
+ - exp/tts_stats_raw_phn_null_g2pk/train/text_shape.phn
111
+ - exp/tts_stats_raw_phn_null_g2pk/train/speech_shape
112
+ valid_shape_file:
113
+ - exp/tts_stats_raw_phn_null_g2pk/valid/text_shape.phn
114
+ - exp/tts_stats_raw_phn_null_g2pk/valid/speech_shape
115
+ batch_type: numel
116
+ valid_batch_type: null
117
+ fold_length:
118
+ - 150
119
+ - 204800
120
+ sort_in_batch: descending
121
+ sort_batch: descending
122
+ multiple_iterator: false
123
+ chunk_length: 500
124
+ chunk_shift_ratio: 0.5
125
+ num_cache_chunks: 1024
126
+ train_data_path_and_name_and_type:
127
+ - - dump/raw/tr_no_dev/text
128
+ - text
129
+ - text
130
+ - - dump/raw/tr_no_dev/wav.scp
131
+ - speech
132
+ - sound
133
+ - - exp/tts_stats_raw_phn_null_g2pk/train/collect_feats/pitch.scp
134
+ - pitch
135
+ - npy
136
+ - - exp/tts_stats_raw_phn_null_g2pk/train/collect_feats/energy.scp
137
+ - energy
138
+ - npy
139
+ valid_data_path_and_name_and_type:
140
+ - - dump/raw/dev/text
141
+ - text
142
+ - text
143
+ - - dump/raw/dev/wav.scp
144
+ - speech
145
+ - sound
146
+ - - exp/tts_stats_raw_phn_null_g2pk/valid/collect_feats/pitch.scp
147
+ - pitch
148
+ - npy
149
+ - - exp/tts_stats_raw_phn_null_g2pk/valid/collect_feats/energy.scp
150
+ - energy
151
+ - npy
152
+ allow_variable_data_keys: false
153
+ max_cache_size: 0.0
154
+ max_cache_fd: 32
155
+ valid_max_cache_size: null
156
+ optim: adamw
157
+ optim_conf:
158
+ lr: 0.0002
159
+ betas:
160
+ - 0.8
161
+ - 0.99
162
+ eps: 1.0e-09
163
+ weight_decay: 0.0
164
+ scheduler: exponentiallr
165
+ scheduler_conf:
166
+ gamma: 0.999875
167
+ optim2: adamw
168
+ optim2_conf:
169
+ lr: 0.0002
170
+ betas:
171
+ - 0.8
172
+ - 0.99
173
+ eps: 1.0e-09
174
+ weight_decay: 0.0
175
+ scheduler2: exponentiallr
176
+ scheduler2_conf:
177
+ gamma: 0.999875
178
+ generator_first: true
179
+ token_list:
180
+ - <blank>
181
+ - <unk>
182
+ - ''
183
+ - ᅡ
184
+ - ᅵ
185
+ - ᄋ
186
+ - ᅳ
187
+ - ᄀ
188
+ - ᅥ
189
+ - ᄂ
190
+ - ᆫ
191
+ - ᄅ
192
+ - ᄌ
193
+ - ᄉ
194
+ - ᅩ
195
+ - ᆯ
196
+ - ᄆ
197
+ - .
198
+ - ᅮ
199
+ - ᄃ
200
+ - ᄒ
201
+ - ᅦ
202
+ - ᆼ
203
+ - ᅢ
204
+ - ᄇ
205
+ - ᅭ
206
+ - ᅧ
207
+ - ᄊ
208
+ - ᆷ
209
+ - ᄄ
210
+ - ᆮ
211
+ - ᄎ
212
+ - ᄁ
213
+ - ᆨ
214
+ - ᄑ
215
+ - ᄐ
216
+ - ᅪ
217
+ - ᄏ
218
+ - '?'
219
+ - ᄍ
220
+ - ᆸ
221
+ - ᅬ
222
+ - ᅣ
223
+ - ᅴ
224
+ - ᅯ
225
+ - ᅨ
226
+ - ᄈ
227
+ - ᅱ
228
+ - ᅲ
229
+ - ᅫ
230
+ - ','
231
+ - '!'
232
+ - ᅤ
233
+ - ':'
234
+ - ᅰ
235
+ - ''''
236
+ - '-'
237
+ - '"'
238
+ - /
239
+ - I
240
+ - M
241
+ - F
242
+ - E
243
+ - S
244
+ - C
245
+ - A
246
+ - B
247
+ - ㅇ
248
+ - <sos/eos>
249
+ odim: null
250
+ model_conf: {}
251
+ use_preprocessor: true
252
+ token_type: phn
253
+ bpemodel: null
254
+ non_linguistic_symbols: null
255
+ cleaner: null
256
+ g2p: g2pk
257
+ feats_extract: fbank
258
+ feats_extract_conf:
259
+ n_fft: 1024
260
+ hop_length: 256
261
+ win_length: null
262
+ fs: 24000
263
+ fmin: 0
264
+ fmax: null
265
+ n_mels: 80
266
+ normalize: global_mvn
267
+ normalize_conf:
268
+ stats_file: exp/tts_stats_raw_phn_null_g2pk/train/feats_stats.npz
269
+ tts: jets
270
+ tts_conf:
271
+ generator_type: jets_generator
272
+ generator_params:
273
+ adim: 256
274
+ aheads: 2
275
+ elayers: 4
276
+ eunits: 1024
277
+ dlayers: 4
278
+ dunits: 1024
279
+ positionwise_layer_type: conv1d
280
+ positionwise_conv_kernel_size: 3
281
+ duration_predictor_layers: 2
282
+ duration_predictor_chans: 256
283
+ duration_predictor_kernel_size: 3
284
+ use_masking: true
285
+ encoder_normalize_before: true
286
+ decoder_normalize_before: true
287
+ encoder_type: transformer
288
+ decoder_type: transformer
289
+ conformer_rel_pos_type: latest
290
+ conformer_pos_enc_layer_type: rel_pos
291
+ conformer_self_attn_layer_type: rel_selfattn
292
+ conformer_activation_type: swish
293
+ use_macaron_style_in_conformer: true
294
+ use_cnn_in_conformer: true
295
+ conformer_enc_kernel_size: 7
296
+ conformer_dec_kernel_size: 31
297
+ init_type: xavier_uniform
298
+ transformer_enc_dropout_rate: 0.2
299
+ transformer_enc_positional_dropout_rate: 0.2
300
+ transformer_enc_attn_dropout_rate: 0.2
301
+ transformer_dec_dropout_rate: 0.2
302
+ transformer_dec_positional_dropout_rate: 0.2
303
+ transformer_dec_attn_dropout_rate: 0.2
304
+ pitch_predictor_layers: 5
305
+ pitch_predictor_chans: 256
306
+ pitch_predictor_kernel_size: 5
307
+ pitch_predictor_dropout: 0.5
308
+ pitch_embed_kernel_size: 1
309
+ pitch_embed_dropout: 0.0
310
+ stop_gradient_from_pitch_predictor: true
311
+ energy_predictor_layers: 2
312
+ energy_predictor_chans: 256
313
+ energy_predictor_kernel_size: 3
314
+ energy_predictor_dropout: 0.5
315
+ energy_embed_kernel_size: 1
316
+ energy_embed_dropout: 0.0
317
+ stop_gradient_from_energy_predictor: false
318
+ generator_out_channels: 1
319
+ generator_channels: 512
320
+ generator_global_channels: -1
321
+ generator_kernel_size: 7
322
+ generator_upsample_scales:
323
+ - 8
324
+ - 8
325
+ - 2
326
+ - 2
327
+ generator_upsample_kernel_sizes:
328
+ - 16
329
+ - 16
330
+ - 4
331
+ - 4
332
+ generator_resblock_kernel_sizes:
333
+ - 3
334
+ - 7
335
+ - 11
336
+ generator_resblock_dilations:
337
+ - - 1
338
+ - 3
339
+ - 5
340
+ - - 1
341
+ - 3
342
+ - 5
343
+ - - 1
344
+ - 3
345
+ - 5
346
+ generator_use_additional_convs: true
347
+ generator_bias: true
348
+ generator_nonlinear_activation: LeakyReLU
349
+ generator_nonlinear_activation_params:
350
+ negative_slope: 0.1
351
+ generator_use_weight_norm: true
352
+ segment_size: 64
353
+ idim: 69
354
+ odim: 80
355
+ discriminator_type: hifigan_multi_scale_multi_period_discriminator
356
+ discriminator_params:
357
+ scales: 1
358
+ scale_downsample_pooling: AvgPool1d
359
+ scale_downsample_pooling_params:
360
+ kernel_size: 4
361
+ stride: 2
362
+ padding: 2
363
+ scale_discriminator_params:
364
+ in_channels: 1
365
+ out_channels: 1
366
+ kernel_sizes:
367
+ - 15
368
+ - 41
369
+ - 5
370
+ - 3
371
+ channels: 128
372
+ max_downsample_channels: 1024
373
+ max_groups: 16
374
+ bias: true
375
+ downsample_scales:
376
+ - 2
377
+ - 2
378
+ - 4
379
+ - 4
380
+ - 1
381
+ nonlinear_activation: LeakyReLU
382
+ nonlinear_activation_params:
383
+ negative_slope: 0.1
384
+ use_weight_norm: true
385
+ use_spectral_norm: false
386
+ follow_official_norm: false
387
+ periods:
388
+ - 2
389
+ - 3
390
+ - 5
391
+ - 7
392
+ - 11
393
+ period_discriminator_params:
394
+ in_channels: 1
395
+ out_channels: 1
396
+ kernel_sizes:
397
+ - 5
398
+ - 3
399
+ channels: 32
400
+ downsample_scales:
401
+ - 3
402
+ - 3
403
+ - 3
404
+ - 3
405
+ - 1
406
+ max_downsample_channels: 1024
407
+ bias: true
408
+ nonlinear_activation: LeakyReLU
409
+ nonlinear_activation_params:
410
+ negative_slope: 0.1
411
+ use_weight_norm: true
412
+ use_spectral_norm: false
413
+ generator_adv_loss_params:
414
+ average_by_discriminators: false
415
+ loss_type: mse
416
+ discriminator_adv_loss_params:
417
+ average_by_discriminators: false
418
+ loss_type: mse
419
+ feat_match_loss_params:
420
+ average_by_discriminators: false
421
+ average_by_layers: false
422
+ include_final_outputs: true
423
+ mel_loss_params:
424
+ fs: 24000
425
+ n_fft: 1024
426
+ hop_length: 256
427
+ win_length: null
428
+ window: hann
429
+ n_mels: 80
430
+ fmin: 0
431
+ fmax: null
432
+ log_base: null
433
+ lambda_adv: 1.0
434
+ lambda_mel: 45.0
435
+ lambda_feat_match: 2.0
436
+ lambda_var: 1.0
437
+ lambda_align: 2.0
438
+ sampling_rate: 24000
439
+ cache_generator_outputs: true
440
+ pitch_extract: dio
441
+ pitch_extract_conf:
442
+ reduction_factor: 1
443
+ use_token_averaged_f0: false
444
+ fs: 24000
445
+ n_fft: 1024
446
+ hop_length: 256
447
+ f0max: 400
448
+ f0min: 80
449
+ pitch_normalize: global_mvn
450
+ pitch_normalize_conf:
451
+ stats_file: exp/tts_stats_raw_phn_null_g2pk/train/pitch_stats.npz
452
+ energy_extract: energy
453
+ energy_extract_conf:
454
+ reduction_factor: 1
455
+ use_token_averaged_energy: false
456
+ fs: 24000
457
+ n_fft: 1024
458
+ hop_length: 256
459
+ win_length: null
460
+ energy_normalize: global_mvn
461
+ energy_normalize_conf:
462
+ stats_file: exp/tts_stats_raw_phn_null_g2pk/train/energy_stats.npz
463
+ required:
464
+ - output_dir
465
+ - token_list
466
+ version: '202204'
467
+ distributed: true
468
+ ```
469
+
470
+ </details>
471
+
472
+
473
+
474
+ ### Citing ESPnet
475
+
476
+ ```BibTex
477
+ @inproceedings{watanabe2018espnet,
478
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
479
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
480
+ year={2018},
481
+ booktitle={Proceedings of Interspeech},
482
+ pages={2207--2211},
483
+ doi={10.21437/Interspeech.2018-1456},
484
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
485
+ }
486
+
487
+
488
+
489
+
490
+ @inproceedings{hayashi2020espnet,
491
+ title={{Espnet-TTS}: Unified, reproducible, and integratable open source end-to-end text-to-speech toolkit},
492
+ author={Hayashi, Tomoki and Yamamoto, Ryuichi and Inoue, Katsuki and Yoshimura, Takenori and Watanabe, Shinji and Toda, Tomoki and Takeda, Kazuya and Zhang, Yu and Tan, Xu},
493
+ booktitle={Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
494
+ pages={7654--7658},
495
+ year={2020},
496
+ organization={IEEE}
497
+ }
498
+ ```
499
+
500
+ or arXiv:
501
+
502
+ ```bibtex
503
+ @misc{watanabe2018espnet,
504
+ title={ESPnet: End-to-End Speech Processing Toolkit},
505
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
506
+ year={2018},
507
+ eprint={1804.00015},
508
+ archivePrefix={arXiv},
509
+ primaryClass={cs.CL}
510
+ }
511
+ ```
exp/tts_stats_raw_phn_null_g2pk/train/energy_stats.npz ADDED
Binary file (770 Bytes). View file
 
exp/tts_stats_raw_phn_null_g2pk/train/feats_stats.npz ADDED
Binary file (1.4 kB). View file
 
exp/tts_stats_raw_phn_null_g2pk/train/pitch_stats.npz ADDED
Binary file (770 Bytes). View file
 
exp/tts_train_jets_raw_phn_null_g2pk/config.yaml ADDED
@@ -0,0 +1,433 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/train_jets.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: exp/tts_train_jets_raw_phn_null_g2pk
7
+ ngpu: 1
8
+ seed: 777
9
+ num_workers: 4
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: 4
14
+ dist_rank: 0
15
+ local_rank: 0
16
+ dist_master_addr: localhost
17
+ dist_master_port: 52809
18
+ dist_launcher: null
19
+ multiprocessing_distributed: true
20
+ unused_parameters: true
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: false
25
+ collect_stats: false
26
+ write_collected_feats: false
27
+ max_epoch: 1000
28
+ patience: null
29
+ val_scheduler_criterion:
30
+ - valid
31
+ - loss
32
+ early_stopping_criterion:
33
+ - valid
34
+ - loss
35
+ - min
36
+ best_model_criterion:
37
+ - - valid
38
+ - text2mel_loss
39
+ - min
40
+ - - train
41
+ - text2mel_loss
42
+ - min
43
+ - - train
44
+ - total_count
45
+ - max
46
+ keep_nbest_models: 5
47
+ nbest_averaging_interval: 0
48
+ grad_clip: -1
49
+ grad_clip_type: 2.0
50
+ grad_noise: false
51
+ accum_grad: 1
52
+ no_forward_run: false
53
+ resume: true
54
+ train_dtype: float32
55
+ use_amp: false
56
+ log_interval: 50
57
+ use_matplotlib: true
58
+ use_tensorboard: true
59
+ use_wandb: false
60
+ wandb_project: null
61
+ wandb_id: null
62
+ wandb_entity: null
63
+ wandb_name: null
64
+ wandb_model_log_interval: -1
65
+ detect_anomaly: false
66
+ pretrain_path: null
67
+ init_param: []
68
+ ignore_init_mismatch: false
69
+ freeze_param: []
70
+ num_iters_per_epoch: 1000
71
+ batch_size: 20
72
+ valid_batch_size: null
73
+ batch_bins: 2000000
74
+ valid_batch_bins: null
75
+ train_shape_file:
76
+ - exp/tts_stats_raw_phn_null_g2pk/train/text_shape.phn
77
+ - exp/tts_stats_raw_phn_null_g2pk/train/speech_shape
78
+ valid_shape_file:
79
+ - exp/tts_stats_raw_phn_null_g2pk/valid/text_shape.phn
80
+ - exp/tts_stats_raw_phn_null_g2pk/valid/speech_shape
81
+ batch_type: numel
82
+ valid_batch_type: null
83
+ fold_length:
84
+ - 150
85
+ - 204800
86
+ sort_in_batch: descending
87
+ sort_batch: descending
88
+ multiple_iterator: false
89
+ chunk_length: 500
90
+ chunk_shift_ratio: 0.5
91
+ num_cache_chunks: 1024
92
+ train_data_path_and_name_and_type:
93
+ - - dump/raw/tr_no_dev/text
94
+ - text
95
+ - text
96
+ - - dump/raw/tr_no_dev/wav.scp
97
+ - speech
98
+ - sound
99
+ - - exp/tts_stats_raw_phn_null_g2pk/train/collect_feats/pitch.scp
100
+ - pitch
101
+ - npy
102
+ - - exp/tts_stats_raw_phn_null_g2pk/train/collect_feats/energy.scp
103
+ - energy
104
+ - npy
105
+ valid_data_path_and_name_and_type:
106
+ - - dump/raw/dev/text
107
+ - text
108
+ - text
109
+ - - dump/raw/dev/wav.scp
110
+ - speech
111
+ - sound
112
+ - - exp/tts_stats_raw_phn_null_g2pk/valid/collect_feats/pitch.scp
113
+ - pitch
114
+ - npy
115
+ - - exp/tts_stats_raw_phn_null_g2pk/valid/collect_feats/energy.scp
116
+ - energy
117
+ - npy
118
+ allow_variable_data_keys: false
119
+ max_cache_size: 0.0
120
+ max_cache_fd: 32
121
+ valid_max_cache_size: null
122
+ optim: adamw
123
+ optim_conf:
124
+ lr: 0.0002
125
+ betas:
126
+ - 0.8
127
+ - 0.99
128
+ eps: 1.0e-09
129
+ weight_decay: 0.0
130
+ scheduler: exponentiallr
131
+ scheduler_conf:
132
+ gamma: 0.999875
133
+ optim2: adamw
134
+ optim2_conf:
135
+ lr: 0.0002
136
+ betas:
137
+ - 0.8
138
+ - 0.99
139
+ eps: 1.0e-09
140
+ weight_decay: 0.0
141
+ scheduler2: exponentiallr
142
+ scheduler2_conf:
143
+ gamma: 0.999875
144
+ generator_first: true
145
+ token_list:
146
+ - <blank>
147
+ - <unk>
148
+ - ''
149
+ - ᅡ
150
+ - ᅵ
151
+ - ᄋ
152
+ - ᅳ
153
+ - ᄀ
154
+ - ᅥ
155
+ - ᄂ
156
+ - ᆫ
157
+ - ᄅ
158
+ - ᄌ
159
+ - ᄉ
160
+ - ᅩ
161
+ - ᆯ
162
+ - ᄆ
163
+ - .
164
+ - ᅮ
165
+ - ᄃ
166
+ - ᄒ
167
+ - ᅦ
168
+ - ᆼ
169
+ - ᅢ
170
+ - ᄇ
171
+ - ᅭ
172
+ - ᅧ
173
+ - ᄊ
174
+ - ᆷ
175
+ - ᄄ
176
+ - ᆮ
177
+ - ᄎ
178
+ - ᄁ
179
+ - ᆨ
180
+ - ᄑ
181
+ - ᄐ
182
+ - ᅪ
183
+ - ᄏ
184
+ - '?'
185
+ - ᄍ
186
+ - ᆸ
187
+ - ᅬ
188
+ - ᅣ
189
+ - ᅴ
190
+ - ᅯ
191
+ - ᅨ
192
+ - ᄈ
193
+ - ᅱ
194
+ - ᅲ
195
+ - ᅫ
196
+ - ','
197
+ - '!'
198
+ - ᅤ
199
+ - ':'
200
+ - ᅰ
201
+ - ''''
202
+ - '-'
203
+ - '"'
204
+ - /
205
+ - I
206
+ - M
207
+ - F
208
+ - E
209
+ - S
210
+ - C
211
+ - A
212
+ - B
213
+ - ㅇ
214
+ - <sos/eos>
215
+ odim: null
216
+ model_conf: {}
217
+ use_preprocessor: true
218
+ token_type: phn
219
+ bpemodel: null
220
+ non_linguistic_symbols: null
221
+ cleaner: null
222
+ g2p: g2pk
223
+ feats_extract: fbank
224
+ feats_extract_conf:
225
+ n_fft: 1024
226
+ hop_length: 256
227
+ win_length: null
228
+ fs: 24000
229
+ fmin: 0
230
+ fmax: null
231
+ n_mels: 80
232
+ normalize: global_mvn
233
+ normalize_conf:
234
+ stats_file: exp/tts_stats_raw_phn_null_g2pk/train/feats_stats.npz
235
+ tts: jets
236
+ tts_conf:
237
+ generator_type: jets_generator
238
+ generator_params:
239
+ adim: 256
240
+ aheads: 2
241
+ elayers: 4
242
+ eunits: 1024
243
+ dlayers: 4
244
+ dunits: 1024
245
+ positionwise_layer_type: conv1d
246
+ positionwise_conv_kernel_size: 3
247
+ duration_predictor_layers: 2
248
+ duration_predictor_chans: 256
249
+ duration_predictor_kernel_size: 3
250
+ use_masking: true
251
+ encoder_normalize_before: true
252
+ decoder_normalize_before: true
253
+ encoder_type: transformer
254
+ decoder_type: transformer
255
+ conformer_rel_pos_type: latest
256
+ conformer_pos_enc_layer_type: rel_pos
257
+ conformer_self_attn_layer_type: rel_selfattn
258
+ conformer_activation_type: swish
259
+ use_macaron_style_in_conformer: true
260
+ use_cnn_in_conformer: true
261
+ conformer_enc_kernel_size: 7
262
+ conformer_dec_kernel_size: 31
263
+ init_type: xavier_uniform
264
+ transformer_enc_dropout_rate: 0.2
265
+ transformer_enc_positional_dropout_rate: 0.2
266
+ transformer_enc_attn_dropout_rate: 0.2
267
+ transformer_dec_dropout_rate: 0.2
268
+ transformer_dec_positional_dropout_rate: 0.2
269
+ transformer_dec_attn_dropout_rate: 0.2
270
+ pitch_predictor_layers: 5
271
+ pitch_predictor_chans: 256
272
+ pitch_predictor_kernel_size: 5
273
+ pitch_predictor_dropout: 0.5
274
+ pitch_embed_kernel_size: 1
275
+ pitch_embed_dropout: 0.0
276
+ stop_gradient_from_pitch_predictor: true
277
+ energy_predictor_layers: 2
278
+ energy_predictor_chans: 256
279
+ energy_predictor_kernel_size: 3
280
+ energy_predictor_dropout: 0.5
281
+ energy_embed_kernel_size: 1
282
+ energy_embed_dropout: 0.0
283
+ stop_gradient_from_energy_predictor: false
284
+ generator_out_channels: 1
285
+ generator_channels: 512
286
+ generator_global_channels: -1
287
+ generator_kernel_size: 7
288
+ generator_upsample_scales:
289
+ - 8
290
+ - 8
291
+ - 2
292
+ - 2
293
+ generator_upsample_kernel_sizes:
294
+ - 16
295
+ - 16
296
+ - 4
297
+ - 4
298
+ generator_resblock_kernel_sizes:
299
+ - 3
300
+ - 7
301
+ - 11
302
+ generator_resblock_dilations:
303
+ - - 1
304
+ - 3
305
+ - 5
306
+ - - 1
307
+ - 3
308
+ - 5
309
+ - - 1
310
+ - 3
311
+ - 5
312
+ generator_use_additional_convs: true
313
+ generator_bias: true
314
+ generator_nonlinear_activation: LeakyReLU
315
+ generator_nonlinear_activation_params:
316
+ negative_slope: 0.1
317
+ generator_use_weight_norm: true
318
+ segment_size: 64
319
+ idim: 69
320
+ odim: 80
321
+ discriminator_type: hifigan_multi_scale_multi_period_discriminator
322
+ discriminator_params:
323
+ scales: 1
324
+ scale_downsample_pooling: AvgPool1d
325
+ scale_downsample_pooling_params:
326
+ kernel_size: 4
327
+ stride: 2
328
+ padding: 2
329
+ scale_discriminator_params:
330
+ in_channels: 1
331
+ out_channels: 1
332
+ kernel_sizes:
333
+ - 15
334
+ - 41
335
+ - 5
336
+ - 3
337
+ channels: 128
338
+ max_downsample_channels: 1024
339
+ max_groups: 16
340
+ bias: true
341
+ downsample_scales:
342
+ - 2
343
+ - 2
344
+ - 4
345
+ - 4
346
+ - 1
347
+ nonlinear_activation: LeakyReLU
348
+ nonlinear_activation_params:
349
+ negative_slope: 0.1
350
+ use_weight_norm: true
351
+ use_spectral_norm: false
352
+ follow_official_norm: false
353
+ periods:
354
+ - 2
355
+ - 3
356
+ - 5
357
+ - 7
358
+ - 11
359
+ period_discriminator_params:
360
+ in_channels: 1
361
+ out_channels: 1
362
+ kernel_sizes:
363
+ - 5
364
+ - 3
365
+ channels: 32
366
+ downsample_scales:
367
+ - 3
368
+ - 3
369
+ - 3
370
+ - 3
371
+ - 1
372
+ max_downsample_channels: 1024
373
+ bias: true
374
+ nonlinear_activation: LeakyReLU
375
+ nonlinear_activation_params:
376
+ negative_slope: 0.1
377
+ use_weight_norm: true
378
+ use_spectral_norm: false
379
+ generator_adv_loss_params:
380
+ average_by_discriminators: false
381
+ loss_type: mse
382
+ discriminator_adv_loss_params:
383
+ average_by_discriminators: false
384
+ loss_type: mse
385
+ feat_match_loss_params:
386
+ average_by_discriminators: false
387
+ average_by_layers: false
388
+ include_final_outputs: true
389
+ mel_loss_params:
390
+ fs: 24000
391
+ n_fft: 1024
392
+ hop_length: 256
393
+ win_length: null
394
+ window: hann
395
+ n_mels: 80
396
+ fmin: 0
397
+ fmax: null
398
+ log_base: null
399
+ lambda_adv: 1.0
400
+ lambda_mel: 45.0
401
+ lambda_feat_match: 2.0
402
+ lambda_var: 1.0
403
+ lambda_align: 2.0
404
+ sampling_rate: 24000
405
+ cache_generator_outputs: true
406
+ pitch_extract: dio
407
+ pitch_extract_conf:
408
+ reduction_factor: 1
409
+ use_token_averaged_f0: false
410
+ fs: 24000
411
+ n_fft: 1024
412
+ hop_length: 256
413
+ f0max: 400
414
+ f0min: 80
415
+ pitch_normalize: global_mvn
416
+ pitch_normalize_conf:
417
+ stats_file: exp/tts_stats_raw_phn_null_g2pk/train/pitch_stats.npz
418
+ energy_extract: energy
419
+ energy_extract_conf:
420
+ reduction_factor: 1
421
+ use_token_averaged_energy: false
422
+ fs: 24000
423
+ n_fft: 1024
424
+ hop_length: 256
425
+ win_length: null
426
+ energy_normalize: global_mvn
427
+ energy_normalize_conf:
428
+ stats_file: exp/tts_stats_raw_phn_null_g2pk/train/energy_stats.npz
429
+ required:
430
+ - output_dir
431
+ - token_list
432
+ version: '202204'
433
+ distributed: true
exp/tts_train_jets_raw_phn_null_g2pk/images/discriminator_backward_time.png ADDED
exp/tts_train_jets_raw_phn_null_g2pk/images/discriminator_fake_loss.png ADDED
exp/tts_train_jets_raw_phn_null_g2pk/images/discriminator_forward_time.png ADDED
exp/tts_train_jets_raw_phn_null_g2pk/images/discriminator_loss.png ADDED
exp/tts_train_jets_raw_phn_null_g2pk/images/discriminator_optim_step_time.png ADDED
exp/tts_train_jets_raw_phn_null_g2pk/images/discriminator_real_loss.png ADDED
exp/tts_train_jets_raw_phn_null_g2pk/images/discriminator_train_time.png ADDED
exp/tts_train_jets_raw_phn_null_g2pk/images/generator_align_bin_loss.png ADDED
exp/tts_train_jets_raw_phn_null_g2pk/images/generator_align_forwardsum_loss.png ADDED
exp/tts_train_jets_raw_phn_null_g2pk/images/generator_align_loss.png ADDED
exp/tts_train_jets_raw_phn_null_g2pk/images/generator_backward_time.png ADDED
exp/tts_train_jets_raw_phn_null_g2pk/images/generator_forward_time.png ADDED
exp/tts_train_jets_raw_phn_null_g2pk/images/generator_g_adv_loss.png ADDED
exp/tts_train_jets_raw_phn_null_g2pk/images/generator_g_feat_match_loss.png ADDED
exp/tts_train_jets_raw_phn_null_g2pk/images/generator_g_loss.png ADDED
exp/tts_train_jets_raw_phn_null_g2pk/images/generator_g_mel_loss.png ADDED
exp/tts_train_jets_raw_phn_null_g2pk/images/generator_loss.png ADDED
exp/tts_train_jets_raw_phn_null_g2pk/images/generator_optim_step_time.png ADDED
exp/tts_train_jets_raw_phn_null_g2pk/images/generator_train_time.png ADDED
exp/tts_train_jets_raw_phn_null_g2pk/images/generator_var_dur_loss.png ADDED
exp/tts_train_jets_raw_phn_null_g2pk/images/generator_var_energy_loss.png ADDED
exp/tts_train_jets_raw_phn_null_g2pk/images/generator_var_loss.png ADDED
exp/tts_train_jets_raw_phn_null_g2pk/images/generator_var_pitch_loss.png ADDED
exp/tts_train_jets_raw_phn_null_g2pk/images/gpu_max_cached_mem_GB.png ADDED
exp/tts_train_jets_raw_phn_null_g2pk/images/iter_time.png ADDED
exp/tts_train_jets_raw_phn_null_g2pk/images/optim0_lr0.png ADDED
exp/tts_train_jets_raw_phn_null_g2pk/images/optim1_lr0.png ADDED
exp/tts_train_jets_raw_phn_null_g2pk/images/train_time.png ADDED
exp/tts_train_jets_raw_phn_null_g2pk/train.total_count.ave_5best.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5de866278f02f5993f647031c4a822363a64fd23a9847a417ad18da71f479b2d
3
+ size 333680139
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: '202204'
2
+ files:
3
+ model_file: exp/tts_train_jets_raw_phn_null_g2pk/train.total_count.ave_5best.pth
4
+ python: "3.7.11 (default, Jul 27 2021, 14:32:16) \n[GCC 7.5.0]"
5
+ timestamp: 1653840124.881467
6
+ torch: 1.10.1+cu113
7
+ yaml_files:
8
+ train_config: exp/tts_train_jets_raw_phn_null_g2pk/config.yaml