imdanboy commited on
Commit
a0db397
1 Parent(s): 6c7787f

Update model

Browse files
Files changed (35) hide show
  1. README.md +514 -0
  2. exp/tts_stats_raw_phn_g2pk_no_space/train/energy_stats.npz +3 -0
  3. exp/tts_stats_raw_phn_g2pk_no_space/train/feats_stats.npz +3 -0
  4. exp/tts_stats_raw_phn_g2pk_no_space/train/pitch_stats.npz +3 -0
  5. exp/tts_train_jets_raw_phn_g2pk_no_space/config.yaml +436 -0
  6. exp/tts_train_jets_raw_phn_g2pk_no_space/images/discriminator_backward_time.png +0 -0
  7. exp/tts_train_jets_raw_phn_g2pk_no_space/images/discriminator_fake_loss.png +0 -0
  8. exp/tts_train_jets_raw_phn_g2pk_no_space/images/discriminator_forward_time.png +0 -0
  9. exp/tts_train_jets_raw_phn_g2pk_no_space/images/discriminator_loss.png +0 -0
  10. exp/tts_train_jets_raw_phn_g2pk_no_space/images/discriminator_optim_step_time.png +0 -0
  11. exp/tts_train_jets_raw_phn_g2pk_no_space/images/discriminator_real_loss.png +0 -0
  12. exp/tts_train_jets_raw_phn_g2pk_no_space/images/discriminator_train_time.png +0 -0
  13. exp/tts_train_jets_raw_phn_g2pk_no_space/images/generator_align_bin_loss.png +0 -0
  14. exp/tts_train_jets_raw_phn_g2pk_no_space/images/generator_align_forwardsum_loss.png +0 -0
  15. exp/tts_train_jets_raw_phn_g2pk_no_space/images/generator_align_loss.png +0 -0
  16. exp/tts_train_jets_raw_phn_g2pk_no_space/images/generator_backward_time.png +0 -0
  17. exp/tts_train_jets_raw_phn_g2pk_no_space/images/generator_forward_time.png +0 -0
  18. exp/tts_train_jets_raw_phn_g2pk_no_space/images/generator_g_adv_loss.png +0 -0
  19. exp/tts_train_jets_raw_phn_g2pk_no_space/images/generator_g_feat_match_loss.png +0 -0
  20. exp/tts_train_jets_raw_phn_g2pk_no_space/images/generator_g_loss.png +0 -0
  21. exp/tts_train_jets_raw_phn_g2pk_no_space/images/generator_g_mel_loss.png +0 -0
  22. exp/tts_train_jets_raw_phn_g2pk_no_space/images/generator_loss.png +0 -0
  23. exp/tts_train_jets_raw_phn_g2pk_no_space/images/generator_optim_step_time.png +0 -0
  24. exp/tts_train_jets_raw_phn_g2pk_no_space/images/generator_train_time.png +0 -0
  25. exp/tts_train_jets_raw_phn_g2pk_no_space/images/generator_var_dur_loss.png +0 -0
  26. exp/tts_train_jets_raw_phn_g2pk_no_space/images/generator_var_energy_loss.png +0 -0
  27. exp/tts_train_jets_raw_phn_g2pk_no_space/images/generator_var_loss.png +0 -0
  28. exp/tts_train_jets_raw_phn_g2pk_no_space/images/generator_var_pitch_loss.png +0 -0
  29. exp/tts_train_jets_raw_phn_g2pk_no_space/images/gpu_max_cached_mem_GB.png +0 -0
  30. exp/tts_train_jets_raw_phn_g2pk_no_space/images/iter_time.png +0 -0
  31. exp/tts_train_jets_raw_phn_g2pk_no_space/images/optim0_lr0.png +0 -0
  32. exp/tts_train_jets_raw_phn_g2pk_no_space/images/optim1_lr0.png +0 -0
  33. exp/tts_train_jets_raw_phn_g2pk_no_space/images/train_time.png +0 -0
  34. exp/tts_train_jets_raw_phn_g2pk_no_space/train.total_count.ave_5best.pth +3 -0
  35. meta.yaml +8 -0
README.md CHANGED
@@ -1,3 +1,517 @@
1
  ---
 
 
 
 
 
 
 
2
  license: cc-by-4.0
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - text-to-speech
6
+ language: ko
7
+ datasets:
8
+ - kss
9
  license: cc-by-4.0
10
  ---
11
+
12
+ ## ESPnet2 TTS model
13
+
14
+ ### `imdanboy/kss_jets`
15
+
16
+ This model was trained by imdanboy using kss recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+ ### Demo: How to use in ESPnet2
19
+
20
+ Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html)
21
+ if you haven't done that already.
22
+
23
+ ```bash
24
+ cd espnet
25
+ git checkout 967ddbed826a7c90b75be2a7129588442d5cb6af
26
+ pip install -e .
27
+ cd egs2/kss/tts1
28
+ ./run.sh --skip_data_prep false --skip_train true --download_model imdanboy/kss_jets
29
+ ```
30
+
31
+
32
+
33
+ ## TTS config
34
+
35
+ <details><summary>expand</summary>
36
+
37
+ ```
38
+ config: conf/tuning/train_jets.yaml
39
+ print_config: false
40
+ log_level: INFO
41
+ dry_run: false
42
+ iterator_type: sequence
43
+ output_dir: exp/tts_train_jets_raw_phn_g2pk_no_space
44
+ ngpu: 1
45
+ seed: 777
46
+ num_workers: 4
47
+ num_att_plot: 3
48
+ dist_backend: nccl
49
+ dist_init_method: env://
50
+ dist_world_size: 4
51
+ dist_rank: 0
52
+ local_rank: 0
53
+ dist_master_addr: localhost
54
+ dist_master_port: 51627
55
+ dist_launcher: null
56
+ multiprocessing_distributed: true
57
+ unused_parameters: true
58
+ sharded_ddp: false
59
+ cudnn_enabled: true
60
+ cudnn_benchmark: false
61
+ cudnn_deterministic: false
62
+ collect_stats: false
63
+ write_collected_feats: false
64
+ max_epoch: 1000
65
+ patience: null
66
+ val_scheduler_criterion:
67
+ - valid
68
+ - loss
69
+ early_stopping_criterion:
70
+ - valid
71
+ - loss
72
+ - min
73
+ best_model_criterion:
74
+ - - valid
75
+ - text2mel_loss
76
+ - min
77
+ - - train
78
+ - text2mel_loss
79
+ - min
80
+ - - train
81
+ - total_count
82
+ - max
83
+ keep_nbest_models: 5
84
+ nbest_averaging_interval: 0
85
+ grad_clip: -1
86
+ grad_clip_type: 2.0
87
+ grad_noise: false
88
+ accum_grad: 1
89
+ no_forward_run: false
90
+ resume: true
91
+ train_dtype: float32
92
+ use_amp: false
93
+ log_interval: 50
94
+ use_matplotlib: true
95
+ use_tensorboard: true
96
+ create_graph_in_tensorboard: false
97
+ use_wandb: false
98
+ wandb_project: null
99
+ wandb_id: null
100
+ wandb_entity: null
101
+ wandb_name: null
102
+ wandb_model_log_interval: -1
103
+ detect_anomaly: false
104
+ pretrain_path: null
105
+ init_param: []
106
+ ignore_init_mismatch: false
107
+ freeze_param: []
108
+ num_iters_per_epoch: 1000
109
+ batch_size: 20
110
+ valid_batch_size: null
111
+ batch_bins: 4500000
112
+ valid_batch_bins: null
113
+ train_shape_file:
114
+ - exp/tts_stats_raw_phn_g2pk_no_space/train/text_shape.phn
115
+ - exp/tts_stats_raw_phn_g2pk_no_space/train/speech_shape
116
+ valid_shape_file:
117
+ - exp/tts_stats_raw_phn_g2pk_no_space/valid/text_shape.phn
118
+ - exp/tts_stats_raw_phn_g2pk_no_space/valid/speech_shape
119
+ batch_type: numel
120
+ valid_batch_type: null
121
+ fold_length:
122
+ - 150
123
+ - 204800
124
+ sort_in_batch: descending
125
+ sort_batch: descending
126
+ multiple_iterator: false
127
+ chunk_length: 500
128
+ chunk_shift_ratio: 0.5
129
+ num_cache_chunks: 1024
130
+ chunk_excluded_key_prefixes: []
131
+ train_data_path_and_name_and_type:
132
+ - - dump/raw/tr_no_dev/text
133
+ - text
134
+ - text
135
+ - - dump/raw/tr_no_dev/wav.scp
136
+ - speech
137
+ - sound
138
+ - - exp/tts_stats_raw_phn_g2pk_no_space/train/collect_feats/pitch.scp
139
+ - pitch
140
+ - npy
141
+ - - exp/tts_stats_raw_phn_g2pk_no_space/train/collect_feats/energy.scp
142
+ - energy
143
+ - npy
144
+ valid_data_path_and_name_and_type:
145
+ - - dump/raw/dev/text
146
+ - text
147
+ - text
148
+ - - dump/raw/dev/wav.scp
149
+ - speech
150
+ - sound
151
+ - - exp/tts_stats_raw_phn_g2pk_no_space/valid/collect_feats/pitch.scp
152
+ - pitch
153
+ - npy
154
+ - - exp/tts_stats_raw_phn_g2pk_no_space/valid/collect_feats/energy.scp
155
+ - energy
156
+ - npy
157
+ allow_variable_data_keys: false
158
+ max_cache_size: 0.0
159
+ max_cache_fd: 32
160
+ valid_max_cache_size: null
161
+ exclude_weight_decay: false
162
+ exclude_weight_decay_conf: {}
163
+ optim: adamw
164
+ optim_conf:
165
+ lr: 0.0002
166
+ betas:
167
+ - 0.8
168
+ - 0.99
169
+ eps: 1.0e-09
170
+ weight_decay: 0.0
171
+ scheduler: exponentiallr
172
+ scheduler_conf:
173
+ gamma: 0.999875
174
+ optim2: adamw
175
+ optim2_conf:
176
+ lr: 0.0002
177
+ betas:
178
+ - 0.8
179
+ - 0.99
180
+ eps: 1.0e-09
181
+ weight_decay: 0.0
182
+ scheduler2: exponentiallr
183
+ scheduler2_conf:
184
+ gamma: 0.999875
185
+ generator_first: true
186
+ token_list:
187
+ - <blank>
188
+ - <unk>
189
+ - ᅡ
190
+ - ᅵ
191
+ - ᄋ
192
+ - ᅳ
193
+ - ᄀ
194
+ - ᅥ
195
+ - ᄂ
196
+ - ᆫ
197
+ - ᄅ
198
+ - ᄌ
199
+ - ᄉ
200
+ - ᅩ
201
+ - ᆯ
202
+ - ᄆ
203
+ - .
204
+ - ᅮ
205
+ - ᄃ
206
+ - ᄒ
207
+ - ᅦ
208
+ - ᆼ
209
+ - ᅢ
210
+ - ᄇ
211
+ - ᅭ
212
+ - ᅧ
213
+ - ᄊ
214
+ - ᆷ
215
+ - ᄄ
216
+ - ᆮ
217
+ - ᄎ
218
+ - ᄁ
219
+ - ᆨ
220
+ - ᄑ
221
+ - ᄐ
222
+ - ᅪ
223
+ - ᄏ
224
+ - '?'
225
+ - ᄍ
226
+ - ᆸ
227
+ - ᅬ
228
+ - ᅣ
229
+ - ᅴ
230
+ - ᅯ
231
+ - ᅨ
232
+ - ᄈ
233
+ - ᅱ
234
+ - ᅲ
235
+ - ᅫ
236
+ - ','
237
+ - '!'
238
+ - ᅤ
239
+ - ':'
240
+ - ᅰ
241
+ - ''''
242
+ - '-'
243
+ - '"'
244
+ - /
245
+ - I
246
+ - M
247
+ - F
248
+ - E
249
+ - S
250
+ - C
251
+ - A
252
+ - B
253
+ - ㅇ
254
+ - <sos/eos>
255
+ odim: null
256
+ model_conf: {}
257
+ use_preprocessor: true
258
+ token_type: phn
259
+ bpemodel: null
260
+ non_linguistic_symbols: null
261
+ cleaner: null
262
+ g2p: g2pk_no_space
263
+ feats_extract: fbank
264
+ feats_extract_conf:
265
+ n_fft: 1024
266
+ hop_length: 256
267
+ win_length: null
268
+ fs: 24000
269
+ fmin: 80
270
+ fmax: 7600
271
+ n_mels: 80
272
+ normalize: global_mvn
273
+ normalize_conf:
274
+ stats_file: exp/tts_stats_raw_phn_g2pk_no_space/train/feats_stats.npz
275
+ tts: jets
276
+ tts_conf:
277
+ generator_type: jets_generator
278
+ generator_params:
279
+ adim: 256
280
+ aheads: 2
281
+ elayers: 4
282
+ eunits: 1024
283
+ dlayers: 4
284
+ dunits: 1024
285
+ positionwise_layer_type: conv1d
286
+ positionwise_conv_kernel_size: 3
287
+ duration_predictor_layers: 2
288
+ duration_predictor_chans: 256
289
+ duration_predictor_kernel_size: 3
290
+ use_masking: true
291
+ encoder_normalize_before: true
292
+ decoder_normalize_before: true
293
+ encoder_type: transformer
294
+ decoder_type: transformer
295
+ conformer_rel_pos_type: latest
296
+ conformer_pos_enc_layer_type: rel_pos
297
+ conformer_self_attn_layer_type: rel_selfattn
298
+ conformer_activation_type: swish
299
+ use_macaron_style_in_conformer: true
300
+ use_cnn_in_conformer: true
301
+ conformer_enc_kernel_size: 7
302
+ conformer_dec_kernel_size: 31
303
+ init_type: xavier_uniform
304
+ transformer_enc_dropout_rate: 0.2
305
+ transformer_enc_positional_dropout_rate: 0.2
306
+ transformer_enc_attn_dropout_rate: 0.2
307
+ transformer_dec_dropout_rate: 0.2
308
+ transformer_dec_positional_dropout_rate: 0.2
309
+ transformer_dec_attn_dropout_rate: 0.2
310
+ pitch_predictor_layers: 5
311
+ pitch_predictor_chans: 256
312
+ pitch_predictor_kernel_size: 5
313
+ pitch_predictor_dropout: 0.5
314
+ pitch_embed_kernel_size: 1
315
+ pitch_embed_dropout: 0.0
316
+ stop_gradient_from_pitch_predictor: true
317
+ energy_predictor_layers: 2
318
+ energy_predictor_chans: 256
319
+ energy_predictor_kernel_size: 3
320
+ energy_predictor_dropout: 0.5
321
+ energy_embed_kernel_size: 1
322
+ energy_embed_dropout: 0.0
323
+ stop_gradient_from_energy_predictor: false
324
+ generator_out_channels: 1
325
+ generator_channels: 512
326
+ generator_global_channels: -1
327
+ generator_kernel_size: 7
328
+ generator_upsample_scales:
329
+ - 8
330
+ - 8
331
+ - 2
332
+ - 2
333
+ generator_upsample_kernel_sizes:
334
+ - 16
335
+ - 16
336
+ - 4
337
+ - 4
338
+ generator_resblock_kernel_sizes:
339
+ - 3
340
+ - 7
341
+ - 11
342
+ generator_resblock_dilations:
343
+ - - 1
344
+ - 3
345
+ - 5
346
+ - - 1
347
+ - 3
348
+ - 5
349
+ - - 1
350
+ - 3
351
+ - 5
352
+ generator_use_additional_convs: true
353
+ generator_bias: true
354
+ generator_nonlinear_activation: LeakyReLU
355
+ generator_nonlinear_activation_params:
356
+ negative_slope: 0.1
357
+ generator_use_weight_norm: true
358
+ segment_size: 32
359
+ idim: 68
360
+ odim: 80
361
+ discriminator_type: hifigan_multi_scale_multi_period_discriminator
362
+ discriminator_params:
363
+ scales: 1
364
+ scale_downsample_pooling: AvgPool1d
365
+ scale_downsample_pooling_params:
366
+ kernel_size: 4
367
+ stride: 2
368
+ padding: 2
369
+ scale_discriminator_params:
370
+ in_channels: 1
371
+ out_channels: 1
372
+ kernel_sizes:
373
+ - 15
374
+ - 41
375
+ - 5
376
+ - 3
377
+ channels: 128
378
+ max_downsample_channels: 1024
379
+ max_groups: 16
380
+ bias: true
381
+ downsample_scales:
382
+ - 2
383
+ - 2
384
+ - 4
385
+ - 4
386
+ - 1
387
+ nonlinear_activation: LeakyReLU
388
+ nonlinear_activation_params:
389
+ negative_slope: 0.1
390
+ use_weight_norm: true
391
+ use_spectral_norm: false
392
+ follow_official_norm: false
393
+ periods:
394
+ - 2
395
+ - 3
396
+ - 5
397
+ - 7
398
+ - 11
399
+ period_discriminator_params:
400
+ in_channels: 1
401
+ out_channels: 1
402
+ kernel_sizes:
403
+ - 5
404
+ - 3
405
+ channels: 32
406
+ downsample_scales:
407
+ - 3
408
+ - 3
409
+ - 3
410
+ - 3
411
+ - 1
412
+ max_downsample_channels: 1024
413
+ bias: true
414
+ nonlinear_activation: LeakyReLU
415
+ nonlinear_activation_params:
416
+ negative_slope: 0.1
417
+ use_weight_norm: true
418
+ use_spectral_norm: false
419
+ generator_adv_loss_params:
420
+ average_by_discriminators: false
421
+ loss_type: mse
422
+ discriminator_adv_loss_params:
423
+ average_by_discriminators: false
424
+ loss_type: mse
425
+ feat_match_loss_params:
426
+ average_by_discriminators: false
427
+ average_by_layers: false
428
+ include_final_outputs: true
429
+ mel_loss_params:
430
+ fs: 24000
431
+ n_fft: 1024
432
+ hop_length: 256
433
+ win_length: null
434
+ window: hann
435
+ n_mels: 80
436
+ fmin: 0
437
+ fmax: null
438
+ log_base: null
439
+ lambda_adv: 1.0
440
+ lambda_mel: 45.0
441
+ lambda_feat_match: 2.0
442
+ lambda_var: 1.0
443
+ lambda_align: 1.0
444
+ sampling_rate: 24000
445
+ cache_generator_outputs: true
446
+ pitch_extract: dio
447
+ pitch_extract_conf:
448
+ reduction_factor: 1
449
+ use_token_averaged_f0: false
450
+ fs: 24000
451
+ n_fft: 1024
452
+ hop_length: 256
453
+ f0max: 400
454
+ f0min: 80
455
+ pitch_normalize: global_mvn
456
+ pitch_normalize_conf:
457
+ stats_file: exp/tts_stats_raw_phn_g2pk_no_space/train/pitch_stats.npz
458
+ energy_extract: energy
459
+ energy_extract_conf:
460
+ reduction_factor: 1
461
+ use_token_averaged_energy: false
462
+ fs: 24000
463
+ n_fft: 1024
464
+ hop_length: 256
465
+ win_length: null
466
+ energy_normalize: global_mvn
467
+ energy_normalize_conf:
468
+ stats_file: exp/tts_stats_raw_phn_g2pk_no_space/train/energy_stats.npz
469
+ required:
470
+ - output_dir
471
+ - token_list
472
+ version: '202304'
473
+ distributed: true
474
+ ```
475
+
476
+ </details>
477
+
478
+
479
+
480
+ ### Citing ESPnet
481
+
482
+ ```BibTex
483
+ @inproceedings{watanabe2018espnet,
484
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
485
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
486
+ year={2018},
487
+ booktitle={Proceedings of Interspeech},
488
+ pages={2207--2211},
489
+ doi={10.21437/Interspeech.2018-1456},
490
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
491
+ }
492
+
493
+
494
+
495
+
496
+ @inproceedings{hayashi2020espnet,
497
+ title={{Espnet-TTS}: Unified, reproducible, and integratable open source end-to-end text-to-speech toolkit},
498
+ author={Hayashi, Tomoki and Yamamoto, Ryuichi and Inoue, Katsuki and Yoshimura, Takenori and Watanabe, Shinji and Toda, Tomoki and Takeda, Kazuya and Zhang, Yu and Tan, Xu},
499
+ booktitle={Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
500
+ pages={7654--7658},
501
+ year={2020},
502
+ organization={IEEE}
503
+ }
504
+ ```
505
+
506
+ or arXiv:
507
+
508
+ ```bibtex
509
+ @misc{watanabe2018espnet,
510
+ title={ESPnet: End-to-End Speech Processing Toolkit},
511
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
512
+ year={2018},
513
+ eprint={1804.00015},
514
+ archivePrefix={arXiv},
515
+ primaryClass={cs.CL}
516
+ }
517
+ ```
exp/tts_stats_raw_phn_g2pk_no_space/train/energy_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:708d422342780a056733631c01afd9fce76c6f3e4cde144d1f9a31e099ffe9da
3
+ size 770
exp/tts_stats_raw_phn_g2pk_no_space/train/feats_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:77bfcce3d801e5ada2b35d87c1ac2060b9286eba2d46e1dbf1705c559a6288ff
3
+ size 1402
exp/tts_stats_raw_phn_g2pk_no_space/train/pitch_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:619bc35bef772fcd070648a51698a4475fe76e49d572b6052056f0f5b8158d45
3
+ size 770
exp/tts_train_jets_raw_phn_g2pk_no_space/config.yaml ADDED
@@ -0,0 +1,436 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/train_jets.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: exp/tts_train_jets_raw_phn_g2pk_no_space
7
+ ngpu: 1
8
+ seed: 777
9
+ num_workers: 4
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: 4
14
+ dist_rank: 0
15
+ local_rank: 0
16
+ dist_master_addr: localhost
17
+ dist_master_port: 51627
18
+ dist_launcher: null
19
+ multiprocessing_distributed: true
20
+ unused_parameters: true
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: false
25
+ collect_stats: false
26
+ write_collected_feats: false
27
+ max_epoch: 1000
28
+ patience: null
29
+ val_scheduler_criterion:
30
+ - valid
31
+ - loss
32
+ early_stopping_criterion:
33
+ - valid
34
+ - loss
35
+ - min
36
+ best_model_criterion:
37
+ - - valid
38
+ - text2mel_loss
39
+ - min
40
+ - - train
41
+ - text2mel_loss
42
+ - min
43
+ - - train
44
+ - total_count
45
+ - max
46
+ keep_nbest_models: 5
47
+ nbest_averaging_interval: 0
48
+ grad_clip: -1
49
+ grad_clip_type: 2.0
50
+ grad_noise: false
51
+ accum_grad: 1
52
+ no_forward_run: false
53
+ resume: true
54
+ train_dtype: float32
55
+ use_amp: false
56
+ log_interval: 50
57
+ use_matplotlib: true
58
+ use_tensorboard: true
59
+ create_graph_in_tensorboard: false
60
+ use_wandb: false
61
+ wandb_project: null
62
+ wandb_id: null
63
+ wandb_entity: null
64
+ wandb_name: null
65
+ wandb_model_log_interval: -1
66
+ detect_anomaly: false
67
+ pretrain_path: null
68
+ init_param: []
69
+ ignore_init_mismatch: false
70
+ freeze_param: []
71
+ num_iters_per_epoch: 1000
72
+ batch_size: 20
73
+ valid_batch_size: null
74
+ batch_bins: 4500000
75
+ valid_batch_bins: null
76
+ train_shape_file:
77
+ - exp/tts_stats_raw_phn_g2pk_no_space/train/text_shape.phn
78
+ - exp/tts_stats_raw_phn_g2pk_no_space/train/speech_shape
79
+ valid_shape_file:
80
+ - exp/tts_stats_raw_phn_g2pk_no_space/valid/text_shape.phn
81
+ - exp/tts_stats_raw_phn_g2pk_no_space/valid/speech_shape
82
+ batch_type: numel
83
+ valid_batch_type: null
84
+ fold_length:
85
+ - 150
86
+ - 204800
87
+ sort_in_batch: descending
88
+ sort_batch: descending
89
+ multiple_iterator: false
90
+ chunk_length: 500
91
+ chunk_shift_ratio: 0.5
92
+ num_cache_chunks: 1024
93
+ chunk_excluded_key_prefixes: []
94
+ train_data_path_and_name_and_type:
95
+ - - dump/raw/tr_no_dev/text
96
+ - text
97
+ - text
98
+ - - dump/raw/tr_no_dev/wav.scp
99
+ - speech
100
+ - sound
101
+ - - exp/tts_stats_raw_phn_g2pk_no_space/train/collect_feats/pitch.scp
102
+ - pitch
103
+ - npy
104
+ - - exp/tts_stats_raw_phn_g2pk_no_space/train/collect_feats/energy.scp
105
+ - energy
106
+ - npy
107
+ valid_data_path_and_name_and_type:
108
+ - - dump/raw/dev/text
109
+ - text
110
+ - text
111
+ - - dump/raw/dev/wav.scp
112
+ - speech
113
+ - sound
114
+ - - exp/tts_stats_raw_phn_g2pk_no_space/valid/collect_feats/pitch.scp
115
+ - pitch
116
+ - npy
117
+ - - exp/tts_stats_raw_phn_g2pk_no_space/valid/collect_feats/energy.scp
118
+ - energy
119
+ - npy
120
+ allow_variable_data_keys: false
121
+ max_cache_size: 0.0
122
+ max_cache_fd: 32
123
+ valid_max_cache_size: null
124
+ exclude_weight_decay: false
125
+ exclude_weight_decay_conf: {}
126
+ optim: adamw
127
+ optim_conf:
128
+ lr: 0.0002
129
+ betas:
130
+ - 0.8
131
+ - 0.99
132
+ eps: 1.0e-09
133
+ weight_decay: 0.0
134
+ scheduler: exponentiallr
135
+ scheduler_conf:
136
+ gamma: 0.999875
137
+ optim2: adamw
138
+ optim2_conf:
139
+ lr: 0.0002
140
+ betas:
141
+ - 0.8
142
+ - 0.99
143
+ eps: 1.0e-09
144
+ weight_decay: 0.0
145
+ scheduler2: exponentiallr
146
+ scheduler2_conf:
147
+ gamma: 0.999875
148
+ generator_first: true
149
+ token_list:
150
+ - <blank>
151
+ - <unk>
152
+ - ᅡ
153
+ - ᅵ
154
+ - ᄋ
155
+ - ᅳ
156
+ - ᄀ
157
+ - ᅥ
158
+ - ᄂ
159
+ - ᆫ
160
+ - ᄅ
161
+ - ᄌ
162
+ - ᄉ
163
+ - ᅩ
164
+ - ᆯ
165
+ - ᄆ
166
+ - .
167
+ - ᅮ
168
+ - ᄃ
169
+ - ᄒ
170
+ - ᅦ
171
+ - ᆼ
172
+ - ᅢ
173
+ - ᄇ
174
+ - ᅭ
175
+ - ᅧ
176
+ - ᄊ
177
+ - ᆷ
178
+ - ᄄ
179
+ - ᆮ
180
+ - ᄎ
181
+ - ᄁ
182
+ - ᆨ
183
+ - ᄑ
184
+ - ᄐ
185
+ - ᅪ
186
+ - ᄏ
187
+ - '?'
188
+ - ᄍ
189
+ - ᆸ
190
+ - ᅬ
191
+ - ᅣ
192
+ - ᅴ
193
+ - ᅯ
194
+ - ᅨ
195
+ - ᄈ
196
+ - ᅱ
197
+ - ᅲ
198
+ - ᅫ
199
+ - ','
200
+ - '!'
201
+ - ᅤ
202
+ - ':'
203
+ - ᅰ
204
+ - ''''
205
+ - '-'
206
+ - '"'
207
+ - /
208
+ - I
209
+ - M
210
+ - F
211
+ - E
212
+ - S
213
+ - C
214
+ - A
215
+ - B
216
+ - ㅇ
217
+ - <sos/eos>
218
+ odim: null
219
+ model_conf: {}
220
+ use_preprocessor: true
221
+ token_type: phn
222
+ bpemodel: null
223
+ non_linguistic_symbols: null
224
+ cleaner: null
225
+ g2p: g2pk_no_space
226
+ feats_extract: fbank
227
+ feats_extract_conf:
228
+ n_fft: 1024
229
+ hop_length: 256
230
+ win_length: null
231
+ fs: 24000
232
+ fmin: 80
233
+ fmax: 7600
234
+ n_mels: 80
235
+ normalize: global_mvn
236
+ normalize_conf:
237
+ stats_file: exp/tts_stats_raw_phn_g2pk_no_space/train/feats_stats.npz
238
+ tts: jets
239
+ tts_conf:
240
+ generator_type: jets_generator
241
+ generator_params:
242
+ adim: 256
243
+ aheads: 2
244
+ elayers: 4
245
+ eunits: 1024
246
+ dlayers: 4
247
+ dunits: 1024
248
+ positionwise_layer_type: conv1d
249
+ positionwise_conv_kernel_size: 3
250
+ duration_predictor_layers: 2
251
+ duration_predictor_chans: 256
252
+ duration_predictor_kernel_size: 3
253
+ use_masking: true
254
+ encoder_normalize_before: true
255
+ decoder_normalize_before: true
256
+ encoder_type: transformer
257
+ decoder_type: transformer
258
+ conformer_rel_pos_type: latest
259
+ conformer_pos_enc_layer_type: rel_pos
260
+ conformer_self_attn_layer_type: rel_selfattn
261
+ conformer_activation_type: swish
262
+ use_macaron_style_in_conformer: true
263
+ use_cnn_in_conformer: true
264
+ conformer_enc_kernel_size: 7
265
+ conformer_dec_kernel_size: 31
266
+ init_type: xavier_uniform
267
+ transformer_enc_dropout_rate: 0.2
268
+ transformer_enc_positional_dropout_rate: 0.2
269
+ transformer_enc_attn_dropout_rate: 0.2
270
+ transformer_dec_dropout_rate: 0.2
271
+ transformer_dec_positional_dropout_rate: 0.2
272
+ transformer_dec_attn_dropout_rate: 0.2
273
+ pitch_predictor_layers: 5
274
+ pitch_predictor_chans: 256
275
+ pitch_predictor_kernel_size: 5
276
+ pitch_predictor_dropout: 0.5
277
+ pitch_embed_kernel_size: 1
278
+ pitch_embed_dropout: 0.0
279
+ stop_gradient_from_pitch_predictor: true
280
+ energy_predictor_layers: 2
281
+ energy_predictor_chans: 256
282
+ energy_predictor_kernel_size: 3
283
+ energy_predictor_dropout: 0.5
284
+ energy_embed_kernel_size: 1
285
+ energy_embed_dropout: 0.0
286
+ stop_gradient_from_energy_predictor: false
287
+ generator_out_channels: 1
288
+ generator_channels: 512
289
+ generator_global_channels: -1
290
+ generator_kernel_size: 7
291
+ generator_upsample_scales:
292
+ - 8
293
+ - 8
294
+ - 2
295
+ - 2
296
+ generator_upsample_kernel_sizes:
297
+ - 16
298
+ - 16
299
+ - 4
300
+ - 4
301
+ generator_resblock_kernel_sizes:
302
+ - 3
303
+ - 7
304
+ - 11
305
+ generator_resblock_dilations:
306
+ - - 1
307
+ - 3
308
+ - 5
309
+ - - 1
310
+ - 3
311
+ - 5
312
+ - - 1
313
+ - 3
314
+ - 5
315
+ generator_use_additional_convs: true
316
+ generator_bias: true
317
+ generator_nonlinear_activation: LeakyReLU
318
+ generator_nonlinear_activation_params:
319
+ negative_slope: 0.1
320
+ generator_use_weight_norm: true
321
+ segment_size: 32
322
+ idim: 68
323
+ odim: 80
324
+ discriminator_type: hifigan_multi_scale_multi_period_discriminator
325
+ discriminator_params:
326
+ scales: 1
327
+ scale_downsample_pooling: AvgPool1d
328
+ scale_downsample_pooling_params:
329
+ kernel_size: 4
330
+ stride: 2
331
+ padding: 2
332
+ scale_discriminator_params:
333
+ in_channels: 1
334
+ out_channels: 1
335
+ kernel_sizes:
336
+ - 15
337
+ - 41
338
+ - 5
339
+ - 3
340
+ channels: 128
341
+ max_downsample_channels: 1024
342
+ max_groups: 16
343
+ bias: true
344
+ downsample_scales:
345
+ - 2
346
+ - 2
347
+ - 4
348
+ - 4
349
+ - 1
350
+ nonlinear_activation: LeakyReLU
351
+ nonlinear_activation_params:
352
+ negative_slope: 0.1
353
+ use_weight_norm: true
354
+ use_spectral_norm: false
355
+ follow_official_norm: false
356
+ periods:
357
+ - 2
358
+ - 3
359
+ - 5
360
+ - 7
361
+ - 11
362
+ period_discriminator_params:
363
+ in_channels: 1
364
+ out_channels: 1
365
+ kernel_sizes:
366
+ - 5
367
+ - 3
368
+ channels: 32
369
+ downsample_scales:
370
+ - 3
371
+ - 3
372
+ - 3
373
+ - 3
374
+ - 1
375
+ max_downsample_channels: 1024
376
+ bias: true
377
+ nonlinear_activation: LeakyReLU
378
+ nonlinear_activation_params:
379
+ negative_slope: 0.1
380
+ use_weight_norm: true
381
+ use_spectral_norm: false
382
+ generator_adv_loss_params:
383
+ average_by_discriminators: false
384
+ loss_type: mse
385
+ discriminator_adv_loss_params:
386
+ average_by_discriminators: false
387
+ loss_type: mse
388
+ feat_match_loss_params:
389
+ average_by_discriminators: false
390
+ average_by_layers: false
391
+ include_final_outputs: true
392
+ mel_loss_params:
393
+ fs: 24000
394
+ n_fft: 1024
395
+ hop_length: 256
396
+ win_length: null
397
+ window: hann
398
+ n_mels: 80
399
+ fmin: 0
400
+ fmax: null
401
+ log_base: null
402
+ lambda_adv: 1.0
403
+ lambda_mel: 45.0
404
+ lambda_feat_match: 2.0
405
+ lambda_var: 1.0
406
+ lambda_align: 1.0
407
+ sampling_rate: 24000
408
+ cache_generator_outputs: true
409
+ pitch_extract: dio
410
+ pitch_extract_conf:
411
+ reduction_factor: 1
412
+ use_token_averaged_f0: false
413
+ fs: 24000
414
+ n_fft: 1024
415
+ hop_length: 256
416
+ f0max: 400
417
+ f0min: 80
418
+ pitch_normalize: global_mvn
419
+ pitch_normalize_conf:
420
+ stats_file: exp/tts_stats_raw_phn_g2pk_no_space/train/pitch_stats.npz
421
+ energy_extract: energy
422
+ energy_extract_conf:
423
+ reduction_factor: 1
424
+ use_token_averaged_energy: false
425
+ fs: 24000
426
+ n_fft: 1024
427
+ hop_length: 256
428
+ win_length: null
429
+ energy_normalize: global_mvn
430
+ energy_normalize_conf:
431
+ stats_file: exp/tts_stats_raw_phn_g2pk_no_space/train/energy_stats.npz
432
+ required:
433
+ - output_dir
434
+ - token_list
435
+ version: '202304'
436
+ distributed: true
exp/tts_train_jets_raw_phn_g2pk_no_space/images/discriminator_backward_time.png ADDED
exp/tts_train_jets_raw_phn_g2pk_no_space/images/discriminator_fake_loss.png ADDED
exp/tts_train_jets_raw_phn_g2pk_no_space/images/discriminator_forward_time.png ADDED
exp/tts_train_jets_raw_phn_g2pk_no_space/images/discriminator_loss.png ADDED
exp/tts_train_jets_raw_phn_g2pk_no_space/images/discriminator_optim_step_time.png ADDED
exp/tts_train_jets_raw_phn_g2pk_no_space/images/discriminator_real_loss.png ADDED
exp/tts_train_jets_raw_phn_g2pk_no_space/images/discriminator_train_time.png ADDED
exp/tts_train_jets_raw_phn_g2pk_no_space/images/generator_align_bin_loss.png ADDED
exp/tts_train_jets_raw_phn_g2pk_no_space/images/generator_align_forwardsum_loss.png ADDED
exp/tts_train_jets_raw_phn_g2pk_no_space/images/generator_align_loss.png ADDED
exp/tts_train_jets_raw_phn_g2pk_no_space/images/generator_backward_time.png ADDED
exp/tts_train_jets_raw_phn_g2pk_no_space/images/generator_forward_time.png ADDED
exp/tts_train_jets_raw_phn_g2pk_no_space/images/generator_g_adv_loss.png ADDED
exp/tts_train_jets_raw_phn_g2pk_no_space/images/generator_g_feat_match_loss.png ADDED
exp/tts_train_jets_raw_phn_g2pk_no_space/images/generator_g_loss.png ADDED
exp/tts_train_jets_raw_phn_g2pk_no_space/images/generator_g_mel_loss.png ADDED
exp/tts_train_jets_raw_phn_g2pk_no_space/images/generator_loss.png ADDED
exp/tts_train_jets_raw_phn_g2pk_no_space/images/generator_optim_step_time.png ADDED
exp/tts_train_jets_raw_phn_g2pk_no_space/images/generator_train_time.png ADDED
exp/tts_train_jets_raw_phn_g2pk_no_space/images/generator_var_dur_loss.png ADDED
exp/tts_train_jets_raw_phn_g2pk_no_space/images/generator_var_energy_loss.png ADDED
exp/tts_train_jets_raw_phn_g2pk_no_space/images/generator_var_loss.png ADDED
exp/tts_train_jets_raw_phn_g2pk_no_space/images/generator_var_pitch_loss.png ADDED
exp/tts_train_jets_raw_phn_g2pk_no_space/images/gpu_max_cached_mem_GB.png ADDED
exp/tts_train_jets_raw_phn_g2pk_no_space/images/iter_time.png ADDED
exp/tts_train_jets_raw_phn_g2pk_no_space/images/optim0_lr0.png ADDED
exp/tts_train_jets_raw_phn_g2pk_no_space/images/optim1_lr0.png ADDED
exp/tts_train_jets_raw_phn_g2pk_no_space/images/train_time.png ADDED
exp/tts_train_jets_raw_phn_g2pk_no_space/train.total_count.ave_5best.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4de8e1cdd06031c72acd2c354c10f6bb122087425390e3264fc38f24adffc796
3
+ size 333698171
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: '202304'
2
+ files:
3
+ model_file: exp/tts_train_jets_raw_phn_g2pk_no_space/train.total_count.ave_5best.pth
4
+ python: "3.8.16 (default, Jun 12 2023, 18:09:05) \n[GCC 11.2.0]"
5
+ timestamp: 1688808586.506846
6
+ torch: 1.11.0
7
+ yaml_files:
8
+ train_config: exp/tts_train_jets_raw_phn_g2pk_no_space/config.yaml