Text-to-Speech
ESPnet
English
audio
imdanboy commited on
Commit
1db95c2
1 Parent(s): f93e534

Update model

Browse files
Files changed (35) hide show
  1. README.md +517 -0
  2. exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/energy_stats.npz +0 -0
  3. exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/feats_stats.npz +0 -0
  4. exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/pitch_stats.npz +0 -0
  5. exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/config.yaml +442 -0
  6. exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/discriminator_backward_time.png +0 -0
  7. exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/discriminator_fake_loss.png +0 -0
  8. exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/discriminator_forward_time.png +0 -0
  9. exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/discriminator_loss.png +0 -0
  10. exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/discriminator_optim_step_time.png +0 -0
  11. exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/discriminator_real_loss.png +0 -0
  12. exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/discriminator_train_time.png +0 -0
  13. exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/generator_align_bin_loss.png +0 -0
  14. exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/generator_align_forwardsum_loss.png +0 -0
  15. exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/generator_align_loss.png +0 -0
  16. exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/generator_backward_time.png +0 -0
  17. exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/generator_forward_time.png +0 -0
  18. exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/generator_g_adv_loss.png +0 -0
  19. exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/generator_g_feat_match_loss.png +0 -0
  20. exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/generator_g_loss.png +0 -0
  21. exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/generator_g_mel_loss.png +0 -0
  22. exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/generator_loss.png +0 -0
  23. exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/generator_optim_step_time.png +0 -0
  24. exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/generator_train_time.png +0 -0
  25. exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/generator_var_dur_loss.png +0 -0
  26. exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/generator_var_energy_loss.png +0 -0
  27. exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/generator_var_loss.png +0 -0
  28. exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/generator_var_pitch_loss.png +0 -0
  29. exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/gpu_max_cached_mem_GB.png +0 -0
  30. exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/iter_time.png +0 -0
  31. exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/optim0_lr0.png +0 -0
  32. exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/optim1_lr0.png +0 -0
  33. exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/train_time.png +0 -0
  34. exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/train.total_count.ave_5best.pth +3 -0
  35. meta.yaml +8 -0
README.md CHANGED
@@ -1,3 +1,520 @@
1
  ---
 
 
 
 
 
 
 
2
  license: cc-by-4.0
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - text-to-speech
6
+ language: en
7
+ datasets:
8
+ - ljspeech
9
  license: cc-by-4.0
10
  ---
11
+
12
+ ## ESPnet2 TTS model
13
+
14
+ ### `imdanboy/jets`
15
+
16
+ This model was trained by imdanboy using ljspeech recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+ ### Demo: How to use in ESPnet2
19
+
20
+ ```bash
21
+ cd espnet
22
+ git checkout c173c30930631731e6836c274a591ad571749741
23
+ pip install -e .
24
+ cd egs2/ljspeech/tts1
25
+ ./run.sh --skip_data_prep false --skip_train true --download_model imdanboy/jets
26
+ ```
27
+
28
+
29
+
30
+ ## TTS config
31
+
32
+ <details><summary>expand</summary>
33
+
34
+ ```
35
+ config: conf/tuning/train_jets.yaml
36
+ print_config: false
37
+ log_level: INFO
38
+ dry_run: false
39
+ iterator_type: sequence
40
+ output_dir: exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space
41
+ ngpu: 1
42
+ seed: 777
43
+ num_workers: 4
44
+ num_att_plot: 3
45
+ dist_backend: nccl
46
+ dist_init_method: env://
47
+ dist_world_size: 4
48
+ dist_rank: 0
49
+ local_rank: 0
50
+ dist_master_addr: localhost
51
+ dist_master_port: 39471
52
+ dist_launcher: null
53
+ multiprocessing_distributed: true
54
+ unused_parameters: true
55
+ sharded_ddp: false
56
+ cudnn_enabled: true
57
+ cudnn_benchmark: false
58
+ cudnn_deterministic: false
59
+ collect_stats: false
60
+ write_collected_feats: false
61
+ max_epoch: 1000
62
+ patience: null
63
+ val_scheduler_criterion:
64
+ - valid
65
+ - loss
66
+ early_stopping_criterion:
67
+ - valid
68
+ - loss
69
+ - min
70
+ best_model_criterion:
71
+ - - valid
72
+ - text2mel_loss
73
+ - min
74
+ - - train
75
+ - text2mel_loss
76
+ - min
77
+ - - train
78
+ - total_count
79
+ - max
80
+ keep_nbest_models: 5
81
+ nbest_averaging_interval: 0
82
+ grad_clip: -1
83
+ grad_clip_type: 2.0
84
+ grad_noise: false
85
+ accum_grad: 1
86
+ no_forward_run: false
87
+ resume: true
88
+ train_dtype: float32
89
+ use_amp: false
90
+ log_interval: 50
91
+ use_matplotlib: true
92
+ use_tensorboard: true
93
+ use_wandb: false
94
+ wandb_project: null
95
+ wandb_id: null
96
+ wandb_entity: null
97
+ wandb_name: null
98
+ wandb_model_log_interval: -1
99
+ detect_anomaly: false
100
+ pretrain_path: null
101
+ init_param: []
102
+ ignore_init_mismatch: false
103
+ freeze_param: []
104
+ num_iters_per_epoch: 1000
105
+ batch_size: 20
106
+ valid_batch_size: null
107
+ batch_bins: 3000000
108
+ valid_batch_bins: null
109
+ train_shape_file:
110
+ - exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/text_shape.phn
111
+ - exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/speech_shape
112
+ valid_shape_file:
113
+ - exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/valid/text_shape.phn
114
+ - exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/valid/speech_shape
115
+ batch_type: numel
116
+ valid_batch_type: null
117
+ fold_length:
118
+ - 150
119
+ - 204800
120
+ sort_in_batch: descending
121
+ sort_batch: descending
122
+ multiple_iterator: false
123
+ chunk_length: 500
124
+ chunk_shift_ratio: 0.5
125
+ num_cache_chunks: 1024
126
+ train_data_path_and_name_and_type:
127
+ - - dump/raw/tr_no_dev/text
128
+ - text
129
+ - text
130
+ - - dump/raw/tr_no_dev/wav.scp
131
+ - speech
132
+ - sound
133
+ - - exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/collect_feats/pitch.scp
134
+ - pitch
135
+ - npy
136
+ - - exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/collect_feats/energy.scp
137
+ - energy
138
+ - npy
139
+ valid_data_path_and_name_and_type:
140
+ - - dump/raw/dev/text
141
+ - text
142
+ - text
143
+ - - dump/raw/dev/wav.scp
144
+ - speech
145
+ - sound
146
+ - - exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/valid/collect_feats/pitch.scp
147
+ - pitch
148
+ - npy
149
+ - - exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/valid/collect_feats/energy.scp
150
+ - energy
151
+ - npy
152
+ allow_variable_data_keys: false
153
+ max_cache_size: 0.0
154
+ max_cache_fd: 32
155
+ valid_max_cache_size: null
156
+ optim: adamw
157
+ optim_conf:
158
+ lr: 0.0002
159
+ betas:
160
+ - 0.8
161
+ - 0.99
162
+ eps: 1.0e-09
163
+ weight_decay: 0.0
164
+ scheduler: exponentiallr
165
+ scheduler_conf:
166
+ gamma: 0.999875
167
+ optim2: adamw
168
+ optim2_conf:
169
+ lr: 0.0002
170
+ betas:
171
+ - 0.8
172
+ - 0.99
173
+ eps: 1.0e-09
174
+ weight_decay: 0.0
175
+ scheduler2: exponentiallr
176
+ scheduler2_conf:
177
+ gamma: 0.999875
178
+ generator_first: true
179
+ token_list:
180
+ - <blank>
181
+ - <unk>
182
+ - AH0
183
+ - N
184
+ - T
185
+ - D
186
+ - S
187
+ - R
188
+ - L
189
+ - DH
190
+ - K
191
+ - Z
192
+ - IH1
193
+ - IH0
194
+ - M
195
+ - EH1
196
+ - W
197
+ - P
198
+ - AE1
199
+ - AH1
200
+ - V
201
+ - ER0
202
+ - F
203
+ - ','
204
+ - AA1
205
+ - B
206
+ - HH
207
+ - IY1
208
+ - UW1
209
+ - IY0
210
+ - AO1
211
+ - EY1
212
+ - AY1
213
+ - .
214
+ - OW1
215
+ - SH
216
+ - NG
217
+ - G
218
+ - ER1
219
+ - CH
220
+ - JH
221
+ - Y
222
+ - AW1
223
+ - TH
224
+ - UH1
225
+ - EH2
226
+ - OW0
227
+ - EY2
228
+ - AO0
229
+ - IH2
230
+ - AE2
231
+ - AY2
232
+ - AA2
233
+ - UW0
234
+ - EH0
235
+ - OY1
236
+ - EY0
237
+ - AO2
238
+ - ZH
239
+ - OW2
240
+ - AE0
241
+ - UW2
242
+ - AH2
243
+ - AY0
244
+ - IY2
245
+ - AW2
246
+ - AA0
247
+ - ''''
248
+ - ER2
249
+ - UH2
250
+ - '?'
251
+ - OY2
252
+ - '!'
253
+ - AW0
254
+ - UH0
255
+ - OY0
256
+ - ..
257
+ - <sos/eos>
258
+ odim: null
259
+ model_conf: {}
260
+ use_preprocessor: true
261
+ token_type: phn
262
+ bpemodel: null
263
+ non_linguistic_symbols: null
264
+ cleaner: tacotron
265
+ g2p: g2p_en_no_space
266
+ feats_extract: fbank
267
+ feats_extract_conf:
268
+ n_fft: 1024
269
+ hop_length: 256
270
+ win_length: null
271
+ fs: 22050
272
+ fmin: 80
273
+ fmax: 7600
274
+ n_mels: 80
275
+ normalize: global_mvn
276
+ normalize_conf:
277
+ stats_file: exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/feats_stats.npz
278
+ tts: jets
279
+ tts_conf:
280
+ generator_type: jets_generator
281
+ generator_params:
282
+ adim: 256
283
+ aheads: 2
284
+ elayers: 4
285
+ eunits: 1024
286
+ dlayers: 4
287
+ dunits: 1024
288
+ positionwise_layer_type: conv1d
289
+ positionwise_conv_kernel_size: 3
290
+ duration_predictor_layers: 2
291
+ duration_predictor_chans: 256
292
+ duration_predictor_kernel_size: 3
293
+ use_masking: true
294
+ encoder_normalize_before: true
295
+ decoder_normalize_before: true
296
+ encoder_type: transformer
297
+ decoder_type: transformer
298
+ conformer_rel_pos_type: latest
299
+ conformer_pos_enc_layer_type: rel_pos
300
+ conformer_self_attn_layer_type: rel_selfattn
301
+ conformer_activation_type: swish
302
+ use_macaron_style_in_conformer: true
303
+ use_cnn_in_conformer: true
304
+ conformer_enc_kernel_size: 7
305
+ conformer_dec_kernel_size: 31
306
+ init_type: xavier_uniform
307
+ transformer_enc_dropout_rate: 0.2
308
+ transformer_enc_positional_dropout_rate: 0.2
309
+ transformer_enc_attn_dropout_rate: 0.2
310
+ transformer_dec_dropout_rate: 0.2
311
+ transformer_dec_positional_dropout_rate: 0.2
312
+ transformer_dec_attn_dropout_rate: 0.2
313
+ pitch_predictor_layers: 5
314
+ pitch_predictor_chans: 256
315
+ pitch_predictor_kernel_size: 5
316
+ pitch_predictor_dropout: 0.5
317
+ pitch_embed_kernel_size: 1
318
+ pitch_embed_dropout: 0.0
319
+ stop_gradient_from_pitch_predictor: true
320
+ energy_predictor_layers: 2
321
+ energy_predictor_chans: 256
322
+ energy_predictor_kernel_size: 3
323
+ energy_predictor_dropout: 0.5
324
+ energy_embed_kernel_size: 1
325
+ energy_embed_dropout: 0.0
326
+ stop_gradient_from_energy_predictor: false
327
+ generator_out_channels: 1
328
+ generator_channels: 512
329
+ generator_global_channels: -1
330
+ generator_kernel_size: 7
331
+ generator_upsample_scales:
332
+ - 8
333
+ - 8
334
+ - 2
335
+ - 2
336
+ generator_upsample_kernel_sizes:
337
+ - 16
338
+ - 16
339
+ - 4
340
+ - 4
341
+ generator_resblock_kernel_sizes:
342
+ - 3
343
+ - 7
344
+ - 11
345
+ generator_resblock_dilations:
346
+ - - 1
347
+ - 3
348
+ - 5
349
+ - - 1
350
+ - 3
351
+ - 5
352
+ - - 1
353
+ - 3
354
+ - 5
355
+ generator_use_additional_convs: true
356
+ generator_bias: true
357
+ generator_nonlinear_activation: LeakyReLU
358
+ generator_nonlinear_activation_params:
359
+ negative_slope: 0.1
360
+ generator_use_weight_norm: true
361
+ segment_size: 64
362
+ idim: 78
363
+ odim: 80
364
+ discriminator_type: hifigan_multi_scale_multi_period_discriminator
365
+ discriminator_params:
366
+ scales: 1
367
+ scale_downsample_pooling: AvgPool1d
368
+ scale_downsample_pooling_params:
369
+ kernel_size: 4
370
+ stride: 2
371
+ padding: 2
372
+ scale_discriminator_params:
373
+ in_channels: 1
374
+ out_channels: 1
375
+ kernel_sizes:
376
+ - 15
377
+ - 41
378
+ - 5
379
+ - 3
380
+ channels: 128
381
+ max_downsample_channels: 1024
382
+ max_groups: 16
383
+ bias: true
384
+ downsample_scales:
385
+ - 2
386
+ - 2
387
+ - 4
388
+ - 4
389
+ - 1
390
+ nonlinear_activation: LeakyReLU
391
+ nonlinear_activation_params:
392
+ negative_slope: 0.1
393
+ use_weight_norm: true
394
+ use_spectral_norm: false
395
+ follow_official_norm: false
396
+ periods:
397
+ - 2
398
+ - 3
399
+ - 5
400
+ - 7
401
+ - 11
402
+ period_discriminator_params:
403
+ in_channels: 1
404
+ out_channels: 1
405
+ kernel_sizes:
406
+ - 5
407
+ - 3
408
+ channels: 32
409
+ downsample_scales:
410
+ - 3
411
+ - 3
412
+ - 3
413
+ - 3
414
+ - 1
415
+ max_downsample_channels: 1024
416
+ bias: true
417
+ nonlinear_activation: LeakyReLU
418
+ nonlinear_activation_params:
419
+ negative_slope: 0.1
420
+ use_weight_norm: true
421
+ use_spectral_norm: false
422
+ generator_adv_loss_params:
423
+ average_by_discriminators: false
424
+ loss_type: mse
425
+ discriminator_adv_loss_params:
426
+ average_by_discriminators: false
427
+ loss_type: mse
428
+ feat_match_loss_params:
429
+ average_by_discriminators: false
430
+ average_by_layers: false
431
+ include_final_outputs: true
432
+ mel_loss_params:
433
+ fs: 22050
434
+ n_fft: 1024
435
+ hop_length: 256
436
+ win_length: null
437
+ window: hann
438
+ n_mels: 80
439
+ fmin: 0
440
+ fmax: null
441
+ log_base: null
442
+ lambda_adv: 1.0
443
+ lambda_mel: 45.0
444
+ lambda_feat_match: 2.0
445
+ lambda_var: 1.0
446
+ lambda_align: 2.0
447
+ sampling_rate: 22050
448
+ cache_generator_outputs: true
449
+ pitch_extract: dio
450
+ pitch_extract_conf:
451
+ reduction_factor: 1
452
+ use_token_averaged_f0: false
453
+ fs: 22050
454
+ n_fft: 1024
455
+ hop_length: 256
456
+ f0max: 400
457
+ f0min: 80
458
+ pitch_normalize: global_mvn
459
+ pitch_normalize_conf:
460
+ stats_file: exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/pitch_stats.npz
461
+ energy_extract: energy
462
+ energy_extract_conf:
463
+ reduction_factor: 1
464
+ use_token_averaged_energy: false
465
+ fs: 22050
466
+ n_fft: 1024
467
+ hop_length: 256
468
+ win_length: null
469
+ energy_normalize: global_mvn
470
+ energy_normalize_conf:
471
+ stats_file: exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/energy_stats.npz
472
+ required:
473
+ - output_dir
474
+ - token_list
475
+ version: '202204'
476
+ distributed: true
477
+ ```
478
+
479
+ </details>
480
+
481
+
482
+
483
+ ### Citing ESPnet
484
+
485
+ ```BibTex
486
+ @inproceedings{watanabe2018espnet,
487
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
488
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
489
+ year={2018},
490
+ booktitle={Proceedings of Interspeech},
491
+ pages={2207--2211},
492
+ doi={10.21437/Interspeech.2018-1456},
493
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
494
+ }
495
+
496
+
497
+
498
+
499
+ @inproceedings{hayashi2020espnet,
500
+ title={{Espnet-TTS}: Unified, reproducible, and integratable open source end-to-end text-to-speech toolkit},
501
+ author={Hayashi, Tomoki and Yamamoto, Ryuichi and Inoue, Katsuki and Yoshimura, Takenori and Watanabe, Shinji and Toda, Tomoki and Takeda, Kazuya and Zhang, Yu and Tan, Xu},
502
+ booktitle={Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
503
+ pages={7654--7658},
504
+ year={2020},
505
+ organization={IEEE}
506
+ }
507
+ ```
508
+
509
+ or arXiv:
510
+
511
+ ```bibtex
512
+ @misc{watanabe2018espnet,
513
+ title={ESPnet: End-to-End Speech Processing Toolkit},
514
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
515
+ year={2018},
516
+ eprint={1804.00015},
517
+ archivePrefix={arXiv},
518
+ primaryClass={cs.CL}
519
+ }
520
+ ```
exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/energy_stats.npz ADDED
Binary file (770 Bytes). View file
 
exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/feats_stats.npz ADDED
Binary file (1.4 kB). View file
 
exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/pitch_stats.npz ADDED
Binary file (770 Bytes). View file
 
exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/config.yaml ADDED
@@ -0,0 +1,442 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/train_jets.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space
7
+ ngpu: 1
8
+ seed: 777
9
+ num_workers: 4
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: 4
14
+ dist_rank: 0
15
+ local_rank: 0
16
+ dist_master_addr: localhost
17
+ dist_master_port: 39471
18
+ dist_launcher: null
19
+ multiprocessing_distributed: true
20
+ unused_parameters: true
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: false
25
+ collect_stats: false
26
+ write_collected_feats: false
27
+ max_epoch: 1000
28
+ patience: null
29
+ val_scheduler_criterion:
30
+ - valid
31
+ - loss
32
+ early_stopping_criterion:
33
+ - valid
34
+ - loss
35
+ - min
36
+ best_model_criterion:
37
+ - - valid
38
+ - text2mel_loss
39
+ - min
40
+ - - train
41
+ - text2mel_loss
42
+ - min
43
+ - - train
44
+ - total_count
45
+ - max
46
+ keep_nbest_models: 5
47
+ nbest_averaging_interval: 0
48
+ grad_clip: -1
49
+ grad_clip_type: 2.0
50
+ grad_noise: false
51
+ accum_grad: 1
52
+ no_forward_run: false
53
+ resume: true
54
+ train_dtype: float32
55
+ use_amp: false
56
+ log_interval: 50
57
+ use_matplotlib: true
58
+ use_tensorboard: true
59
+ use_wandb: false
60
+ wandb_project: null
61
+ wandb_id: null
62
+ wandb_entity: null
63
+ wandb_name: null
64
+ wandb_model_log_interval: -1
65
+ detect_anomaly: false
66
+ pretrain_path: null
67
+ init_param: []
68
+ ignore_init_mismatch: false
69
+ freeze_param: []
70
+ num_iters_per_epoch: 1000
71
+ batch_size: 20
72
+ valid_batch_size: null
73
+ batch_bins: 3000000
74
+ valid_batch_bins: null
75
+ train_shape_file:
76
+ - exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/text_shape.phn
77
+ - exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/speech_shape
78
+ valid_shape_file:
79
+ - exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/valid/text_shape.phn
80
+ - exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/valid/speech_shape
81
+ batch_type: numel
82
+ valid_batch_type: null
83
+ fold_length:
84
+ - 150
85
+ - 204800
86
+ sort_in_batch: descending
87
+ sort_batch: descending
88
+ multiple_iterator: false
89
+ chunk_length: 500
90
+ chunk_shift_ratio: 0.5
91
+ num_cache_chunks: 1024
92
+ train_data_path_and_name_and_type:
93
+ - - dump/raw/tr_no_dev/text
94
+ - text
95
+ - text
96
+ - - dump/raw/tr_no_dev/wav.scp
97
+ - speech
98
+ - sound
99
+ - - exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/collect_feats/pitch.scp
100
+ - pitch
101
+ - npy
102
+ - - exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/collect_feats/energy.scp
103
+ - energy
104
+ - npy
105
+ valid_data_path_and_name_and_type:
106
+ - - dump/raw/dev/text
107
+ - text
108
+ - text
109
+ - - dump/raw/dev/wav.scp
110
+ - speech
111
+ - sound
112
+ - - exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/valid/collect_feats/pitch.scp
113
+ - pitch
114
+ - npy
115
+ - - exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/valid/collect_feats/energy.scp
116
+ - energy
117
+ - npy
118
+ allow_variable_data_keys: false
119
+ max_cache_size: 0.0
120
+ max_cache_fd: 32
121
+ valid_max_cache_size: null
122
+ optim: adamw
123
+ optim_conf:
124
+ lr: 0.0002
125
+ betas:
126
+ - 0.8
127
+ - 0.99
128
+ eps: 1.0e-09
129
+ weight_decay: 0.0
130
+ scheduler: exponentiallr
131
+ scheduler_conf:
132
+ gamma: 0.999875
133
+ optim2: adamw
134
+ optim2_conf:
135
+ lr: 0.0002
136
+ betas:
137
+ - 0.8
138
+ - 0.99
139
+ eps: 1.0e-09
140
+ weight_decay: 0.0
141
+ scheduler2: exponentiallr
142
+ scheduler2_conf:
143
+ gamma: 0.999875
144
+ generator_first: true
145
+ token_list:
146
+ - <blank>
147
+ - <unk>
148
+ - AH0
149
+ - N
150
+ - T
151
+ - D
152
+ - S
153
+ - R
154
+ - L
155
+ - DH
156
+ - K
157
+ - Z
158
+ - IH1
159
+ - IH0
160
+ - M
161
+ - EH1
162
+ - W
163
+ - P
164
+ - AE1
165
+ - AH1
166
+ - V
167
+ - ER0
168
+ - F
169
+ - ','
170
+ - AA1
171
+ - B
172
+ - HH
173
+ - IY1
174
+ - UW1
175
+ - IY0
176
+ - AO1
177
+ - EY1
178
+ - AY1
179
+ - .
180
+ - OW1
181
+ - SH
182
+ - NG
183
+ - G
184
+ - ER1
185
+ - CH
186
+ - JH
187
+ - Y
188
+ - AW1
189
+ - TH
190
+ - UH1
191
+ - EH2
192
+ - OW0
193
+ - EY2
194
+ - AO0
195
+ - IH2
196
+ - AE2
197
+ - AY2
198
+ - AA2
199
+ - UW0
200
+ - EH0
201
+ - OY1
202
+ - EY0
203
+ - AO2
204
+ - ZH
205
+ - OW2
206
+ - AE0
207
+ - UW2
208
+ - AH2
209
+ - AY0
210
+ - IY2
211
+ - AW2
212
+ - AA0
213
+ - ''''
214
+ - ER2
215
+ - UH2
216
+ - '?'
217
+ - OY2
218
+ - '!'
219
+ - AW0
220
+ - UH0
221
+ - OY0
222
+ - ..
223
+ - <sos/eos>
224
+ odim: null
225
+ model_conf: {}
226
+ use_preprocessor: true
227
+ token_type: phn
228
+ bpemodel: null
229
+ non_linguistic_symbols: null
230
+ cleaner: tacotron
231
+ g2p: g2p_en_no_space
232
+ feats_extract: fbank
233
+ feats_extract_conf:
234
+ n_fft: 1024
235
+ hop_length: 256
236
+ win_length: null
237
+ fs: 22050
238
+ fmin: 80
239
+ fmax: 7600
240
+ n_mels: 80
241
+ normalize: global_mvn
242
+ normalize_conf:
243
+ stats_file: exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/feats_stats.npz
244
+ tts: jets
245
+ tts_conf:
246
+ generator_type: jets_generator
247
+ generator_params:
248
+ adim: 256
249
+ aheads: 2
250
+ elayers: 4
251
+ eunits: 1024
252
+ dlayers: 4
253
+ dunits: 1024
254
+ positionwise_layer_type: conv1d
255
+ positionwise_conv_kernel_size: 3
256
+ duration_predictor_layers: 2
257
+ duration_predictor_chans: 256
258
+ duration_predictor_kernel_size: 3
259
+ use_masking: true
260
+ encoder_normalize_before: true
261
+ decoder_normalize_before: true
262
+ encoder_type: transformer
263
+ decoder_type: transformer
264
+ conformer_rel_pos_type: latest
265
+ conformer_pos_enc_layer_type: rel_pos
266
+ conformer_self_attn_layer_type: rel_selfattn
267
+ conformer_activation_type: swish
268
+ use_macaron_style_in_conformer: true
269
+ use_cnn_in_conformer: true
270
+ conformer_enc_kernel_size: 7
271
+ conformer_dec_kernel_size: 31
272
+ init_type: xavier_uniform
273
+ transformer_enc_dropout_rate: 0.2
274
+ transformer_enc_positional_dropout_rate: 0.2
275
+ transformer_enc_attn_dropout_rate: 0.2
276
+ transformer_dec_dropout_rate: 0.2
277
+ transformer_dec_positional_dropout_rate: 0.2
278
+ transformer_dec_attn_dropout_rate: 0.2
279
+ pitch_predictor_layers: 5
280
+ pitch_predictor_chans: 256
281
+ pitch_predictor_kernel_size: 5
282
+ pitch_predictor_dropout: 0.5
283
+ pitch_embed_kernel_size: 1
284
+ pitch_embed_dropout: 0.0
285
+ stop_gradient_from_pitch_predictor: true
286
+ energy_predictor_layers: 2
287
+ energy_predictor_chans: 256
288
+ energy_predictor_kernel_size: 3
289
+ energy_predictor_dropout: 0.5
290
+ energy_embed_kernel_size: 1
291
+ energy_embed_dropout: 0.0
292
+ stop_gradient_from_energy_predictor: false
293
+ generator_out_channels: 1
294
+ generator_channels: 512
295
+ generator_global_channels: -1
296
+ generator_kernel_size: 7
297
+ generator_upsample_scales:
298
+ - 8
299
+ - 8
300
+ - 2
301
+ - 2
302
+ generator_upsample_kernel_sizes:
303
+ - 16
304
+ - 16
305
+ - 4
306
+ - 4
307
+ generator_resblock_kernel_sizes:
308
+ - 3
309
+ - 7
310
+ - 11
311
+ generator_resblock_dilations:
312
+ - - 1
313
+ - 3
314
+ - 5
315
+ - - 1
316
+ - 3
317
+ - 5
318
+ - - 1
319
+ - 3
320
+ - 5
321
+ generator_use_additional_convs: true
322
+ generator_bias: true
323
+ generator_nonlinear_activation: LeakyReLU
324
+ generator_nonlinear_activation_params:
325
+ negative_slope: 0.1
326
+ generator_use_weight_norm: true
327
+ segment_size: 64
328
+ idim: 78
329
+ odim: 80
330
+ discriminator_type: hifigan_multi_scale_multi_period_discriminator
331
+ discriminator_params:
332
+ scales: 1
333
+ scale_downsample_pooling: AvgPool1d
334
+ scale_downsample_pooling_params:
335
+ kernel_size: 4
336
+ stride: 2
337
+ padding: 2
338
+ scale_discriminator_params:
339
+ in_channels: 1
340
+ out_channels: 1
341
+ kernel_sizes:
342
+ - 15
343
+ - 41
344
+ - 5
345
+ - 3
346
+ channels: 128
347
+ max_downsample_channels: 1024
348
+ max_groups: 16
349
+ bias: true
350
+ downsample_scales:
351
+ - 2
352
+ - 2
353
+ - 4
354
+ - 4
355
+ - 1
356
+ nonlinear_activation: LeakyReLU
357
+ nonlinear_activation_params:
358
+ negative_slope: 0.1
359
+ use_weight_norm: true
360
+ use_spectral_norm: false
361
+ follow_official_norm: false
362
+ periods:
363
+ - 2
364
+ - 3
365
+ - 5
366
+ - 7
367
+ - 11
368
+ period_discriminator_params:
369
+ in_channels: 1
370
+ out_channels: 1
371
+ kernel_sizes:
372
+ - 5
373
+ - 3
374
+ channels: 32
375
+ downsample_scales:
376
+ - 3
377
+ - 3
378
+ - 3
379
+ - 3
380
+ - 1
381
+ max_downsample_channels: 1024
382
+ bias: true
383
+ nonlinear_activation: LeakyReLU
384
+ nonlinear_activation_params:
385
+ negative_slope: 0.1
386
+ use_weight_norm: true
387
+ use_spectral_norm: false
388
+ generator_adv_loss_params:
389
+ average_by_discriminators: false
390
+ loss_type: mse
391
+ discriminator_adv_loss_params:
392
+ average_by_discriminators: false
393
+ loss_type: mse
394
+ feat_match_loss_params:
395
+ average_by_discriminators: false
396
+ average_by_layers: false
397
+ include_final_outputs: true
398
+ mel_loss_params:
399
+ fs: 22050
400
+ n_fft: 1024
401
+ hop_length: 256
402
+ win_length: null
403
+ window: hann
404
+ n_mels: 80
405
+ fmin: 0
406
+ fmax: null
407
+ log_base: null
408
+ lambda_adv: 1.0
409
+ lambda_mel: 45.0
410
+ lambda_feat_match: 2.0
411
+ lambda_var: 1.0
412
+ lambda_align: 2.0
413
+ sampling_rate: 22050
414
+ cache_generator_outputs: true
415
+ pitch_extract: dio
416
+ pitch_extract_conf:
417
+ reduction_factor: 1
418
+ use_token_averaged_f0: false
419
+ fs: 22050
420
+ n_fft: 1024
421
+ hop_length: 256
422
+ f0max: 400
423
+ f0min: 80
424
+ pitch_normalize: global_mvn
425
+ pitch_normalize_conf:
426
+ stats_file: exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/pitch_stats.npz
427
+ energy_extract: energy
428
+ energy_extract_conf:
429
+ reduction_factor: 1
430
+ use_token_averaged_energy: false
431
+ fs: 22050
432
+ n_fft: 1024
433
+ hop_length: 256
434
+ win_length: null
435
+ energy_normalize: global_mvn
436
+ energy_normalize_conf:
437
+ stats_file: exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/energy_stats.npz
438
+ required:
439
+ - output_dir
440
+ - token_list
441
+ version: '202204'
442
+ distributed: true
exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/discriminator_backward_time.png ADDED
exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/discriminator_fake_loss.png ADDED
exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/discriminator_forward_time.png ADDED
exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/discriminator_loss.png ADDED
exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/discriminator_optim_step_time.png ADDED
exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/discriminator_real_loss.png ADDED
exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/discriminator_train_time.png ADDED
exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/generator_align_bin_loss.png ADDED
exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/generator_align_forwardsum_loss.png ADDED
exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/generator_align_loss.png ADDED
exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/generator_backward_time.png ADDED
exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/generator_forward_time.png ADDED
exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/generator_g_adv_loss.png ADDED
exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/generator_g_feat_match_loss.png ADDED
exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/generator_g_loss.png ADDED
exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/generator_g_mel_loss.png ADDED
exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/generator_loss.png ADDED
exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/generator_optim_step_time.png ADDED
exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/generator_train_time.png ADDED
exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/generator_var_dur_loss.png ADDED
exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/generator_var_energy_loss.png ADDED
exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/generator_var_loss.png ADDED
exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/generator_var_pitch_loss.png ADDED
exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/gpu_max_cached_mem_GB.png ADDED
exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/iter_time.png ADDED
exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/optim0_lr0.png ADDED
exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/optim1_lr0.png ADDED
exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/train_time.png ADDED
exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/train.total_count.ave_5best.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5703509bc14fc32a4a1b1326c6579589a07cbeb02a99ff3683e9be88722546c6
3
+ size 333689355
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: '202204'
2
+ files:
3
+ model_file: exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/train.total_count.ave_5best.pth
4
+ python: "3.8.1 (default, Jan 26 2022, 13:46:13) \n[GCC 7.5.0]"
5
+ timestamp: 1653755192.099963
6
+ torch: 1.10.1+cu111
7
+ yaml_files:
8
+ train_config: exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/config.yaml