imdanboy commited on
Commit
91ac199
1 Parent(s): ebaa37e

Update model

Browse files
Files changed (35) hide show
  1. README.md +524 -0
  2. exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/energy_stats.npz +3 -0
  3. exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/feats_stats.npz +3 -0
  4. exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/pitch_stats.npz +3 -0
  5. exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/config.yaml +446 -0
  6. exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/discriminator_backward_time.png +0 -0
  7. exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/discriminator_fake_loss.png +0 -0
  8. exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/discriminator_forward_time.png +0 -0
  9. exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/discriminator_loss.png +0 -0
  10. exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/discriminator_optim_step_time.png +0 -0
  11. exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/discriminator_real_loss.png +0 -0
  12. exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/discriminator_train_time.png +0 -0
  13. exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/generator_align_bin_loss.png +0 -0
  14. exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/generator_align_forwardsum_loss.png +0 -0
  15. exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/generator_align_loss.png +0 -0
  16. exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/generator_backward_time.png +0 -0
  17. exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/generator_forward_time.png +0 -0
  18. exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/generator_g_adv_loss.png +0 -0
  19. exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/generator_g_feat_match_loss.png +0 -0
  20. exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/generator_g_loss.png +0 -0
  21. exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/generator_g_mel_loss.png +0 -0
  22. exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/generator_loss.png +0 -0
  23. exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/generator_optim_step_time.png +0 -0
  24. exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/generator_train_time.png +0 -0
  25. exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/generator_var_dur_loss.png +0 -0
  26. exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/generator_var_energy_loss.png +0 -0
  27. exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/generator_var_loss.png +0 -0
  28. exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/generator_var_pitch_loss.png +0 -0
  29. exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/gpu_max_cached_mem_GB.png +0 -0
  30. exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/iter_time.png +0 -0
  31. exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/optim0_lr0.png +0 -0
  32. exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/optim1_lr0.png +0 -0
  33. exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/train_time.png +0 -0
  34. exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/train.total_count.ave_5best.pth +3 -0
  35. meta.yaml +8 -0
README.md CHANGED
@@ -1,3 +1,527 @@
1
  ---
 
 
 
 
 
 
 
2
  license: cc-by-4.0
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - text-to-speech
6
+ language: en
7
+ datasets:
8
+ - ljspeech
9
  license: cc-by-4.0
10
  ---
11
+
12
+ ## ESPnet2 TTS model
13
+
14
+ ### `imdanboy/ljspeech_jets`
15
+
16
+ This model was trained by imdanboy using ljspeech recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+ ### Demo: How to use in ESPnet2
19
+
20
+ Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html)
21
+ if you haven't done that already.
22
+
23
+ ```bash
24
+ cd espnet
25
+ git checkout 6f93936c439a84f6758b1ed12b8714d5e543fb8d
26
+ pip install -e .
27
+ cd egs2/ljspeech/tts1
28
+ ./run.sh --skip_data_prep false --skip_train true --download_model imdanboy/ljspeech_jets
29
+ ```
30
+
31
+
32
+
33
+ ## TTS config
34
+
35
+ <details><summary>expand</summary>
36
+
37
+ ```
38
+ config: conf/tuning/train_jets.yaml
39
+ print_config: false
40
+ log_level: INFO
41
+ dry_run: false
42
+ iterator_type: sequence
43
+ output_dir: exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space
44
+ ngpu: 1
45
+ seed: 777
46
+ num_workers: 4
47
+ num_att_plot: 3
48
+ dist_backend: nccl
49
+ dist_init_method: env://
50
+ dist_world_size: 4
51
+ dist_rank: 0
52
+ local_rank: 0
53
+ dist_master_addr: localhost
54
+ dist_master_port: 47369
55
+ dist_launcher: null
56
+ multiprocessing_distributed: true
57
+ unused_parameters: true
58
+ sharded_ddp: false
59
+ cudnn_enabled: true
60
+ cudnn_benchmark: false
61
+ cudnn_deterministic: false
62
+ collect_stats: false
63
+ write_collected_feats: false
64
+ max_epoch: 1000
65
+ patience: null
66
+ val_scheduler_criterion:
67
+ - valid
68
+ - loss
69
+ early_stopping_criterion:
70
+ - valid
71
+ - loss
72
+ - min
73
+ best_model_criterion:
74
+ - - valid
75
+ - text2mel_loss
76
+ - min
77
+ - - train
78
+ - text2mel_loss
79
+ - min
80
+ - - train
81
+ - total_count
82
+ - max
83
+ keep_nbest_models: 5
84
+ nbest_averaging_interval: 0
85
+ grad_clip: -1
86
+ grad_clip_type: 2.0
87
+ grad_noise: false
88
+ accum_grad: 1
89
+ no_forward_run: false
90
+ resume: true
91
+ train_dtype: float32
92
+ use_amp: false
93
+ log_interval: 50
94
+ use_matplotlib: true
95
+ use_tensorboard: true
96
+ create_graph_in_tensorboard: false
97
+ use_wandb: false
98
+ wandb_project: null
99
+ wandb_id: null
100
+ wandb_entity: null
101
+ wandb_name: null
102
+ wandb_model_log_interval: -1
103
+ detect_anomaly: false
104
+ pretrain_path: null
105
+ init_param: []
106
+ ignore_init_mismatch: false
107
+ freeze_param: []
108
+ num_iters_per_epoch: 1000
109
+ batch_size: 20
110
+ valid_batch_size: null
111
+ batch_bins: 5000000
112
+ valid_batch_bins: null
113
+ train_shape_file:
114
+ - exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/text_shape.phn
115
+ - exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/speech_shape
116
+ valid_shape_file:
117
+ - exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/valid/text_shape.phn
118
+ - exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/valid/speech_shape
119
+ batch_type: numel
120
+ valid_batch_type: null
121
+ fold_length:
122
+ - 150
123
+ - 204800
124
+ sort_in_batch: descending
125
+ sort_batch: descending
126
+ multiple_iterator: false
127
+ chunk_length: 500
128
+ chunk_shift_ratio: 0.5
129
+ num_cache_chunks: 1024
130
+ chunk_excluded_key_prefixes: []
131
+ train_data_path_and_name_and_type:
132
+ - - dump/raw/tr_no_dev/text
133
+ - text
134
+ - text
135
+ - - dump/raw/tr_no_dev/wav.scp
136
+ - speech
137
+ - sound
138
+ - - exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/collect_feats/pitch.scp
139
+ - pitch
140
+ - npy
141
+ - - exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/collect_feats/energy.scp
142
+ - energy
143
+ - npy
144
+ valid_data_path_and_name_and_type:
145
+ - - dump/raw/dev/text
146
+ - text
147
+ - text
148
+ - - dump/raw/dev/wav.scp
149
+ - speech
150
+ - sound
151
+ - - exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/valid/collect_feats/pitch.scp
152
+ - pitch
153
+ - npy
154
+ - - exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/valid/collect_feats/energy.scp
155
+ - energy
156
+ - npy
157
+ allow_variable_data_keys: false
158
+ max_cache_size: 0.0
159
+ max_cache_fd: 32
160
+ valid_max_cache_size: null
161
+ exclude_weight_decay: false
162
+ exclude_weight_decay_conf: {}
163
+ optim: adamw
164
+ optim_conf:
165
+ lr: 0.0002
166
+ betas:
167
+ - 0.8
168
+ - 0.99
169
+ eps: 1.0e-09
170
+ weight_decay: 0.0
171
+ scheduler: exponentiallr
172
+ scheduler_conf:
173
+ gamma: 0.999875
174
+ optim2: adamw
175
+ optim2_conf:
176
+ lr: 0.0002
177
+ betas:
178
+ - 0.8
179
+ - 0.99
180
+ eps: 1.0e-09
181
+ weight_decay: 0.0
182
+ scheduler2: exponentiallr
183
+ scheduler2_conf:
184
+ gamma: 0.999875
185
+ generator_first: true
186
+ token_list:
187
+ - <blank>
188
+ - <unk>
189
+ - AH0
190
+ - N
191
+ - T
192
+ - D
193
+ - S
194
+ - R
195
+ - L
196
+ - DH
197
+ - K
198
+ - Z
199
+ - IH1
200
+ - IH0
201
+ - M
202
+ - EH1
203
+ - W
204
+ - P
205
+ - AE1
206
+ - AH1
207
+ - V
208
+ - ER0
209
+ - F
210
+ - ','
211
+ - AA1
212
+ - B
213
+ - HH
214
+ - IY1
215
+ - UW1
216
+ - IY0
217
+ - AO1
218
+ - EY1
219
+ - AY1
220
+ - .
221
+ - OW1
222
+ - SH
223
+ - NG
224
+ - G
225
+ - ER1
226
+ - CH
227
+ - JH
228
+ - Y
229
+ - AW1
230
+ - TH
231
+ - UH1
232
+ - EH2
233
+ - OW0
234
+ - EY2
235
+ - AO0
236
+ - IH2
237
+ - AE2
238
+ - AY2
239
+ - AA2
240
+ - UW0
241
+ - EH0
242
+ - OY1
243
+ - EY0
244
+ - AO2
245
+ - ZH
246
+ - OW2
247
+ - AE0
248
+ - UW2
249
+ - AH2
250
+ - AY0
251
+ - IY2
252
+ - AW2
253
+ - AA0
254
+ - ''''
255
+ - ER2
256
+ - UH2
257
+ - '?'
258
+ - OY2
259
+ - '!'
260
+ - AW0
261
+ - UH0
262
+ - OY0
263
+ - ..
264
+ - <sos/eos>
265
+ odim: null
266
+ model_conf: {}
267
+ use_preprocessor: true
268
+ token_type: phn
269
+ bpemodel: null
270
+ non_linguistic_symbols: null
271
+ cleaner: tacotron
272
+ g2p: g2p_en_no_space
273
+ feats_extract: fbank
274
+ feats_extract_conf:
275
+ n_fft: 1024
276
+ hop_length: 256
277
+ win_length: null
278
+ fs: 22050
279
+ fmin: 80
280
+ fmax: 7600
281
+ n_mels: 80
282
+ normalize: global_mvn
283
+ normalize_conf:
284
+ stats_file: exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/feats_stats.npz
285
+ tts: jets
286
+ tts_conf:
287
+ generator_type: jets_generator
288
+ generator_params:
289
+ adim: 256
290
+ aheads: 2
291
+ elayers: 4
292
+ eunits: 1024
293
+ dlayers: 4
294
+ dunits: 1024
295
+ positionwise_layer_type: conv1d
296
+ positionwise_conv_kernel_size: 3
297
+ duration_predictor_layers: 2
298
+ duration_predictor_chans: 256
299
+ duration_predictor_kernel_size: 3
300
+ use_masking: true
301
+ encoder_normalize_before: true
302
+ decoder_normalize_before: true
303
+ encoder_type: transformer
304
+ decoder_type: transformer
305
+ conformer_rel_pos_type: latest
306
+ conformer_pos_enc_layer_type: rel_pos
307
+ conformer_self_attn_layer_type: rel_selfattn
308
+ conformer_activation_type: swish
309
+ use_macaron_style_in_conformer: true
310
+ use_cnn_in_conformer: true
311
+ conformer_enc_kernel_size: 7
312
+ conformer_dec_kernel_size: 31
313
+ init_type: xavier_uniform
314
+ transformer_enc_dropout_rate: 0.2
315
+ transformer_enc_positional_dropout_rate: 0.2
316
+ transformer_enc_attn_dropout_rate: 0.2
317
+ transformer_dec_dropout_rate: 0.2
318
+ transformer_dec_positional_dropout_rate: 0.2
319
+ transformer_dec_attn_dropout_rate: 0.2
320
+ pitch_predictor_layers: 5
321
+ pitch_predictor_chans: 256
322
+ pitch_predictor_kernel_size: 5
323
+ pitch_predictor_dropout: 0.5
324
+ pitch_embed_kernel_size: 1
325
+ pitch_embed_dropout: 0.0
326
+ stop_gradient_from_pitch_predictor: true
327
+ energy_predictor_layers: 2
328
+ energy_predictor_chans: 256
329
+ energy_predictor_kernel_size: 3
330
+ energy_predictor_dropout: 0.5
331
+ energy_embed_kernel_size: 1
332
+ energy_embed_dropout: 0.0
333
+ stop_gradient_from_energy_predictor: false
334
+ generator_out_channels: 1
335
+ generator_channels: 512
336
+ generator_global_channels: -1
337
+ generator_kernel_size: 7
338
+ generator_upsample_scales:
339
+ - 8
340
+ - 8
341
+ - 2
342
+ - 2
343
+ generator_upsample_kernel_sizes:
344
+ - 16
345
+ - 16
346
+ - 4
347
+ - 4
348
+ generator_resblock_kernel_sizes:
349
+ - 3
350
+ - 7
351
+ - 11
352
+ generator_resblock_dilations:
353
+ - - 1
354
+ - 3
355
+ - 5
356
+ - - 1
357
+ - 3
358
+ - 5
359
+ - - 1
360
+ - 3
361
+ - 5
362
+ generator_use_additional_convs: true
363
+ generator_bias: true
364
+ generator_nonlinear_activation: LeakyReLU
365
+ generator_nonlinear_activation_params:
366
+ negative_slope: 0.1
367
+ generator_use_weight_norm: true
368
+ segment_size: 32
369
+ idim: 78
370
+ odim: 80
371
+ discriminator_type: hifigan_multi_scale_multi_period_discriminator
372
+ discriminator_params:
373
+ scales: 1
374
+ scale_downsample_pooling: AvgPool1d
375
+ scale_downsample_pooling_params:
376
+ kernel_size: 4
377
+ stride: 2
378
+ padding: 2
379
+ scale_discriminator_params:
380
+ in_channels: 1
381
+ out_channels: 1
382
+ kernel_sizes:
383
+ - 15
384
+ - 41
385
+ - 5
386
+ - 3
387
+ channels: 128
388
+ max_downsample_channels: 1024
389
+ max_groups: 16
390
+ bias: true
391
+ downsample_scales:
392
+ - 2
393
+ - 2
394
+ - 4
395
+ - 4
396
+ - 1
397
+ nonlinear_activation: LeakyReLU
398
+ nonlinear_activation_params:
399
+ negative_slope: 0.1
400
+ use_weight_norm: true
401
+ use_spectral_norm: false
402
+ follow_official_norm: false
403
+ periods:
404
+ - 2
405
+ - 3
406
+ - 5
407
+ - 7
408
+ - 11
409
+ period_discriminator_params:
410
+ in_channels: 1
411
+ out_channels: 1
412
+ kernel_sizes:
413
+ - 5
414
+ - 3
415
+ channels: 32
416
+ downsample_scales:
417
+ - 3
418
+ - 3
419
+ - 3
420
+ - 3
421
+ - 1
422
+ max_downsample_channels: 1024
423
+ bias: true
424
+ nonlinear_activation: LeakyReLU
425
+ nonlinear_activation_params:
426
+ negative_slope: 0.1
427
+ use_weight_norm: true
428
+ use_spectral_norm: false
429
+ generator_adv_loss_params:
430
+ average_by_discriminators: false
431
+ loss_type: mse
432
+ discriminator_adv_loss_params:
433
+ average_by_discriminators: false
434
+ loss_type: mse
435
+ feat_match_loss_params:
436
+ average_by_discriminators: false
437
+ average_by_layers: false
438
+ include_final_outputs: true
439
+ mel_loss_params:
440
+ fs: 22050
441
+ n_fft: 1024
442
+ hop_length: 256
443
+ win_length: null
444
+ window: hann
445
+ n_mels: 80
446
+ fmin: 0
447
+ fmax: null
448
+ log_base: null
449
+ lambda_adv: 1.0
450
+ lambda_mel: 45.0
451
+ lambda_feat_match: 2.0
452
+ lambda_var: 1.0
453
+ lambda_align: 1.0
454
+ sampling_rate: 22050
455
+ cache_generator_outputs: true
456
+ pitch_extract: dio
457
+ pitch_extract_conf:
458
+ reduction_factor: 1
459
+ use_token_averaged_f0: false
460
+ fs: 22050
461
+ n_fft: 1024
462
+ hop_length: 256
463
+ f0max: 400
464
+ f0min: 80
465
+ pitch_normalize: global_mvn
466
+ pitch_normalize_conf:
467
+ stats_file: exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/pitch_stats.npz
468
+ energy_extract: energy
469
+ energy_extract_conf:
470
+ reduction_factor: 1
471
+ use_token_averaged_energy: false
472
+ fs: 22050
473
+ n_fft: 1024
474
+ hop_length: 256
475
+ win_length: null
476
+ energy_normalize: global_mvn
477
+ energy_normalize_conf:
478
+ stats_file: exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/energy_stats.npz
479
+ required:
480
+ - output_dir
481
+ - token_list
482
+ version: '202304'
483
+ distributed: true
484
+ ```
485
+
486
+ </details>
487
+
488
+
489
+
490
+ ### Citing ESPnet
491
+
492
+ ```BibTex
493
+ @inproceedings{watanabe2018espnet,
494
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
495
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
496
+ year={2018},
497
+ booktitle={Proceedings of Interspeech},
498
+ pages={2207--2211},
499
+ doi={10.21437/Interspeech.2018-1456},
500
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
501
+ }
502
+
503
+
504
+
505
+
506
+ @inproceedings{hayashi2020espnet,
507
+ title={{Espnet-TTS}: Unified, reproducible, and integratable open source end-to-end text-to-speech toolkit},
508
+ author={Hayashi, Tomoki and Yamamoto, Ryuichi and Inoue, Katsuki and Yoshimura, Takenori and Watanabe, Shinji and Toda, Tomoki and Takeda, Kazuya and Zhang, Yu and Tan, Xu},
509
+ booktitle={Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
510
+ pages={7654--7658},
511
+ year={2020},
512
+ organization={IEEE}
513
+ }
514
+ ```
515
+
516
+ or arXiv:
517
+
518
+ ```bibtex
519
+ @misc{watanabe2018espnet,
520
+ title={ESPnet: End-to-End Speech Processing Toolkit},
521
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
522
+ year={2018},
523
+ eprint={1804.00015},
524
+ archivePrefix={arXiv},
525
+ primaryClass={cs.CL}
526
+ }
527
+ ```
exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/energy_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0685f0325f4b857068f350d3a30aa2ec94f56a7968f0fb3c86da67bf52438ab7
3
+ size 770
exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/feats_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3bd9b064688bd8764295a74f4206805636762619d674c8313fef16e2d3fd6244
3
+ size 1402
exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/pitch_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61758f1c1576d4fd279e6f68508aca55d82d300283f7202613a8171ac3ee8824
3
+ size 770
exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/config.yaml ADDED
@@ -0,0 +1,446 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/train_jets.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space
7
+ ngpu: 1
8
+ seed: 777
9
+ num_workers: 4
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: 4
14
+ dist_rank: 0
15
+ local_rank: 0
16
+ dist_master_addr: localhost
17
+ dist_master_port: 47369
18
+ dist_launcher: null
19
+ multiprocessing_distributed: true
20
+ unused_parameters: true
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: false
25
+ collect_stats: false
26
+ write_collected_feats: false
27
+ max_epoch: 1000
28
+ patience: null
29
+ val_scheduler_criterion:
30
+ - valid
31
+ - loss
32
+ early_stopping_criterion:
33
+ - valid
34
+ - loss
35
+ - min
36
+ best_model_criterion:
37
+ - - valid
38
+ - text2mel_loss
39
+ - min
40
+ - - train
41
+ - text2mel_loss
42
+ - min
43
+ - - train
44
+ - total_count
45
+ - max
46
+ keep_nbest_models: 5
47
+ nbest_averaging_interval: 0
48
+ grad_clip: -1
49
+ grad_clip_type: 2.0
50
+ grad_noise: false
51
+ accum_grad: 1
52
+ no_forward_run: false
53
+ resume: true
54
+ train_dtype: float32
55
+ use_amp: false
56
+ log_interval: 50
57
+ use_matplotlib: true
58
+ use_tensorboard: true
59
+ create_graph_in_tensorboard: false
60
+ use_wandb: false
61
+ wandb_project: null
62
+ wandb_id: null
63
+ wandb_entity: null
64
+ wandb_name: null
65
+ wandb_model_log_interval: -1
66
+ detect_anomaly: false
67
+ pretrain_path: null
68
+ init_param: []
69
+ ignore_init_mismatch: false
70
+ freeze_param: []
71
+ num_iters_per_epoch: 1000
72
+ batch_size: 20
73
+ valid_batch_size: null
74
+ batch_bins: 5000000
75
+ valid_batch_bins: null
76
+ train_shape_file:
77
+ - exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/text_shape.phn
78
+ - exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/speech_shape
79
+ valid_shape_file:
80
+ - exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/valid/text_shape.phn
81
+ - exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/valid/speech_shape
82
+ batch_type: numel
83
+ valid_batch_type: null
84
+ fold_length:
85
+ - 150
86
+ - 204800
87
+ sort_in_batch: descending
88
+ sort_batch: descending
89
+ multiple_iterator: false
90
+ chunk_length: 500
91
+ chunk_shift_ratio: 0.5
92
+ num_cache_chunks: 1024
93
+ chunk_excluded_key_prefixes: []
94
+ train_data_path_and_name_and_type:
95
+ - - dump/raw/tr_no_dev/text
96
+ - text
97
+ - text
98
+ - - dump/raw/tr_no_dev/wav.scp
99
+ - speech
100
+ - sound
101
+ - - exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/collect_feats/pitch.scp
102
+ - pitch
103
+ - npy
104
+ - - exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/collect_feats/energy.scp
105
+ - energy
106
+ - npy
107
+ valid_data_path_and_name_and_type:
108
+ - - dump/raw/dev/text
109
+ - text
110
+ - text
111
+ - - dump/raw/dev/wav.scp
112
+ - speech
113
+ - sound
114
+ - - exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/valid/collect_feats/pitch.scp
115
+ - pitch
116
+ - npy
117
+ - - exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/valid/collect_feats/energy.scp
118
+ - energy
119
+ - npy
120
+ allow_variable_data_keys: false
121
+ max_cache_size: 0.0
122
+ max_cache_fd: 32
123
+ valid_max_cache_size: null
124
+ exclude_weight_decay: false
125
+ exclude_weight_decay_conf: {}
126
+ optim: adamw
127
+ optim_conf:
128
+ lr: 0.0002
129
+ betas:
130
+ - 0.8
131
+ - 0.99
132
+ eps: 1.0e-09
133
+ weight_decay: 0.0
134
+ scheduler: exponentiallr
135
+ scheduler_conf:
136
+ gamma: 0.999875
137
+ optim2: adamw
138
+ optim2_conf:
139
+ lr: 0.0002
140
+ betas:
141
+ - 0.8
142
+ - 0.99
143
+ eps: 1.0e-09
144
+ weight_decay: 0.0
145
+ scheduler2: exponentiallr
146
+ scheduler2_conf:
147
+ gamma: 0.999875
148
+ generator_first: true
149
+ token_list:
150
+ - <blank>
151
+ - <unk>
152
+ - AH0
153
+ - N
154
+ - T
155
+ - D
156
+ - S
157
+ - R
158
+ - L
159
+ - DH
160
+ - K
161
+ - Z
162
+ - IH1
163
+ - IH0
164
+ - M
165
+ - EH1
166
+ - W
167
+ - P
168
+ - AE1
169
+ - AH1
170
+ - V
171
+ - ER0
172
+ - F
173
+ - ','
174
+ - AA1
175
+ - B
176
+ - HH
177
+ - IY1
178
+ - UW1
179
+ - IY0
180
+ - AO1
181
+ - EY1
182
+ - AY1
183
+ - .
184
+ - OW1
185
+ - SH
186
+ - NG
187
+ - G
188
+ - ER1
189
+ - CH
190
+ - JH
191
+ - Y
192
+ - AW1
193
+ - TH
194
+ - UH1
195
+ - EH2
196
+ - OW0
197
+ - EY2
198
+ - AO0
199
+ - IH2
200
+ - AE2
201
+ - AY2
202
+ - AA2
203
+ - UW0
204
+ - EH0
205
+ - OY1
206
+ - EY0
207
+ - AO2
208
+ - ZH
209
+ - OW2
210
+ - AE0
211
+ - UW2
212
+ - AH2
213
+ - AY0
214
+ - IY2
215
+ - AW2
216
+ - AA0
217
+ - ''''
218
+ - ER2
219
+ - UH2
220
+ - '?'
221
+ - OY2
222
+ - '!'
223
+ - AW0
224
+ - UH0
225
+ - OY0
226
+ - ..
227
+ - <sos/eos>
228
+ odim: null
229
+ model_conf: {}
230
+ use_preprocessor: true
231
+ token_type: phn
232
+ bpemodel: null
233
+ non_linguistic_symbols: null
234
+ cleaner: tacotron
235
+ g2p: g2p_en_no_space
236
+ feats_extract: fbank
237
+ feats_extract_conf:
238
+ n_fft: 1024
239
+ hop_length: 256
240
+ win_length: null
241
+ fs: 22050
242
+ fmin: 80
243
+ fmax: 7600
244
+ n_mels: 80
245
+ normalize: global_mvn
246
+ normalize_conf:
247
+ stats_file: exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/feats_stats.npz
248
+ tts: jets
249
+ tts_conf:
250
+ generator_type: jets_generator
251
+ generator_params:
252
+ adim: 256
253
+ aheads: 2
254
+ elayers: 4
255
+ eunits: 1024
256
+ dlayers: 4
257
+ dunits: 1024
258
+ positionwise_layer_type: conv1d
259
+ positionwise_conv_kernel_size: 3
260
+ duration_predictor_layers: 2
261
+ duration_predictor_chans: 256
262
+ duration_predictor_kernel_size: 3
263
+ use_masking: true
264
+ encoder_normalize_before: true
265
+ decoder_normalize_before: true
266
+ encoder_type: transformer
267
+ decoder_type: transformer
268
+ conformer_rel_pos_type: latest
269
+ conformer_pos_enc_layer_type: rel_pos
270
+ conformer_self_attn_layer_type: rel_selfattn
271
+ conformer_activation_type: swish
272
+ use_macaron_style_in_conformer: true
273
+ use_cnn_in_conformer: true
274
+ conformer_enc_kernel_size: 7
275
+ conformer_dec_kernel_size: 31
276
+ init_type: xavier_uniform
277
+ transformer_enc_dropout_rate: 0.2
278
+ transformer_enc_positional_dropout_rate: 0.2
279
+ transformer_enc_attn_dropout_rate: 0.2
280
+ transformer_dec_dropout_rate: 0.2
281
+ transformer_dec_positional_dropout_rate: 0.2
282
+ transformer_dec_attn_dropout_rate: 0.2
283
+ pitch_predictor_layers: 5
284
+ pitch_predictor_chans: 256
285
+ pitch_predictor_kernel_size: 5
286
+ pitch_predictor_dropout: 0.5
287
+ pitch_embed_kernel_size: 1
288
+ pitch_embed_dropout: 0.0
289
+ stop_gradient_from_pitch_predictor: true
290
+ energy_predictor_layers: 2
291
+ energy_predictor_chans: 256
292
+ energy_predictor_kernel_size: 3
293
+ energy_predictor_dropout: 0.5
294
+ energy_embed_kernel_size: 1
295
+ energy_embed_dropout: 0.0
296
+ stop_gradient_from_energy_predictor: false
297
+ generator_out_channels: 1
298
+ generator_channels: 512
299
+ generator_global_channels: -1
300
+ generator_kernel_size: 7
301
+ generator_upsample_scales:
302
+ - 8
303
+ - 8
304
+ - 2
305
+ - 2
306
+ generator_upsample_kernel_sizes:
307
+ - 16
308
+ - 16
309
+ - 4
310
+ - 4
311
+ generator_resblock_kernel_sizes:
312
+ - 3
313
+ - 7
314
+ - 11
315
+ generator_resblock_dilations:
316
+ - - 1
317
+ - 3
318
+ - 5
319
+ - - 1
320
+ - 3
321
+ - 5
322
+ - - 1
323
+ - 3
324
+ - 5
325
+ generator_use_additional_convs: true
326
+ generator_bias: true
327
+ generator_nonlinear_activation: LeakyReLU
328
+ generator_nonlinear_activation_params:
329
+ negative_slope: 0.1
330
+ generator_use_weight_norm: true
331
+ segment_size: 32
332
+ idim: 78
333
+ odim: 80
334
+ discriminator_type: hifigan_multi_scale_multi_period_discriminator
335
+ discriminator_params:
336
+ scales: 1
337
+ scale_downsample_pooling: AvgPool1d
338
+ scale_downsample_pooling_params:
339
+ kernel_size: 4
340
+ stride: 2
341
+ padding: 2
342
+ scale_discriminator_params:
343
+ in_channels: 1
344
+ out_channels: 1
345
+ kernel_sizes:
346
+ - 15
347
+ - 41
348
+ - 5
349
+ - 3
350
+ channels: 128
351
+ max_downsample_channels: 1024
352
+ max_groups: 16
353
+ bias: true
354
+ downsample_scales:
355
+ - 2
356
+ - 2
357
+ - 4
358
+ - 4
359
+ - 1
360
+ nonlinear_activation: LeakyReLU
361
+ nonlinear_activation_params:
362
+ negative_slope: 0.1
363
+ use_weight_norm: true
364
+ use_spectral_norm: false
365
+ follow_official_norm: false
366
+ periods:
367
+ - 2
368
+ - 3
369
+ - 5
370
+ - 7
371
+ - 11
372
+ period_discriminator_params:
373
+ in_channels: 1
374
+ out_channels: 1
375
+ kernel_sizes:
376
+ - 5
377
+ - 3
378
+ channels: 32
379
+ downsample_scales:
380
+ - 3
381
+ - 3
382
+ - 3
383
+ - 3
384
+ - 1
385
+ max_downsample_channels: 1024
386
+ bias: true
387
+ nonlinear_activation: LeakyReLU
388
+ nonlinear_activation_params:
389
+ negative_slope: 0.1
390
+ use_weight_norm: true
391
+ use_spectral_norm: false
392
+ generator_adv_loss_params:
393
+ average_by_discriminators: false
394
+ loss_type: mse
395
+ discriminator_adv_loss_params:
396
+ average_by_discriminators: false
397
+ loss_type: mse
398
+ feat_match_loss_params:
399
+ average_by_discriminators: false
400
+ average_by_layers: false
401
+ include_final_outputs: true
402
+ mel_loss_params:
403
+ fs: 22050
404
+ n_fft: 1024
405
+ hop_length: 256
406
+ win_length: null
407
+ window: hann
408
+ n_mels: 80
409
+ fmin: 0
410
+ fmax: null
411
+ log_base: null
412
+ lambda_adv: 1.0
413
+ lambda_mel: 45.0
414
+ lambda_feat_match: 2.0
415
+ lambda_var: 1.0
416
+ lambda_align: 1.0
417
+ sampling_rate: 22050
418
+ cache_generator_outputs: true
419
+ pitch_extract: dio
420
+ pitch_extract_conf:
421
+ reduction_factor: 1
422
+ use_token_averaged_f0: false
423
+ fs: 22050
424
+ n_fft: 1024
425
+ hop_length: 256
426
+ f0max: 400
427
+ f0min: 80
428
+ pitch_normalize: global_mvn
429
+ pitch_normalize_conf:
430
+ stats_file: exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/pitch_stats.npz
431
+ energy_extract: energy
432
+ energy_extract_conf:
433
+ reduction_factor: 1
434
+ use_token_averaged_energy: false
435
+ fs: 22050
436
+ n_fft: 1024
437
+ hop_length: 256
438
+ win_length: null
439
+ energy_normalize: global_mvn
440
+ energy_normalize_conf:
441
+ stats_file: exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/energy_stats.npz
442
+ required:
443
+ - output_dir
444
+ - token_list
445
+ version: '202304'
446
+ distributed: true
exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/discriminator_backward_time.png ADDED
exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/discriminator_fake_loss.png ADDED
exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/discriminator_forward_time.png ADDED
exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/discriminator_loss.png ADDED
exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/discriminator_optim_step_time.png ADDED
exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/discriminator_real_loss.png ADDED
exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/discriminator_train_time.png ADDED
exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/generator_align_bin_loss.png ADDED
exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/generator_align_forwardsum_loss.png ADDED
exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/generator_align_loss.png ADDED
exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/generator_backward_time.png ADDED
exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/generator_forward_time.png ADDED
exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/generator_g_adv_loss.png ADDED
exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/generator_g_feat_match_loss.png ADDED
exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/generator_g_loss.png ADDED
exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/generator_g_mel_loss.png ADDED
exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/generator_loss.png ADDED
exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/generator_optim_step_time.png ADDED
exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/generator_train_time.png ADDED
exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/generator_var_dur_loss.png ADDED
exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/generator_var_energy_loss.png ADDED
exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/generator_var_loss.png ADDED
exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/generator_var_pitch_loss.png ADDED
exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/gpu_max_cached_mem_GB.png ADDED
exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/iter_time.png ADDED
exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/optim0_lr0.png ADDED
exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/optim1_lr0.png ADDED
exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/images/train_time.png ADDED
exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/train.total_count.ave_5best.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f7a022291d366cff3ddc0d603c58d5702b6af559406387adedff64d4b209b29
3
+ size 333708411
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: '202304'
2
+ files:
3
+ model_file: exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/train.total_count.ave_5best.pth
4
+ python: "3.8.16 (default, Jun 12 2023, 18:09:05) \n[GCC 11.2.0]"
5
+ timestamp: 1688808213.794778
6
+ torch: 1.11.0
7
+ yaml_files:
8
+ train_config: exp/tts_train_jets_raw_phn_tacotron_g2p_en_no_space/config.yaml