viks66 commited on
Commit
c7839a8
1 Parent(s): 1a8baad

Update model

Browse files
Files changed (27) hide show
  1. README.md +486 -0
  2. dump/22k/raw/org/tr_no_dev/spk2sid +11 -0
  3. exp/22k/tts_train_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/config.yaml +403 -0
  4. exp/22k/tts_train_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/discriminator_backward_time.png +0 -0
  5. exp/22k/tts_train_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/discriminator_fake_loss.png +0 -0
  6. exp/22k/tts_train_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/discriminator_forward_time.png +0 -0
  7. exp/22k/tts_train_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/discriminator_loss.png +0 -0
  8. exp/22k/tts_train_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/discriminator_optim_step_time.png +0 -0
  9. exp/22k/tts_train_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/discriminator_real_loss.png +0 -0
  10. exp/22k/tts_train_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/discriminator_train_time.png +0 -0
  11. exp/22k/tts_train_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_adv_loss.png +0 -0
  12. exp/22k/tts_train_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_backward_time.png +0 -0
  13. exp/22k/tts_train_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_dur_loss.png +0 -0
  14. exp/22k/tts_train_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_feat_match_loss.png +0 -0
  15. exp/22k/tts_train_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_forward_time.png +0 -0
  16. exp/22k/tts_train_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_kl_loss.png +0 -0
  17. exp/22k/tts_train_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_loss.png +0 -0
  18. exp/22k/tts_train_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_mel_loss.png +0 -0
  19. exp/22k/tts_train_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_optim_step_time.png +0 -0
  20. exp/22k/tts_train_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_train_time.png +0 -0
  21. exp/22k/tts_train_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/gpu_max_cached_mem_GB.png +0 -0
  22. exp/22k/tts_train_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/iter_time.png +0 -0
  23. exp/22k/tts_train_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/optim0_lr0.png +0 -0
  24. exp/22k/tts_train_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/optim1_lr0.png +0 -0
  25. exp/22k/tts_train_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/train_time.png +0 -0
  26. exp/22k/tts_train_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/train.loss.ave.pth +3 -0
  27. meta.yaml +8 -0
README.md ADDED
@@ -0,0 +1,486 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - text-to-speech
6
+ language: en
7
+ datasets:
8
+ - hifitts
9
+ license: cc-by-4.0
10
+ ---
11
+
12
+ ## ESPnet2 TTS model
13
+
14
+ ### `jes3275/hifitts_vits_multispeaker_22.05k`
15
+
16
+ This model was trained by bloodraven66 using hifitts recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+ ### Demo: How to use in ESPnet2
19
+
20
+ Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html)
21
+ if you haven't done that already.
22
+
23
+ ```bash
24
+ cd espnet
25
+ git checkout d0047402e830a3c53e8b590064af4bf70415fb3b
26
+ pip install -e .
27
+ cd egs2/hifitts/tts1
28
+ ./run.sh --skip_data_prep false --skip_train true --download_model jes3275/hifitts_vits_multispeaker_22.05k
29
+ ```
30
+
31
+
32
+
33
+ ## TTS config
34
+
35
+ <details><summary>expand</summary>
36
+
37
+ ```
38
+ config: ./conf/tuning/train_multi_spk_vits.yaml
39
+ print_config: false
40
+ log_level: INFO
41
+ drop_last_iter: false
42
+ dry_run: false
43
+ iterator_type: sequence
44
+ valid_iterator_type: null
45
+ output_dir: exp/22k/tts_train_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space
46
+ ngpu: 1
47
+ seed: 777
48
+ num_workers: 4
49
+ num_att_plot: 3
50
+ dist_backend: nccl
51
+ dist_init_method: env://
52
+ dist_world_size: 2
53
+ dist_rank: 0
54
+ local_rank: 0
55
+ dist_master_addr: localhost
56
+ dist_master_port: 43797
57
+ dist_launcher: null
58
+ multiprocessing_distributed: true
59
+ unused_parameters: true
60
+ sharded_ddp: false
61
+ cudnn_enabled: true
62
+ cudnn_benchmark: false
63
+ cudnn_deterministic: false
64
+ collect_stats: false
65
+ write_collected_feats: false
66
+ max_epoch: 3000
67
+ patience: null
68
+ val_scheduler_criterion:
69
+ - valid
70
+ - loss
71
+ early_stopping_criterion:
72
+ - valid
73
+ - loss
74
+ - min
75
+ best_model_criterion:
76
+ - - train
77
+ - total_count
78
+ - max
79
+ keep_nbest_models: 10
80
+ nbest_averaging_interval: 0
81
+ grad_clip: -1
82
+ grad_clip_type: 2.0
83
+ grad_noise: false
84
+ accum_grad: 1
85
+ no_forward_run: false
86
+ resume: true
87
+ train_dtype: float32
88
+ use_amp: false
89
+ log_interval: 50
90
+ use_matplotlib: true
91
+ use_tensorboard: true
92
+ create_graph_in_tensorboard: false
93
+ use_wandb: false
94
+ wandb_project: null
95
+ wandb_id: null
96
+ wandb_entity: null
97
+ wandb_name: null
98
+ wandb_model_log_interval: -1
99
+ detect_anomaly: false
100
+ use_adapter: false
101
+ adapter: lora
102
+ save_strategy: all
103
+ adapter_conf: {}
104
+ pretrain_path: null
105
+ init_param: []
106
+ ignore_init_mismatch: false
107
+ freeze_param: []
108
+ num_iters_per_epoch: 1000
109
+ batch_size: 20
110
+ valid_batch_size: null
111
+ batch_bins: 1000000
112
+ valid_batch_bins: null
113
+ train_shape_file:
114
+ - exp/22k/tts_stats_raw_linear_spectrogram_phn_tacotron_g2p_en_no_space/train/text_shape.phn
115
+ - exp/22k/tts_stats_raw_linear_spectrogram_phn_tacotron_g2p_en_no_space/train/speech_shape
116
+ valid_shape_file:
117
+ - exp/22k/tts_stats_raw_linear_spectrogram_phn_tacotron_g2p_en_no_space/valid/text_shape.phn
118
+ - exp/22k/tts_stats_raw_linear_spectrogram_phn_tacotron_g2p_en_no_space/valid/speech_shape
119
+ batch_type: numel
120
+ valid_batch_type: null
121
+ fold_length:
122
+ - 150
123
+ - 204800
124
+ sort_in_batch: descending
125
+ shuffle_within_batch: false
126
+ sort_batch: descending
127
+ multiple_iterator: false
128
+ chunk_length: 500
129
+ chunk_shift_ratio: 0.5
130
+ num_cache_chunks: 1024
131
+ chunk_excluded_key_prefixes: []
132
+ chunk_default_fs: null
133
+ train_data_path_and_name_and_type:
134
+ - - dump/22k/raw/tr_no_dev/text
135
+ - text
136
+ - text
137
+ - - dump/22k/raw/tr_no_dev/wav.scp
138
+ - speech
139
+ - sound
140
+ - - dump/22k/raw/tr_no_dev/utt2sid
141
+ - sids
142
+ - text_int
143
+ valid_data_path_and_name_and_type:
144
+ - - dump/22k/raw/dev/text
145
+ - text
146
+ - text
147
+ - - dump/22k/raw/dev/wav.scp
148
+ - speech
149
+ - sound
150
+ - - dump/22k/raw/dev/utt2sid
151
+ - sids
152
+ - text_int
153
+ allow_variable_data_keys: false
154
+ max_cache_size: 0.0
155
+ max_cache_fd: 32
156
+ allow_multi_rates: false
157
+ valid_max_cache_size: null
158
+ exclude_weight_decay: false
159
+ exclude_weight_decay_conf: {}
160
+ optim: adamw
161
+ optim_conf:
162
+ lr: 0.0002
163
+ betas:
164
+ - 0.8
165
+ - 0.99
166
+ eps: 1.0e-09
167
+ weight_decay: 0.0
168
+ scheduler: exponentiallr
169
+ scheduler_conf:
170
+ gamma: 0.999875
171
+ optim2: adamw
172
+ optim2_conf:
173
+ lr: 0.0002
174
+ betas:
175
+ - 0.8
176
+ - 0.99
177
+ eps: 1.0e-09
178
+ weight_decay: 0.0
179
+ scheduler2: exponentiallr
180
+ scheduler2_conf:
181
+ gamma: 0.999875
182
+ generator_first: false
183
+ token_list:
184
+ - <blank>
185
+ - <unk>
186
+ - AH0
187
+ - N
188
+ - T
189
+ - D
190
+ - S
191
+ - R
192
+ - L
193
+ - IH1
194
+ - DH
195
+ - M
196
+ - EH1
197
+ - Z
198
+ - AE1
199
+ - K
200
+ - IH0
201
+ - AH1
202
+ - HH
203
+ - W
204
+ - ER0
205
+ - V
206
+ - IY1
207
+ - F
208
+ - UW1
209
+ - P
210
+ - AY1
211
+ - B
212
+ - AA1
213
+ - AO1
214
+ - EY1
215
+ - IY0
216
+ - OW1
217
+ - NG
218
+ - SH
219
+ - G
220
+ - Y
221
+ - AW1
222
+ - CH
223
+ - ER1
224
+ - UH1
225
+ - TH
226
+ - JH
227
+ - OW0
228
+ - EH2
229
+ - IH2
230
+ - OY1
231
+ - EY2
232
+ - AY2
233
+ - EH0
234
+ - UW0
235
+ - AA2
236
+ - AE2
237
+ - OW2
238
+ - AA0
239
+ - AH2
240
+ - ZH
241
+ - AE0
242
+ - AO2
243
+ - UW2
244
+ - AO0
245
+ - AY0
246
+ - IY2
247
+ - AW2
248
+ - UH2
249
+ - EY0
250
+ - ER2
251
+ - AW0
252
+ - UH0
253
+ - OY2
254
+ - OY0
255
+ - ''''
256
+ - <sos/eos>
257
+ odim: null
258
+ model_conf: {}
259
+ use_preprocessor: true
260
+ token_type: phn
261
+ bpemodel: null
262
+ non_linguistic_symbols: null
263
+ cleaner: tacotron
264
+ g2p: g2p_en_no_space
265
+ feats_extract: linear_spectrogram
266
+ feats_extract_conf:
267
+ n_fft: 1024
268
+ hop_length: 256
269
+ win_length: null
270
+ normalize: null
271
+ normalize_conf: {}
272
+ tts: vits
273
+ tts_conf:
274
+ generator_type: vits_generator
275
+ generator_params:
276
+ hidden_channels: 192
277
+ spks: 128
278
+ global_channels: 256
279
+ segment_size: 32
280
+ text_encoder_attention_heads: 2
281
+ text_encoder_ffn_expand: 4
282
+ text_encoder_blocks: 6
283
+ text_encoder_positionwise_layer_type: conv1d
284
+ text_encoder_positionwise_conv_kernel_size: 3
285
+ text_encoder_positional_encoding_layer_type: rel_pos
286
+ text_encoder_self_attention_layer_type: rel_selfattn
287
+ text_encoder_activation_type: swish
288
+ text_encoder_normalize_before: true
289
+ text_encoder_dropout_rate: 0.1
290
+ text_encoder_positional_dropout_rate: 0.0
291
+ text_encoder_attention_dropout_rate: 0.1
292
+ use_macaron_style_in_text_encoder: true
293
+ use_conformer_conv_in_text_encoder: false
294
+ text_encoder_conformer_kernel_size: -1
295
+ decoder_kernel_size: 7
296
+ decoder_channels: 512
297
+ decoder_upsample_scales:
298
+ - 8
299
+ - 8
300
+ - 2
301
+ - 2
302
+ decoder_upsample_kernel_sizes:
303
+ - 16
304
+ - 16
305
+ - 4
306
+ - 4
307
+ decoder_resblock_kernel_sizes:
308
+ - 3
309
+ - 7
310
+ - 11
311
+ decoder_resblock_dilations:
312
+ - - 1
313
+ - 3
314
+ - 5
315
+ - - 1
316
+ - 3
317
+ - 5
318
+ - - 1
319
+ - 3
320
+ - 5
321
+ use_weight_norm_in_decoder: true
322
+ posterior_encoder_kernel_size: 5
323
+ posterior_encoder_layers: 16
324
+ posterior_encoder_stacks: 1
325
+ posterior_encoder_base_dilation: 1
326
+ posterior_encoder_dropout_rate: 0.0
327
+ use_weight_norm_in_posterior_encoder: true
328
+ flow_flows: 4
329
+ flow_kernel_size: 5
330
+ flow_base_dilation: 1
331
+ flow_layers: 4
332
+ flow_dropout_rate: 0.0
333
+ use_weight_norm_in_flow: true
334
+ use_only_mean_in_flow: true
335
+ stochastic_duration_predictor_kernel_size: 3
336
+ stochastic_duration_predictor_dropout_rate: 0.5
337
+ stochastic_duration_predictor_flows: 4
338
+ stochastic_duration_predictor_dds_conv_layers: 3
339
+ vocabs: 73
340
+ aux_channels: 513
341
+ discriminator_type: hifigan_multi_scale_multi_period_discriminator
342
+ discriminator_params:
343
+ scales: 1
344
+ scale_downsample_pooling: AvgPool1d
345
+ scale_downsample_pooling_params:
346
+ kernel_size: 4
347
+ stride: 2
348
+ padding: 2
349
+ scale_discriminator_params:
350
+ in_channels: 1
351
+ out_channels: 1
352
+ kernel_sizes:
353
+ - 15
354
+ - 41
355
+ - 5
356
+ - 3
357
+ channels: 128
358
+ max_downsample_channels: 1024
359
+ max_groups: 16
360
+ bias: true
361
+ downsample_scales:
362
+ - 2
363
+ - 2
364
+ - 4
365
+ - 4
366
+ - 1
367
+ nonlinear_activation: LeakyReLU
368
+ nonlinear_activation_params:
369
+ negative_slope: 0.1
370
+ use_weight_norm: true
371
+ use_spectral_norm: false
372
+ follow_official_norm: false
373
+ periods:
374
+ - 2
375
+ - 3
376
+ - 5
377
+ - 7
378
+ - 11
379
+ period_discriminator_params:
380
+ in_channels: 1
381
+ out_channels: 1
382
+ kernel_sizes:
383
+ - 5
384
+ - 3
385
+ channels: 32
386
+ downsample_scales:
387
+ - 3
388
+ - 3
389
+ - 3
390
+ - 3
391
+ - 1
392
+ max_downsample_channels: 1024
393
+ bias: true
394
+ nonlinear_activation: LeakyReLU
395
+ nonlinear_activation_params:
396
+ negative_slope: 0.1
397
+ use_weight_norm: true
398
+ use_spectral_norm: false
399
+ generator_adv_loss_params:
400
+ average_by_discriminators: false
401
+ loss_type: mse
402
+ discriminator_adv_loss_params:
403
+ average_by_discriminators: false
404
+ loss_type: mse
405
+ feat_match_loss_params:
406
+ average_by_discriminators: false
407
+ average_by_layers: false
408
+ include_final_outputs: true
409
+ mel_loss_params:
410
+ fs: 22050
411
+ n_fft: 1024
412
+ hop_length: 256
413
+ win_length: null
414
+ window: hann
415
+ n_mels: 80
416
+ fmin: 0
417
+ fmax: null
418
+ log_base: null
419
+ lambda_adv: 1.0
420
+ lambda_mel: 45.0
421
+ lambda_feat_match: 2.0
422
+ lambda_dur: 1.0
423
+ lambda_kl: 1.0
424
+ sampling_rate: 22050
425
+ cache_generator_outputs: true
426
+ plot_pred_mos: false
427
+ mos_pred_tool: utmos
428
+ pitch_extract: null
429
+ pitch_extract_conf: {}
430
+ pitch_normalize: null
431
+ pitch_normalize_conf: {}
432
+ energy_extract: null
433
+ energy_extract_conf: {}
434
+ energy_normalize: null
435
+ energy_normalize_conf: {}
436
+ required:
437
+ - output_dir
438
+ - token_list
439
+ version: '202402'
440
+ distributed: true
441
+ ```
442
+
443
+ </details>
444
+
445
+
446
+
447
+ ### Citing ESPnet
448
+
449
+ ```BibTex
450
+ @inproceedings{watanabe2018espnet,
451
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
452
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
453
+ year={2018},
454
+ booktitle={Proceedings of Interspeech},
455
+ pages={2207--2211},
456
+ doi={10.21437/Interspeech.2018-1456},
457
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
458
+ }
459
+
460
+
461
+
462
+
463
+ @inproceedings{hayashi2020espnet,
464
+ title={{Espnet-TTS}: Unified, reproducible, and integratable open source end-to-end text-to-speech toolkit},
465
+ author={Hayashi, Tomoki and Yamamoto, Ryuichi and Inoue, Katsuki and Yoshimura, Takenori and Watanabe, Shinji and Toda, Tomoki and Takeda, Kazuya and Zhang, Yu and Tan, Xu},
466
+ booktitle={Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
467
+ pages={7654--7658},
468
+ year={2020},
469
+ organization={IEEE}
470
+ }
471
+
472
+
473
+ ```
474
+
475
+ or arXiv:
476
+
477
+ ```bibtex
478
+ @misc{watanabe2018espnet,
479
+ title={ESPnet: End-to-End Speech Processing Toolkit},
480
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
481
+ year={2018},
482
+ eprint={1804.00015},
483
+ archivePrefix={arXiv},
484
+ primaryClass={cs.CL}
485
+ }
486
+ ```
dump/22k/raw/org/tr_no_dev/spk2sid ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <unk> 0
2
+ 11614 1
3
+ 11697 2
4
+ 12787 3
5
+ 6097 4
6
+ 6670 5
7
+ 6671 6
8
+ 8051 7
9
+ 9017 8
10
+ 9136 9
11
+ 92 10
exp/22k/tts_train_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/config.yaml ADDED
@@ -0,0 +1,403 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: ./conf/tuning/train_multi_spk_vits.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ drop_last_iter: false
5
+ dry_run: false
6
+ iterator_type: sequence
7
+ valid_iterator_type: null
8
+ output_dir: exp/22k/tts_train_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space
9
+ ngpu: 1
10
+ seed: 777
11
+ num_workers: 4
12
+ num_att_plot: 3
13
+ dist_backend: nccl
14
+ dist_init_method: env://
15
+ dist_world_size: 2
16
+ dist_rank: 0
17
+ local_rank: 0
18
+ dist_master_addr: localhost
19
+ dist_master_port: 43797
20
+ dist_launcher: null
21
+ multiprocessing_distributed: true
22
+ unused_parameters: true
23
+ sharded_ddp: false
24
+ cudnn_enabled: true
25
+ cudnn_benchmark: false
26
+ cudnn_deterministic: false
27
+ collect_stats: false
28
+ write_collected_feats: false
29
+ max_epoch: 3000
30
+ patience: null
31
+ val_scheduler_criterion:
32
+ - valid
33
+ - loss
34
+ early_stopping_criterion:
35
+ - valid
36
+ - loss
37
+ - min
38
+ best_model_criterion:
39
+ - - train
40
+ - total_count
41
+ - max
42
+ keep_nbest_models: 10
43
+ nbest_averaging_interval: 0
44
+ grad_clip: -1
45
+ grad_clip_type: 2.0
46
+ grad_noise: false
47
+ accum_grad: 1
48
+ no_forward_run: false
49
+ resume: true
50
+ train_dtype: float32
51
+ use_amp: false
52
+ log_interval: 50
53
+ use_matplotlib: true
54
+ use_tensorboard: true
55
+ create_graph_in_tensorboard: false
56
+ use_wandb: false
57
+ wandb_project: null
58
+ wandb_id: null
59
+ wandb_entity: null
60
+ wandb_name: null
61
+ wandb_model_log_interval: -1
62
+ detect_anomaly: false
63
+ use_adapter: false
64
+ adapter: lora
65
+ save_strategy: all
66
+ adapter_conf: {}
67
+ pretrain_path: null
68
+ init_param: []
69
+ ignore_init_mismatch: false
70
+ freeze_param: []
71
+ num_iters_per_epoch: 1000
72
+ batch_size: 20
73
+ valid_batch_size: null
74
+ batch_bins: 1000000
75
+ valid_batch_bins: null
76
+ train_shape_file:
77
+ - exp/22k/tts_stats_raw_linear_spectrogram_phn_tacotron_g2p_en_no_space/train/text_shape.phn
78
+ - exp/22k/tts_stats_raw_linear_spectrogram_phn_tacotron_g2p_en_no_space/train/speech_shape
79
+ valid_shape_file:
80
+ - exp/22k/tts_stats_raw_linear_spectrogram_phn_tacotron_g2p_en_no_space/valid/text_shape.phn
81
+ - exp/22k/tts_stats_raw_linear_spectrogram_phn_tacotron_g2p_en_no_space/valid/speech_shape
82
+ batch_type: numel
83
+ valid_batch_type: null
84
+ fold_length:
85
+ - 150
86
+ - 204800
87
+ sort_in_batch: descending
88
+ shuffle_within_batch: false
89
+ sort_batch: descending
90
+ multiple_iterator: false
91
+ chunk_length: 500
92
+ chunk_shift_ratio: 0.5
93
+ num_cache_chunks: 1024
94
+ chunk_excluded_key_prefixes: []
95
+ chunk_default_fs: null
96
+ train_data_path_and_name_and_type:
97
+ - - dump/22k/raw/tr_no_dev/text
98
+ - text
99
+ - text
100
+ - - dump/22k/raw/tr_no_dev/wav.scp
101
+ - speech
102
+ - sound
103
+ - - dump/22k/raw/tr_no_dev/utt2sid
104
+ - sids
105
+ - text_int
106
+ valid_data_path_and_name_and_type:
107
+ - - dump/22k/raw/dev/text
108
+ - text
109
+ - text
110
+ - - dump/22k/raw/dev/wav.scp
111
+ - speech
112
+ - sound
113
+ - - dump/22k/raw/dev/utt2sid
114
+ - sids
115
+ - text_int
116
+ allow_variable_data_keys: false
117
+ max_cache_size: 0.0
118
+ max_cache_fd: 32
119
+ allow_multi_rates: false
120
+ valid_max_cache_size: null
121
+ exclude_weight_decay: false
122
+ exclude_weight_decay_conf: {}
123
+ optim: adamw
124
+ optim_conf:
125
+ lr: 0.0002
126
+ betas:
127
+ - 0.8
128
+ - 0.99
129
+ eps: 1.0e-09
130
+ weight_decay: 0.0
131
+ scheduler: exponentiallr
132
+ scheduler_conf:
133
+ gamma: 0.999875
134
+ optim2: adamw
135
+ optim2_conf:
136
+ lr: 0.0002
137
+ betas:
138
+ - 0.8
139
+ - 0.99
140
+ eps: 1.0e-09
141
+ weight_decay: 0.0
142
+ scheduler2: exponentiallr
143
+ scheduler2_conf:
144
+ gamma: 0.999875
145
+ generator_first: false
146
+ token_list:
147
+ - <blank>
148
+ - <unk>
149
+ - AH0
150
+ - N
151
+ - T
152
+ - D
153
+ - S
154
+ - R
155
+ - L
156
+ - IH1
157
+ - DH
158
+ - M
159
+ - EH1
160
+ - Z
161
+ - AE1
162
+ - K
163
+ - IH0
164
+ - AH1
165
+ - HH
166
+ - W
167
+ - ER0
168
+ - V
169
+ - IY1
170
+ - F
171
+ - UW1
172
+ - P
173
+ - AY1
174
+ - B
175
+ - AA1
176
+ - AO1
177
+ - EY1
178
+ - IY0
179
+ - OW1
180
+ - NG
181
+ - SH
182
+ - G
183
+ - Y
184
+ - AW1
185
+ - CH
186
+ - ER1
187
+ - UH1
188
+ - TH
189
+ - JH
190
+ - OW0
191
+ - EH2
192
+ - IH2
193
+ - OY1
194
+ - EY2
195
+ - AY2
196
+ - EH0
197
+ - UW0
198
+ - AA2
199
+ - AE2
200
+ - OW2
201
+ - AA0
202
+ - AH2
203
+ - ZH
204
+ - AE0
205
+ - AO2
206
+ - UW2
207
+ - AO0
208
+ - AY0
209
+ - IY2
210
+ - AW2
211
+ - UH2
212
+ - EY0
213
+ - ER2
214
+ - AW0
215
+ - UH0
216
+ - OY2
217
+ - OY0
218
+ - ''''
219
+ - <sos/eos>
220
+ odim: null
221
+ model_conf: {}
222
+ use_preprocessor: true
223
+ token_type: phn
224
+ bpemodel: null
225
+ non_linguistic_symbols: null
226
+ cleaner: tacotron
227
+ g2p: g2p_en_no_space
228
+ feats_extract: linear_spectrogram
229
+ feats_extract_conf:
230
+ n_fft: 1024
231
+ hop_length: 256
232
+ win_length: null
233
+ normalize: null
234
+ normalize_conf: {}
235
+ tts: vits
236
+ tts_conf:
237
+ generator_type: vits_generator
238
+ generator_params:
239
+ hidden_channels: 192
240
+ spks: 128
241
+ global_channels: 256
242
+ segment_size: 32
243
+ text_encoder_attention_heads: 2
244
+ text_encoder_ffn_expand: 4
245
+ text_encoder_blocks: 6
246
+ text_encoder_positionwise_layer_type: conv1d
247
+ text_encoder_positionwise_conv_kernel_size: 3
248
+ text_encoder_positional_encoding_layer_type: rel_pos
249
+ text_encoder_self_attention_layer_type: rel_selfattn
250
+ text_encoder_activation_type: swish
251
+ text_encoder_normalize_before: true
252
+ text_encoder_dropout_rate: 0.1
253
+ text_encoder_positional_dropout_rate: 0.0
254
+ text_encoder_attention_dropout_rate: 0.1
255
+ use_macaron_style_in_text_encoder: true
256
+ use_conformer_conv_in_text_encoder: false
257
+ text_encoder_conformer_kernel_size: -1
258
+ decoder_kernel_size: 7
259
+ decoder_channels: 512
260
+ decoder_upsample_scales:
261
+ - 8
262
+ - 8
263
+ - 2
264
+ - 2
265
+ decoder_upsample_kernel_sizes:
266
+ - 16
267
+ - 16
268
+ - 4
269
+ - 4
270
+ decoder_resblock_kernel_sizes:
271
+ - 3
272
+ - 7
273
+ - 11
274
+ decoder_resblock_dilations:
275
+ - - 1
276
+ - 3
277
+ - 5
278
+ - - 1
279
+ - 3
280
+ - 5
281
+ - - 1
282
+ - 3
283
+ - 5
284
+ use_weight_norm_in_decoder: true
285
+ posterior_encoder_kernel_size: 5
286
+ posterior_encoder_layers: 16
287
+ posterior_encoder_stacks: 1
288
+ posterior_encoder_base_dilation: 1
289
+ posterior_encoder_dropout_rate: 0.0
290
+ use_weight_norm_in_posterior_encoder: true
291
+ flow_flows: 4
292
+ flow_kernel_size: 5
293
+ flow_base_dilation: 1
294
+ flow_layers: 4
295
+ flow_dropout_rate: 0.0
296
+ use_weight_norm_in_flow: true
297
+ use_only_mean_in_flow: true
298
+ stochastic_duration_predictor_kernel_size: 3
299
+ stochastic_duration_predictor_dropout_rate: 0.5
300
+ stochastic_duration_predictor_flows: 4
301
+ stochastic_duration_predictor_dds_conv_layers: 3
302
+ vocabs: 73
303
+ aux_channels: 513
304
+ discriminator_type: hifigan_multi_scale_multi_period_discriminator
305
+ discriminator_params:
306
+ scales: 1
307
+ scale_downsample_pooling: AvgPool1d
308
+ scale_downsample_pooling_params:
309
+ kernel_size: 4
310
+ stride: 2
311
+ padding: 2
312
+ scale_discriminator_params:
313
+ in_channels: 1
314
+ out_channels: 1
315
+ kernel_sizes:
316
+ - 15
317
+ - 41
318
+ - 5
319
+ - 3
320
+ channels: 128
321
+ max_downsample_channels: 1024
322
+ max_groups: 16
323
+ bias: true
324
+ downsample_scales:
325
+ - 2
326
+ - 2
327
+ - 4
328
+ - 4
329
+ - 1
330
+ nonlinear_activation: LeakyReLU
331
+ nonlinear_activation_params:
332
+ negative_slope: 0.1
333
+ use_weight_norm: true
334
+ use_spectral_norm: false
335
+ follow_official_norm: false
336
+ periods:
337
+ - 2
338
+ - 3
339
+ - 5
340
+ - 7
341
+ - 11
342
+ period_discriminator_params:
343
+ in_channels: 1
344
+ out_channels: 1
345
+ kernel_sizes:
346
+ - 5
347
+ - 3
348
+ channels: 32
349
+ downsample_scales:
350
+ - 3
351
+ - 3
352
+ - 3
353
+ - 3
354
+ - 1
355
+ max_downsample_channels: 1024
356
+ bias: true
357
+ nonlinear_activation: LeakyReLU
358
+ nonlinear_activation_params:
359
+ negative_slope: 0.1
360
+ use_weight_norm: true
361
+ use_spectral_norm: false
362
+ generator_adv_loss_params:
363
+ average_by_discriminators: false
364
+ loss_type: mse
365
+ discriminator_adv_loss_params:
366
+ average_by_discriminators: false
367
+ loss_type: mse
368
+ feat_match_loss_params:
369
+ average_by_discriminators: false
370
+ average_by_layers: false
371
+ include_final_outputs: true
372
+ mel_loss_params:
373
+ fs: 22050
374
+ n_fft: 1024
375
+ hop_length: 256
376
+ win_length: null
377
+ window: hann
378
+ n_mels: 80
379
+ fmin: 0
380
+ fmax: null
381
+ log_base: null
382
+ lambda_adv: 1.0
383
+ lambda_mel: 45.0
384
+ lambda_feat_match: 2.0
385
+ lambda_dur: 1.0
386
+ lambda_kl: 1.0
387
+ sampling_rate: 22050
388
+ cache_generator_outputs: true
389
+ plot_pred_mos: false
390
+ mos_pred_tool: utmos
391
+ pitch_extract: null
392
+ pitch_extract_conf: {}
393
+ pitch_normalize: null
394
+ pitch_normalize_conf: {}
395
+ energy_extract: null
396
+ energy_extract_conf: {}
397
+ energy_normalize: null
398
+ energy_normalize_conf: {}
399
+ required:
400
+ - output_dir
401
+ - token_list
402
+ version: '202402'
403
+ distributed: true
exp/22k/tts_train_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/discriminator_backward_time.png ADDED
exp/22k/tts_train_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/discriminator_fake_loss.png ADDED
exp/22k/tts_train_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/discriminator_forward_time.png ADDED
exp/22k/tts_train_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/discriminator_loss.png ADDED
exp/22k/tts_train_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/discriminator_optim_step_time.png ADDED
exp/22k/tts_train_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/discriminator_real_loss.png ADDED
exp/22k/tts_train_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/discriminator_train_time.png ADDED
exp/22k/tts_train_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_adv_loss.png ADDED
exp/22k/tts_train_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_backward_time.png ADDED
exp/22k/tts_train_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_dur_loss.png ADDED
exp/22k/tts_train_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_feat_match_loss.png ADDED
exp/22k/tts_train_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_forward_time.png ADDED
exp/22k/tts_train_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_kl_loss.png ADDED
exp/22k/tts_train_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_loss.png ADDED
exp/22k/tts_train_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_mel_loss.png ADDED
exp/22k/tts_train_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_optim_step_time.png ADDED
exp/22k/tts_train_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_train_time.png ADDED
exp/22k/tts_train_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/gpu_max_cached_mem_GB.png ADDED
exp/22k/tts_train_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/iter_time.png ADDED
exp/22k/tts_train_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/optim0_lr0.png ADDED
exp/22k/tts_train_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/optim1_lr0.png ADDED
exp/22k/tts_train_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/train_time.png ADDED
exp/22k/tts_train_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/train.loss.ave.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb8f668af21bd4e48c92f8cc8ccbf12526742b6c885b136ecb6c7f9f6ca3a8c0
3
+ size 386086080
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: '202402'
2
+ files:
3
+ model_file: exp/22k/tts_train_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/train.loss.ave.pth
4
+ python: "3.9.18 (main, Aug 25 2023, 13:20:04) \n[GCC 9.4.0]"
5
+ timestamp: 1716368677.384559
6
+ torch: 2.2.1+cu118
7
+ yaml_files:
8
+ train_config: exp/22k/tts_train_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/config.yaml