fujie commited on
Commit
72960a9
1 Parent(s): ffb5551

Update model

Browse files
Files changed (26) hide show
  1. README.md +446 -0
  2. exp/tts_finetune_xvector_vits_raw_phn_jaconv_pyopenjtalk_prosody/config.yaml +368 -0
  3. exp/tts_finetune_xvector_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/discriminator_backward_time.png +0 -0
  4. exp/tts_finetune_xvector_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/discriminator_fake_loss.png +0 -0
  5. exp/tts_finetune_xvector_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/discriminator_forward_time.png +0 -0
  6. exp/tts_finetune_xvector_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/discriminator_loss.png +0 -0
  7. exp/tts_finetune_xvector_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/discriminator_optim_step_time.png +0 -0
  8. exp/tts_finetune_xvector_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/discriminator_real_loss.png +0 -0
  9. exp/tts_finetune_xvector_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/discriminator_train_time.png +0 -0
  10. exp/tts_finetune_xvector_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/generator_adv_loss.png +0 -0
  11. exp/tts_finetune_xvector_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/generator_backward_time.png +0 -0
  12. exp/tts_finetune_xvector_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/generator_dur_loss.png +0 -0
  13. exp/tts_finetune_xvector_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/generator_feat_match_loss.png +0 -0
  14. exp/tts_finetune_xvector_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/generator_forward_time.png +0 -0
  15. exp/tts_finetune_xvector_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/generator_kl_loss.png +0 -0
  16. exp/tts_finetune_xvector_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/generator_loss.png +0 -0
  17. exp/tts_finetune_xvector_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/generator_mel_loss.png +0 -0
  18. exp/tts_finetune_xvector_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/generator_optim_step_time.png +0 -0
  19. exp/tts_finetune_xvector_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/generator_train_time.png +0 -0
  20. exp/tts_finetune_xvector_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/gpu_max_cached_mem_GB.png +0 -0
  21. exp/tts_finetune_xvector_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/iter_time.png +0 -0
  22. exp/tts_finetune_xvector_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/optim0_lr0.png +0 -0
  23. exp/tts_finetune_xvector_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/optim1_lr0.png +0 -0
  24. exp/tts_finetune_xvector_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/train_time.png +0 -0
  25. exp/tts_finetune_xvector_vits_raw_phn_jaconv_pyopenjtalk_prosody/train.total_count.ave_10best.pth +3 -0
  26. meta.yaml +8 -0
README.md CHANGED
@@ -1,3 +1,449 @@
1
  ---
 
 
 
 
 
 
 
2
  license: cc-by-4.0
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - text-to-speech
6
+ language: jp
7
+ datasets:
8
+ - jvs_ms
9
  license: cc-by-4.0
10
  ---
11
+
12
+ ## ESPnet2 TTS model
13
+
14
+ ### `fujie/fujie_jvs_ms_tts_finetune_xvector_vits_raw_phn_jaconv_pyopenjtalk_prosody`
15
+
16
+ This model was trained by Shinya Fujie using jvs_ms recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+ ### Demo: How to use in ESPnet2
19
+
20
+ Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html)
21
+ if you haven't done that already.
22
+
23
+ ```bash
24
+ cd espnet
25
+ git checkout 1a13976db1013601dc5e70dddd47b622aaac614f
26
+ pip install -e .
27
+ cd egs2/jvs_ms/tts1
28
+ ./run.sh --skip_data_prep false --skip_train true --download_model fujie/fujie_jvs_ms_tts_finetune_xvector_vits_raw_phn_jaconv_pyopenjtalk_prosody
29
+ ```
30
+
31
+
32
+
33
+ ## TTS config
34
+
35
+ <details><summary>expand</summary>
36
+
37
+ ```
38
+ config: ./conf/tuning/finetune_vits.yaml
39
+ print_config: false
40
+ log_level: INFO
41
+ dry_run: false
42
+ iterator_type: sequence
43
+ output_dir: exp/tts_finetune_xvector_vits_raw_phn_jaconv_pyopenjtalk_prosody
44
+ ngpu: 1
45
+ seed: 777
46
+ num_workers: 4
47
+ num_att_plot: 3
48
+ dist_backend: nccl
49
+ dist_init_method: env://
50
+ dist_world_size: 2
51
+ dist_rank: 0
52
+ local_rank: 0
53
+ dist_master_addr: localhost
54
+ dist_master_port: 45333
55
+ dist_launcher: null
56
+ multiprocessing_distributed: true
57
+ unused_parameters: true
58
+ sharded_ddp: false
59
+ cudnn_enabled: true
60
+ cudnn_benchmark: false
61
+ cudnn_deterministic: false
62
+ collect_stats: false
63
+ write_collected_feats: false
64
+ max_epoch: 100
65
+ patience: null
66
+ val_scheduler_criterion:
67
+ - valid
68
+ - loss
69
+ early_stopping_criterion:
70
+ - valid
71
+ - loss
72
+ - min
73
+ best_model_criterion:
74
+ - - train
75
+ - total_count
76
+ - max
77
+ keep_nbest_models: 10
78
+ nbest_averaging_interval: 0
79
+ grad_clip: -1
80
+ grad_clip_type: 2.0
81
+ grad_noise: false
82
+ accum_grad: 1
83
+ no_forward_run: false
84
+ resume: true
85
+ train_dtype: float32
86
+ use_amp: false
87
+ log_interval: 50
88
+ use_matplotlib: true
89
+ use_tensorboard: true
90
+ create_graph_in_tensorboard: false
91
+ use_wandb: false
92
+ wandb_project: null
93
+ wandb_id: null
94
+ wandb_entity: null
95
+ wandb_name: null
96
+ wandb_model_log_interval: -1
97
+ detect_anomaly: false
98
+ pretrain_path: null
99
+ init_param:
100
+ - downloads/snapshots/3a859bfd2c9710846fa6244598000f0578a2d3e4/exp/tts_train_vits_raw_phn_jaconv_pyopenjtalk_prosody/train.total_count.ave_10best.pth
101
+ ignore_init_mismatch: false
102
+ freeze_param: []
103
+ num_iters_per_epoch: 1000
104
+ batch_size: 20
105
+ valid_batch_size: null
106
+ batch_bins: 1000000
107
+ valid_batch_bins: null
108
+ train_shape_file:
109
+ - exp/tts_stats_raw_linear_spectrogram_phn_jaconv_pyopenjtalk_prosody/train/text_shape.phn
110
+ - exp/tts_stats_raw_linear_spectrogram_phn_jaconv_pyopenjtalk_prosody/train/speech_shape
111
+ valid_shape_file:
112
+ - exp/tts_stats_raw_linear_spectrogram_phn_jaconv_pyopenjtalk_prosody/valid/text_shape.phn
113
+ - exp/tts_stats_raw_linear_spectrogram_phn_jaconv_pyopenjtalk_prosody/valid/speech_shape
114
+ batch_type: numel
115
+ valid_batch_type: null
116
+ fold_length:
117
+ - 150
118
+ - 204800
119
+ sort_in_batch: descending
120
+ sort_batch: descending
121
+ multiple_iterator: false
122
+ chunk_length: 500
123
+ chunk_shift_ratio: 0.5
124
+ num_cache_chunks: 1024
125
+ chunk_excluded_key_prefixes: []
126
+ train_data_path_and_name_and_type:
127
+ - - dump/22k/raw/tr_no_dev/text
128
+ - text
129
+ - text
130
+ - - dump/22k/raw/tr_no_dev/wav.scp
131
+ - speech
132
+ - sound
133
+ - - dump/22k/xvector/tr_no_dev/xvector.scp
134
+ - spembs
135
+ - kaldi_ark
136
+ valid_data_path_and_name_and_type:
137
+ - - dump/22k/raw/dev/text
138
+ - text
139
+ - text
140
+ - - dump/22k/raw/dev/wav.scp
141
+ - speech
142
+ - sound
143
+ - - dump/22k/xvector/dev/xvector.scp
144
+ - spembs
145
+ - kaldi_ark
146
+ allow_variable_data_keys: false
147
+ max_cache_size: 0.0
148
+ max_cache_fd: 32
149
+ valid_max_cache_size: null
150
+ exclude_weight_decay: false
151
+ exclude_weight_decay_conf: {}
152
+ optim: adamw
153
+ optim_conf:
154
+ lr: 0.0001
155
+ betas:
156
+ - 0.8
157
+ - 0.99
158
+ eps: 1.0e-09
159
+ weight_decay: 0.0
160
+ scheduler: exponentiallr
161
+ scheduler_conf:
162
+ gamma: 0.999875
163
+ optim2: adamw
164
+ optim2_conf:
165
+ lr: 0.0001
166
+ betas:
167
+ - 0.8
168
+ - 0.99
169
+ eps: 1.0e-09
170
+ weight_decay: 0.0
171
+ scheduler2: exponentiallr
172
+ scheduler2_conf:
173
+ gamma: 0.999875
174
+ generator_first: false
175
+ token_list:
176
+ - <blank>
177
+ - <unk>
178
+ - a
179
+ - o
180
+ - i
181
+ - '['
182
+ - '#'
183
+ - u
184
+ - ']'
185
+ - e
186
+ - k
187
+ - n
188
+ - t
189
+ - r
190
+ - s
191
+ - N
192
+ - m
193
+ - _
194
+ - sh
195
+ - d
196
+ - g
197
+ - ^
198
+ - $
199
+ - w
200
+ - cl
201
+ - h
202
+ - y
203
+ - b
204
+ - j
205
+ - ts
206
+ - ch
207
+ - z
208
+ - p
209
+ - f
210
+ - ky
211
+ - ry
212
+ - gy
213
+ - hy
214
+ - ny
215
+ - by
216
+ - my
217
+ - py
218
+ - v
219
+ - dy
220
+ - '?'
221
+ - ty
222
+ - <sos/eos>
223
+ odim: null
224
+ model_conf: {}
225
+ use_preprocessor: true
226
+ token_type: phn
227
+ bpemodel: null
228
+ non_linguistic_symbols: null
229
+ cleaner: jaconv
230
+ g2p: pyopenjtalk_prosody
231
+ feats_extract: linear_spectrogram
232
+ feats_extract_conf:
233
+ n_fft: 1024
234
+ hop_length: 256
235
+ win_length: null
236
+ normalize: null
237
+ normalize_conf: {}
238
+ tts: vits
239
+ tts_conf:
240
+ generator_type: vits_generator
241
+ generator_params:
242
+ hidden_channels: 192
243
+ spks: -1
244
+ global_channels: 256
245
+ segment_size: 32
246
+ text_encoder_attention_heads: 2
247
+ text_encoder_ffn_expand: 4
248
+ text_encoder_blocks: 6
249
+ text_encoder_positionwise_layer_type: conv1d
250
+ text_encoder_positionwise_conv_kernel_size: 3
251
+ text_encoder_positional_encoding_layer_type: rel_pos
252
+ text_encoder_self_attention_layer_type: rel_selfattn
253
+ text_encoder_activation_type: swish
254
+ text_encoder_normalize_before: true
255
+ text_encoder_dropout_rate: 0.1
256
+ text_encoder_positional_dropout_rate: 0.0
257
+ text_encoder_attention_dropout_rate: 0.1
258
+ use_macaron_style_in_text_encoder: true
259
+ use_conformer_conv_in_text_encoder: false
260
+ text_encoder_conformer_kernel_size: -1
261
+ decoder_kernel_size: 7
262
+ decoder_channels: 512
263
+ decoder_upsample_scales:
264
+ - 8
265
+ - 8
266
+ - 2
267
+ - 2
268
+ decoder_upsample_kernel_sizes:
269
+ - 16
270
+ - 16
271
+ - 4
272
+ - 4
273
+ decoder_resblock_kernel_sizes:
274
+ - 3
275
+ - 7
276
+ - 11
277
+ decoder_resblock_dilations:
278
+ - - 1
279
+ - 3
280
+ - 5
281
+ - - 1
282
+ - 3
283
+ - 5
284
+ - - 1
285
+ - 3
286
+ - 5
287
+ use_weight_norm_in_decoder: true
288
+ posterior_encoder_kernel_size: 5
289
+ posterior_encoder_layers: 16
290
+ posterior_encoder_stacks: 1
291
+ posterior_encoder_base_dilation: 1
292
+ posterior_encoder_dropout_rate: 0.0
293
+ use_weight_norm_in_posterior_encoder: true
294
+ flow_flows: 4
295
+ flow_kernel_size: 5
296
+ flow_base_dilation: 1
297
+ flow_layers: 4
298
+ flow_dropout_rate: 0.0
299
+ use_weight_norm_in_flow: true
300
+ use_only_mean_in_flow: true
301
+ stochastic_duration_predictor_kernel_size: 3
302
+ stochastic_duration_predictor_dropout_rate: 0.5
303
+ stochastic_duration_predictor_flows: 4
304
+ stochastic_duration_predictor_dds_conv_layers: 3
305
+ spk_embed_dim: 512
306
+ vocabs: 47
307
+ aux_channels: 513
308
+ discriminator_type: hifigan_multi_scale_multi_period_discriminator
309
+ discriminator_params:
310
+ scales: 1
311
+ scale_downsample_pooling: AvgPool1d
312
+ scale_downsample_pooling_params:
313
+ kernel_size: 4
314
+ stride: 2
315
+ padding: 2
316
+ scale_discriminator_params:
317
+ in_channels: 1
318
+ out_channels: 1
319
+ kernel_sizes:
320
+ - 15
321
+ - 41
322
+ - 5
323
+ - 3
324
+ channels: 128
325
+ max_downsample_channels: 1024
326
+ max_groups: 16
327
+ bias: true
328
+ downsample_scales:
329
+ - 2
330
+ - 2
331
+ - 4
332
+ - 4
333
+ - 1
334
+ nonlinear_activation: LeakyReLU
335
+ nonlinear_activation_params:
336
+ negative_slope: 0.1
337
+ use_weight_norm: true
338
+ use_spectral_norm: false
339
+ follow_official_norm: false
340
+ periods:
341
+ - 2
342
+ - 3
343
+ - 5
344
+ - 7
345
+ - 11
346
+ period_discriminator_params:
347
+ in_channels: 1
348
+ out_channels: 1
349
+ kernel_sizes:
350
+ - 5
351
+ - 3
352
+ channels: 32
353
+ downsample_scales:
354
+ - 3
355
+ - 3
356
+ - 3
357
+ - 3
358
+ - 1
359
+ max_downsample_channels: 1024
360
+ bias: true
361
+ nonlinear_activation: LeakyReLU
362
+ nonlinear_activation_params:
363
+ negative_slope: 0.1
364
+ use_weight_norm: true
365
+ use_spectral_norm: false
366
+ generator_adv_loss_params:
367
+ average_by_discriminators: false
368
+ loss_type: mse
369
+ discriminator_adv_loss_params:
370
+ average_by_discriminators: false
371
+ loss_type: mse
372
+ feat_match_loss_params:
373
+ average_by_discriminators: false
374
+ average_by_layers: false
375
+ include_final_outputs: true
376
+ mel_loss_params:
377
+ fs: 22050
378
+ n_fft: 1024
379
+ hop_length: 256
380
+ win_length: null
381
+ window: hann
382
+ n_mels: 80
383
+ fmin: 0
384
+ fmax: null
385
+ log_base: null
386
+ lambda_adv: 1.0
387
+ lambda_mel: 45.0
388
+ lambda_feat_match: 2.0
389
+ lambda_dur: 1.0
390
+ lambda_kl: 1.0
391
+ sampling_rate: 22050
392
+ cache_generator_outputs: true
393
+ pitch_extract: null
394
+ pitch_extract_conf: {}
395
+ pitch_normalize: null
396
+ pitch_normalize_conf: {}
397
+ energy_extract: null
398
+ energy_extract_conf: {}
399
+ energy_normalize: null
400
+ energy_normalize_conf: {}
401
+ required:
402
+ - output_dir
403
+ - token_list
404
+ version: '202304'
405
+ distributed: true
406
+ ```
407
+
408
+ </details>
409
+
410
+
411
+
412
+ ### Citing ESPnet
413
+
414
+ ```BibTex
415
+ @inproceedings{watanabe2018espnet,
416
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
417
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
418
+ year={2018},
419
+ booktitle={Proceedings of Interspeech},
420
+ pages={2207--2211},
421
+ doi={10.21437/Interspeech.2018-1456},
422
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
423
+ }
424
+
425
+
426
+
427
+
428
+ @inproceedings{hayashi2020espnet,
429
+ title={{Espnet-TTS}: Unified, reproducible, and integratable open source end-to-end text-to-speech toolkit},
430
+ author={Hayashi, Tomoki and Yamamoto, Ryuichi and Inoue, Katsuki and Yoshimura, Takenori and Watanabe, Shinji and Toda, Tomoki and Takeda, Kazuya and Zhang, Yu and Tan, Xu},
431
+ booktitle={Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
432
+ pages={7654--7658},
433
+ year={2020},
434
+ organization={IEEE}
435
+ }
436
+ ```
437
+
438
+ or arXiv:
439
+
440
+ ```bibtex
441
+ @misc{watanabe2018espnet,
442
+ title={ESPnet: End-to-End Speech Processing Toolkit},
443
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
444
+ year={2018},
445
+ eprint={1804.00015},
446
+ archivePrefix={arXiv},
447
+ primaryClass={cs.CL}
448
+ }
449
+ ```
exp/tts_finetune_xvector_vits_raw_phn_jaconv_pyopenjtalk_prosody/config.yaml ADDED
@@ -0,0 +1,368 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: ./conf/tuning/finetune_vits.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: exp/tts_finetune_xvector_vits_raw_phn_jaconv_pyopenjtalk_prosody
7
+ ngpu: 1
8
+ seed: 777
9
+ num_workers: 4
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: 2
14
+ dist_rank: 0
15
+ local_rank: 0
16
+ dist_master_addr: localhost
17
+ dist_master_port: 45333
18
+ dist_launcher: null
19
+ multiprocessing_distributed: true
20
+ unused_parameters: true
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: false
25
+ collect_stats: false
26
+ write_collected_feats: false
27
+ max_epoch: 100
28
+ patience: null
29
+ val_scheduler_criterion:
30
+ - valid
31
+ - loss
32
+ early_stopping_criterion:
33
+ - valid
34
+ - loss
35
+ - min
36
+ best_model_criterion:
37
+ - - train
38
+ - total_count
39
+ - max
40
+ keep_nbest_models: 10
41
+ nbest_averaging_interval: 0
42
+ grad_clip: -1
43
+ grad_clip_type: 2.0
44
+ grad_noise: false
45
+ accum_grad: 1
46
+ no_forward_run: false
47
+ resume: true
48
+ train_dtype: float32
49
+ use_amp: false
50
+ log_interval: 50
51
+ use_matplotlib: true
52
+ use_tensorboard: true
53
+ create_graph_in_tensorboard: false
54
+ use_wandb: false
55
+ wandb_project: null
56
+ wandb_id: null
57
+ wandb_entity: null
58
+ wandb_name: null
59
+ wandb_model_log_interval: -1
60
+ detect_anomaly: false
61
+ pretrain_path: null
62
+ init_param:
63
+ - downloads/snapshots/3a859bfd2c9710846fa6244598000f0578a2d3e4/exp/tts_train_vits_raw_phn_jaconv_pyopenjtalk_prosody/train.total_count.ave_10best.pth
64
+ ignore_init_mismatch: false
65
+ freeze_param: []
66
+ num_iters_per_epoch: 1000
67
+ batch_size: 20
68
+ valid_batch_size: null
69
+ batch_bins: 1000000
70
+ valid_batch_bins: null
71
+ train_shape_file:
72
+ - exp/tts_stats_raw_linear_spectrogram_phn_jaconv_pyopenjtalk_prosody/train/text_shape.phn
73
+ - exp/tts_stats_raw_linear_spectrogram_phn_jaconv_pyopenjtalk_prosody/train/speech_shape
74
+ valid_shape_file:
75
+ - exp/tts_stats_raw_linear_spectrogram_phn_jaconv_pyopenjtalk_prosody/valid/text_shape.phn
76
+ - exp/tts_stats_raw_linear_spectrogram_phn_jaconv_pyopenjtalk_prosody/valid/speech_shape
77
+ batch_type: numel
78
+ valid_batch_type: null
79
+ fold_length:
80
+ - 150
81
+ - 204800
82
+ sort_in_batch: descending
83
+ sort_batch: descending
84
+ multiple_iterator: false
85
+ chunk_length: 500
86
+ chunk_shift_ratio: 0.5
87
+ num_cache_chunks: 1024
88
+ chunk_excluded_key_prefixes: []
89
+ train_data_path_and_name_and_type:
90
+ - - dump/22k/raw/tr_no_dev/text
91
+ - text
92
+ - text
93
+ - - dump/22k/raw/tr_no_dev/wav.scp
94
+ - speech
95
+ - sound
96
+ - - dump/22k/xvector/tr_no_dev/xvector.scp
97
+ - spembs
98
+ - kaldi_ark
99
+ valid_data_path_and_name_and_type:
100
+ - - dump/22k/raw/dev/text
101
+ - text
102
+ - text
103
+ - - dump/22k/raw/dev/wav.scp
104
+ - speech
105
+ - sound
106
+ - - dump/22k/xvector/dev/xvector.scp
107
+ - spembs
108
+ - kaldi_ark
109
+ allow_variable_data_keys: false
110
+ max_cache_size: 0.0
111
+ max_cache_fd: 32
112
+ valid_max_cache_size: null
113
+ exclude_weight_decay: false
114
+ exclude_weight_decay_conf: {}
115
+ optim: adamw
116
+ optim_conf:
117
+ lr: 0.0001
118
+ betas:
119
+ - 0.8
120
+ - 0.99
121
+ eps: 1.0e-09
122
+ weight_decay: 0.0
123
+ scheduler: exponentiallr
124
+ scheduler_conf:
125
+ gamma: 0.999875
126
+ optim2: adamw
127
+ optim2_conf:
128
+ lr: 0.0001
129
+ betas:
130
+ - 0.8
131
+ - 0.99
132
+ eps: 1.0e-09
133
+ weight_decay: 0.0
134
+ scheduler2: exponentiallr
135
+ scheduler2_conf:
136
+ gamma: 0.999875
137
+ generator_first: false
138
+ token_list:
139
+ - <blank>
140
+ - <unk>
141
+ - a
142
+ - o
143
+ - i
144
+ - '['
145
+ - '#'
146
+ - u
147
+ - ']'
148
+ - e
149
+ - k
150
+ - n
151
+ - t
152
+ - r
153
+ - s
154
+ - N
155
+ - m
156
+ - _
157
+ - sh
158
+ - d
159
+ - g
160
+ - ^
161
+ - $
162
+ - w
163
+ - cl
164
+ - h
165
+ - y
166
+ - b
167
+ - j
168
+ - ts
169
+ - ch
170
+ - z
171
+ - p
172
+ - f
173
+ - ky
174
+ - ry
175
+ - gy
176
+ - hy
177
+ - ny
178
+ - by
179
+ - my
180
+ - py
181
+ - v
182
+ - dy
183
+ - '?'
184
+ - ty
185
+ - <sos/eos>
186
+ odim: null
187
+ model_conf: {}
188
+ use_preprocessor: true
189
+ token_type: phn
190
+ bpemodel: null
191
+ non_linguistic_symbols: null
192
+ cleaner: jaconv
193
+ g2p: pyopenjtalk_prosody
194
+ feats_extract: linear_spectrogram
195
+ feats_extract_conf:
196
+ n_fft: 1024
197
+ hop_length: 256
198
+ win_length: null
199
+ normalize: null
200
+ normalize_conf: {}
201
+ tts: vits
202
+ tts_conf:
203
+ generator_type: vits_generator
204
+ generator_params:
205
+ hidden_channels: 192
206
+ spks: -1
207
+ global_channels: 256
208
+ segment_size: 32
209
+ text_encoder_attention_heads: 2
210
+ text_encoder_ffn_expand: 4
211
+ text_encoder_blocks: 6
212
+ text_encoder_positionwise_layer_type: conv1d
213
+ text_encoder_positionwise_conv_kernel_size: 3
214
+ text_encoder_positional_encoding_layer_type: rel_pos
215
+ text_encoder_self_attention_layer_type: rel_selfattn
216
+ text_encoder_activation_type: swish
217
+ text_encoder_normalize_before: true
218
+ text_encoder_dropout_rate: 0.1
219
+ text_encoder_positional_dropout_rate: 0.0
220
+ text_encoder_attention_dropout_rate: 0.1
221
+ use_macaron_style_in_text_encoder: true
222
+ use_conformer_conv_in_text_encoder: false
223
+ text_encoder_conformer_kernel_size: -1
224
+ decoder_kernel_size: 7
225
+ decoder_channels: 512
226
+ decoder_upsample_scales:
227
+ - 8
228
+ - 8
229
+ - 2
230
+ - 2
231
+ decoder_upsample_kernel_sizes:
232
+ - 16
233
+ - 16
234
+ - 4
235
+ - 4
236
+ decoder_resblock_kernel_sizes:
237
+ - 3
238
+ - 7
239
+ - 11
240
+ decoder_resblock_dilations:
241
+ - - 1
242
+ - 3
243
+ - 5
244
+ - - 1
245
+ - 3
246
+ - 5
247
+ - - 1
248
+ - 3
249
+ - 5
250
+ use_weight_norm_in_decoder: true
251
+ posterior_encoder_kernel_size: 5
252
+ posterior_encoder_layers: 16
253
+ posterior_encoder_stacks: 1
254
+ posterior_encoder_base_dilation: 1
255
+ posterior_encoder_dropout_rate: 0.0
256
+ use_weight_norm_in_posterior_encoder: true
257
+ flow_flows: 4
258
+ flow_kernel_size: 5
259
+ flow_base_dilation: 1
260
+ flow_layers: 4
261
+ flow_dropout_rate: 0.0
262
+ use_weight_norm_in_flow: true
263
+ use_only_mean_in_flow: true
264
+ stochastic_duration_predictor_kernel_size: 3
265
+ stochastic_duration_predictor_dropout_rate: 0.5
266
+ stochastic_duration_predictor_flows: 4
267
+ stochastic_duration_predictor_dds_conv_layers: 3
268
+ spk_embed_dim: 512
269
+ vocabs: 47
270
+ aux_channels: 513
271
+ discriminator_type: hifigan_multi_scale_multi_period_discriminator
272
+ discriminator_params:
273
+ scales: 1
274
+ scale_downsample_pooling: AvgPool1d
275
+ scale_downsample_pooling_params:
276
+ kernel_size: 4
277
+ stride: 2
278
+ padding: 2
279
+ scale_discriminator_params:
280
+ in_channels: 1
281
+ out_channels: 1
282
+ kernel_sizes:
283
+ - 15
284
+ - 41
285
+ - 5
286
+ - 3
287
+ channels: 128
288
+ max_downsample_channels: 1024
289
+ max_groups: 16
290
+ bias: true
291
+ downsample_scales:
292
+ - 2
293
+ - 2
294
+ - 4
295
+ - 4
296
+ - 1
297
+ nonlinear_activation: LeakyReLU
298
+ nonlinear_activation_params:
299
+ negative_slope: 0.1
300
+ use_weight_norm: true
301
+ use_spectral_norm: false
302
+ follow_official_norm: false
303
+ periods:
304
+ - 2
305
+ - 3
306
+ - 5
307
+ - 7
308
+ - 11
309
+ period_discriminator_params:
310
+ in_channels: 1
311
+ out_channels: 1
312
+ kernel_sizes:
313
+ - 5
314
+ - 3
315
+ channels: 32
316
+ downsample_scales:
317
+ - 3
318
+ - 3
319
+ - 3
320
+ - 3
321
+ - 1
322
+ max_downsample_channels: 1024
323
+ bias: true
324
+ nonlinear_activation: LeakyReLU
325
+ nonlinear_activation_params:
326
+ negative_slope: 0.1
327
+ use_weight_norm: true
328
+ use_spectral_norm: false
329
+ generator_adv_loss_params:
330
+ average_by_discriminators: false
331
+ loss_type: mse
332
+ discriminator_adv_loss_params:
333
+ average_by_discriminators: false
334
+ loss_type: mse
335
+ feat_match_loss_params:
336
+ average_by_discriminators: false
337
+ average_by_layers: false
338
+ include_final_outputs: true
339
+ mel_loss_params:
340
+ fs: 22050
341
+ n_fft: 1024
342
+ hop_length: 256
343
+ win_length: null
344
+ window: hann
345
+ n_mels: 80
346
+ fmin: 0
347
+ fmax: null
348
+ log_base: null
349
+ lambda_adv: 1.0
350
+ lambda_mel: 45.0
351
+ lambda_feat_match: 2.0
352
+ lambda_dur: 1.0
353
+ lambda_kl: 1.0
354
+ sampling_rate: 22050
355
+ cache_generator_outputs: true
356
+ pitch_extract: null
357
+ pitch_extract_conf: {}
358
+ pitch_normalize: null
359
+ pitch_normalize_conf: {}
360
+ energy_extract: null
361
+ energy_extract_conf: {}
362
+ energy_normalize: null
363
+ energy_normalize_conf: {}
364
+ required:
365
+ - output_dir
366
+ - token_list
367
+ version: '202304'
368
+ distributed: true
exp/tts_finetune_xvector_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/discriminator_backward_time.png ADDED
exp/tts_finetune_xvector_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/discriminator_fake_loss.png ADDED
exp/tts_finetune_xvector_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/discriminator_forward_time.png ADDED
exp/tts_finetune_xvector_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/discriminator_loss.png ADDED
exp/tts_finetune_xvector_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/discriminator_optim_step_time.png ADDED
exp/tts_finetune_xvector_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/discriminator_real_loss.png ADDED
exp/tts_finetune_xvector_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/discriminator_train_time.png ADDED
exp/tts_finetune_xvector_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/generator_adv_loss.png ADDED
exp/tts_finetune_xvector_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/generator_backward_time.png ADDED
exp/tts_finetune_xvector_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/generator_dur_loss.png ADDED
exp/tts_finetune_xvector_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/generator_feat_match_loss.png ADDED
exp/tts_finetune_xvector_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/generator_forward_time.png ADDED
exp/tts_finetune_xvector_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/generator_kl_loss.png ADDED
exp/tts_finetune_xvector_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/generator_loss.png ADDED
exp/tts_finetune_xvector_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/generator_mel_loss.png ADDED
exp/tts_finetune_xvector_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/generator_optim_step_time.png ADDED
exp/tts_finetune_xvector_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/generator_train_time.png ADDED
exp/tts_finetune_xvector_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/gpu_max_cached_mem_GB.png ADDED
exp/tts_finetune_xvector_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/iter_time.png ADDED
exp/tts_finetune_xvector_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/optim0_lr0.png ADDED
exp/tts_finetune_xvector_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/optim1_lr0.png ADDED
exp/tts_finetune_xvector_vits_raw_phn_jaconv_pyopenjtalk_prosody/images/train_time.png ADDED
exp/tts_finetune_xvector_vits_raw_phn_jaconv_pyopenjtalk_prosody/train.total_count.ave_10best.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a853ef0be294fe22504c80f4917588a786fcd80dab6b701cfef76e860efafb0
3
+ size 386446927
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: '202304'
2
+ files:
3
+ model_file: exp/tts_finetune_xvector_vits_raw_phn_jaconv_pyopenjtalk_prosody/train.total_count.ave_10best.pth
4
+ python: 3.10.10 (main, Mar 21 2023, 18:45:11) [GCC 11.2.0]
5
+ timestamp: 1684969732.788711
6
+ torch: 1.13.1
7
+ yaml_files:
8
+ train_config: exp/tts_finetune_xvector_vits_raw_phn_jaconv_pyopenjtalk_prosody/config.yaml