mio
/

Text-to-Speech
ESPnet
jp
audio
mio commited on
Commit
87424d7
1 Parent(s): 5652a6d

Update model

Browse files
Files changed (27) hide show
  1. README.md +477 -0
  2. exp/tts_chtholly_vits_finetune_from_jsut/33epoch.pth +3 -0
  3. exp/tts_chtholly_vits_finetune_from_jsut/config.yaml +396 -0
  4. exp/tts_chtholly_vits_finetune_from_jsut/images/discriminator_backward_time.png +0 -0
  5. exp/tts_chtholly_vits_finetune_from_jsut/images/discriminator_fake_loss.png +0 -0
  6. exp/tts_chtholly_vits_finetune_from_jsut/images/discriminator_forward_time.png +0 -0
  7. exp/tts_chtholly_vits_finetune_from_jsut/images/discriminator_loss.png +0 -0
  8. exp/tts_chtholly_vits_finetune_from_jsut/images/discriminator_optim_step_time.png +0 -0
  9. exp/tts_chtholly_vits_finetune_from_jsut/images/discriminator_real_loss.png +0 -0
  10. exp/tts_chtholly_vits_finetune_from_jsut/images/discriminator_train_time.png +0 -0
  11. exp/tts_chtholly_vits_finetune_from_jsut/images/generator_adv_loss.png +0 -0
  12. exp/tts_chtholly_vits_finetune_from_jsut/images/generator_backward_time.png +0 -0
  13. exp/tts_chtholly_vits_finetune_from_jsut/images/generator_dur_loss.png +0 -0
  14. exp/tts_chtholly_vits_finetune_from_jsut/images/generator_feat_match_loss.png +0 -0
  15. exp/tts_chtholly_vits_finetune_from_jsut/images/generator_forward_time.png +0 -0
  16. exp/tts_chtholly_vits_finetune_from_jsut/images/generator_kl_loss.png +0 -0
  17. exp/tts_chtholly_vits_finetune_from_jsut/images/generator_loss.png +0 -0
  18. exp/tts_chtholly_vits_finetune_from_jsut/images/generator_mel_loss.png +0 -0
  19. exp/tts_chtholly_vits_finetune_from_jsut/images/generator_optim_step_time.png +0 -0
  20. exp/tts_chtholly_vits_finetune_from_jsut/images/generator_train_time.png +0 -0
  21. exp/tts_chtholly_vits_finetune_from_jsut/images/gpu_max_cached_mem_GB.png +0 -0
  22. exp/tts_chtholly_vits_finetune_from_jsut/images/iter_time.png +0 -0
  23. exp/tts_chtholly_vits_finetune_from_jsut/images/optim0_lr0.png +0 -0
  24. exp/tts_chtholly_vits_finetune_from_jsut/images/optim1_lr0.png +0 -0
  25. exp/tts_chtholly_vits_finetune_from_jsut/images/train_time.png +0 -0
  26. exp/tts_stats_raw_linear_spectrogram_phn_jaconv_pyopenjtalk_accent_with_pause/train/feats_stats.npz +3 -0
  27. meta.yaml +8 -0
README.md ADDED
@@ -0,0 +1,477 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - text-to-speech
6
+ language: jp
7
+ datasets:
8
+ - chtholly
9
+ license: cc-by-4.0
10
+ ---
11
+
12
+ ## ESPnet2 TTS model
13
+
14
+ ### `mio/chtholly`
15
+
16
+ This model was trained by mio using chtholly recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+ ### Demo: How to use in ESPnet2
19
+
20
+ Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html)
21
+ if you haven't done that already.
22
+
23
+ ```bash
24
+ cd espnet
25
+ git checkout 0232f540a98ece921477b961db8ae019211da9af
26
+ pip install -e .
27
+ cd egs2/chtholly/tts1
28
+ ./run.sh --skip_data_prep false --skip_train true --download_model mio/chtholly
29
+ ```
30
+
31
+
32
+
33
+ ## TTS config
34
+
35
+ <details><summary>expand</summary>
36
+
37
+ ```
38
+ config: conf/tuning/finetune_vits.yaml
39
+ print_config: false
40
+ log_level: INFO
41
+ dry_run: false
42
+ iterator_type: sequence
43
+ output_dir: exp/tts_chtholly_vits_finetune_from_jsut
44
+ ngpu: 1
45
+ seed: 777
46
+ num_workers: 4
47
+ num_att_plot: 3
48
+ dist_backend: nccl
49
+ dist_init_method: env://
50
+ dist_world_size: 2
51
+ dist_rank: 0
52
+ local_rank: 0
53
+ dist_master_addr: localhost
54
+ dist_master_port: 50705
55
+ dist_launcher: null
56
+ multiprocessing_distributed: true
57
+ unused_parameters: true
58
+ sharded_ddp: false
59
+ cudnn_enabled: true
60
+ cudnn_benchmark: false
61
+ cudnn_deterministic: false
62
+ collect_stats: false
63
+ write_collected_feats: false
64
+ max_epoch: 100
65
+ patience: null
66
+ val_scheduler_criterion:
67
+ - valid
68
+ - loss
69
+ early_stopping_criterion:
70
+ - valid
71
+ - loss
72
+ - min
73
+ best_model_criterion:
74
+ - - train
75
+ - total_count
76
+ - max
77
+ keep_nbest_models: 10
78
+ nbest_averaging_interval: 0
79
+ grad_clip: -1
80
+ grad_clip_type: 2.0
81
+ grad_noise: false
82
+ accum_grad: 1
83
+ no_forward_run: false
84
+ resume: true
85
+ train_dtype: float32
86
+ use_amp: false
87
+ log_interval: 50
88
+ use_matplotlib: true
89
+ use_tensorboard: false
90
+ create_graph_in_tensorboard: false
91
+ use_wandb: true
92
+ wandb_project: chtholly
93
+ wandb_id: null
94
+ wandb_entity: null
95
+ wandb_name: vits_finetune_chtholly_from_jsut
96
+ wandb_model_log_interval: -1
97
+ detect_anomaly: false
98
+ pretrain_path: null
99
+ init_param:
100
+ - downloads/f3698edf589206588f58f5ec837fa516/exp/tts_train_vits_raw_phn_jaconv_pyopenjtalk_accent_with_pause/train.total_count.ave_10best.pth:tts:tts
101
+ ignore_init_mismatch: false
102
+ freeze_param: []
103
+ num_iters_per_epoch: 1000
104
+ batch_size: 20
105
+ valid_batch_size: null
106
+ batch_bins: 5000000
107
+ valid_batch_bins: null
108
+ train_shape_file:
109
+ - exp/tts_stats_raw_linear_spectrogram_phn_jaconv_pyopenjtalk_accent_with_pause/train/text_shape.phn
110
+ - exp/tts_stats_raw_linear_spectrogram_phn_jaconv_pyopenjtalk_accent_with_pause/train/speech_shape
111
+ valid_shape_file:
112
+ - exp/tts_stats_raw_linear_spectrogram_phn_jaconv_pyopenjtalk_accent_with_pause/valid/text_shape.phn
113
+ - exp/tts_stats_raw_linear_spectrogram_phn_jaconv_pyopenjtalk_accent_with_pause/valid/speech_shape
114
+ batch_type: numel
115
+ valid_batch_type: null
116
+ fold_length:
117
+ - 150
118
+ - 204800
119
+ sort_in_batch: descending
120
+ sort_batch: descending
121
+ multiple_iterator: false
122
+ chunk_length: 500
123
+ chunk_shift_ratio: 0.5
124
+ num_cache_chunks: 1024
125
+ train_data_path_and_name_and_type:
126
+ - - dump/22k/raw/train/text
127
+ - text
128
+ - text
129
+ - - dump/22k/raw/train/wav.scp
130
+ - speech
131
+ - sound
132
+ valid_data_path_and_name_and_type:
133
+ - - dump/22k/raw/dev/text
134
+ - text
135
+ - text
136
+ - - dump/22k/raw/dev/wav.scp
137
+ - speech
138
+ - sound
139
+ allow_variable_data_keys: false
140
+ max_cache_size: 0.0
141
+ max_cache_fd: 32
142
+ valid_max_cache_size: null
143
+ optim: adamw
144
+ optim_conf:
145
+ lr: 0.0001
146
+ betas:
147
+ - 0.8
148
+ - 0.99
149
+ eps: 1.0e-09
150
+ weight_decay: 0.0
151
+ scheduler: exponentiallr
152
+ scheduler_conf:
153
+ gamma: 0.999875
154
+ optim2: adamw
155
+ optim2_conf:
156
+ lr: 0.0001
157
+ betas:
158
+ - 0.8
159
+ - 0.99
160
+ eps: 1.0e-09
161
+ weight_decay: 0.0
162
+ scheduler2: exponentiallr
163
+ scheduler2_conf:
164
+ gamma: 0.999875
165
+ generator_first: false
166
+ token_list:
167
+ - <blank>
168
+ - <unk>
169
+ - '1'
170
+ - '2'
171
+ - '0'
172
+ - '3'
173
+ - '4'
174
+ - '-1'
175
+ - '5'
176
+ - a
177
+ - o
178
+ - '-2'
179
+ - i
180
+ - '-3'
181
+ - u
182
+ - e
183
+ - k
184
+ - n
185
+ - t
186
+ - '6'
187
+ - r
188
+ - '-4'
189
+ - s
190
+ - N
191
+ - m
192
+ - pau
193
+ - '7'
194
+ - sh
195
+ - d
196
+ - g
197
+ - w
198
+ - '8'
199
+ - U
200
+ - '-5'
201
+ - I
202
+ - cl
203
+ - h
204
+ - y
205
+ - b
206
+ - '9'
207
+ - j
208
+ - ts
209
+ - ch
210
+ - '-6'
211
+ - z
212
+ - p
213
+ - '-7'
214
+ - f
215
+ - ky
216
+ - ry
217
+ - '-8'
218
+ - gy
219
+ - '-9'
220
+ - hy
221
+ - ny
222
+ - '-10'
223
+ - by
224
+ - my
225
+ - '-11'
226
+ - '-12'
227
+ - '-13'
228
+ - py
229
+ - '-14'
230
+ - '-15'
231
+ - v
232
+ - '10'
233
+ - '-16'
234
+ - '-17'
235
+ - '11'
236
+ - '-21'
237
+ - '-20'
238
+ - '12'
239
+ - '-19'
240
+ - '13'
241
+ - '-18'
242
+ - '14'
243
+ - dy
244
+ - '15'
245
+ - ty
246
+ - '-22'
247
+ - '16'
248
+ - '18'
249
+ - '19'
250
+ - '17'
251
+ - <sos/eos>
252
+ odim: null
253
+ model_conf: {}
254
+ use_preprocessor: true
255
+ token_type: phn
256
+ bpemodel: null
257
+ non_linguistic_symbols: null
258
+ cleaner: jaconv
259
+ g2p: pyopenjtalk_accent_with_pause
260
+ feats_extract: linear_spectrogram
261
+ feats_extract_conf:
262
+ n_fft: 1024
263
+ hop_length: 256
264
+ win_length: null
265
+ normalize: null
266
+ normalize_conf: {}
267
+ tts: vits
268
+ tts_conf:
269
+ generator_type: vits_generator
270
+ generator_params:
271
+ hidden_channels: 192
272
+ spks: -1
273
+ global_channels: -1
274
+ segment_size: 32
275
+ text_encoder_attention_heads: 2
276
+ text_encoder_ffn_expand: 4
277
+ text_encoder_blocks: 6
278
+ text_encoder_positionwise_layer_type: conv1d
279
+ text_encoder_positionwise_conv_kernel_size: 3
280
+ text_encoder_positional_encoding_layer_type: rel_pos
281
+ text_encoder_self_attention_layer_type: rel_selfattn
282
+ text_encoder_activation_type: swish
283
+ text_encoder_normalize_before: true
284
+ text_encoder_dropout_rate: 0.1
285
+ text_encoder_positional_dropout_rate: 0.0
286
+ text_encoder_attention_dropout_rate: 0.1
287
+ use_macaron_style_in_text_encoder: true
288
+ use_conformer_conv_in_text_encoder: false
289
+ text_encoder_conformer_kernel_size: -1
290
+ decoder_kernel_size: 7
291
+ decoder_channels: 512
292
+ decoder_upsample_scales:
293
+ - 8
294
+ - 8
295
+ - 2
296
+ - 2
297
+ decoder_upsample_kernel_sizes:
298
+ - 16
299
+ - 16
300
+ - 4
301
+ - 4
302
+ decoder_resblock_kernel_sizes:
303
+ - 3
304
+ - 7
305
+ - 11
306
+ decoder_resblock_dilations:
307
+ - - 1
308
+ - 3
309
+ - 5
310
+ - - 1
311
+ - 3
312
+ - 5
313
+ - - 1
314
+ - 3
315
+ - 5
316
+ use_weight_norm_in_decoder: true
317
+ posterior_encoder_kernel_size: 5
318
+ posterior_encoder_layers: 16
319
+ posterior_encoder_stacks: 1
320
+ posterior_encoder_base_dilation: 1
321
+ posterior_encoder_dropout_rate: 0.0
322
+ use_weight_norm_in_posterior_encoder: true
323
+ flow_flows: 4
324
+ flow_kernel_size: 5
325
+ flow_base_dilation: 1
326
+ flow_layers: 4
327
+ flow_dropout_rate: 0.0
328
+ use_weight_norm_in_flow: true
329
+ use_only_mean_in_flow: true
330
+ stochastic_duration_predictor_kernel_size: 3
331
+ stochastic_duration_predictor_dropout_rate: 0.5
332
+ stochastic_duration_predictor_flows: 4
333
+ stochastic_duration_predictor_dds_conv_layers: 3
334
+ vocabs: 85
335
+ aux_channels: 513
336
+ discriminator_type: hifigan_multi_scale_multi_period_discriminator
337
+ discriminator_params:
338
+ scales: 1
339
+ scale_downsample_pooling: AvgPool1d
340
+ scale_downsample_pooling_params:
341
+ kernel_size: 4
342
+ stride: 2
343
+ padding: 2
344
+ scale_discriminator_params:
345
+ in_channels: 1
346
+ out_channels: 1
347
+ kernel_sizes:
348
+ - 15
349
+ - 41
350
+ - 5
351
+ - 3
352
+ channels: 128
353
+ max_downsample_channels: 1024
354
+ max_groups: 16
355
+ bias: true
356
+ downsample_scales:
357
+ - 2
358
+ - 2
359
+ - 4
360
+ - 4
361
+ - 1
362
+ nonlinear_activation: LeakyReLU
363
+ nonlinear_activation_params:
364
+ negative_slope: 0.1
365
+ use_weight_norm: true
366
+ use_spectral_norm: false
367
+ follow_official_norm: false
368
+ periods:
369
+ - 2
370
+ - 3
371
+ - 5
372
+ - 7
373
+ - 11
374
+ period_discriminator_params:
375
+ in_channels: 1
376
+ out_channels: 1
377
+ kernel_sizes:
378
+ - 5
379
+ - 3
380
+ channels: 32
381
+ downsample_scales:
382
+ - 3
383
+ - 3
384
+ - 3
385
+ - 3
386
+ - 1
387
+ max_downsample_channels: 1024
388
+ bias: true
389
+ nonlinear_activation: LeakyReLU
390
+ nonlinear_activation_params:
391
+ negative_slope: 0.1
392
+ use_weight_norm: true
393
+ use_spectral_norm: false
394
+ generator_adv_loss_params:
395
+ average_by_discriminators: false
396
+ loss_type: mse
397
+ discriminator_adv_loss_params:
398
+ average_by_discriminators: false
399
+ loss_type: mse
400
+ feat_match_loss_params:
401
+ average_by_discriminators: false
402
+ average_by_layers: false
403
+ include_final_outputs: true
404
+ mel_loss_params:
405
+ fs: 22050
406
+ n_fft: 1024
407
+ hop_length: 256
408
+ win_length: null
409
+ window: hann
410
+ n_mels: 80
411
+ fmin: 0
412
+ fmax: null
413
+ log_base: null
414
+ lambda_adv: 1.0
415
+ lambda_mel: 45.0
416
+ lambda_feat_match: 2.0
417
+ lambda_dur: 1.0
418
+ lambda_kl: 1.0
419
+ sampling_rate: 22050
420
+ cache_generator_outputs: true
421
+ pitch_extract: null
422
+ pitch_extract_conf: {}
423
+ pitch_normalize: null
424
+ pitch_normalize_conf: {}
425
+ energy_extract: null
426
+ energy_extract_conf: {}
427
+ energy_normalize: null
428
+ energy_normalize_conf: {}
429
+ required:
430
+ - output_dir
431
+ - token_list
432
+ version: '202207'
433
+ distributed: true
434
+ ```
435
+
436
+ </details>
437
+
438
+
439
+
440
+ ### Citing ESPnet
441
+
442
+ ```BibTex
443
+ @inproceedings{watanabe2018espnet,
444
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
445
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
446
+ year={2018},
447
+ booktitle={Proceedings of Interspeech},
448
+ pages={2207--2211},
449
+ doi={10.21437/Interspeech.2018-1456},
450
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
451
+ }
452
+
453
+
454
+
455
+
456
+ @inproceedings{hayashi2020espnet,
457
+ title={{Espnet-TTS}: Unified, reproducible, and integratable open source end-to-end text-to-speech toolkit},
458
+ author={Hayashi, Tomoki and Yamamoto, Ryuichi and Inoue, Katsuki and Yoshimura, Takenori and Watanabe, Shinji and Toda, Tomoki and Takeda, Kazuya and Zhang, Yu and Tan, Xu},
459
+ booktitle={Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
460
+ pages={7654--7658},
461
+ year={2020},
462
+ organization={IEEE}
463
+ }
464
+ ```
465
+
466
+ or arXiv:
467
+
468
+ ```bibtex
469
+ @misc{watanabe2018espnet,
470
+ title={ESPnet: End-to-End Speech Processing Toolkit},
471
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
472
+ year={2018},
473
+ eprint={1804.00015},
474
+ archivePrefix={arXiv},
475
+ primaryClass={cs.CL}
476
+ }
477
+ ```
exp/tts_chtholly_vits_finetune_from_jsut/33epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:39b07efdeddebe3004938806abb0196ad99e143a63ba995d1d63d3a7f8b0137e
3
+ size 372578383
exp/tts_chtholly_vits_finetune_from_jsut/config.yaml ADDED
@@ -0,0 +1,396 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/finetune_vits.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: exp/tts_chtholly_vits_finetune_from_jsut
7
+ ngpu: 1
8
+ seed: 777
9
+ num_workers: 4
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: 2
14
+ dist_rank: 0
15
+ local_rank: 0
16
+ dist_master_addr: localhost
17
+ dist_master_port: 50705
18
+ dist_launcher: null
19
+ multiprocessing_distributed: true
20
+ unused_parameters: true
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: false
25
+ collect_stats: false
26
+ write_collected_feats: false
27
+ max_epoch: 100
28
+ patience: null
29
+ val_scheduler_criterion:
30
+ - valid
31
+ - loss
32
+ early_stopping_criterion:
33
+ - valid
34
+ - loss
35
+ - min
36
+ best_model_criterion:
37
+ - - train
38
+ - total_count
39
+ - max
40
+ keep_nbest_models: 10
41
+ nbest_averaging_interval: 0
42
+ grad_clip: -1
43
+ grad_clip_type: 2.0
44
+ grad_noise: false
45
+ accum_grad: 1
46
+ no_forward_run: false
47
+ resume: true
48
+ train_dtype: float32
49
+ use_amp: false
50
+ log_interval: 50
51
+ use_matplotlib: true
52
+ use_tensorboard: false
53
+ create_graph_in_tensorboard: false
54
+ use_wandb: true
55
+ wandb_project: chtholly
56
+ wandb_id: null
57
+ wandb_entity: null
58
+ wandb_name: vits_finetune_chtholly_from_jsut
59
+ wandb_model_log_interval: -1
60
+ detect_anomaly: false
61
+ pretrain_path: null
62
+ init_param:
63
+ - downloads/f3698edf589206588f58f5ec837fa516/exp/tts_train_vits_raw_phn_jaconv_pyopenjtalk_accent_with_pause/train.total_count.ave_10best.pth:tts:tts
64
+ ignore_init_mismatch: false
65
+ freeze_param: []
66
+ num_iters_per_epoch: 1000
67
+ batch_size: 20
68
+ valid_batch_size: null
69
+ batch_bins: 5000000
70
+ valid_batch_bins: null
71
+ train_shape_file:
72
+ - exp/tts_stats_raw_linear_spectrogram_phn_jaconv_pyopenjtalk_accent_with_pause/train/text_shape.phn
73
+ - exp/tts_stats_raw_linear_spectrogram_phn_jaconv_pyopenjtalk_accent_with_pause/train/speech_shape
74
+ valid_shape_file:
75
+ - exp/tts_stats_raw_linear_spectrogram_phn_jaconv_pyopenjtalk_accent_with_pause/valid/text_shape.phn
76
+ - exp/tts_stats_raw_linear_spectrogram_phn_jaconv_pyopenjtalk_accent_with_pause/valid/speech_shape
77
+ batch_type: numel
78
+ valid_batch_type: null
79
+ fold_length:
80
+ - 150
81
+ - 204800
82
+ sort_in_batch: descending
83
+ sort_batch: descending
84
+ multiple_iterator: false
85
+ chunk_length: 500
86
+ chunk_shift_ratio: 0.5
87
+ num_cache_chunks: 1024
88
+ train_data_path_and_name_and_type:
89
+ - - dump/22k/raw/train/text
90
+ - text
91
+ - text
92
+ - - dump/22k/raw/train/wav.scp
93
+ - speech
94
+ - sound
95
+ valid_data_path_and_name_and_type:
96
+ - - dump/22k/raw/dev/text
97
+ - text
98
+ - text
99
+ - - dump/22k/raw/dev/wav.scp
100
+ - speech
101
+ - sound
102
+ allow_variable_data_keys: false
103
+ max_cache_size: 0.0
104
+ max_cache_fd: 32
105
+ valid_max_cache_size: null
106
+ optim: adamw
107
+ optim_conf:
108
+ lr: 0.0001
109
+ betas:
110
+ - 0.8
111
+ - 0.99
112
+ eps: 1.0e-09
113
+ weight_decay: 0.0
114
+ scheduler: exponentiallr
115
+ scheduler_conf:
116
+ gamma: 0.999875
117
+ optim2: adamw
118
+ optim2_conf:
119
+ lr: 0.0001
120
+ betas:
121
+ - 0.8
122
+ - 0.99
123
+ eps: 1.0e-09
124
+ weight_decay: 0.0
125
+ scheduler2: exponentiallr
126
+ scheduler2_conf:
127
+ gamma: 0.999875
128
+ generator_first: false
129
+ token_list:
130
+ - <blank>
131
+ - <unk>
132
+ - '1'
133
+ - '2'
134
+ - '0'
135
+ - '3'
136
+ - '4'
137
+ - '-1'
138
+ - '5'
139
+ - a
140
+ - o
141
+ - '-2'
142
+ - i
143
+ - '-3'
144
+ - u
145
+ - e
146
+ - k
147
+ - n
148
+ - t
149
+ - '6'
150
+ - r
151
+ - '-4'
152
+ - s
153
+ - N
154
+ - m
155
+ - pau
156
+ - '7'
157
+ - sh
158
+ - d
159
+ - g
160
+ - w
161
+ - '8'
162
+ - U
163
+ - '-5'
164
+ - I
165
+ - cl
166
+ - h
167
+ - y
168
+ - b
169
+ - '9'
170
+ - j
171
+ - ts
172
+ - ch
173
+ - '-6'
174
+ - z
175
+ - p
176
+ - '-7'
177
+ - f
178
+ - ky
179
+ - ry
180
+ - '-8'
181
+ - gy
182
+ - '-9'
183
+ - hy
184
+ - ny
185
+ - '-10'
186
+ - by
187
+ - my
188
+ - '-11'
189
+ - '-12'
190
+ - '-13'
191
+ - py
192
+ - '-14'
193
+ - '-15'
194
+ - v
195
+ - '10'
196
+ - '-16'
197
+ - '-17'
198
+ - '11'
199
+ - '-21'
200
+ - '-20'
201
+ - '12'
202
+ - '-19'
203
+ - '13'
204
+ - '-18'
205
+ - '14'
206
+ - dy
207
+ - '15'
208
+ - ty
209
+ - '-22'
210
+ - '16'
211
+ - '18'
212
+ - '19'
213
+ - '17'
214
+ - <sos/eos>
215
+ odim: null
216
+ model_conf: {}
217
+ use_preprocessor: true
218
+ token_type: phn
219
+ bpemodel: null
220
+ non_linguistic_symbols: null
221
+ cleaner: jaconv
222
+ g2p: pyopenjtalk_accent_with_pause
223
+ feats_extract: linear_spectrogram
224
+ feats_extract_conf:
225
+ n_fft: 1024
226
+ hop_length: 256
227
+ win_length: null
228
+ normalize: null
229
+ normalize_conf: {}
230
+ tts: vits
231
+ tts_conf:
232
+ generator_type: vits_generator
233
+ generator_params:
234
+ hidden_channels: 192
235
+ spks: -1
236
+ global_channels: -1
237
+ segment_size: 32
238
+ text_encoder_attention_heads: 2
239
+ text_encoder_ffn_expand: 4
240
+ text_encoder_blocks: 6
241
+ text_encoder_positionwise_layer_type: conv1d
242
+ text_encoder_positionwise_conv_kernel_size: 3
243
+ text_encoder_positional_encoding_layer_type: rel_pos
244
+ text_encoder_self_attention_layer_type: rel_selfattn
245
+ text_encoder_activation_type: swish
246
+ text_encoder_normalize_before: true
247
+ text_encoder_dropout_rate: 0.1
248
+ text_encoder_positional_dropout_rate: 0.0
249
+ text_encoder_attention_dropout_rate: 0.1
250
+ use_macaron_style_in_text_encoder: true
251
+ use_conformer_conv_in_text_encoder: false
252
+ text_encoder_conformer_kernel_size: -1
253
+ decoder_kernel_size: 7
254
+ decoder_channels: 512
255
+ decoder_upsample_scales:
256
+ - 8
257
+ - 8
258
+ - 2
259
+ - 2
260
+ decoder_upsample_kernel_sizes:
261
+ - 16
262
+ - 16
263
+ - 4
264
+ - 4
265
+ decoder_resblock_kernel_sizes:
266
+ - 3
267
+ - 7
268
+ - 11
269
+ decoder_resblock_dilations:
270
+ - - 1
271
+ - 3
272
+ - 5
273
+ - - 1
274
+ - 3
275
+ - 5
276
+ - - 1
277
+ - 3
278
+ - 5
279
+ use_weight_norm_in_decoder: true
280
+ posterior_encoder_kernel_size: 5
281
+ posterior_encoder_layers: 16
282
+ posterior_encoder_stacks: 1
283
+ posterior_encoder_base_dilation: 1
284
+ posterior_encoder_dropout_rate: 0.0
285
+ use_weight_norm_in_posterior_encoder: true
286
+ flow_flows: 4
287
+ flow_kernel_size: 5
288
+ flow_base_dilation: 1
289
+ flow_layers: 4
290
+ flow_dropout_rate: 0.0
291
+ use_weight_norm_in_flow: true
292
+ use_only_mean_in_flow: true
293
+ stochastic_duration_predictor_kernel_size: 3
294
+ stochastic_duration_predictor_dropout_rate: 0.5
295
+ stochastic_duration_predictor_flows: 4
296
+ stochastic_duration_predictor_dds_conv_layers: 3
297
+ vocabs: 85
298
+ aux_channels: 513
299
+ discriminator_type: hifigan_multi_scale_multi_period_discriminator
300
+ discriminator_params:
301
+ scales: 1
302
+ scale_downsample_pooling: AvgPool1d
303
+ scale_downsample_pooling_params:
304
+ kernel_size: 4
305
+ stride: 2
306
+ padding: 2
307
+ scale_discriminator_params:
308
+ in_channels: 1
309
+ out_channels: 1
310
+ kernel_sizes:
311
+ - 15
312
+ - 41
313
+ - 5
314
+ - 3
315
+ channels: 128
316
+ max_downsample_channels: 1024
317
+ max_groups: 16
318
+ bias: true
319
+ downsample_scales:
320
+ - 2
321
+ - 2
322
+ - 4
323
+ - 4
324
+ - 1
325
+ nonlinear_activation: LeakyReLU
326
+ nonlinear_activation_params:
327
+ negative_slope: 0.1
328
+ use_weight_norm: true
329
+ use_spectral_norm: false
330
+ follow_official_norm: false
331
+ periods:
332
+ - 2
333
+ - 3
334
+ - 5
335
+ - 7
336
+ - 11
337
+ period_discriminator_params:
338
+ in_channels: 1
339
+ out_channels: 1
340
+ kernel_sizes:
341
+ - 5
342
+ - 3
343
+ channels: 32
344
+ downsample_scales:
345
+ - 3
346
+ - 3
347
+ - 3
348
+ - 3
349
+ - 1
350
+ max_downsample_channels: 1024
351
+ bias: true
352
+ nonlinear_activation: LeakyReLU
353
+ nonlinear_activation_params:
354
+ negative_slope: 0.1
355
+ use_weight_norm: true
356
+ use_spectral_norm: false
357
+ generator_adv_loss_params:
358
+ average_by_discriminators: false
359
+ loss_type: mse
360
+ discriminator_adv_loss_params:
361
+ average_by_discriminators: false
362
+ loss_type: mse
363
+ feat_match_loss_params:
364
+ average_by_discriminators: false
365
+ average_by_layers: false
366
+ include_final_outputs: true
367
+ mel_loss_params:
368
+ fs: 22050
369
+ n_fft: 1024
370
+ hop_length: 256
371
+ win_length: null
372
+ window: hann
373
+ n_mels: 80
374
+ fmin: 0
375
+ fmax: null
376
+ log_base: null
377
+ lambda_adv: 1.0
378
+ lambda_mel: 45.0
379
+ lambda_feat_match: 2.0
380
+ lambda_dur: 1.0
381
+ lambda_kl: 1.0
382
+ sampling_rate: 22050
383
+ cache_generator_outputs: true
384
+ pitch_extract: null
385
+ pitch_extract_conf: {}
386
+ pitch_normalize: null
387
+ pitch_normalize_conf: {}
388
+ energy_extract: null
389
+ energy_extract_conf: {}
390
+ energy_normalize: null
391
+ energy_normalize_conf: {}
392
+ required:
393
+ - output_dir
394
+ - token_list
395
+ version: '202207'
396
+ distributed: true
exp/tts_chtholly_vits_finetune_from_jsut/images/discriminator_backward_time.png ADDED
exp/tts_chtholly_vits_finetune_from_jsut/images/discriminator_fake_loss.png ADDED
exp/tts_chtholly_vits_finetune_from_jsut/images/discriminator_forward_time.png ADDED
exp/tts_chtholly_vits_finetune_from_jsut/images/discriminator_loss.png ADDED
exp/tts_chtholly_vits_finetune_from_jsut/images/discriminator_optim_step_time.png ADDED
exp/tts_chtholly_vits_finetune_from_jsut/images/discriminator_real_loss.png ADDED
exp/tts_chtholly_vits_finetune_from_jsut/images/discriminator_train_time.png ADDED
exp/tts_chtholly_vits_finetune_from_jsut/images/generator_adv_loss.png ADDED
exp/tts_chtholly_vits_finetune_from_jsut/images/generator_backward_time.png ADDED
exp/tts_chtholly_vits_finetune_from_jsut/images/generator_dur_loss.png ADDED
exp/tts_chtholly_vits_finetune_from_jsut/images/generator_feat_match_loss.png ADDED
exp/tts_chtholly_vits_finetune_from_jsut/images/generator_forward_time.png ADDED
exp/tts_chtholly_vits_finetune_from_jsut/images/generator_kl_loss.png ADDED
exp/tts_chtholly_vits_finetune_from_jsut/images/generator_loss.png ADDED
exp/tts_chtholly_vits_finetune_from_jsut/images/generator_mel_loss.png ADDED
exp/tts_chtholly_vits_finetune_from_jsut/images/generator_optim_step_time.png ADDED
exp/tts_chtholly_vits_finetune_from_jsut/images/generator_train_time.png ADDED
exp/tts_chtholly_vits_finetune_from_jsut/images/gpu_max_cached_mem_GB.png ADDED
exp/tts_chtholly_vits_finetune_from_jsut/images/iter_time.png ADDED
exp/tts_chtholly_vits_finetune_from_jsut/images/optim0_lr0.png ADDED
exp/tts_chtholly_vits_finetune_from_jsut/images/optim1_lr0.png ADDED
exp/tts_chtholly_vits_finetune_from_jsut/images/train_time.png ADDED
exp/tts_stats_raw_linear_spectrogram_phn_jaconv_pyopenjtalk_accent_with_pause/train/feats_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4bc3553357d02332e6a70b07bbc6266c33f2fa8cd4cff9adcc7f1b65b8bc4009
3
+ size 4866
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: '202207'
2
+ files:
3
+ model_file: exp/tts_chtholly_vits_finetune_from_jsut/33epoch.pth
4
+ python: "3.8.13 (default, Mar 28 2022, 11:38:47) \n[GCC 7.5.0]"
5
+ timestamp: 1684896316.947001
6
+ torch: 1.8.1
7
+ yaml_files:
8
+ train_config: exp/tts_chtholly_vits_finetune_from_jsut/config.yaml