Siddhant commited on
Commit
584e8cd
1 Parent(s): d117407

import from zenodo

Browse files
Files changed (26) hide show
  1. README.md +50 -0
  2. exp/tts_train_vits_raw_phn_pypinyin_g2p_phone/config.yaml +509 -0
  3. exp/tts_train_vits_raw_phn_pypinyin_g2p_phone/images/discriminator_backward_time.png +0 -0
  4. exp/tts_train_vits_raw_phn_pypinyin_g2p_phone/images/discriminator_fake_loss.png +0 -0
  5. exp/tts_train_vits_raw_phn_pypinyin_g2p_phone/images/discriminator_forward_time.png +0 -0
  6. exp/tts_train_vits_raw_phn_pypinyin_g2p_phone/images/discriminator_loss.png +0 -0
  7. exp/tts_train_vits_raw_phn_pypinyin_g2p_phone/images/discriminator_optim_step_time.png +0 -0
  8. exp/tts_train_vits_raw_phn_pypinyin_g2p_phone/images/discriminator_real_loss.png +0 -0
  9. exp/tts_train_vits_raw_phn_pypinyin_g2p_phone/images/discriminator_train_time.png +0 -0
  10. exp/tts_train_vits_raw_phn_pypinyin_g2p_phone/images/generator_adv_loss.png +0 -0
  11. exp/tts_train_vits_raw_phn_pypinyin_g2p_phone/images/generator_backward_time.png +0 -0
  12. exp/tts_train_vits_raw_phn_pypinyin_g2p_phone/images/generator_dur_loss.png +0 -0
  13. exp/tts_train_vits_raw_phn_pypinyin_g2p_phone/images/generator_feat_match_loss.png +0 -0
  14. exp/tts_train_vits_raw_phn_pypinyin_g2p_phone/images/generator_forward_time.png +0 -0
  15. exp/tts_train_vits_raw_phn_pypinyin_g2p_phone/images/generator_kl_loss.png +0 -0
  16. exp/tts_train_vits_raw_phn_pypinyin_g2p_phone/images/generator_loss.png +0 -0
  17. exp/tts_train_vits_raw_phn_pypinyin_g2p_phone/images/generator_mel_loss.png +0 -0
  18. exp/tts_train_vits_raw_phn_pypinyin_g2p_phone/images/generator_optim_step_time.png +0 -0
  19. exp/tts_train_vits_raw_phn_pypinyin_g2p_phone/images/generator_train_time.png +0 -0
  20. exp/tts_train_vits_raw_phn_pypinyin_g2p_phone/images/gpu_max_cached_mem_GB.png +0 -0
  21. exp/tts_train_vits_raw_phn_pypinyin_g2p_phone/images/iter_time.png +0 -0
  22. exp/tts_train_vits_raw_phn_pypinyin_g2p_phone/images/optim0_lr0.png +0 -0
  23. exp/tts_train_vits_raw_phn_pypinyin_g2p_phone/images/optim1_lr0.png +0 -0
  24. exp/tts_train_vits_raw_phn_pypinyin_g2p_phone/images/train_time.png +0 -0
  25. exp/tts_train_vits_raw_phn_pypinyin_g2p_phone/train.total_count.ave_10best.pth +3 -0
  26. meta.yaml +8 -0
README.md ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - text-to-speech
6
+ language: zh
7
+ datasets:
8
+ - csmsc
9
+ license: cc-by-4.0
10
+ ---
11
+ ## ESPnet2 TTS pretrained model
12
+ ### `kan-bayashi/csmsc_vits`
13
+ ♻️ Imported from https://zenodo.org/record/5499120/
14
+
15
+ This model was trained by kan-bayashi using csmsc/tts1 recipe in [espnet](https://github.com/espnet/espnet/).
16
+ ### Demo: How to use in ESPnet2
17
+ ```python
18
+ # coming soon
19
+ ```
20
+ ### Citing ESPnet
21
+ ```BibTex
22
+ @inproceedings{watanabe2018espnet,
23
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson {Enrique Yalta Soplin} and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
24
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
25
+ year={2018},
26
+ booktitle={Proceedings of Interspeech},
27
+ pages={2207--2211},
28
+ doi={10.21437/Interspeech.2018-1456},
29
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
30
+ }
31
+ @inproceedings{hayashi2020espnet,
32
+ title={{Espnet-TTS}: Unified, reproducible, and integratable open source end-to-end text-to-speech toolkit},
33
+ author={Hayashi, Tomoki and Yamamoto, Ryuichi and Inoue, Katsuki and Yoshimura, Takenori and Watanabe, Shinji and Toda, Tomoki and Takeda, Kazuya and Zhang, Yu and Tan, Xu},
34
+ booktitle={Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
35
+ pages={7654--7658},
36
+ year={2020},
37
+ organization={IEEE}
38
+ }
39
+ ```
40
+ or arXiv:
41
+ ```bibtex
42
+ @misc{watanabe2018espnet,
43
+ title={ESPnet: End-to-End Speech Processing Toolkit},
44
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Enrique Yalta Soplin and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
45
+ year={2018},
46
+ eprint={1804.00015},
47
+ archivePrefix={arXiv},
48
+ primaryClass={cs.CL}
49
+ }
50
+ ```
exp/tts_train_vits_raw_phn_pypinyin_g2p_phone/config.yaml ADDED
@@ -0,0 +1,509 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: ./conf/tuning/train_vits.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: exp/tts_train_vits_raw_phn_pypinyin_g2p_phone
7
+ ngpu: 1
8
+ seed: 777
9
+ num_workers: 4
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: 4
14
+ dist_rank: 0
15
+ local_rank: 0
16
+ dist_master_addr: localhost
17
+ dist_master_port: 41492
18
+ dist_launcher: null
19
+ multiprocessing_distributed: true
20
+ unused_parameters: true
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: true
24
+ cudnn_deterministic: false
25
+ collect_stats: false
26
+ write_collected_feats: false
27
+ max_epoch: 2000
28
+ patience: null
29
+ val_scheduler_criterion:
30
+ - valid
31
+ - loss
32
+ early_stopping_criterion:
33
+ - valid
34
+ - loss
35
+ - min
36
+ best_model_criterion:
37
+ - - train
38
+ - total_count
39
+ - max
40
+ keep_nbest_models: 10
41
+ grad_clip: -1
42
+ grad_clip_type: 2.0
43
+ grad_noise: false
44
+ accum_grad: 1
45
+ no_forward_run: false
46
+ resume: true
47
+ train_dtype: float32
48
+ use_amp: false
49
+ log_interval: 50
50
+ use_tensorboard: true
51
+ use_wandb: false
52
+ wandb_project: null
53
+ wandb_id: null
54
+ wandb_entity: null
55
+ wandb_name: null
56
+ wandb_model_log_interval: -1
57
+ detect_anomaly: false
58
+ pretrain_path: null
59
+ init_param: []
60
+ ignore_init_mismatch: false
61
+ freeze_param: []
62
+ num_iters_per_epoch: 500
63
+ batch_size: 20
64
+ valid_batch_size: null
65
+ batch_bins: 5000000
66
+ valid_batch_bins: null
67
+ train_shape_file:
68
+ - exp/tts_stats_raw_linear_spectrogram_phn_pypinyin_g2p_phone/train/text_shape.phn
69
+ - exp/tts_stats_raw_linear_spectrogram_phn_pypinyin_g2p_phone/train/speech_shape
70
+ valid_shape_file:
71
+ - exp/tts_stats_raw_linear_spectrogram_phn_pypinyin_g2p_phone/valid/text_shape.phn
72
+ - exp/tts_stats_raw_linear_spectrogram_phn_pypinyin_g2p_phone/valid/speech_shape
73
+ batch_type: numel
74
+ valid_batch_type: null
75
+ fold_length:
76
+ - 150
77
+ - 204800
78
+ sort_in_batch: descending
79
+ sort_batch: descending
80
+ multiple_iterator: false
81
+ chunk_length: 500
82
+ chunk_shift_ratio: 0.5
83
+ num_cache_chunks: 1024
84
+ train_data_path_and_name_and_type:
85
+ - - dump/22k/raw/tr_no_dev/text
86
+ - text
87
+ - text
88
+ - - dump/22k/raw/tr_no_dev/wav.scp
89
+ - speech
90
+ - sound
91
+ valid_data_path_and_name_and_type:
92
+ - - dump/22k/raw/dev/text
93
+ - text
94
+ - text
95
+ - - dump/22k/raw/dev/wav.scp
96
+ - speech
97
+ - sound
98
+ allow_variable_data_keys: false
99
+ max_cache_size: 0.0
100
+ max_cache_fd: 32
101
+ valid_max_cache_size: null
102
+ optim: adamw
103
+ optim_conf:
104
+ lr: 0.0002
105
+ betas:
106
+ - 0.8
107
+ - 0.99
108
+ eps: 1.0e-09
109
+ weight_decay: 0.0
110
+ scheduler: exponentiallr
111
+ scheduler_conf:
112
+ gamma: 0.999875
113
+ optim2: adamw
114
+ optim2_conf:
115
+ lr: 0.0002
116
+ betas:
117
+ - 0.8
118
+ - 0.99
119
+ eps: 1.0e-09
120
+ weight_decay: 0.0
121
+ scheduler2: exponentiallr
122
+ scheduler2_conf:
123
+ gamma: 0.999875
124
+ generator_first: false
125
+ token_list:
126
+ - <blank>
127
+ - <unk>
128
+ - d
129
+ - sh
130
+ - j
131
+ - l
132
+ - 。
133
+ - zh
134
+ - ,
135
+ - i4
136
+ - x
137
+ - h
138
+ - b
139
+ - e
140
+ - g
141
+ - t
142
+ - m
143
+ - z
144
+ - q
145
+ - i1
146
+ - i3
147
+ - ch
148
+ - u4
149
+ - n
150
+ - f
151
+ - i2
152
+ - r
153
+ - k
154
+ - s
155
+ - e4
156
+ - ai4
157
+ - a1
158
+ - c
159
+ - p
160
+ - ian4
161
+ - uo3
162
+ - ao3
163
+ - ai2
164
+ - ao4
165
+ - an4
166
+ - u3
167
+ - ong1
168
+ - ing2
169
+ - en2
170
+ - u2
171
+ - e2
172
+ - ui4
173
+ - ian2
174
+ - iou3
175
+ - ang4
176
+ - u1
177
+ - iao4
178
+ - uo4
179
+ - eng2
180
+ - a4
181
+ - in1
182
+ - ang1
183
+ - eng1
184
+ - ou3
185
+ - ian1
186
+ - ou4
187
+ - ing1
188
+ - uo1
189
+ - an1
190
+ - ian3
191
+ - ie3
192
+ - a3
193
+ - an3
194
+ - ing4
195
+ - an2
196
+ - ü4
197
+ - iao3
198
+ - ei4
199
+ - ong2
200
+ - en1
201
+ - uei4
202
+ - üan2
203
+ - ang2
204
+ - ang3
205
+ - iu4
206
+ - iang4
207
+ - ai3
208
+ - ao1
209
+ - ou1
210
+ - eng4
211
+ - iang3
212
+ - en3
213
+ - ai1
214
+ - ong4
215
+ - ie4
216
+ - e3
217
+ - ia1
218
+ - uo2
219
+ - ia4
220
+ - ü3
221
+ - uan1
222
+ - er2
223
+ - ei3
224
+ - ei2
225
+ - iang1
226
+ - ing3
227
+ - en4
228
+ - ü2
229
+ - uan3
230
+ - e1
231
+ - in2
232
+ - iao1
233
+ - i
234
+ - in4
235
+ - ie1
236
+ - ong3
237
+ - iang2
238
+ - ie2
239
+ - uan4
240
+ - a2
241
+ - ui3
242
+ - eng3
243
+ - uan2
244
+ - üe4
245
+ - uai4
246
+ - ou2
247
+ - ?
248
+ - üe2
249
+ - in3
250
+ - uang3
251
+ - uang1
252
+ - iu2
253
+ - en
254
+ - a
255
+ - ao2
256
+ - ua4
257
+ - un1
258
+ - ui1
259
+ - uei2
260
+ - iong4
261
+ - uang2
262
+ - v3
263
+ - ui2
264
+ - iao2
265
+ - uang4
266
+ - ü1
267
+ - ei1
268
+ - o2
269
+ - er4
270
+ - iou2
271
+ - iou4
272
+ - !
273
+ - ua1
274
+ - üan4
275
+ - iu3
276
+ - un4
277
+ - üan3
278
+ - ün4
279
+ - uen2
280
+ - iu1
281
+ - un3
282
+ - uen4
283
+ - un2
284
+ - er3
285
+ - ün1
286
+ - ün2
287
+ - o4
288
+ - o1
289
+ - ua2
290
+ - uei1
291
+ - uei3
292
+ - ia3
293
+ - iong3
294
+ - ua3
295
+ - ia
296
+ - v4
297
+ - üe1
298
+ - üan1
299
+ - iong1
300
+ - ia2
301
+ - uai1
302
+ - iong2
303
+ - iou1
304
+ - uai3
305
+ - üe3
306
+ - uen1
307
+ - uen3
308
+ - uai2
309
+ - o3
310
+ - er
311
+ - ve4
312
+ - ou
313
+ - io1
314
+ - ün3
315
+ - ueng1
316
+ - v2
317
+ - uo
318
+ - ueng4
319
+ - o
320
+ - ua
321
+ - ei
322
+ - '2'
323
+ - ueng3
324
+ - ang
325
+ - P
326
+ - B
327
+ - <sos/eos>
328
+ odim: null
329
+ model_conf: {}
330
+ use_preprocessor: true
331
+ token_type: phn
332
+ bpemodel: null
333
+ non_linguistic_symbols: null
334
+ cleaner: null
335
+ g2p: pypinyin_g2p_phone
336
+ feats_extract: linear_spectrogram
337
+ feats_extract_conf:
338
+ n_fft: 1024
339
+ hop_length: 256
340
+ win_length: null
341
+ normalize: null
342
+ normalize_conf: {}
343
+ tts: vits
344
+ tts_conf:
345
+ generator_type: vits_generator
346
+ generator_params:
347
+ hidden_channels: 192
348
+ spks: -1
349
+ global_channels: -1
350
+ segment_size: 32
351
+ text_encoder_attention_heads: 2
352
+ text_encoder_ffn_expand: 4
353
+ text_encoder_blocks: 6
354
+ text_encoder_positionwise_layer_type: conv1d
355
+ text_encoder_positionwise_conv_kernel_size: 3
356
+ text_encoder_positional_encoding_layer_type: rel_pos
357
+ text_encoder_self_attention_layer_type: rel_selfattn
358
+ text_encoder_activation_type: swish
359
+ text_encoder_normalize_before: true
360
+ text_encoder_dropout_rate: 0.1
361
+ text_encoder_positional_dropout_rate: 0.0
362
+ text_encoder_attention_dropout_rate: 0.1
363
+ use_macaron_style_in_text_encoder: true
364
+ use_conformer_conv_in_text_encoder: false
365
+ text_encoder_conformer_kernel_size: -1
366
+ decoder_kernel_size: 7
367
+ decoder_channels: 512
368
+ decoder_upsample_scales:
369
+ - 8
370
+ - 8
371
+ - 2
372
+ - 2
373
+ decoder_upsample_kernel_sizes:
374
+ - 16
375
+ - 16
376
+ - 4
377
+ - 4
378
+ decoder_resblock_kernel_sizes:
379
+ - 3
380
+ - 7
381
+ - 11
382
+ decoder_resblock_dilations:
383
+ - - 1
384
+ - 3
385
+ - 5
386
+ - - 1
387
+ - 3
388
+ - 5
389
+ - - 1
390
+ - 3
391
+ - 5
392
+ use_weight_norm_in_decoder: true
393
+ posterior_encoder_kernel_size: 5
394
+ posterior_encoder_layers: 16
395
+ posterior_encoder_stacks: 1
396
+ posterior_encoder_base_dilation: 1
397
+ posterior_encoder_dropout_rate: 0.0
398
+ use_weight_norm_in_posterior_encoder: true
399
+ flow_flows: 4
400
+ flow_kernel_size: 5
401
+ flow_base_dilation: 1
402
+ flow_layers: 4
403
+ flow_dropout_rate: 0.0
404
+ use_weight_norm_in_flow: true
405
+ use_only_mean_in_flow: true
406
+ stochastic_duration_predictor_kernel_size: 3
407
+ stochastic_duration_predictor_dropout_rate: 0.5
408
+ stochastic_duration_predictor_flows: 4
409
+ stochastic_duration_predictor_dds_conv_layers: 3
410
+ vocabs: 202
411
+ aux_channels: 513
412
+ discriminator_type: hifigan_multi_scale_multi_period_discriminator
413
+ discriminator_params:
414
+ scales: 1
415
+ scale_downsample_pooling: AvgPool1d
416
+ scale_downsample_pooling_params:
417
+ kernel_size: 4
418
+ stride: 2
419
+ padding: 2
420
+ scale_discriminator_params:
421
+ in_channels: 1
422
+ out_channels: 1
423
+ kernel_sizes:
424
+ - 15
425
+ - 41
426
+ - 5
427
+ - 3
428
+ channels: 128
429
+ max_downsample_channels: 1024
430
+ max_groups: 16
431
+ bias: true
432
+ downsample_scales:
433
+ - 2
434
+ - 2
435
+ - 4
436
+ - 4
437
+ - 1
438
+ nonlinear_activation: LeakyReLU
439
+ nonlinear_activation_params:
440
+ negative_slope: 0.1
441
+ use_weight_norm: true
442
+ use_spectral_norm: false
443
+ follow_official_norm: false
444
+ periods:
445
+ - 2
446
+ - 3
447
+ - 5
448
+ - 7
449
+ - 11
450
+ period_discriminator_params:
451
+ in_channels: 1
452
+ out_channels: 1
453
+ kernel_sizes:
454
+ - 5
455
+ - 3
456
+ channels: 32
457
+ downsample_scales:
458
+ - 3
459
+ - 3
460
+ - 3
461
+ - 3
462
+ - 1
463
+ max_downsample_channels: 1024
464
+ bias: true
465
+ nonlinear_activation: LeakyReLU
466
+ nonlinear_activation_params:
467
+ negative_slope: 0.1
468
+ use_weight_norm: true
469
+ use_spectral_norm: false
470
+ generator_adv_loss_params:
471
+ average_by_discriminators: false
472
+ loss_type: mse
473
+ discriminator_adv_loss_params:
474
+ average_by_discriminators: false
475
+ loss_type: mse
476
+ feat_match_loss_params:
477
+ average_by_discriminators: false
478
+ average_by_layers: false
479
+ include_final_outputs: true
480
+ mel_loss_params:
481
+ fs: 22050
482
+ n_fft: 1024
483
+ hop_length: 256
484
+ win_length: null
485
+ window: hann
486
+ n_mels: 80
487
+ fmin: 0
488
+ fmax: null
489
+ log_base: null
490
+ lambda_adv: 1.0
491
+ lambda_mel: 45.0
492
+ lambda_feat_match: 2.0
493
+ lambda_dur: 1.0
494
+ lambda_kl: 1.0
495
+ sampling_rate: 22050
496
+ cache_generator_outputs: true
497
+ pitch_extract: null
498
+ pitch_extract_conf: {}
499
+ pitch_normalize: null
500
+ pitch_normalize_conf: {}
501
+ energy_extract: null
502
+ energy_extract_conf: {}
503
+ energy_normalize: null
504
+ energy_normalize_conf: {}
505
+ required:
506
+ - output_dir
507
+ - token_list
508
+ version: 0.10.3a1
509
+ distributed: true
exp/tts_train_vits_raw_phn_pypinyin_g2p_phone/images/discriminator_backward_time.png ADDED
exp/tts_train_vits_raw_phn_pypinyin_g2p_phone/images/discriminator_fake_loss.png ADDED
exp/tts_train_vits_raw_phn_pypinyin_g2p_phone/images/discriminator_forward_time.png ADDED
exp/tts_train_vits_raw_phn_pypinyin_g2p_phone/images/discriminator_loss.png ADDED
exp/tts_train_vits_raw_phn_pypinyin_g2p_phone/images/discriminator_optim_step_time.png ADDED
exp/tts_train_vits_raw_phn_pypinyin_g2p_phone/images/discriminator_real_loss.png ADDED
exp/tts_train_vits_raw_phn_pypinyin_g2p_phone/images/discriminator_train_time.png ADDED
exp/tts_train_vits_raw_phn_pypinyin_g2p_phone/images/generator_adv_loss.png ADDED
exp/tts_train_vits_raw_phn_pypinyin_g2p_phone/images/generator_backward_time.png ADDED
exp/tts_train_vits_raw_phn_pypinyin_g2p_phone/images/generator_dur_loss.png ADDED
exp/tts_train_vits_raw_phn_pypinyin_g2p_phone/images/generator_feat_match_loss.png ADDED
exp/tts_train_vits_raw_phn_pypinyin_g2p_phone/images/generator_forward_time.png ADDED
exp/tts_train_vits_raw_phn_pypinyin_g2p_phone/images/generator_kl_loss.png ADDED
exp/tts_train_vits_raw_phn_pypinyin_g2p_phone/images/generator_loss.png ADDED
exp/tts_train_vits_raw_phn_pypinyin_g2p_phone/images/generator_mel_loss.png ADDED
exp/tts_train_vits_raw_phn_pypinyin_g2p_phone/images/generator_optim_step_time.png ADDED
exp/tts_train_vits_raw_phn_pypinyin_g2p_phone/images/generator_train_time.png ADDED
exp/tts_train_vits_raw_phn_pypinyin_g2p_phone/images/gpu_max_cached_mem_GB.png ADDED
exp/tts_train_vits_raw_phn_pypinyin_g2p_phone/images/iter_time.png ADDED
exp/tts_train_vits_raw_phn_pypinyin_g2p_phone/images/optim0_lr0.png ADDED
exp/tts_train_vits_raw_phn_pypinyin_g2p_phone/images/optim1_lr0.png ADDED
exp/tts_train_vits_raw_phn_pypinyin_g2p_phone/images/train_time.png ADDED
exp/tts_train_vits_raw_phn_pypinyin_g2p_phone/train.total_count.ave_10best.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ef6d32652c60c8843c3333ad8888b3b4e2ba87c4d6a89a498e1ad80c1825945
3
+ size 372654415
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
1
+ espnet: 0.10.3a2
2
+ files:
3
+ model_file: exp/tts_train_vits_raw_phn_pypinyin_g2p_phone/train.total_count.ave_10best.pth
4
+ python: "3.7.3 (default, Mar 27 2019, 22:11:17) \n[GCC 7.3.0]"
5
+ timestamp: 1631247396.642391
6
+ torch: 1.7.1
7
+ yaml_files:
8
+ train_config: exp/tts_train_vits_raw_phn_pypinyin_g2p_phone/config.yaml