ftshijt commited on
Commit
5ff7e61
1 Parent(s): 9e06480

Update model

Browse files
Files changed (33) hide show
  1. README.md +558 -3
  2. dump/raw/org/tr_no_dev/spk2sid +31 -0
  3. exp/svs_stats_raw_phn_None_zh/train/feats_stats.npz +3 -0
  4. exp/svs_stats_raw_phn_None_zh/train/pitch_stats.npz +3 -0
  5. exp/svs_train_visinger2_40singer_raw_phn_None_zh/500epoch.pth +3 -0
  6. exp/svs_train_visinger2_40singer_raw_phn_None_zh/config.yaml +475 -0
  7. exp/svs_train_visinger2_40singer_raw_phn_None_zh/images/discriminator_backward_time.png +0 -0
  8. exp/svs_train_visinger2_40singer_raw_phn_None_zh/images/discriminator_fake_loss.png +0 -0
  9. exp/svs_train_visinger2_40singer_raw_phn_None_zh/images/discriminator_forward_time.png +0 -0
  10. exp/svs_train_visinger2_40singer_raw_phn_None_zh/images/discriminator_loss.png +0 -0
  11. exp/svs_train_visinger2_40singer_raw_phn_None_zh/images/discriminator_optim_step_time.png +0 -0
  12. exp/svs_train_visinger2_40singer_raw_phn_None_zh/images/discriminator_real_loss.png +0 -0
  13. exp/svs_train_visinger2_40singer_raw_phn_None_zh/images/discriminator_train_time.png +0 -0
  14. exp/svs_train_visinger2_40singer_raw_phn_None_zh/images/generator_adv_loss.png +0 -0
  15. exp/svs_train_visinger2_40singer_raw_phn_None_zh/images/generator_backward_time.png +0 -0
  16. exp/svs_train_visinger2_40singer_raw_phn_None_zh/images/generator_feat_match_loss.png +0 -0
  17. exp/svs_train_visinger2_40singer_raw_phn_None_zh/images/generator_forward_time.png +0 -0
  18. exp/svs_train_visinger2_40singer_raw_phn_None_zh/images/generator_kl_loss.png +0 -0
  19. exp/svs_train_visinger2_40singer_raw_phn_None_zh/images/generator_loss.png +0 -0
  20. exp/svs_train_visinger2_40singer_raw_phn_None_zh/images/generator_mel_am_loss.png +0 -0
  21. exp/svs_train_visinger2_40singer_raw_phn_None_zh/images/generator_mel_ddsp_loss.png +0 -0
  22. exp/svs_train_visinger2_40singer_raw_phn_None_zh/images/generator_mel_loss.png +0 -0
  23. exp/svs_train_visinger2_40singer_raw_phn_None_zh/images/generator_optim_step_time.png +0 -0
  24. exp/svs_train_visinger2_40singer_raw_phn_None_zh/images/generator_phn_dur_loss.png +0 -0
  25. exp/svs_train_visinger2_40singer_raw_phn_None_zh/images/generator_pitch_loss.png +0 -0
  26. exp/svs_train_visinger2_40singer_raw_phn_None_zh/images/generator_score_dur_loss.png +0 -0
  27. exp/svs_train_visinger2_40singer_raw_phn_None_zh/images/generator_train_time.png +0 -0
  28. exp/svs_train_visinger2_40singer_raw_phn_None_zh/images/gpu_max_cached_mem_GB.png +0 -0
  29. exp/svs_train_visinger2_40singer_raw_phn_None_zh/images/iter_time.png +0 -0
  30. exp/svs_train_visinger2_40singer_raw_phn_None_zh/images/optim0_lr0.png +0 -0
  31. exp/svs_train_visinger2_40singer_raw_phn_None_zh/images/optim1_lr0.png +0 -0
  32. exp/svs_train_visinger2_40singer_raw_phn_None_zh/images/train_time.png +0 -0
  33. meta.yaml +8 -0
README.md CHANGED
@@ -1,3 +1,558 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - singing-voice-synthesis
6
+ language: zh
7
+ datasets:
8
+ - acesinger
9
+ license: cc-by-4.0
10
+ ---
11
+
12
+ ## ESPnet2 SVS model
13
+
14
+ ### `espnet/aceopencpop_svs_visinger2_40singer_pretrain`
15
+
16
+ This model was trained by ftshijt using acesinger recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+ ### Demo: How to use in ESPnet2
19
+
20
+ Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html)
21
+ if you haven't done that already.
22
+
23
+ ```bash
24
+ cd espnet
25
+ git checkout ba4880118d5249e5dd92e89d107280a0d4f317e8
26
+ pip install -e .
27
+ cd egs2/acesinger/svs1
28
+ ./run.sh --skip_data_prep false --skip_train true --download_model espnet/aceopencpop_svs_visinger2_40singer_pretrain
29
+ ```
30
+
31
+
32
+
33
+ ## SVS config
34
+
35
+ <details><summary>expand</summary>
36
+
37
+ ```
38
+ config: conf/tuning/train_visinger2_40singer.yaml
39
+ print_config: false
40
+ log_level: INFO
41
+ drop_last_iter: false
42
+ dry_run: false
43
+ iterator_type: sequence
44
+ valid_iterator_type: null
45
+ output_dir: exp/svs_train_visinger2_40singer_raw_phn_None_zh
46
+ ngpu: 1
47
+ seed: 777
48
+ num_workers: 0
49
+ num_att_plot: 0
50
+ dist_backend: nccl
51
+ dist_init_method: env://
52
+ dist_world_size: null
53
+ dist_rank: null
54
+ local_rank: 0
55
+ dist_master_addr: null
56
+ dist_master_port: null
57
+ dist_launcher: null
58
+ multiprocessing_distributed: false
59
+ unused_parameters: true
60
+ sharded_ddp: false
61
+ cudnn_enabled: true
62
+ cudnn_benchmark: false
63
+ cudnn_deterministic: false
64
+ collect_stats: false
65
+ write_collected_feats: false
66
+ max_epoch: 500
67
+ patience: null
68
+ val_scheduler_criterion:
69
+ - valid
70
+ - loss
71
+ early_stopping_criterion:
72
+ - valid
73
+ - loss
74
+ - min
75
+ best_model_criterion:
76
+ - - train
77
+ - total_count
78
+ - max
79
+ keep_nbest_models: 10
80
+ nbest_averaging_interval: 0
81
+ grad_clip: -1
82
+ grad_clip_type: 2.0
83
+ grad_noise: false
84
+ accum_grad: 1
85
+ no_forward_run: false
86
+ resume: true
87
+ train_dtype: float32
88
+ use_amp: false
89
+ log_interval: 50
90
+ use_matplotlib: true
91
+ use_tensorboard: true
92
+ create_graph_in_tensorboard: false
93
+ use_wandb: false
94
+ wandb_project: null
95
+ wandb_id: null
96
+ wandb_entity: null
97
+ wandb_name: null
98
+ wandb_model_log_interval: -1
99
+ detect_anomaly: false
100
+ use_lora: false
101
+ save_lora_only: true
102
+ lora_conf: {}
103
+ pretrain_path: null
104
+ init_param: []
105
+ ignore_init_mismatch: false
106
+ freeze_param: []
107
+ num_iters_per_epoch: 1000
108
+ batch_size: 8
109
+ valid_batch_size: null
110
+ batch_bins: 1000000
111
+ valid_batch_bins: null
112
+ train_shape_file:
113
+ - exp/svs_stats_raw_phn_None_zh/train/text_shape.phn
114
+ - exp/svs_stats_raw_phn_None_zh/train/singing_shape
115
+ valid_shape_file:
116
+ - exp/svs_stats_raw_phn_None_zh/valid/text_shape.phn
117
+ - exp/svs_stats_raw_phn_None_zh/valid/singing_shape
118
+ batch_type: sorted
119
+ valid_batch_type: null
120
+ fold_length:
121
+ - 150
122
+ - 409600
123
+ sort_in_batch: descending
124
+ shuffle_within_batch: false
125
+ sort_batch: descending
126
+ multiple_iterator: false
127
+ chunk_length: 500
128
+ chunk_shift_ratio: 0.5
129
+ num_cache_chunks: 1024
130
+ chunk_excluded_key_prefixes: []
131
+ chunk_default_fs: null
132
+ train_data_path_and_name_and_type:
133
+ - - dump/raw/tr_no_dev/text
134
+ - text
135
+ - text
136
+ - - dump/raw/tr_no_dev/wav.scp
137
+ - singing
138
+ - sound
139
+ - - dump/raw/tr_no_dev/label
140
+ - label
141
+ - duration
142
+ - - dump/raw/tr_no_dev/score.scp
143
+ - score
144
+ - score
145
+ - - exp/svs_stats_raw_phn_None_zh/train/collect_feats/pitch.scp
146
+ - pitch
147
+ - npy
148
+ - - exp/svs_stats_raw_phn_None_zh/train/collect_feats/feats.scp
149
+ - feats
150
+ - npy
151
+ - - dump/raw/tr_no_dev/utt2sid
152
+ - sids
153
+ - text_int
154
+ valid_data_path_and_name_and_type:
155
+ - - dump/raw/dev/text
156
+ - text
157
+ - text
158
+ - - dump/raw/dev/wav.scp
159
+ - singing
160
+ - sound
161
+ - - dump/raw/dev/label
162
+ - label
163
+ - duration
164
+ - - dump/raw/dev/score.scp
165
+ - score
166
+ - score
167
+ - - exp/svs_stats_raw_phn_None_zh/valid/collect_feats/pitch.scp
168
+ - pitch
169
+ - npy
170
+ - - exp/svs_stats_raw_phn_None_zh/valid/collect_feats/feats.scp
171
+ - feats
172
+ - npy
173
+ - - dump/raw/dev/utt2sid
174
+ - sids
175
+ - text_int
176
+ allow_variable_data_keys: false
177
+ max_cache_size: 0.0
178
+ max_cache_fd: 32
179
+ allow_multi_rates: false
180
+ valid_max_cache_size: null
181
+ exclude_weight_decay: false
182
+ exclude_weight_decay_conf: {}
183
+ optim: adamw
184
+ optim_conf:
185
+ lr: 0.0002
186
+ betas:
187
+ - 0.8
188
+ - 0.99
189
+ eps: 1.0e-09
190
+ weight_decay: 0.0
191
+ scheduler: exponentiallr
192
+ scheduler_conf:
193
+ gamma: 0.998
194
+ optim2: adamw
195
+ optim2_conf:
196
+ lr: 0.0002
197
+ betas:
198
+ - 0.8
199
+ - 0.99
200
+ eps: 1.0e-09
201
+ weight_decay: 0.0
202
+ scheduler2: exponentiallr
203
+ scheduler2_conf:
204
+ gamma: 0.998
205
+ generator_first: true
206
+ token_list:
207
+ - <blank>
208
+ - <unk>
209
+ - SP
210
+ - i
211
+ - AP
212
+ - e
213
+ - d
214
+ - y
215
+ - w
216
+ - sh
217
+ - ai
218
+ - n
219
+ - x
220
+ - j
221
+ - u
222
+ - ian
223
+ - l
224
+ - h
225
+ - b
226
+ - o
227
+ - zh
228
+ - ou
229
+ - an
230
+ - m
231
+ - q
232
+ - z
233
+ - en
234
+ - g
235
+ - ing
236
+ - ei
237
+ - ao
238
+ - uo
239
+ - ang
240
+ - eng
241
+ - t
242
+ - ong
243
+ - a
244
+ - ui
245
+ - f
246
+ - k
247
+ - r
248
+ - ch
249
+ - v
250
+ - iang
251
+ - in
252
+ - iao
253
+ - ie
254
+ - iu
255
+ - c
256
+ - s
257
+ - van
258
+ - p
259
+ - ve
260
+ - uan
261
+ - uang
262
+ - ia
263
+ - ua
264
+ - uai
265
+ - un
266
+ - er
267
+ - vn
268
+ - iong
269
+ - <sos/eos>
270
+ odim: null
271
+ model_conf: {}
272
+ use_preprocessor: true
273
+ token_type: phn
274
+ bpemodel: null
275
+ non_linguistic_symbols: null
276
+ cleaner: null
277
+ g2p: null
278
+ fs: 44100
279
+ score_feats_extract: syllable_score_feats
280
+ score_feats_extract_conf:
281
+ fs: 44100
282
+ n_fft: 2048
283
+ win_length: 2048
284
+ hop_length: 512
285
+ feats_extract: fbank
286
+ feats_extract_conf:
287
+ n_fft: 2048
288
+ hop_length: 512
289
+ win_length: 2048
290
+ fs: 44100
291
+ fmin: 80
292
+ fmax: 7600
293
+ n_mels: 80
294
+ normalize: global_mvn
295
+ normalize_conf:
296
+ stats_file: exp/svs_stats_raw_phn_None_zh/train/feats_stats.npz
297
+ svs: vits
298
+ svs_conf:
299
+ generator_type: visinger2
300
+ vocoder_generator_type: visinger2
301
+ generator_params:
302
+ hidden_channels: 192
303
+ spks: 40
304
+ global_channels: 256
305
+ segment_size: 20
306
+ text_encoder_attention_heads: 2
307
+ text_encoder_ffn_expand: 4
308
+ text_encoder_blocks: 6
309
+ text_encoder_positionwise_layer_type: conv1d
310
+ text_encoder_positionwise_conv_kernel_size: 3
311
+ text_encoder_positional_encoding_layer_type: rel_pos
312
+ text_encoder_self_attention_layer_type: rel_selfattn
313
+ text_encoder_activation_type: swish
314
+ text_encoder_normalize_before: true
315
+ text_encoder_dropout_rate: 0.1
316
+ text_encoder_positional_dropout_rate: 0.0
317
+ text_encoder_attention_dropout_rate: 0.1
318
+ use_macaron_style_in_text_encoder: true
319
+ use_conformer_conv_in_text_encoder: false
320
+ text_encoder_conformer_kernel_size: -1
321
+ decoder_kernel_size: 7
322
+ decoder_channels: 256
323
+ decoder_upsample_scales:
324
+ - 8
325
+ - 8
326
+ - 4
327
+ - 2
328
+ decoder_upsample_kernel_sizes:
329
+ - 16
330
+ - 16
331
+ - 8
332
+ - 4
333
+ n_harmonic: 64
334
+ decoder_resblock_kernel_sizes:
335
+ - 3
336
+ - 7
337
+ - 11
338
+ decoder_resblock_dilations:
339
+ - - 1
340
+ - 3
341
+ - 5
342
+ - - 1
343
+ - 3
344
+ - 5
345
+ - - 1
346
+ - 3
347
+ - 5
348
+ use_weight_norm_in_decoder: true
349
+ posterior_encoder_kernel_size: 3
350
+ posterior_encoder_layers: 8
351
+ posterior_encoder_stacks: 1
352
+ posterior_encoder_base_dilation: 1
353
+ posterior_encoder_dropout_rate: 0.0
354
+ use_weight_norm_in_posterior_encoder: true
355
+ flow_flows: -1
356
+ flow_kernel_size: 5
357
+ flow_base_dilation: 1
358
+ flow_layers: 4
359
+ flow_dropout_rate: 0.0
360
+ use_weight_norm_in_flow: true
361
+ use_only_mean_in_flow: true
362
+ use_phoneme_predictor: false
363
+ vocabs: 63
364
+ aux_channels: 80
365
+ generator_type: visinger2
366
+ vocoder_generator_type: visinger2
367
+ fs: 44100
368
+ hop_length: 512
369
+ win_length: 2048
370
+ n_fft: 2048
371
+ discriminator_type: visinger2
372
+ discriminator_params:
373
+ scales: 1
374
+ scale_downsample_pooling: AvgPool1d
375
+ scale_downsample_pooling_params:
376
+ kernel_size: 4
377
+ stride: 2
378
+ padding: 2
379
+ scale_discriminator_params:
380
+ in_channels: 1
381
+ out_channels: 1
382
+ kernel_sizes:
383
+ - 15
384
+ - 41
385
+ - 5
386
+ - 3
387
+ channels: 128
388
+ max_downsample_channels: 1024
389
+ max_groups: 256
390
+ bias: true
391
+ downsample_scales:
392
+ - 4
393
+ - 4
394
+ - 4
395
+ - 4
396
+ nonlinear_activation: LeakyReLU
397
+ nonlinear_activation_params:
398
+ negative_slope: 0.1
399
+ use_weight_norm: true
400
+ use_spectral_norm: false
401
+ follow_official_norm: false
402
+ periods:
403
+ - 2
404
+ - 3
405
+ - 5
406
+ - 7
407
+ - 11
408
+ period_discriminator_params:
409
+ in_channels: 1
410
+ out_channels: 1
411
+ kernel_sizes:
412
+ - 5
413
+ - 3
414
+ channels: 32
415
+ downsample_scales:
416
+ - 3
417
+ - 3
418
+ - 3
419
+ - 3
420
+ - 1
421
+ max_downsample_channels: 1024
422
+ bias: true
423
+ nonlinear_activation: LeakyReLU
424
+ nonlinear_activation_params:
425
+ negative_slope: 0.1
426
+ use_weight_norm: true
427
+ use_spectral_norm: false
428
+ multi_freq_disc_params:
429
+ hidden_channels:
430
+ - 256
431
+ - 256
432
+ - 256
433
+ - 256
434
+ - 256
435
+ domain: double
436
+ mel_scale: true
437
+ divisors:
438
+ - 32
439
+ - 16
440
+ - 8
441
+ - 4
442
+ - 2
443
+ - 1
444
+ - 1
445
+ strides:
446
+ - 1
447
+ - 2
448
+ - 1
449
+ - 2
450
+ - 1
451
+ - 2
452
+ - 1
453
+ sample_rate: 44100
454
+ hop_lengths:
455
+ - 110
456
+ - 220
457
+ - 330
458
+ - 441
459
+ - 551
460
+ - 661
461
+ generator_adv_loss_params:
462
+ average_by_discriminators: false
463
+ loss_type: mse
464
+ discriminator_adv_loss_params:
465
+ average_by_discriminators: false
466
+ loss_type: mse
467
+ feat_match_loss_params:
468
+ average_by_discriminators: false
469
+ average_by_layers: false
470
+ include_final_outputs: true
471
+ mel_loss_params:
472
+ fs: 44100
473
+ n_fft: 2048
474
+ hop_length: 512
475
+ win_length: 2048
476
+ window: hann
477
+ n_mels: 80
478
+ fmin: 0
479
+ fmax: 22050
480
+ log_base: null
481
+ lambda_adv: 1.0
482
+ lambda_mel: 45.0
483
+ lambda_feat_match: 2.0
484
+ lambda_dur: 0.1
485
+ lambda_pitch: 10.0
486
+ lambda_phoneme: 1.0
487
+ lambda_kl: 1.0
488
+ sampling_rate: 44100
489
+ cache_generator_outputs: true
490
+ pitch_extract: dio
491
+ pitch_extract_conf:
492
+ use_token_averaged_f0: false
493
+ use_log_f0: false
494
+ fs: 44100
495
+ n_fft: 2048
496
+ hop_length: 512
497
+ f0max: 800
498
+ f0min: 80
499
+ pitch_normalize: null
500
+ pitch_normalize_conf:
501
+ stats_file: exp/svs_stats_raw_phn_None_zh/train/pitch_stats.npz
502
+ ying_extract: null
503
+ ying_extract_conf: {}
504
+ energy_extract: null
505
+ energy_extract_conf: {}
506
+ energy_normalize: null
507
+ energy_normalize_conf: {}
508
+ required:
509
+ - output_dir
510
+ - token_list
511
+ version: '202310'
512
+ distributed: false
513
+ ```
514
+
515
+ </details>
516
+
517
+
518
+
519
+ ### Citing ESPnet
520
+
521
+ ```BibTex
522
+ @inproceedings{watanabe2018espnet,
523
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
524
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
525
+ year={2018},
526
+ booktitle={Proceedings of Interspeech},
527
+ pages={2207--2211},
528
+ doi={10.21437/Interspeech.2018-1456},
529
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
530
+ }
531
+
532
+
533
+
534
+
535
+
536
+
537
+ @inproceedings{shi22d_interspeech,
538
+ author={Jiatong Shi and Shuai Guo and Tao Qian and Tomoki Hayashi and Yuning Wu and Fangzheng Xu and Xuankai Chang and Huazhe Li and Peter Wu and Shinji Watanabe and Qin Jin},
539
+ title={{Muskits: an End-to-end Music Processing Toolkit for Singing Voice Synthesis}},
540
+ year=2022,
541
+ booktitle={Proc. Interspeech 2022},
542
+ pages={4277--4281},
543
+ doi={10.21437/Interspeech.2022-10039}
544
+ }
545
+ ```
546
+
547
+ or arXiv:
548
+
549
+ ```bibtex
550
+ @misc{watanabe2018espnet,
551
+ title={ESPnet: End-to-End Speech Processing Toolkit},
552
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
553
+ year={2018},
554
+ eprint={1804.00015},
555
+ archivePrefix={arXiv},
556
+ primaryClass={cs.CL}
557
+ }
558
+ ```
dump/raw/org/tr_no_dev/spk2sid ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <unk> 0
2
+ 1 1
3
+ 10 2
4
+ 11 3
5
+ 12 4
6
+ 13 5
7
+ 14 6
8
+ 15 7
9
+ 16 8
10
+ 17 9
11
+ 18 10
12
+ 19 11
13
+ 2 12
14
+ 20 13
15
+ 21 14
16
+ 22 15
17
+ 23 16
18
+ 24 17
19
+ 25 18
20
+ 26 19
21
+ 27 20
22
+ 28 21
23
+ 29 22
24
+ 3 23
25
+ 30 24
26
+ 4 25
27
+ 5 26
28
+ 6 27
29
+ 7 28
30
+ 8 29
31
+ 9 30
exp/svs_stats_raw_phn_None_zh/train/feats_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:91ed268c6ea2d7a005f9fd542e21509a3625f5f10b3b4624b7dd2f28f15ee830
3
+ size 1402
exp/svs_stats_raw_phn_None_zh/train/pitch_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c47d2ada04809ecaf6335963a737fac108371d7bef101f5f6f9d2a0addf45bfb
3
+ size 770
exp/svs_train_visinger2_40singer_raw_phn_None_zh/500epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4125552c2bbd45e21137dd016bf00e3c1f3ca335eb027e0fef49967c388ee171
3
+ size 448208603
exp/svs_train_visinger2_40singer_raw_phn_None_zh/config.yaml ADDED
@@ -0,0 +1,475 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/train_visinger2_40singer.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ drop_last_iter: false
5
+ dry_run: false
6
+ iterator_type: sequence
7
+ valid_iterator_type: null
8
+ output_dir: exp/svs_train_visinger2_40singer_raw_phn_None_zh
9
+ ngpu: 1
10
+ seed: 777
11
+ num_workers: 0
12
+ num_att_plot: 0
13
+ dist_backend: nccl
14
+ dist_init_method: env://
15
+ dist_world_size: null
16
+ dist_rank: null
17
+ local_rank: 0
18
+ dist_master_addr: null
19
+ dist_master_port: null
20
+ dist_launcher: null
21
+ multiprocessing_distributed: false
22
+ unused_parameters: true
23
+ sharded_ddp: false
24
+ cudnn_enabled: true
25
+ cudnn_benchmark: false
26
+ cudnn_deterministic: false
27
+ collect_stats: false
28
+ write_collected_feats: false
29
+ max_epoch: 500
30
+ patience: null
31
+ val_scheduler_criterion:
32
+ - valid
33
+ - loss
34
+ early_stopping_criterion:
35
+ - valid
36
+ - loss
37
+ - min
38
+ best_model_criterion:
39
+ - - train
40
+ - total_count
41
+ - max
42
+ keep_nbest_models: 10
43
+ nbest_averaging_interval: 0
44
+ grad_clip: -1
45
+ grad_clip_type: 2.0
46
+ grad_noise: false
47
+ accum_grad: 1
48
+ no_forward_run: false
49
+ resume: true
50
+ train_dtype: float32
51
+ use_amp: false
52
+ log_interval: 50
53
+ use_matplotlib: true
54
+ use_tensorboard: true
55
+ create_graph_in_tensorboard: false
56
+ use_wandb: false
57
+ wandb_project: null
58
+ wandb_id: null
59
+ wandb_entity: null
60
+ wandb_name: null
61
+ wandb_model_log_interval: -1
62
+ detect_anomaly: false
63
+ use_lora: false
64
+ save_lora_only: true
65
+ lora_conf: {}
66
+ pretrain_path: null
67
+ init_param: []
68
+ ignore_init_mismatch: false
69
+ freeze_param: []
70
+ num_iters_per_epoch: 1000
71
+ batch_size: 8
72
+ valid_batch_size: null
73
+ batch_bins: 1000000
74
+ valid_batch_bins: null
75
+ train_shape_file:
76
+ - exp/svs_stats_raw_phn_None_zh/train/text_shape.phn
77
+ - exp/svs_stats_raw_phn_None_zh/train/singing_shape
78
+ valid_shape_file:
79
+ - exp/svs_stats_raw_phn_None_zh/valid/text_shape.phn
80
+ - exp/svs_stats_raw_phn_None_zh/valid/singing_shape
81
+ batch_type: sorted
82
+ valid_batch_type: null
83
+ fold_length:
84
+ - 150
85
+ - 409600
86
+ sort_in_batch: descending
87
+ shuffle_within_batch: false
88
+ sort_batch: descending
89
+ multiple_iterator: false
90
+ chunk_length: 500
91
+ chunk_shift_ratio: 0.5
92
+ num_cache_chunks: 1024
93
+ chunk_excluded_key_prefixes: []
94
+ chunk_default_fs: null
95
+ train_data_path_and_name_and_type:
96
+ - - dump/raw/tr_no_dev/text
97
+ - text
98
+ - text
99
+ - - dump/raw/tr_no_dev/wav.scp
100
+ - singing
101
+ - sound
102
+ - - dump/raw/tr_no_dev/label
103
+ - label
104
+ - duration
105
+ - - dump/raw/tr_no_dev/score.scp
106
+ - score
107
+ - score
108
+ - - exp/svs_stats_raw_phn_None_zh/train/collect_feats/pitch.scp
109
+ - pitch
110
+ - npy
111
+ - - exp/svs_stats_raw_phn_None_zh/train/collect_feats/feats.scp
112
+ - feats
113
+ - npy
114
+ - - dump/raw/tr_no_dev/utt2sid
115
+ - sids
116
+ - text_int
117
+ valid_data_path_and_name_and_type:
118
+ - - dump/raw/dev/text
119
+ - text
120
+ - text
121
+ - - dump/raw/dev/wav.scp
122
+ - singing
123
+ - sound
124
+ - - dump/raw/dev/label
125
+ - label
126
+ - duration
127
+ - - dump/raw/dev/score.scp
128
+ - score
129
+ - score
130
+ - - exp/svs_stats_raw_phn_None_zh/valid/collect_feats/pitch.scp
131
+ - pitch
132
+ - npy
133
+ - - exp/svs_stats_raw_phn_None_zh/valid/collect_feats/feats.scp
134
+ - feats
135
+ - npy
136
+ - - dump/raw/dev/utt2sid
137
+ - sids
138
+ - text_int
139
+ allow_variable_data_keys: false
140
+ max_cache_size: 0.0
141
+ max_cache_fd: 32
142
+ allow_multi_rates: false
143
+ valid_max_cache_size: null
144
+ exclude_weight_decay: false
145
+ exclude_weight_decay_conf: {}
146
+ optim: adamw
147
+ optim_conf:
148
+ lr: 0.0002
149
+ betas:
150
+ - 0.8
151
+ - 0.99
152
+ eps: 1.0e-09
153
+ weight_decay: 0.0
154
+ scheduler: exponentiallr
155
+ scheduler_conf:
156
+ gamma: 0.998
157
+ optim2: adamw
158
+ optim2_conf:
159
+ lr: 0.0002
160
+ betas:
161
+ - 0.8
162
+ - 0.99
163
+ eps: 1.0e-09
164
+ weight_decay: 0.0
165
+ scheduler2: exponentiallr
166
+ scheduler2_conf:
167
+ gamma: 0.998
168
+ generator_first: true
169
+ token_list:
170
+ - <blank>
171
+ - <unk>
172
+ - SP
173
+ - i
174
+ - AP
175
+ - e
176
+ - d
177
+ - y
178
+ - w
179
+ - sh
180
+ - ai
181
+ - n
182
+ - x
183
+ - j
184
+ - u
185
+ - ian
186
+ - l
187
+ - h
188
+ - b
189
+ - o
190
+ - zh
191
+ - ou
192
+ - an
193
+ - m
194
+ - q
195
+ - z
196
+ - en
197
+ - g
198
+ - ing
199
+ - ei
200
+ - ao
201
+ - uo
202
+ - ang
203
+ - eng
204
+ - t
205
+ - ong
206
+ - a
207
+ - ui
208
+ - f
209
+ - k
210
+ - r
211
+ - ch
212
+ - v
213
+ - iang
214
+ - in
215
+ - iao
216
+ - ie
217
+ - iu
218
+ - c
219
+ - s
220
+ - van
221
+ - p
222
+ - ve
223
+ - uan
224
+ - uang
225
+ - ia
226
+ - ua
227
+ - uai
228
+ - un
229
+ - er
230
+ - vn
231
+ - iong
232
+ - <sos/eos>
233
+ odim: null
234
+ model_conf: {}
235
+ use_preprocessor: true
236
+ token_type: phn
237
+ bpemodel: null
238
+ non_linguistic_symbols: null
239
+ cleaner: null
240
+ g2p: null
241
+ fs: 44100
242
+ score_feats_extract: syllable_score_feats
243
+ score_feats_extract_conf:
244
+ fs: 44100
245
+ n_fft: 2048
246
+ win_length: 2048
247
+ hop_length: 512
248
+ feats_extract: fbank
249
+ feats_extract_conf:
250
+ n_fft: 2048
251
+ hop_length: 512
252
+ win_length: 2048
253
+ fs: 44100
254
+ fmin: 80
255
+ fmax: 7600
256
+ n_mels: 80
257
+ normalize: global_mvn
258
+ normalize_conf:
259
+ stats_file: exp/svs_stats_raw_phn_None_zh/train/feats_stats.npz
260
+ svs: vits
261
+ svs_conf:
262
+ generator_type: visinger2
263
+ vocoder_generator_type: visinger2
264
+ generator_params:
265
+ hidden_channels: 192
266
+ spks: 40
267
+ global_channels: 256
268
+ segment_size: 20
269
+ text_encoder_attention_heads: 2
270
+ text_encoder_ffn_expand: 4
271
+ text_encoder_blocks: 6
272
+ text_encoder_positionwise_layer_type: conv1d
273
+ text_encoder_positionwise_conv_kernel_size: 3
274
+ text_encoder_positional_encoding_layer_type: rel_pos
275
+ text_encoder_self_attention_layer_type: rel_selfattn
276
+ text_encoder_activation_type: swish
277
+ text_encoder_normalize_before: true
278
+ text_encoder_dropout_rate: 0.1
279
+ text_encoder_positional_dropout_rate: 0.0
280
+ text_encoder_attention_dropout_rate: 0.1
281
+ use_macaron_style_in_text_encoder: true
282
+ use_conformer_conv_in_text_encoder: false
283
+ text_encoder_conformer_kernel_size: -1
284
+ decoder_kernel_size: 7
285
+ decoder_channels: 256
286
+ decoder_upsample_scales:
287
+ - 8
288
+ - 8
289
+ - 4
290
+ - 2
291
+ decoder_upsample_kernel_sizes:
292
+ - 16
293
+ - 16
294
+ - 8
295
+ - 4
296
+ n_harmonic: 64
297
+ decoder_resblock_kernel_sizes:
298
+ - 3
299
+ - 7
300
+ - 11
301
+ decoder_resblock_dilations:
302
+ - - 1
303
+ - 3
304
+ - 5
305
+ - - 1
306
+ - 3
307
+ - 5
308
+ - - 1
309
+ - 3
310
+ - 5
311
+ use_weight_norm_in_decoder: true
312
+ posterior_encoder_kernel_size: 3
313
+ posterior_encoder_layers: 8
314
+ posterior_encoder_stacks: 1
315
+ posterior_encoder_base_dilation: 1
316
+ posterior_encoder_dropout_rate: 0.0
317
+ use_weight_norm_in_posterior_encoder: true
318
+ flow_flows: -1
319
+ flow_kernel_size: 5
320
+ flow_base_dilation: 1
321
+ flow_layers: 4
322
+ flow_dropout_rate: 0.0
323
+ use_weight_norm_in_flow: true
324
+ use_only_mean_in_flow: true
325
+ use_phoneme_predictor: false
326
+ vocabs: 63
327
+ aux_channels: 80
328
+ generator_type: visinger2
329
+ vocoder_generator_type: visinger2
330
+ fs: 44100
331
+ hop_length: 512
332
+ win_length: 2048
333
+ n_fft: 2048
334
+ discriminator_type: visinger2
335
+ discriminator_params:
336
+ scales: 1
337
+ scale_downsample_pooling: AvgPool1d
338
+ scale_downsample_pooling_params:
339
+ kernel_size: 4
340
+ stride: 2
341
+ padding: 2
342
+ scale_discriminator_params:
343
+ in_channels: 1
344
+ out_channels: 1
345
+ kernel_sizes:
346
+ - 15
347
+ - 41
348
+ - 5
349
+ - 3
350
+ channels: 128
351
+ max_downsample_channels: 1024
352
+ max_groups: 256
353
+ bias: true
354
+ downsample_scales:
355
+ - 4
356
+ - 4
357
+ - 4
358
+ - 4
359
+ nonlinear_activation: LeakyReLU
360
+ nonlinear_activation_params:
361
+ negative_slope: 0.1
362
+ use_weight_norm: true
363
+ use_spectral_norm: false
364
+ follow_official_norm: false
365
+ periods:
366
+ - 2
367
+ - 3
368
+ - 5
369
+ - 7
370
+ - 11
371
+ period_discriminator_params:
372
+ in_channels: 1
373
+ out_channels: 1
374
+ kernel_sizes:
375
+ - 5
376
+ - 3
377
+ channels: 32
378
+ downsample_scales:
379
+ - 3
380
+ - 3
381
+ - 3
382
+ - 3
383
+ - 1
384
+ max_downsample_channels: 1024
385
+ bias: true
386
+ nonlinear_activation: LeakyReLU
387
+ nonlinear_activation_params:
388
+ negative_slope: 0.1
389
+ use_weight_norm: true
390
+ use_spectral_norm: false
391
+ multi_freq_disc_params:
392
+ hidden_channels:
393
+ - 256
394
+ - 256
395
+ - 256
396
+ - 256
397
+ - 256
398
+ domain: double
399
+ mel_scale: true
400
+ divisors:
401
+ - 32
402
+ - 16
403
+ - 8
404
+ - 4
405
+ - 2
406
+ - 1
407
+ - 1
408
+ strides:
409
+ - 1
410
+ - 2
411
+ - 1
412
+ - 2
413
+ - 1
414
+ - 2
415
+ - 1
416
+ sample_rate: 44100
417
+ hop_lengths:
418
+ - 110
419
+ - 220
420
+ - 330
421
+ - 441
422
+ - 551
423
+ - 661
424
+ generator_adv_loss_params:
425
+ average_by_discriminators: false
426
+ loss_type: mse
427
+ discriminator_adv_loss_params:
428
+ average_by_discriminators: false
429
+ loss_type: mse
430
+ feat_match_loss_params:
431
+ average_by_discriminators: false
432
+ average_by_layers: false
433
+ include_final_outputs: true
434
+ mel_loss_params:
435
+ fs: 44100
436
+ n_fft: 2048
437
+ hop_length: 512
438
+ win_length: 2048
439
+ window: hann
440
+ n_mels: 80
441
+ fmin: 0
442
+ fmax: 22050
443
+ log_base: null
444
+ lambda_adv: 1.0
445
+ lambda_mel: 45.0
446
+ lambda_feat_match: 2.0
447
+ lambda_dur: 0.1
448
+ lambda_pitch: 10.0
449
+ lambda_phoneme: 1.0
450
+ lambda_kl: 1.0
451
+ sampling_rate: 44100
452
+ cache_generator_outputs: true
453
+ pitch_extract: dio
454
+ pitch_extract_conf:
455
+ use_token_averaged_f0: false
456
+ use_log_f0: false
457
+ fs: 44100
458
+ n_fft: 2048
459
+ hop_length: 512
460
+ f0max: 800
461
+ f0min: 80
462
+ pitch_normalize: null
463
+ pitch_normalize_conf:
464
+ stats_file: exp/svs_stats_raw_phn_None_zh/train/pitch_stats.npz
465
+ ying_extract: null
466
+ ying_extract_conf: {}
467
+ energy_extract: null
468
+ energy_extract_conf: {}
469
+ energy_normalize: null
470
+ energy_normalize_conf: {}
471
+ required:
472
+ - output_dir
473
+ - token_list
474
+ version: '202310'
475
+ distributed: false
exp/svs_train_visinger2_40singer_raw_phn_None_zh/images/discriminator_backward_time.png ADDED
exp/svs_train_visinger2_40singer_raw_phn_None_zh/images/discriminator_fake_loss.png ADDED
exp/svs_train_visinger2_40singer_raw_phn_None_zh/images/discriminator_forward_time.png ADDED
exp/svs_train_visinger2_40singer_raw_phn_None_zh/images/discriminator_loss.png ADDED
exp/svs_train_visinger2_40singer_raw_phn_None_zh/images/discriminator_optim_step_time.png ADDED
exp/svs_train_visinger2_40singer_raw_phn_None_zh/images/discriminator_real_loss.png ADDED
exp/svs_train_visinger2_40singer_raw_phn_None_zh/images/discriminator_train_time.png ADDED
exp/svs_train_visinger2_40singer_raw_phn_None_zh/images/generator_adv_loss.png ADDED
exp/svs_train_visinger2_40singer_raw_phn_None_zh/images/generator_backward_time.png ADDED
exp/svs_train_visinger2_40singer_raw_phn_None_zh/images/generator_feat_match_loss.png ADDED
exp/svs_train_visinger2_40singer_raw_phn_None_zh/images/generator_forward_time.png ADDED
exp/svs_train_visinger2_40singer_raw_phn_None_zh/images/generator_kl_loss.png ADDED
exp/svs_train_visinger2_40singer_raw_phn_None_zh/images/generator_loss.png ADDED
exp/svs_train_visinger2_40singer_raw_phn_None_zh/images/generator_mel_am_loss.png ADDED
exp/svs_train_visinger2_40singer_raw_phn_None_zh/images/generator_mel_ddsp_loss.png ADDED
exp/svs_train_visinger2_40singer_raw_phn_None_zh/images/generator_mel_loss.png ADDED
exp/svs_train_visinger2_40singer_raw_phn_None_zh/images/generator_optim_step_time.png ADDED
exp/svs_train_visinger2_40singer_raw_phn_None_zh/images/generator_phn_dur_loss.png ADDED
exp/svs_train_visinger2_40singer_raw_phn_None_zh/images/generator_pitch_loss.png ADDED
exp/svs_train_visinger2_40singer_raw_phn_None_zh/images/generator_score_dur_loss.png ADDED
exp/svs_train_visinger2_40singer_raw_phn_None_zh/images/generator_train_time.png ADDED
exp/svs_train_visinger2_40singer_raw_phn_None_zh/images/gpu_max_cached_mem_GB.png ADDED
exp/svs_train_visinger2_40singer_raw_phn_None_zh/images/iter_time.png ADDED
exp/svs_train_visinger2_40singer_raw_phn_None_zh/images/optim0_lr0.png ADDED
exp/svs_train_visinger2_40singer_raw_phn_None_zh/images/optim1_lr0.png ADDED
exp/svs_train_visinger2_40singer_raw_phn_None_zh/images/train_time.png ADDED
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: '202310'
2
+ files:
3
+ model_file: exp/svs_train_visinger2_40singer_raw_phn_None_zh/500epoch.pth
4
+ python: "3.9.16 (main, Mar 8 2023, 14:00:05) \n[GCC 11.2.0]"
5
+ timestamp: 1719266563.424163
6
+ torch: 1.13.1+cu117
7
+ yaml_files:
8
+ train_config: exp/svs_train_visinger2_40singer_raw_phn_None_zh/config.yaml