ftshijt commited on
Commit
b947639
1 Parent(s): 7aba46f

Update model

Browse files
Files changed (32) hide show
  1. 44kexp/svs_stats_raw_phn_pyopenjtalk_jp/train/feats_stats.npz +3 -0
  2. 44kexp/svs_stats_raw_phn_pyopenjtalk_jp/train/pitch_stats.npz +3 -0
  3. 44kexp/svs_train_visinger2_raw_phn_pyopenjtalk_jp/82epoch.pth +3 -0
  4. 44kexp/svs_train_visinger2_raw_phn_pyopenjtalk_jp/config.yaml +435 -0
  5. 44kexp/svs_train_visinger2_raw_phn_pyopenjtalk_jp/images/discriminator_backward_time.png +0 -0
  6. 44kexp/svs_train_visinger2_raw_phn_pyopenjtalk_jp/images/discriminator_fake_loss.png +0 -0
  7. 44kexp/svs_train_visinger2_raw_phn_pyopenjtalk_jp/images/discriminator_forward_time.png +0 -0
  8. 44kexp/svs_train_visinger2_raw_phn_pyopenjtalk_jp/images/discriminator_loss.png +0 -0
  9. 44kexp/svs_train_visinger2_raw_phn_pyopenjtalk_jp/images/discriminator_optim_step_time.png +0 -0
  10. 44kexp/svs_train_visinger2_raw_phn_pyopenjtalk_jp/images/discriminator_real_loss.png +0 -0
  11. 44kexp/svs_train_visinger2_raw_phn_pyopenjtalk_jp/images/discriminator_train_time.png +0 -0
  12. 44kexp/svs_train_visinger2_raw_phn_pyopenjtalk_jp/images/generator_adv_loss.png +0 -0
  13. 44kexp/svs_train_visinger2_raw_phn_pyopenjtalk_jp/images/generator_backward_time.png +0 -0
  14. 44kexp/svs_train_visinger2_raw_phn_pyopenjtalk_jp/images/generator_feat_match_loss.png +0 -0
  15. 44kexp/svs_train_visinger2_raw_phn_pyopenjtalk_jp/images/generator_forward_time.png +0 -0
  16. 44kexp/svs_train_visinger2_raw_phn_pyopenjtalk_jp/images/generator_kl_loss.png +0 -0
  17. 44kexp/svs_train_visinger2_raw_phn_pyopenjtalk_jp/images/generator_loss.png +0 -0
  18. 44kexp/svs_train_visinger2_raw_phn_pyopenjtalk_jp/images/generator_mel_am_loss.png +0 -0
  19. 44kexp/svs_train_visinger2_raw_phn_pyopenjtalk_jp/images/generator_mel_ddsp_loss.png +0 -0
  20. 44kexp/svs_train_visinger2_raw_phn_pyopenjtalk_jp/images/generator_mel_loss.png +0 -0
  21. 44kexp/svs_train_visinger2_raw_phn_pyopenjtalk_jp/images/generator_optim_step_time.png +0 -0
  22. 44kexp/svs_train_visinger2_raw_phn_pyopenjtalk_jp/images/generator_phn_dur_loss.png +0 -0
  23. 44kexp/svs_train_visinger2_raw_phn_pyopenjtalk_jp/images/generator_pitch_loss.png +0 -0
  24. 44kexp/svs_train_visinger2_raw_phn_pyopenjtalk_jp/images/generator_score_dur_loss.png +0 -0
  25. 44kexp/svs_train_visinger2_raw_phn_pyopenjtalk_jp/images/generator_train_time.png +0 -0
  26. 44kexp/svs_train_visinger2_raw_phn_pyopenjtalk_jp/images/gpu_max_cached_mem_GB.png +0 -0
  27. 44kexp/svs_train_visinger2_raw_phn_pyopenjtalk_jp/images/iter_time.png +0 -0
  28. 44kexp/svs_train_visinger2_raw_phn_pyopenjtalk_jp/images/optim0_lr0.png +0 -0
  29. 44kexp/svs_train_visinger2_raw_phn_pyopenjtalk_jp/images/optim1_lr0.png +0 -0
  30. 44kexp/svs_train_visinger2_raw_phn_pyopenjtalk_jp/images/train_time.png +0 -0
  31. README.md +516 -1
  32. meta.yaml +8 -0
44kexp/svs_stats_raw_phn_pyopenjtalk_jp/train/feats_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:99a96648d1332ac0d9b85e1f043a355455804a24f6aaa97364e4700b9e645f53
3
+ size 1402
44kexp/svs_stats_raw_phn_pyopenjtalk_jp/train/pitch_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:34002be3f9f99fa6353666d14f1210ddb72879299e386e2631ec7a09b5f1d9a9
3
+ size 770
44kexp/svs_train_visinger2_raw_phn_pyopenjtalk_jp/82epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c10508e79a6e438e517857087e7c5f72970ea0e9a7805e3eb5e7a634bdbcbad
3
+ size 443920523
44kexp/svs_train_visinger2_raw_phn_pyopenjtalk_jp/config.yaml ADDED
@@ -0,0 +1,435 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/train_visinger2.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ drop_last_iter: false
5
+ dry_run: false
6
+ iterator_type: sequence
7
+ valid_iterator_type: null
8
+ output_dir: 44kexp/svs_train_visinger2_raw_phn_pyopenjtalk_jp
9
+ ngpu: 1
10
+ seed: 777
11
+ num_workers: 4
12
+ num_att_plot: 3
13
+ dist_backend: nccl
14
+ dist_init_method: env://
15
+ dist_world_size: null
16
+ dist_rank: null
17
+ local_rank: 0
18
+ dist_master_addr: null
19
+ dist_master_port: null
20
+ dist_launcher: null
21
+ multiprocessing_distributed: false
22
+ unused_parameters: true
23
+ sharded_ddp: false
24
+ cudnn_enabled: true
25
+ cudnn_benchmark: false
26
+ cudnn_deterministic: false
27
+ collect_stats: false
28
+ write_collected_feats: false
29
+ max_epoch: 500
30
+ patience: null
31
+ val_scheduler_criterion:
32
+ - valid
33
+ - loss
34
+ early_stopping_criterion:
35
+ - valid
36
+ - loss
37
+ - min
38
+ best_model_criterion:
39
+ - - train
40
+ - total_count
41
+ - max
42
+ keep_nbest_models: 10
43
+ nbest_averaging_interval: 0
44
+ grad_clip: -1
45
+ grad_clip_type: 2.0
46
+ grad_noise: false
47
+ accum_grad: 1
48
+ no_forward_run: false
49
+ resume: true
50
+ train_dtype: float32
51
+ use_amp: false
52
+ log_interval: 50
53
+ use_matplotlib: true
54
+ use_tensorboard: true
55
+ create_graph_in_tensorboard: false
56
+ use_wandb: false
57
+ wandb_project: null
58
+ wandb_id: null
59
+ wandb_entity: null
60
+ wandb_name: null
61
+ wandb_model_log_interval: -1
62
+ detect_anomaly: false
63
+ use_lora: false
64
+ save_lora_only: true
65
+ lora_conf: {}
66
+ pretrain_path: null
67
+ init_param: []
68
+ ignore_init_mismatch: false
69
+ freeze_param: []
70
+ num_iters_per_epoch: 1000
71
+ batch_size: 8
72
+ valid_batch_size: null
73
+ batch_bins: 1000000
74
+ valid_batch_bins: null
75
+ train_shape_file:
76
+ - 44kexp/svs_stats_raw_phn_pyopenjtalk_jp/train/text_shape.phn
77
+ - 44kexp/svs_stats_raw_phn_pyopenjtalk_jp/train/singing_shape
78
+ valid_shape_file:
79
+ - 44kexp/svs_stats_raw_phn_pyopenjtalk_jp/valid/text_shape.phn
80
+ - 44kexp/svs_stats_raw_phn_pyopenjtalk_jp/valid/singing_shape
81
+ batch_type: sorted
82
+ valid_batch_type: null
83
+ fold_length:
84
+ - 150
85
+ - 409600
86
+ sort_in_batch: descending
87
+ shuffle_within_batch: false
88
+ sort_batch: descending
89
+ multiple_iterator: false
90
+ chunk_length: 500
91
+ chunk_shift_ratio: 0.5
92
+ num_cache_chunks: 1024
93
+ chunk_excluded_key_prefixes: []
94
+ chunk_default_fs: null
95
+ train_data_path_and_name_and_type:
96
+ - - 44kdump/raw/tr_no_dev/text
97
+ - text
98
+ - text
99
+ - - 44kdump/raw/tr_no_dev/wav.scp
100
+ - singing
101
+ - sound
102
+ - - 44kdump/raw/tr_no_dev/label
103
+ - label
104
+ - duration
105
+ - - 44kdump/raw/tr_no_dev/score.scp
106
+ - score
107
+ - score
108
+ valid_data_path_and_name_and_type:
109
+ - - 44kdump/raw/dev/text
110
+ - text
111
+ - text
112
+ - - 44kdump/raw/dev/wav.scp
113
+ - singing
114
+ - sound
115
+ - - 44kdump/raw/dev/label
116
+ - label
117
+ - duration
118
+ - - 44kdump/raw/dev/score.scp
119
+ - score
120
+ - score
121
+ allow_variable_data_keys: false
122
+ max_cache_size: 0.0
123
+ max_cache_fd: 32
124
+ allow_multi_rates: false
125
+ valid_max_cache_size: null
126
+ exclude_weight_decay: false
127
+ exclude_weight_decay_conf: {}
128
+ optim: adamw
129
+ optim_conf:
130
+ lr: 0.0002
131
+ betas:
132
+ - 0.8
133
+ - 0.99
134
+ eps: 1.0e-09
135
+ weight_decay: 0.0
136
+ scheduler: exponentiallr
137
+ scheduler_conf:
138
+ gamma: 0.998
139
+ optim2: adamw
140
+ optim2_conf:
141
+ lr: 0.0002
142
+ betas:
143
+ - 0.8
144
+ - 0.99
145
+ eps: 1.0e-09
146
+ weight_decay: 0.0
147
+ scheduler2: exponentiallr
148
+ scheduler2_conf:
149
+ gamma: 0.998
150
+ generator_first: false
151
+ token_list:
152
+ - <blank>
153
+ - <unk>
154
+ - pau
155
+ - a
156
+ - o
157
+ - i
158
+ - u
159
+ - e
160
+ - k
161
+ - n
162
+ - r
163
+ - m
164
+ - t
165
+ - N
166
+ - s
167
+ - w
168
+ - y
169
+ - sh
170
+ - g
171
+ - d
172
+ - ch
173
+ - b
174
+ - ts
175
+ - p
176
+ - z
177
+ - h
178
+ - f
179
+ - j
180
+ - cl
181
+ - ry
182
+ - ky
183
+ - gy
184
+ - ny
185
+ - hy
186
+ - my
187
+ - v
188
+ - by
189
+ - py
190
+ - ty
191
+ - dy
192
+ - <sos/eos>
193
+ odim: null
194
+ model_conf: {}
195
+ use_preprocessor: true
196
+ token_type: phn
197
+ bpemodel: null
198
+ non_linguistic_symbols: null
199
+ cleaner: null
200
+ g2p: pyopenjtalk
201
+ fs: 44100
202
+ score_feats_extract: syllable_score_feats
203
+ score_feats_extract_conf:
204
+ fs: 44100
205
+ n_fft: 2048
206
+ win_length: 2048
207
+ hop_length: 512
208
+ feats_extract: fbank
209
+ feats_extract_conf:
210
+ n_fft: 2048
211
+ hop_length: 512
212
+ win_length: 2048
213
+ fs: 44100
214
+ fmin: 80
215
+ fmax: 22050
216
+ n_mels: 80
217
+ normalize: global_mvn
218
+ normalize_conf:
219
+ stats_file: 44kexp/svs_stats_raw_phn_pyopenjtalk_jp/train/feats_stats.npz
220
+ svs: vits
221
+ svs_conf:
222
+ generator_type: visinger2
223
+ vocoder_generator_type: visinger2
224
+ generator_params:
225
+ hidden_channels: 192
226
+ spks: -1
227
+ global_channels: -1
228
+ segment_size: 20
229
+ text_encoder_attention_heads: 2
230
+ text_encoder_ffn_expand: 4
231
+ text_encoder_blocks: 6
232
+ text_encoder_positionwise_layer_type: conv1d
233
+ text_encoder_positionwise_conv_kernel_size: 3
234
+ text_encoder_positional_encoding_layer_type: rel_pos
235
+ text_encoder_self_attention_layer_type: rel_selfattn
236
+ text_encoder_activation_type: swish
237
+ text_encoder_normalize_before: true
238
+ text_encoder_dropout_rate: 0.1
239
+ text_encoder_positional_dropout_rate: 0.0
240
+ text_encoder_attention_dropout_rate: 0.1
241
+ use_macaron_style_in_text_encoder: true
242
+ use_conformer_conv_in_text_encoder: false
243
+ text_encoder_conformer_kernel_size: -1
244
+ decoder_kernel_size: 7
245
+ decoder_channels: 256
246
+ decoder_upsample_scales:
247
+ - 8
248
+ - 8
249
+ - 4
250
+ - 2
251
+ decoder_upsample_kernel_sizes:
252
+ - 16
253
+ - 16
254
+ - 8
255
+ - 4
256
+ n_harmonic: 64
257
+ decoder_resblock_kernel_sizes:
258
+ - 3
259
+ - 7
260
+ - 11
261
+ decoder_resblock_dilations:
262
+ - - 1
263
+ - 3
264
+ - 5
265
+ - - 1
266
+ - 3
267
+ - 5
268
+ - - 1
269
+ - 3
270
+ - 5
271
+ use_weight_norm_in_decoder: true
272
+ posterior_encoder_kernel_size: 3
273
+ posterior_encoder_layers: 8
274
+ posterior_encoder_stacks: 1
275
+ posterior_encoder_base_dilation: 1
276
+ posterior_encoder_dropout_rate: 0.0
277
+ use_weight_norm_in_posterior_encoder: true
278
+ flow_flows: -1
279
+ flow_kernel_size: 5
280
+ flow_base_dilation: 1
281
+ flow_layers: 4
282
+ flow_dropout_rate: 0.0
283
+ use_weight_norm_in_flow: true
284
+ use_only_mean_in_flow: true
285
+ use_phoneme_predictor: false
286
+ vocabs: 41
287
+ aux_channels: 80
288
+ generator_type: visinger2
289
+ vocoder_generator_type: visinger2
290
+ fs: 44100
291
+ hop_length: 512
292
+ win_length: 2048
293
+ n_fft: 2048
294
+ discriminator_type: visinger2
295
+ discriminator_params:
296
+ scales: 1
297
+ scale_downsample_pooling: AvgPool1d
298
+ scale_downsample_pooling_params:
299
+ kernel_size: 4
300
+ stride: 2
301
+ padding: 2
302
+ scale_discriminator_params:
303
+ in_channels: 1
304
+ out_channels: 1
305
+ kernel_sizes:
306
+ - 15
307
+ - 41
308
+ - 5
309
+ - 3
310
+ channels: 128
311
+ max_downsample_channels: 1024
312
+ max_groups: 256
313
+ bias: true
314
+ downsample_scales:
315
+ - 4
316
+ - 4
317
+ - 4
318
+ - 4
319
+ nonlinear_activation: LeakyReLU
320
+ nonlinear_activation_params:
321
+ negative_slope: 0.1
322
+ use_weight_norm: true
323
+ use_spectral_norm: false
324
+ follow_official_norm: false
325
+ periods:
326
+ - 2
327
+ - 3
328
+ - 5
329
+ - 7
330
+ - 11
331
+ period_discriminator_params:
332
+ in_channels: 1
333
+ out_channels: 1
334
+ kernel_sizes:
335
+ - 5
336
+ - 3
337
+ channels: 32
338
+ downsample_scales:
339
+ - 3
340
+ - 3
341
+ - 3
342
+ - 3
343
+ - 1
344
+ max_downsample_channels: 1024
345
+ bias: true
346
+ nonlinear_activation: LeakyReLU
347
+ nonlinear_activation_params:
348
+ negative_slope: 0.1
349
+ use_weight_norm: true
350
+ use_spectral_norm: false
351
+ multi_freq_disc_params:
352
+ hidden_channels:
353
+ - 256
354
+ - 256
355
+ - 256
356
+ - 256
357
+ - 256
358
+ domain: double
359
+ mel_scale: true
360
+ divisors:
361
+ - 32
362
+ - 16
363
+ - 8
364
+ - 4
365
+ - 2
366
+ - 1
367
+ - 1
368
+ strides:
369
+ - 1
370
+ - 2
371
+ - 1
372
+ - 2
373
+ - 1
374
+ - 2
375
+ - 1
376
+ sample_rate: 44100
377
+ hop_lengths:
378
+ - 110
379
+ - 220
380
+ - 330
381
+ - 441
382
+ - 551
383
+ - 661
384
+ generator_adv_loss_params:
385
+ average_by_discriminators: false
386
+ loss_type: mse
387
+ discriminator_adv_loss_params:
388
+ average_by_discriminators: false
389
+ loss_type: mse
390
+ feat_match_loss_params:
391
+ average_by_discriminators: false
392
+ average_by_layers: false
393
+ include_final_outputs: true
394
+ mel_loss_params:
395
+ fs: 44100
396
+ n_fft: 2048
397
+ hop_length: 512
398
+ win_length: 2048
399
+ window: hann
400
+ n_mels: 80
401
+ fmin: 0
402
+ fmax: 22050
403
+ log_base: null
404
+ lambda_adv: 1.0
405
+ lambda_mel: 45.0
406
+ lambda_feat_match: 2.0
407
+ lambda_dur: 0.1
408
+ lambda_pitch: 10.0
409
+ lambda_phoneme: 1.0
410
+ lambda_kl: 1.0
411
+ sampling_rate: 44100
412
+ cache_generator_outputs: true
413
+ pitch_extract: dio
414
+ pitch_extract_conf:
415
+ use_token_averaged_f0: false
416
+ use_log_f0: false
417
+ fs: 44100
418
+ n_fft: 2048
419
+ hop_length: 512
420
+ f0max: 800
421
+ f0min: 80
422
+ pitch_normalize: null
423
+ pitch_normalize_conf:
424
+ stats_file: 44kexp/svs_stats_raw_phn_pyopenjtalk_jp/train/pitch_stats.npz
425
+ ying_extract: null
426
+ ying_extract_conf: {}
427
+ energy_extract: null
428
+ energy_extract_conf: {}
429
+ energy_normalize: null
430
+ energy_normalize_conf: {}
431
+ required:
432
+ - output_dir
433
+ - token_list
434
+ version: '202310'
435
+ distributed: false
44kexp/svs_train_visinger2_raw_phn_pyopenjtalk_jp/images/discriminator_backward_time.png ADDED
44kexp/svs_train_visinger2_raw_phn_pyopenjtalk_jp/images/discriminator_fake_loss.png ADDED
44kexp/svs_train_visinger2_raw_phn_pyopenjtalk_jp/images/discriminator_forward_time.png ADDED
44kexp/svs_train_visinger2_raw_phn_pyopenjtalk_jp/images/discriminator_loss.png ADDED
44kexp/svs_train_visinger2_raw_phn_pyopenjtalk_jp/images/discriminator_optim_step_time.png ADDED
44kexp/svs_train_visinger2_raw_phn_pyopenjtalk_jp/images/discriminator_real_loss.png ADDED
44kexp/svs_train_visinger2_raw_phn_pyopenjtalk_jp/images/discriminator_train_time.png ADDED
44kexp/svs_train_visinger2_raw_phn_pyopenjtalk_jp/images/generator_adv_loss.png ADDED
44kexp/svs_train_visinger2_raw_phn_pyopenjtalk_jp/images/generator_backward_time.png ADDED
44kexp/svs_train_visinger2_raw_phn_pyopenjtalk_jp/images/generator_feat_match_loss.png ADDED
44kexp/svs_train_visinger2_raw_phn_pyopenjtalk_jp/images/generator_forward_time.png ADDED
44kexp/svs_train_visinger2_raw_phn_pyopenjtalk_jp/images/generator_kl_loss.png ADDED
44kexp/svs_train_visinger2_raw_phn_pyopenjtalk_jp/images/generator_loss.png ADDED
44kexp/svs_train_visinger2_raw_phn_pyopenjtalk_jp/images/generator_mel_am_loss.png ADDED
44kexp/svs_train_visinger2_raw_phn_pyopenjtalk_jp/images/generator_mel_ddsp_loss.png ADDED
44kexp/svs_train_visinger2_raw_phn_pyopenjtalk_jp/images/generator_mel_loss.png ADDED
44kexp/svs_train_visinger2_raw_phn_pyopenjtalk_jp/images/generator_optim_step_time.png ADDED
44kexp/svs_train_visinger2_raw_phn_pyopenjtalk_jp/images/generator_phn_dur_loss.png ADDED
44kexp/svs_train_visinger2_raw_phn_pyopenjtalk_jp/images/generator_pitch_loss.png ADDED
44kexp/svs_train_visinger2_raw_phn_pyopenjtalk_jp/images/generator_score_dur_loss.png ADDED
44kexp/svs_train_visinger2_raw_phn_pyopenjtalk_jp/images/generator_train_time.png ADDED
44kexp/svs_train_visinger2_raw_phn_pyopenjtalk_jp/images/gpu_max_cached_mem_GB.png ADDED
44kexp/svs_train_visinger2_raw_phn_pyopenjtalk_jp/images/iter_time.png ADDED
44kexp/svs_train_visinger2_raw_phn_pyopenjtalk_jp/images/optim0_lr0.png ADDED
44kexp/svs_train_visinger2_raw_phn_pyopenjtalk_jp/images/optim1_lr0.png ADDED
44kexp/svs_train_visinger2_raw_phn_pyopenjtalk_jp/images/train_time.png ADDED
README.md CHANGED
@@ -1,3 +1,518 @@
1
  ---
2
- license: apache-2.0
 
 
 
 
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - singing-voice-synthesis
6
+ language: jp
7
+ datasets:
8
+ - oniku_kurumi_utagoe_db
9
+ license: cc-by-4.0
10
  ---
11
+
12
+ ## ESPnet2 SVS model
13
+
14
+ ### `espnet/oniku_kurumi_utagoe_db_svs_visinger2`
15
+
16
+ This model was trained by ftshijt using oniku_kurumi_utagoe_db recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+ ### Demo: How to use in ESPnet2
19
+
20
+ Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html)
21
+ if you haven't done that already.
22
+
23
+ ```bash
24
+ cd espnet
25
+ git checkout 5c4d7cf7feba8461de2e1080bf82182f0efaef38
26
+ pip install -e .
27
+ cd egs2/oniku_kurumi_utagoe_db/svs1
28
+ ./run.sh --skip_data_prep false --skip_train true --download_model espnet/oniku_kurumi_utagoe_db_svs_visinger2
29
+ ```
30
+
31
+
32
+
33
+ ## SVS config
34
+
35
+ <details><summary>expand</summary>
36
+
37
+ ```
38
+ config: conf/tuning/train_visinger2.yaml
39
+ print_config: false
40
+ log_level: INFO
41
+ drop_last_iter: false
42
+ dry_run: false
43
+ iterator_type: sequence
44
+ valid_iterator_type: null
45
+ output_dir: 44kexp/svs_train_visinger2_raw_phn_pyopenjtalk_jp
46
+ ngpu: 1
47
+ seed: 777
48
+ num_workers: 4
49
+ num_att_plot: 3
50
+ dist_backend: nccl
51
+ dist_init_method: env://
52
+ dist_world_size: null
53
+ dist_rank: null
54
+ local_rank: 0
55
+ dist_master_addr: null
56
+ dist_master_port: null
57
+ dist_launcher: null
58
+ multiprocessing_distributed: false
59
+ unused_parameters: true
60
+ sharded_ddp: false
61
+ cudnn_enabled: true
62
+ cudnn_benchmark: false
63
+ cudnn_deterministic: false
64
+ collect_stats: false
65
+ write_collected_feats: false
66
+ max_epoch: 500
67
+ patience: null
68
+ val_scheduler_criterion:
69
+ - valid
70
+ - loss
71
+ early_stopping_criterion:
72
+ - valid
73
+ - loss
74
+ - min
75
+ best_model_criterion:
76
+ - - train
77
+ - total_count
78
+ - max
79
+ keep_nbest_models: 10
80
+ nbest_averaging_interval: 0
81
+ grad_clip: -1
82
+ grad_clip_type: 2.0
83
+ grad_noise: false
84
+ accum_grad: 1
85
+ no_forward_run: false
86
+ resume: true
87
+ train_dtype: float32
88
+ use_amp: false
89
+ log_interval: 50
90
+ use_matplotlib: true
91
+ use_tensorboard: true
92
+ create_graph_in_tensorboard: false
93
+ use_wandb: false
94
+ wandb_project: null
95
+ wandb_id: null
96
+ wandb_entity: null
97
+ wandb_name: null
98
+ wandb_model_log_interval: -1
99
+ detect_anomaly: false
100
+ use_lora: false
101
+ save_lora_only: true
102
+ lora_conf: {}
103
+ pretrain_path: null
104
+ init_param: []
105
+ ignore_init_mismatch: false
106
+ freeze_param: []
107
+ num_iters_per_epoch: 1000
108
+ batch_size: 8
109
+ valid_batch_size: null
110
+ batch_bins: 1000000
111
+ valid_batch_bins: null
112
+ train_shape_file:
113
+ - 44kexp/svs_stats_raw_phn_pyopenjtalk_jp/train/text_shape.phn
114
+ - 44kexp/svs_stats_raw_phn_pyopenjtalk_jp/train/singing_shape
115
+ valid_shape_file:
116
+ - 44kexp/svs_stats_raw_phn_pyopenjtalk_jp/valid/text_shape.phn
117
+ - 44kexp/svs_stats_raw_phn_pyopenjtalk_jp/valid/singing_shape
118
+ batch_type: sorted
119
+ valid_batch_type: null
120
+ fold_length:
121
+ - 150
122
+ - 409600
123
+ sort_in_batch: descending
124
+ shuffle_within_batch: false
125
+ sort_batch: descending
126
+ multiple_iterator: false
127
+ chunk_length: 500
128
+ chunk_shift_ratio: 0.5
129
+ num_cache_chunks: 1024
130
+ chunk_excluded_key_prefixes: []
131
+ chunk_default_fs: null
132
+ train_data_path_and_name_and_type:
133
+ - - 44kdump/raw/tr_no_dev/text
134
+ - text
135
+ - text
136
+ - - 44kdump/raw/tr_no_dev/wav.scp
137
+ - singing
138
+ - sound
139
+ - - 44kdump/raw/tr_no_dev/label
140
+ - label
141
+ - duration
142
+ - - 44kdump/raw/tr_no_dev/score.scp
143
+ - score
144
+ - score
145
+ valid_data_path_and_name_and_type:
146
+ - - 44kdump/raw/dev/text
147
+ - text
148
+ - text
149
+ - - 44kdump/raw/dev/wav.scp
150
+ - singing
151
+ - sound
152
+ - - 44kdump/raw/dev/label
153
+ - label
154
+ - duration
155
+ - - 44kdump/raw/dev/score.scp
156
+ - score
157
+ - score
158
+ allow_variable_data_keys: false
159
+ max_cache_size: 0.0
160
+ max_cache_fd: 32
161
+ allow_multi_rates: false
162
+ valid_max_cache_size: null
163
+ exclude_weight_decay: false
164
+ exclude_weight_decay_conf: {}
165
+ optim: adamw
166
+ optim_conf:
167
+ lr: 0.0002
168
+ betas:
169
+ - 0.8
170
+ - 0.99
171
+ eps: 1.0e-09
172
+ weight_decay: 0.0
173
+ scheduler: exponentiallr
174
+ scheduler_conf:
175
+ gamma: 0.998
176
+ optim2: adamw
177
+ optim2_conf:
178
+ lr: 0.0002
179
+ betas:
180
+ - 0.8
181
+ - 0.99
182
+ eps: 1.0e-09
183
+ weight_decay: 0.0
184
+ scheduler2: exponentiallr
185
+ scheduler2_conf:
186
+ gamma: 0.998
187
+ generator_first: false
188
+ token_list:
189
+ - <blank>
190
+ - <unk>
191
+ - pau
192
+ - a
193
+ - o
194
+ - i
195
+ - u
196
+ - e
197
+ - k
198
+ - n
199
+ - r
200
+ - m
201
+ - t
202
+ - N
203
+ - s
204
+ - w
205
+ - y
206
+ - sh
207
+ - g
208
+ - d
209
+ - ch
210
+ - b
211
+ - ts
212
+ - p
213
+ - z
214
+ - h
215
+ - f
216
+ - j
217
+ - cl
218
+ - ry
219
+ - ky
220
+ - gy
221
+ - ny
222
+ - hy
223
+ - my
224
+ - v
225
+ - by
226
+ - py
227
+ - ty
228
+ - dy
229
+ - <sos/eos>
230
+ odim: null
231
+ model_conf: {}
232
+ use_preprocessor: true
233
+ token_type: phn
234
+ bpemodel: null
235
+ non_linguistic_symbols: null
236
+ cleaner: null
237
+ g2p: pyopenjtalk
238
+ fs: 44100
239
+ score_feats_extract: syllable_score_feats
240
+ score_feats_extract_conf:
241
+ fs: 44100
242
+ n_fft: 2048
243
+ win_length: 2048
244
+ hop_length: 512
245
+ feats_extract: fbank
246
+ feats_extract_conf:
247
+ n_fft: 2048
248
+ hop_length: 512
249
+ win_length: 2048
250
+ fs: 44100
251
+ fmin: 80
252
+ fmax: 22050
253
+ n_mels: 80
254
+ normalize: global_mvn
255
+ normalize_conf:
256
+ stats_file: 44kexp/svs_stats_raw_phn_pyopenjtalk_jp/train/feats_stats.npz
257
+ svs: vits
258
+ svs_conf:
259
+ generator_type: visinger2
260
+ vocoder_generator_type: visinger2
261
+ generator_params:
262
+ hidden_channels: 192
263
+ spks: -1
264
+ global_channels: -1
265
+ segment_size: 20
266
+ text_encoder_attention_heads: 2
267
+ text_encoder_ffn_expand: 4
268
+ text_encoder_blocks: 6
269
+ text_encoder_positionwise_layer_type: conv1d
270
+ text_encoder_positionwise_conv_kernel_size: 3
271
+ text_encoder_positional_encoding_layer_type: rel_pos
272
+ text_encoder_self_attention_layer_type: rel_selfattn
273
+ text_encoder_activation_type: swish
274
+ text_encoder_normalize_before: true
275
+ text_encoder_dropout_rate: 0.1
276
+ text_encoder_positional_dropout_rate: 0.0
277
+ text_encoder_attention_dropout_rate: 0.1
278
+ use_macaron_style_in_text_encoder: true
279
+ use_conformer_conv_in_text_encoder: false
280
+ text_encoder_conformer_kernel_size: -1
281
+ decoder_kernel_size: 7
282
+ decoder_channels: 256
283
+ decoder_upsample_scales:
284
+ - 8
285
+ - 8
286
+ - 4
287
+ - 2
288
+ decoder_upsample_kernel_sizes:
289
+ - 16
290
+ - 16
291
+ - 8
292
+ - 4
293
+ n_harmonic: 64
294
+ decoder_resblock_kernel_sizes:
295
+ - 3
296
+ - 7
297
+ - 11
298
+ decoder_resblock_dilations:
299
+ - - 1
300
+ - 3
301
+ - 5
302
+ - - 1
303
+ - 3
304
+ - 5
305
+ - - 1
306
+ - 3
307
+ - 5
308
+ use_weight_norm_in_decoder: true
309
+ posterior_encoder_kernel_size: 3
310
+ posterior_encoder_layers: 8
311
+ posterior_encoder_stacks: 1
312
+ posterior_encoder_base_dilation: 1
313
+ posterior_encoder_dropout_rate: 0.0
314
+ use_weight_norm_in_posterior_encoder: true
315
+ flow_flows: -1
316
+ flow_kernel_size: 5
317
+ flow_base_dilation: 1
318
+ flow_layers: 4
319
+ flow_dropout_rate: 0.0
320
+ use_weight_norm_in_flow: true
321
+ use_only_mean_in_flow: true
322
+ use_phoneme_predictor: false
323
+ vocabs: 41
324
+ aux_channels: 80
325
+ generator_type: visinger2
326
+ vocoder_generator_type: visinger2
327
+ fs: 44100
328
+ hop_length: 512
329
+ win_length: 2048
330
+ n_fft: 2048
331
+ discriminator_type: visinger2
332
+ discriminator_params:
333
+ scales: 1
334
+ scale_downsample_pooling: AvgPool1d
335
+ scale_downsample_pooling_params:
336
+ kernel_size: 4
337
+ stride: 2
338
+ padding: 2
339
+ scale_discriminator_params:
340
+ in_channels: 1
341
+ out_channels: 1
342
+ kernel_sizes:
343
+ - 15
344
+ - 41
345
+ - 5
346
+ - 3
347
+ channels: 128
348
+ max_downsample_channels: 1024
349
+ max_groups: 256
350
+ bias: true
351
+ downsample_scales:
352
+ - 4
353
+ - 4
354
+ - 4
355
+ - 4
356
+ nonlinear_activation: LeakyReLU
357
+ nonlinear_activation_params:
358
+ negative_slope: 0.1
359
+ use_weight_norm: true
360
+ use_spectral_norm: false
361
+ follow_official_norm: false
362
+ periods:
363
+ - 2
364
+ - 3
365
+ - 5
366
+ - 7
367
+ - 11
368
+ period_discriminator_params:
369
+ in_channels: 1
370
+ out_channels: 1
371
+ kernel_sizes:
372
+ - 5
373
+ - 3
374
+ channels: 32
375
+ downsample_scales:
376
+ - 3
377
+ - 3
378
+ - 3
379
+ - 3
380
+ - 1
381
+ max_downsample_channels: 1024
382
+ bias: true
383
+ nonlinear_activation: LeakyReLU
384
+ nonlinear_activation_params:
385
+ negative_slope: 0.1
386
+ use_weight_norm: true
387
+ use_spectral_norm: false
388
+ multi_freq_disc_params:
389
+ hidden_channels:
390
+ - 256
391
+ - 256
392
+ - 256
393
+ - 256
394
+ - 256
395
+ domain: double
396
+ mel_scale: true
397
+ divisors:
398
+ - 32
399
+ - 16
400
+ - 8
401
+ - 4
402
+ - 2
403
+ - 1
404
+ - 1
405
+ strides:
406
+ - 1
407
+ - 2
408
+ - 1
409
+ - 2
410
+ - 1
411
+ - 2
412
+ - 1
413
+ sample_rate: 44100
414
+ hop_lengths:
415
+ - 110
416
+ - 220
417
+ - 330
418
+ - 441
419
+ - 551
420
+ - 661
421
+ generator_adv_loss_params:
422
+ average_by_discriminators: false
423
+ loss_type: mse
424
+ discriminator_adv_loss_params:
425
+ average_by_discriminators: false
426
+ loss_type: mse
427
+ feat_match_loss_params:
428
+ average_by_discriminators: false
429
+ average_by_layers: false
430
+ include_final_outputs: true
431
+ mel_loss_params:
432
+ fs: 44100
433
+ n_fft: 2048
434
+ hop_length: 512
435
+ win_length: 2048
436
+ window: hann
437
+ n_mels: 80
438
+ fmin: 0
439
+ fmax: 22050
440
+ log_base: null
441
+ lambda_adv: 1.0
442
+ lambda_mel: 45.0
443
+ lambda_feat_match: 2.0
444
+ lambda_dur: 0.1
445
+ lambda_pitch: 10.0
446
+ lambda_phoneme: 1.0
447
+ lambda_kl: 1.0
448
+ sampling_rate: 44100
449
+ cache_generator_outputs: true
450
+ pitch_extract: dio
451
+ pitch_extract_conf:
452
+ use_token_averaged_f0: false
453
+ use_log_f0: false
454
+ fs: 44100
455
+ n_fft: 2048
456
+ hop_length: 512
457
+ f0max: 800
458
+ f0min: 80
459
+ pitch_normalize: null
460
+ pitch_normalize_conf:
461
+ stats_file: 44kexp/svs_stats_raw_phn_pyopenjtalk_jp/train/pitch_stats.npz
462
+ ying_extract: null
463
+ ying_extract_conf: {}
464
+ energy_extract: null
465
+ energy_extract_conf: {}
466
+ energy_normalize: null
467
+ energy_normalize_conf: {}
468
+ required:
469
+ - output_dir
470
+ - token_list
471
+ version: '202310'
472
+ distributed: false
473
+ ```
474
+
475
+ </details>
476
+
477
+
478
+
479
+ ### Citing ESPnet
480
+
481
+ ```BibTex
482
+ @inproceedings{watanabe2018espnet,
483
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
484
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
485
+ year={2018},
486
+ booktitle={Proceedings of Interspeech},
487
+ pages={2207--2211},
488
+ doi={10.21437/Interspeech.2018-1456},
489
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
490
+ }
491
+
492
+
493
+
494
+
495
+
496
+
497
+ @inproceedings{shi22d_interspeech,
498
+ author={Jiatong Shi and Shuai Guo and Tao Qian and Tomoki Hayashi and Yuning Wu and Fangzheng Xu and Xuankai Chang and Huazhe Li and Peter Wu and Shinji Watanabe and Qin Jin},
499
+ title={{Muskits: an End-to-end Music Processing Toolkit for Singing Voice Synthesis}},
500
+ year=2022,
501
+ booktitle={Proc. Interspeech 2022},
502
+ pages={4277--4281},
503
+ doi={10.21437/Interspeech.2022-10039}
504
+ }
505
+ ```
506
+
507
+ or arXiv:
508
+
509
+ ```bibtex
510
+ @misc{watanabe2018espnet,
511
+ title={ESPnet: End-to-End Speech Processing Toolkit},
512
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
513
+ year={2018},
514
+ eprint={1804.00015},
515
+ archivePrefix={arXiv},
516
+ primaryClass={cs.CL}
517
+ }
518
+ ```
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: '202310'
2
+ files:
3
+ model_file: 44kexp/svs_train_visinger2_raw_phn_pyopenjtalk_jp/82epoch.pth
4
+ python: "3.9.16 (main, Mar 8 2023, 14:00:05) \n[GCC 11.2.0]"
5
+ timestamp: 1703470450.02486
6
+ torch: 1.13.1+cu117
7
+ yaml_files:
8
+ train_config: 44kexp/svs_train_visinger2_raw_phn_pyopenjtalk_jp/config.yaml