ESPnet
jp
audio
singing-voice-synthesis
ftshijt commited on
Commit
a2a617c
1 Parent(s): 576d9b9

Update model

Browse files
Files changed (30) hide show
  1. README.md +20 -20
  2. exp/svs_train_visinger2_24_raw_phn_pyopenjtalk_jp/191epoch.pth +3 -0
  3. exp/svs_train_visinger2_24_raw_phn_pyopenjtalk_jp/config.yaml +428 -0
  4. exp/svs_train_visinger2_24_raw_phn_pyopenjtalk_jp/images/discriminator_backward_time.png +0 -0
  5. exp/svs_train_visinger2_24_raw_phn_pyopenjtalk_jp/images/discriminator_fake_loss.png +0 -0
  6. exp/svs_train_visinger2_24_raw_phn_pyopenjtalk_jp/images/discriminator_forward_time.png +0 -0
  7. exp/svs_train_visinger2_24_raw_phn_pyopenjtalk_jp/images/discriminator_loss.png +0 -0
  8. exp/svs_train_visinger2_24_raw_phn_pyopenjtalk_jp/images/discriminator_optim_step_time.png +0 -0
  9. exp/svs_train_visinger2_24_raw_phn_pyopenjtalk_jp/images/discriminator_real_loss.png +0 -0
  10. exp/svs_train_visinger2_24_raw_phn_pyopenjtalk_jp/images/discriminator_train_time.png +0 -0
  11. exp/svs_train_visinger2_24_raw_phn_pyopenjtalk_jp/images/generator_adv_loss.png +0 -0
  12. exp/svs_train_visinger2_24_raw_phn_pyopenjtalk_jp/images/generator_backward_time.png +0 -0
  13. exp/svs_train_visinger2_24_raw_phn_pyopenjtalk_jp/images/generator_feat_match_loss.png +0 -0
  14. exp/svs_train_visinger2_24_raw_phn_pyopenjtalk_jp/images/generator_forward_time.png +0 -0
  15. exp/svs_train_visinger2_24_raw_phn_pyopenjtalk_jp/images/generator_kl_loss.png +0 -0
  16. exp/svs_train_visinger2_24_raw_phn_pyopenjtalk_jp/images/generator_loss.png +0 -0
  17. exp/svs_train_visinger2_24_raw_phn_pyopenjtalk_jp/images/generator_mel_am_loss.png +0 -0
  18. exp/svs_train_visinger2_24_raw_phn_pyopenjtalk_jp/images/generator_mel_ddsp_loss.png +0 -0
  19. exp/svs_train_visinger2_24_raw_phn_pyopenjtalk_jp/images/generator_mel_loss.png +0 -0
  20. exp/svs_train_visinger2_24_raw_phn_pyopenjtalk_jp/images/generator_optim_step_time.png +0 -0
  21. exp/svs_train_visinger2_24_raw_phn_pyopenjtalk_jp/images/generator_phn_dur_loss.png +0 -0
  22. exp/svs_train_visinger2_24_raw_phn_pyopenjtalk_jp/images/generator_pitch_loss.png +0 -0
  23. exp/svs_train_visinger2_24_raw_phn_pyopenjtalk_jp/images/generator_score_dur_loss.png +0 -0
  24. exp/svs_train_visinger2_24_raw_phn_pyopenjtalk_jp/images/generator_train_time.png +0 -0
  25. exp/svs_train_visinger2_24_raw_phn_pyopenjtalk_jp/images/gpu_max_cached_mem_GB.png +0 -0
  26. exp/svs_train_visinger2_24_raw_phn_pyopenjtalk_jp/images/iter_time.png +0 -0
  27. exp/svs_train_visinger2_24_raw_phn_pyopenjtalk_jp/images/optim0_lr0.png +0 -0
  28. exp/svs_train_visinger2_24_raw_phn_pyopenjtalk_jp/images/optim1_lr0.png +0 -0
  29. exp/svs_train_visinger2_24_raw_phn_pyopenjtalk_jp/images/train_time.png +0 -0
  30. meta.yaml +3 -3
README.md CHANGED
@@ -35,17 +35,17 @@ cd egs2/kiritan/svs1
35
  <details><summary>expand</summary>
36
 
37
  ```
38
- config: conf/tuning/train_visinger_24.yaml
39
  print_config: false
40
  log_level: INFO
41
  drop_last_iter: false
42
  dry_run: false
43
  iterator_type: sequence
44
  valid_iterator_type: null
45
- output_dir: exp/svs_train_visinger_24_raw_phn_pyopenjtalk_jp
46
  ngpu: 1
47
  seed: 777
48
- num_workers: 2
49
  num_att_plot: 3
50
  dist_backend: nccl
51
  dist_init_method: env://
@@ -105,7 +105,7 @@ init_param: []
105
  ignore_init_mismatch: false
106
  freeze_param: []
107
  num_iters_per_epoch: 1000
108
- batch_size: 4
109
  valid_batch_size: null
110
  batch_bins: 1000000
111
  valid_batch_bins: null
@@ -184,7 +184,7 @@ optim2_conf:
184
  scheduler2: exponentiallr
185
  scheduler2_conf:
186
  gamma: 0.998
187
- generator_first: false
188
  token_list:
189
  - <blank>
190
  - <unk>
@@ -250,12 +250,11 @@ normalize_conf:
250
  stats_file: exp/svs_stats_raw_phn_pyopenjtalk_jp/train/feats_stats.npz
251
  svs: vits
252
  svs_conf:
253
- generator_type: visinger
254
- vocoder_generator_type: hifigan
255
  generator_params:
256
  hidden_channels: 192
257
  spks: -1
258
- global_channels: -1
259
  segment_size: 20
260
  text_encoder_attention_heads: 2
261
  text_encoder_ffn_expand: 4
@@ -273,7 +272,7 @@ svs_conf:
273
  use_conformer_conv_in_text_encoder: false
274
  text_encoder_conformer_kernel_size: -1
275
  decoder_kernel_size: 7
276
- decoder_channels: 512
277
  decoder_upsample_scales:
278
  - 5
279
  - 5
@@ -284,6 +283,7 @@ svs_conf:
284
  - 10
285
  - 8
286
  - 6
 
287
  decoder_resblock_kernel_sizes:
288
  - 3
289
  - 7
@@ -315,11 +315,11 @@ svs_conf:
315
  use_phoneme_predictor: false
316
  vocabs: 35
317
  aux_channels: 80
318
- generator_type: visinger
319
- vocoder_generator_type: hifigan
320
  fs: 24000
321
  hop_length: 300
322
- win_length: 1024
323
  n_fft: 2048
324
  discriminator_type: visinger2
325
  discriminator_params:
@@ -405,12 +405,12 @@ svs_conf:
405
  - 1
406
  sample_rate: 24000
407
  hop_lengths:
408
- - 110
409
- - 220
410
- - 330
411
- - 441
412
- - 551
413
- - 661
414
  generator_adv_loss_params:
415
  average_by_discriminators: false
416
  loss_type: mse
@@ -425,11 +425,11 @@ svs_conf:
425
  fs: 24000
426
  n_fft: 2048
427
  hop_length: 300
428
- win_length: 1024
429
  window: hann
430
  n_mels: 80
431
  fmin: 0
432
- fmax: 7600
433
  log_base: null
434
  lambda_adv: 1.0
435
  lambda_mel: 45.0
 
35
  <details><summary>expand</summary>
36
 
37
  ```
38
+ config: conf/tuning/train_visinger2_24.yaml
39
  print_config: false
40
  log_level: INFO
41
  drop_last_iter: false
42
  dry_run: false
43
  iterator_type: sequence
44
  valid_iterator_type: null
45
+ output_dir: exp/svs_train_visinger2_24_raw_phn_pyopenjtalk_jp
46
  ngpu: 1
47
  seed: 777
48
+ num_workers: 4
49
  num_att_plot: 3
50
  dist_backend: nccl
51
  dist_init_method: env://
 
105
  ignore_init_mismatch: false
106
  freeze_param: []
107
  num_iters_per_epoch: 1000
108
+ batch_size: 8
109
  valid_batch_size: null
110
  batch_bins: 1000000
111
  valid_batch_bins: null
 
184
  scheduler2: exponentiallr
185
  scheduler2_conf:
186
  gamma: 0.998
187
+ generator_first: true
188
  token_list:
189
  - <blank>
190
  - <unk>
 
250
  stats_file: exp/svs_stats_raw_phn_pyopenjtalk_jp/train/feats_stats.npz
251
  svs: vits
252
  svs_conf:
253
+ generator_type: visinger2
254
+ vocoder_generator_type: visinger2
255
  generator_params:
256
  hidden_channels: 192
257
  spks: -1
 
258
  segment_size: 20
259
  text_encoder_attention_heads: 2
260
  text_encoder_ffn_expand: 4
 
272
  use_conformer_conv_in_text_encoder: false
273
  text_encoder_conformer_kernel_size: -1
274
  decoder_kernel_size: 7
275
+ decoder_channels: 256
276
  decoder_upsample_scales:
277
  - 5
278
  - 5
 
283
  - 10
284
  - 8
285
  - 6
286
+ n_harmonic: 64
287
  decoder_resblock_kernel_sizes:
288
  - 3
289
  - 7
 
315
  use_phoneme_predictor: false
316
  vocabs: 35
317
  aux_channels: 80
318
+ generator_type: visinger2
319
+ vocoder_generator_type: visinger2
320
  fs: 24000
321
  hop_length: 300
322
+ win_length: 1200
323
  n_fft: 2048
324
  discriminator_type: visinger2
325
  discriminator_params:
 
405
  - 1
406
  sample_rate: 24000
407
  hop_lengths:
408
+ - 60
409
+ - 120
410
+ - 180
411
+ - 240
412
+ - 300
413
+ - 360
414
  generator_adv_loss_params:
415
  average_by_discriminators: false
416
  loss_type: mse
 
425
  fs: 24000
426
  n_fft: 2048
427
  hop_length: 300
428
+ win_length: 1200
429
  window: hann
430
  n_mels: 80
431
  fmin: 0
432
+ fmax: null
433
  log_base: null
434
  lambda_adv: 1.0
435
  lambda_mel: 45.0
exp/svs_train_visinger2_24_raw_phn_pyopenjtalk_jp/191epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ebde346e93fd2750366a3d6dab8de6fc9d241670b59d746ef8ece02598e5318
3
+ size 430317339
exp/svs_train_visinger2_24_raw_phn_pyopenjtalk_jp/config.yaml ADDED
@@ -0,0 +1,428 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/train_visinger2_24.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ drop_last_iter: false
5
+ dry_run: false
6
+ iterator_type: sequence
7
+ valid_iterator_type: null
8
+ output_dir: exp/svs_train_visinger2_24_raw_phn_pyopenjtalk_jp
9
+ ngpu: 1
10
+ seed: 777
11
+ num_workers: 4
12
+ num_att_plot: 3
13
+ dist_backend: nccl
14
+ dist_init_method: env://
15
+ dist_world_size: null
16
+ dist_rank: null
17
+ local_rank: 0
18
+ dist_master_addr: null
19
+ dist_master_port: null
20
+ dist_launcher: null
21
+ multiprocessing_distributed: false
22
+ unused_parameters: true
23
+ sharded_ddp: false
24
+ cudnn_enabled: true
25
+ cudnn_benchmark: false
26
+ cudnn_deterministic: false
27
+ collect_stats: false
28
+ write_collected_feats: false
29
+ max_epoch: 200
30
+ patience: null
31
+ val_scheduler_criterion:
32
+ - valid
33
+ - loss
34
+ early_stopping_criterion:
35
+ - valid
36
+ - loss
37
+ - min
38
+ best_model_criterion:
39
+ - - train
40
+ - total_count
41
+ - max
42
+ keep_nbest_models: 10
43
+ nbest_averaging_interval: 0
44
+ grad_clip: -1
45
+ grad_clip_type: 2.0
46
+ grad_noise: false
47
+ accum_grad: 1
48
+ no_forward_run: false
49
+ resume: true
50
+ train_dtype: float32
51
+ use_amp: false
52
+ log_interval: 50
53
+ use_matplotlib: true
54
+ use_tensorboard: true
55
+ create_graph_in_tensorboard: false
56
+ use_wandb: false
57
+ wandb_project: null
58
+ wandb_id: null
59
+ wandb_entity: null
60
+ wandb_name: null
61
+ wandb_model_log_interval: -1
62
+ detect_anomaly: false
63
+ use_lora: false
64
+ save_lora_only: true
65
+ lora_conf: {}
66
+ pretrain_path: null
67
+ init_param: []
68
+ ignore_init_mismatch: false
69
+ freeze_param: []
70
+ num_iters_per_epoch: 1000
71
+ batch_size: 8
72
+ valid_batch_size: null
73
+ batch_bins: 1000000
74
+ valid_batch_bins: null
75
+ train_shape_file:
76
+ - exp/svs_stats_raw_phn_pyopenjtalk_jp/train/text_shape.phn
77
+ - exp/svs_stats_raw_phn_pyopenjtalk_jp/train/singing_shape
78
+ valid_shape_file:
79
+ - exp/svs_stats_raw_phn_pyopenjtalk_jp/valid/text_shape.phn
80
+ - exp/svs_stats_raw_phn_pyopenjtalk_jp/valid/singing_shape
81
+ batch_type: sorted
82
+ valid_batch_type: null
83
+ fold_length:
84
+ - 150
85
+ - 240000
86
+ sort_in_batch: descending
87
+ shuffle_within_batch: false
88
+ sort_batch: descending
89
+ multiple_iterator: false
90
+ chunk_length: 500
91
+ chunk_shift_ratio: 0.5
92
+ num_cache_chunks: 1024
93
+ chunk_excluded_key_prefixes: []
94
+ chunk_default_fs: null
95
+ train_data_path_and_name_and_type:
96
+ - - dump/raw/tr_no_dev/text
97
+ - text
98
+ - text
99
+ - - dump/raw/tr_no_dev/wav.scp
100
+ - singing
101
+ - sound
102
+ - - dump/raw/tr_no_dev/label
103
+ - label
104
+ - duration
105
+ - - dump/raw/tr_no_dev/score.scp
106
+ - score
107
+ - score
108
+ valid_data_path_and_name_and_type:
109
+ - - dump/raw/dev/text
110
+ - text
111
+ - text
112
+ - - dump/raw/dev/wav.scp
113
+ - singing
114
+ - sound
115
+ - - dump/raw/dev/label
116
+ - label
117
+ - duration
118
+ - - dump/raw/dev/score.scp
119
+ - score
120
+ - score
121
+ allow_variable_data_keys: false
122
+ max_cache_size: 0.0
123
+ max_cache_fd: 32
124
+ allow_multi_rates: false
125
+ valid_max_cache_size: null
126
+ exclude_weight_decay: false
127
+ exclude_weight_decay_conf: {}
128
+ optim: adamw
129
+ optim_conf:
130
+ lr: 0.0002
131
+ betas:
132
+ - 0.8
133
+ - 0.99
134
+ eps: 1.0e-09
135
+ weight_decay: 0.0
136
+ scheduler: exponentiallr
137
+ scheduler_conf:
138
+ gamma: 0.998
139
+ optim2: adamw
140
+ optim2_conf:
141
+ lr: 0.0002
142
+ betas:
143
+ - 0.8
144
+ - 0.99
145
+ eps: 1.0e-09
146
+ weight_decay: 0.0
147
+ scheduler2: exponentiallr
148
+ scheduler2_conf:
149
+ gamma: 0.998
150
+ generator_first: true
151
+ token_list:
152
+ - <blank>
153
+ - <unk>
154
+ - pau
155
+ - a
156
+ - i
157
+ - o
158
+ - e
159
+ - u
160
+ - k
161
+ - n
162
+ - r
163
+ - t
164
+ - m
165
+ - d
166
+ - s
167
+ - N
168
+ - sh
169
+ - g
170
+ - y
171
+ - b
172
+ - w
173
+ - cl
174
+ - ts
175
+ - z
176
+ - ch
177
+ - j
178
+ - h
179
+ - f
180
+ - p
181
+ - ky
182
+ - ry
183
+ - hy
184
+ - py
185
+ - ny
186
+ - <sos/eos>
187
+ odim: null
188
+ model_conf: {}
189
+ use_preprocessor: true
190
+ token_type: phn
191
+ bpemodel: null
192
+ non_linguistic_symbols: null
193
+ cleaner: null
194
+ g2p: pyopenjtalk
195
+ fs: 24000
196
+ score_feats_extract: syllable_score_feats
197
+ score_feats_extract_conf:
198
+ fs: 24000
199
+ n_fft: 2048
200
+ win_length: 1200
201
+ hop_length: 300
202
+ feats_extract: fbank
203
+ feats_extract_conf:
204
+ n_fft: 2048
205
+ hop_length: 300
206
+ win_length: 1200
207
+ fs: 24000
208
+ fmin: 80
209
+ fmax: 7600
210
+ n_mels: 80
211
+ normalize: global_mvn
212
+ normalize_conf:
213
+ stats_file: exp/svs_stats_raw_phn_pyopenjtalk_jp/train/feats_stats.npz
214
+ svs: vits
215
+ svs_conf:
216
+ generator_type: visinger2
217
+ vocoder_generator_type: visinger2
218
+ generator_params:
219
+ hidden_channels: 192
220
+ spks: -1
221
+ segment_size: 20
222
+ text_encoder_attention_heads: 2
223
+ text_encoder_ffn_expand: 4
224
+ text_encoder_blocks: 6
225
+ text_encoder_positionwise_layer_type: conv1d
226
+ text_encoder_positionwise_conv_kernel_size: 3
227
+ text_encoder_positional_encoding_layer_type: rel_pos
228
+ text_encoder_self_attention_layer_type: rel_selfattn
229
+ text_encoder_activation_type: swish
230
+ text_encoder_normalize_before: true
231
+ text_encoder_dropout_rate: 0.1
232
+ text_encoder_positional_dropout_rate: 0.0
233
+ text_encoder_attention_dropout_rate: 0.1
234
+ use_macaron_style_in_text_encoder: true
235
+ use_conformer_conv_in_text_encoder: false
236
+ text_encoder_conformer_kernel_size: -1
237
+ decoder_kernel_size: 7
238
+ decoder_channels: 256
239
+ decoder_upsample_scales:
240
+ - 5
241
+ - 5
242
+ - 4
243
+ - 3
244
+ decoder_upsample_kernel_sizes:
245
+ - 10
246
+ - 10
247
+ - 8
248
+ - 6
249
+ n_harmonic: 64
250
+ decoder_resblock_kernel_sizes:
251
+ - 3
252
+ - 7
253
+ - 11
254
+ decoder_resblock_dilations:
255
+ - - 1
256
+ - 3
257
+ - 5
258
+ - - 1
259
+ - 3
260
+ - 5
261
+ - - 1
262
+ - 3
263
+ - 5
264
+ use_weight_norm_in_decoder: true
265
+ posterior_encoder_kernel_size: 3
266
+ posterior_encoder_layers: 8
267
+ posterior_encoder_stacks: 1
268
+ posterior_encoder_base_dilation: 1
269
+ posterior_encoder_dropout_rate: 0.0
270
+ use_weight_norm_in_posterior_encoder: true
271
+ flow_flows: -1
272
+ flow_kernel_size: 5
273
+ flow_base_dilation: 1
274
+ flow_layers: 4
275
+ flow_dropout_rate: 0.0
276
+ use_weight_norm_in_flow: true
277
+ use_only_mean_in_flow: true
278
+ use_phoneme_predictor: false
279
+ vocabs: 35
280
+ aux_channels: 80
281
+ generator_type: visinger2
282
+ vocoder_generator_type: visinger2
283
+ fs: 24000
284
+ hop_length: 300
285
+ win_length: 1200
286
+ n_fft: 2048
287
+ discriminator_type: visinger2
288
+ discriminator_params:
289
+ scales: 1
290
+ scale_downsample_pooling: AvgPool1d
291
+ scale_downsample_pooling_params:
292
+ kernel_size: 4
293
+ stride: 2
294
+ padding: 2
295
+ scale_discriminator_params:
296
+ in_channels: 1
297
+ out_channels: 1
298
+ kernel_sizes:
299
+ - 15
300
+ - 41
301
+ - 5
302
+ - 3
303
+ channels: 128
304
+ max_downsample_channels: 1024
305
+ max_groups: 256
306
+ bias: true
307
+ downsample_scales:
308
+ - 4
309
+ - 4
310
+ - 4
311
+ - 4
312
+ nonlinear_activation: LeakyReLU
313
+ nonlinear_activation_params:
314
+ negative_slope: 0.1
315
+ use_weight_norm: true
316
+ use_spectral_norm: false
317
+ follow_official_norm: false
318
+ periods:
319
+ - 2
320
+ - 3
321
+ - 5
322
+ - 7
323
+ - 11
324
+ period_discriminator_params:
325
+ in_channels: 1
326
+ out_channels: 1
327
+ kernel_sizes:
328
+ - 5
329
+ - 3
330
+ channels: 32
331
+ downsample_scales:
332
+ - 3
333
+ - 3
334
+ - 3
335
+ - 3
336
+ - 1
337
+ max_downsample_channels: 1024
338
+ bias: true
339
+ nonlinear_activation: LeakyReLU
340
+ nonlinear_activation_params:
341
+ negative_slope: 0.1
342
+ use_weight_norm: true
343
+ use_spectral_norm: false
344
+ multi_freq_disc_params:
345
+ hidden_channels:
346
+ - 256
347
+ - 256
348
+ - 256
349
+ - 256
350
+ - 256
351
+ domain: double
352
+ mel_scale: true
353
+ divisors:
354
+ - 32
355
+ - 16
356
+ - 8
357
+ - 4
358
+ - 2
359
+ - 1
360
+ - 1
361
+ strides:
362
+ - 1
363
+ - 2
364
+ - 1
365
+ - 2
366
+ - 1
367
+ - 2
368
+ - 1
369
+ sample_rate: 24000
370
+ hop_lengths:
371
+ - 60
372
+ - 120
373
+ - 180
374
+ - 240
375
+ - 300
376
+ - 360
377
+ generator_adv_loss_params:
378
+ average_by_discriminators: false
379
+ loss_type: mse
380
+ discriminator_adv_loss_params:
381
+ average_by_discriminators: false
382
+ loss_type: mse
383
+ feat_match_loss_params:
384
+ average_by_discriminators: false
385
+ average_by_layers: false
386
+ include_final_outputs: true
387
+ mel_loss_params:
388
+ fs: 24000
389
+ n_fft: 2048
390
+ hop_length: 300
391
+ win_length: 1200
392
+ window: hann
393
+ n_mels: 80
394
+ fmin: 0
395
+ fmax: null
396
+ log_base: null
397
+ lambda_adv: 1.0
398
+ lambda_mel: 45.0
399
+ lambda_feat_match: 2.0
400
+ lambda_dur: 0.1
401
+ lambda_pitch: 10.0
402
+ lambda_phoneme: 1.0
403
+ lambda_kl: 1.0
404
+ sampling_rate: 24000
405
+ cache_generator_outputs: true
406
+ pitch_extract: dio
407
+ pitch_extract_conf:
408
+ use_token_averaged_f0: false
409
+ use_log_f0: false
410
+ fs: 24000
411
+ n_fft: 2048
412
+ hop_length: 300
413
+ f0max: 800
414
+ f0min: 80
415
+ pitch_normalize: null
416
+ pitch_normalize_conf:
417
+ stats_file: exp/svs_stats_raw_phn_pyopenjtalk_jp/train/pitch_stats.npz
418
+ ying_extract: null
419
+ ying_extract_conf: {}
420
+ energy_extract: null
421
+ energy_extract_conf: {}
422
+ energy_normalize: null
423
+ energy_normalize_conf: {}
424
+ required:
425
+ - output_dir
426
+ - token_list
427
+ version: '202310'
428
+ distributed: false
exp/svs_train_visinger2_24_raw_phn_pyopenjtalk_jp/images/discriminator_backward_time.png ADDED
exp/svs_train_visinger2_24_raw_phn_pyopenjtalk_jp/images/discriminator_fake_loss.png ADDED
exp/svs_train_visinger2_24_raw_phn_pyopenjtalk_jp/images/discriminator_forward_time.png ADDED
exp/svs_train_visinger2_24_raw_phn_pyopenjtalk_jp/images/discriminator_loss.png ADDED
exp/svs_train_visinger2_24_raw_phn_pyopenjtalk_jp/images/discriminator_optim_step_time.png ADDED
exp/svs_train_visinger2_24_raw_phn_pyopenjtalk_jp/images/discriminator_real_loss.png ADDED
exp/svs_train_visinger2_24_raw_phn_pyopenjtalk_jp/images/discriminator_train_time.png ADDED
exp/svs_train_visinger2_24_raw_phn_pyopenjtalk_jp/images/generator_adv_loss.png ADDED
exp/svs_train_visinger2_24_raw_phn_pyopenjtalk_jp/images/generator_backward_time.png ADDED
exp/svs_train_visinger2_24_raw_phn_pyopenjtalk_jp/images/generator_feat_match_loss.png ADDED
exp/svs_train_visinger2_24_raw_phn_pyopenjtalk_jp/images/generator_forward_time.png ADDED
exp/svs_train_visinger2_24_raw_phn_pyopenjtalk_jp/images/generator_kl_loss.png ADDED
exp/svs_train_visinger2_24_raw_phn_pyopenjtalk_jp/images/generator_loss.png ADDED
exp/svs_train_visinger2_24_raw_phn_pyopenjtalk_jp/images/generator_mel_am_loss.png ADDED
exp/svs_train_visinger2_24_raw_phn_pyopenjtalk_jp/images/generator_mel_ddsp_loss.png ADDED
exp/svs_train_visinger2_24_raw_phn_pyopenjtalk_jp/images/generator_mel_loss.png ADDED
exp/svs_train_visinger2_24_raw_phn_pyopenjtalk_jp/images/generator_optim_step_time.png ADDED
exp/svs_train_visinger2_24_raw_phn_pyopenjtalk_jp/images/generator_phn_dur_loss.png ADDED
exp/svs_train_visinger2_24_raw_phn_pyopenjtalk_jp/images/generator_pitch_loss.png ADDED
exp/svs_train_visinger2_24_raw_phn_pyopenjtalk_jp/images/generator_score_dur_loss.png ADDED
exp/svs_train_visinger2_24_raw_phn_pyopenjtalk_jp/images/generator_train_time.png ADDED
exp/svs_train_visinger2_24_raw_phn_pyopenjtalk_jp/images/gpu_max_cached_mem_GB.png ADDED
exp/svs_train_visinger2_24_raw_phn_pyopenjtalk_jp/images/iter_time.png ADDED
exp/svs_train_visinger2_24_raw_phn_pyopenjtalk_jp/images/optim0_lr0.png ADDED
exp/svs_train_visinger2_24_raw_phn_pyopenjtalk_jp/images/optim1_lr0.png ADDED
exp/svs_train_visinger2_24_raw_phn_pyopenjtalk_jp/images/train_time.png ADDED
meta.yaml CHANGED
@@ -1,8 +1,8 @@
1
  espnet: '202310'
2
  files:
3
- model_file: exp/svs_train_visinger_24_raw_phn_pyopenjtalk_jp/200epoch.pth
4
  python: "3.9.16 (main, Mar 8 2023, 14:00:05) \n[GCC 11.2.0]"
5
- timestamp: 1703352886.770364
6
  torch: 1.13.1+cu117
7
  yaml_files:
8
- train_config: exp/svs_train_visinger_24_raw_phn_pyopenjtalk_jp/config.yaml
 
1
  espnet: '202310'
2
  files:
3
+ model_file: exp/svs_train_visinger2_24_raw_phn_pyopenjtalk_jp/191epoch.pth
4
  python: "3.9.16 (main, Mar 8 2023, 14:00:05) \n[GCC 11.2.0]"
5
+ timestamp: 1703352969.464152
6
  torch: 1.13.1+cu117
7
  yaml_files:
8
+ train_config: exp/svs_train_visinger2_24_raw_phn_pyopenjtalk_jp/config.yaml