Text-to-Speech
ESPnet
jp
audio
ErodeesFleurs commited on
Commit
08c9294
1 Parent(s): 1f12231

commit from

Browse files
exp/tts_amadeus_vits_finetune_from_jsut_32_sentence/config.yaml ADDED
@@ -0,0 +1,396 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/finetune_vits.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: exp/tts_amadeus_vits_finetune_from_jsut_32_sentence
7
+ ngpu: 1
8
+ seed: 777
9
+ num_workers: 4
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: null
14
+ dist_rank: null
15
+ local_rank: 0
16
+ dist_master_addr: null
17
+ dist_master_port: null
18
+ dist_launcher: null
19
+ multiprocessing_distributed: false
20
+ unused_parameters: true
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: false
25
+ collect_stats: false
26
+ write_collected_feats: false
27
+ max_epoch: 2000
28
+ patience: null
29
+ val_scheduler_criterion:
30
+ - valid
31
+ - loss
32
+ early_stopping_criterion:
33
+ - valid
34
+ - loss
35
+ - min
36
+ best_model_criterion:
37
+ - - train
38
+ - total_count
39
+ - max
40
+ keep_nbest_models: 3
41
+ nbest_averaging_interval: 0
42
+ grad_clip: -1
43
+ grad_clip_type: 2.0
44
+ grad_noise: false
45
+ accum_grad: 1
46
+ no_forward_run: false
47
+ resume: true
48
+ train_dtype: float32
49
+ use_amp: false
50
+ log_interval: 50
51
+ use_matplotlib: true
52
+ use_tensorboard: true
53
+ create_graph_in_tensorboard: false
54
+ use_wandb: true
55
+ wandb_project: amadeus
56
+ wandb_id: null
57
+ wandb_entity: null
58
+ wandb_name: null
59
+ wandb_model_log_interval: -1
60
+ detect_anomaly: false
61
+ pretrain_path: null
62
+ init_param:
63
+ - downloads/f3698edf589206588f58f5ec837fa516/exp/tts_train_vits_raw_phn_jaconv_pyopenjtalk_accent_with_pause/train.total_count.ave_10best.pth:tts:tts
64
+ ignore_init_mismatch: false
65
+ freeze_param: []
66
+ num_iters_per_epoch: null
67
+ batch_size: 20
68
+ valid_batch_size: null
69
+ batch_bins: 5000000
70
+ valid_batch_bins: null
71
+ train_shape_file:
72
+ - exp/tts_stats_raw_linear_spectrogram_phn_jaconv_pyopenjtalk_accent_with_pause/train/text_shape.phn
73
+ - exp/tts_stats_raw_linear_spectrogram_phn_jaconv_pyopenjtalk_accent_with_pause/train/speech_shape
74
+ valid_shape_file:
75
+ - exp/tts_stats_raw_linear_spectrogram_phn_jaconv_pyopenjtalk_accent_with_pause/valid/text_shape.phn
76
+ - exp/tts_stats_raw_linear_spectrogram_phn_jaconv_pyopenjtalk_accent_with_pause/valid/speech_shape
77
+ batch_type: numel
78
+ valid_batch_type: null
79
+ fold_length:
80
+ - 150
81
+ - 204800
82
+ sort_in_batch: descending
83
+ sort_batch: descending
84
+ multiple_iterator: false
85
+ chunk_length: 500
86
+ chunk_shift_ratio: 0.5
87
+ num_cache_chunks: 1024
88
+ train_data_path_and_name_and_type:
89
+ - - dump/22k/raw/train/text
90
+ - text
91
+ - text
92
+ - - dump/22k/raw/train/wav.scp
93
+ - speech
94
+ - sound
95
+ valid_data_path_and_name_and_type:
96
+ - - dump/22k/raw/dev/text
97
+ - text
98
+ - text
99
+ - - dump/22k/raw/dev/wav.scp
100
+ - speech
101
+ - sound
102
+ allow_variable_data_keys: false
103
+ max_cache_size: 0.0
104
+ max_cache_fd: 32
105
+ valid_max_cache_size: null
106
+ optim: adamw
107
+ optim_conf:
108
+ lr: 0.0001
109
+ betas:
110
+ - 0.8
111
+ - 0.99
112
+ eps: 1.0e-09
113
+ weight_decay: 0.0
114
+ scheduler: exponentiallr
115
+ scheduler_conf:
116
+ gamma: 0.999875
117
+ optim2: adamw
118
+ optim2_conf:
119
+ lr: 0.0001
120
+ betas:
121
+ - 0.8
122
+ - 0.99
123
+ eps: 1.0e-09
124
+ weight_decay: 0.0
125
+ scheduler2: exponentiallr
126
+ scheduler2_conf:
127
+ gamma: 0.999875
128
+ generator_first: false
129
+ token_list:
130
+ - <blank>
131
+ - <unk>
132
+ - '1'
133
+ - '2'
134
+ - '0'
135
+ - '3'
136
+ - '4'
137
+ - '-1'
138
+ - '5'
139
+ - a
140
+ - o
141
+ - '-2'
142
+ - i
143
+ - '-3'
144
+ - u
145
+ - e
146
+ - k
147
+ - n
148
+ - t
149
+ - '6'
150
+ - r
151
+ - '-4'
152
+ - s
153
+ - N
154
+ - m
155
+ - pau
156
+ - '7'
157
+ - sh
158
+ - d
159
+ - g
160
+ - w
161
+ - '8'
162
+ - U
163
+ - '-5'
164
+ - I
165
+ - cl
166
+ - h
167
+ - y
168
+ - b
169
+ - '9'
170
+ - j
171
+ - ts
172
+ - ch
173
+ - '-6'
174
+ - z
175
+ - p
176
+ - '-7'
177
+ - f
178
+ - ky
179
+ - ry
180
+ - '-8'
181
+ - gy
182
+ - '-9'
183
+ - hy
184
+ - ny
185
+ - '-10'
186
+ - by
187
+ - my
188
+ - '-11'
189
+ - '-12'
190
+ - '-13'
191
+ - py
192
+ - '-14'
193
+ - '-15'
194
+ - v
195
+ - '10'
196
+ - '-16'
197
+ - '-17'
198
+ - '11'
199
+ - '-21'
200
+ - '-20'
201
+ - '12'
202
+ - '-19'
203
+ - '13'
204
+ - '-18'
205
+ - '14'
206
+ - dy
207
+ - '15'
208
+ - ty
209
+ - '-22'
210
+ - '16'
211
+ - '18'
212
+ - '19'
213
+ - '17'
214
+ - <sos/eos>
215
+ odim: null
216
+ model_conf: {}
217
+ use_preprocessor: true
218
+ token_type: phn
219
+ bpemodel: null
220
+ non_linguistic_symbols: null
221
+ cleaner: jaconv
222
+ g2p: pyopenjtalk_accent_with_pause
223
+ feats_extract: linear_spectrogram
224
+ feats_extract_conf:
225
+ n_fft: 1024
226
+ hop_length: 256
227
+ win_length: null
228
+ normalize: null
229
+ normalize_conf: {}
230
+ tts: vits
231
+ tts_conf:
232
+ generator_type: vits_generator
233
+ generator_params:
234
+ hidden_channels: 192
235
+ spks: -1
236
+ global_channels: -1
237
+ segment_size: 32
238
+ text_encoder_attention_heads: 2
239
+ text_encoder_ffn_expand: 4
240
+ text_encoder_blocks: 6
241
+ text_encoder_positionwise_layer_type: conv1d
242
+ text_encoder_positionwise_conv_kernel_size: 3
243
+ text_encoder_positional_encoding_layer_type: rel_pos
244
+ text_encoder_self_attention_layer_type: rel_selfattn
245
+ text_encoder_activation_type: swish
246
+ text_encoder_normalize_before: true
247
+ text_encoder_dropout_rate: 0.1
248
+ text_encoder_positional_dropout_rate: 0.0
249
+ text_encoder_attention_dropout_rate: 0.1
250
+ use_macaron_style_in_text_encoder: true
251
+ use_conformer_conv_in_text_encoder: false
252
+ text_encoder_conformer_kernel_size: -1
253
+ decoder_kernel_size: 7
254
+ decoder_channels: 512
255
+ decoder_upsample_scales:
256
+ - 8
257
+ - 8
258
+ - 2
259
+ - 2
260
+ decoder_upsample_kernel_sizes:
261
+ - 16
262
+ - 16
263
+ - 4
264
+ - 4
265
+ decoder_resblock_kernel_sizes:
266
+ - 3
267
+ - 7
268
+ - 11
269
+ decoder_resblock_dilations:
270
+ - - 1
271
+ - 3
272
+ - 5
273
+ - - 1
274
+ - 3
275
+ - 5
276
+ - - 1
277
+ - 3
278
+ - 5
279
+ use_weight_norm_in_decoder: true
280
+ posterior_encoder_kernel_size: 5
281
+ posterior_encoder_layers: 16
282
+ posterior_encoder_stacks: 1
283
+ posterior_encoder_base_dilation: 1
284
+ posterior_encoder_dropout_rate: 0.0
285
+ use_weight_norm_in_posterior_encoder: true
286
+ flow_flows: 4
287
+ flow_kernel_size: 5
288
+ flow_base_dilation: 1
289
+ flow_layers: 4
290
+ flow_dropout_rate: 0.0
291
+ use_weight_norm_in_flow: true
292
+ use_only_mean_in_flow: true
293
+ stochastic_duration_predictor_kernel_size: 3
294
+ stochastic_duration_predictor_dropout_rate: 0.5
295
+ stochastic_duration_predictor_flows: 4
296
+ stochastic_duration_predictor_dds_conv_layers: 3
297
+ vocabs: 85
298
+ aux_channels: 513
299
+ discriminator_type: hifigan_multi_scale_multi_period_discriminator
300
+ discriminator_params:
301
+ scales: 1
302
+ scale_downsample_pooling: AvgPool1d
303
+ scale_downsample_pooling_params:
304
+ kernel_size: 4
305
+ stride: 2
306
+ padding: 2
307
+ scale_discriminator_params:
308
+ in_channels: 1
309
+ out_channels: 1
310
+ kernel_sizes:
311
+ - 15
312
+ - 41
313
+ - 5
314
+ - 3
315
+ channels: 128
316
+ max_downsample_channels: 1024
317
+ max_groups: 16
318
+ bias: true
319
+ downsample_scales:
320
+ - 2
321
+ - 2
322
+ - 4
323
+ - 4
324
+ - 1
325
+ nonlinear_activation: LeakyReLU
326
+ nonlinear_activation_params:
327
+ negative_slope: 0.1
328
+ use_weight_norm: true
329
+ use_spectral_norm: false
330
+ follow_official_norm: false
331
+ periods:
332
+ - 2
333
+ - 3
334
+ - 5
335
+ - 7
336
+ - 11
337
+ period_discriminator_params:
338
+ in_channels: 1
339
+ out_channels: 1
340
+ kernel_sizes:
341
+ - 5
342
+ - 3
343
+ channels: 32
344
+ downsample_scales:
345
+ - 3
346
+ - 3
347
+ - 3
348
+ - 3
349
+ - 1
350
+ max_downsample_channels: 1024
351
+ bias: true
352
+ nonlinear_activation: LeakyReLU
353
+ nonlinear_activation_params:
354
+ negative_slope: 0.1
355
+ use_weight_norm: true
356
+ use_spectral_norm: false
357
+ generator_adv_loss_params:
358
+ average_by_discriminators: false
359
+ loss_type: mse
360
+ discriminator_adv_loss_params:
361
+ average_by_discriminators: false
362
+ loss_type: mse
363
+ feat_match_loss_params:
364
+ average_by_discriminators: false
365
+ average_by_layers: false
366
+ include_final_outputs: true
367
+ mel_loss_params:
368
+ fs: 22050
369
+ n_fft: 1024
370
+ hop_length: 256
371
+ win_length: null
372
+ window: hann
373
+ n_mels: 80
374
+ fmin: 0
375
+ fmax: null
376
+ log_base: null
377
+ lambda_adv: 1.0
378
+ lambda_mel: 45.0
379
+ lambda_feat_match: 2.0
380
+ lambda_dur: 1.0
381
+ lambda_kl: 1.0
382
+ sampling_rate: 22050
383
+ cache_generator_outputs: true
384
+ pitch_extract: null
385
+ pitch_extract_conf: {}
386
+ pitch_normalize: null
387
+ pitch_normalize_conf: {}
388
+ energy_extract: null
389
+ energy_extract_conf: {}
390
+ energy_normalize: null
391
+ energy_normalize_conf: {}
392
+ required:
393
+ - output_dir
394
+ - token_list
395
+ version: '202207'
396
+ distributed: false
exp/tts_amadeus_vits_finetune_from_jsut_32_sentence/train.total_count.ave_3best.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47f585212ab86e97fd712b3cd0e8f9cf4ff75a37ecd3f01343b4db7022873c25
3
+ size 372564559
exp/tts_stats_raw_linear_spectrogram_phn_jaconv_pyopenjtalk_accent_with_pause/train/feats_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:943c319e34197e5c4b875da0c6ef76872fac941532c8341f81440a6cc2050f78
3
+ size 4866