Kleber commited on
Commit
907c0f4
1 Parent(s): 2e95979

Upload 2 files

Browse files
Files changed (2) hide show
  1. config.yaml +384 -0
  2. train.total_count.best.pth +3 -0
config.yaml ADDED
@@ -0,0 +1,384 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/train_full_band_vits.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: exp/44k/tts_train_full_band_vits_raw_phn_tacotron_g2p_en
7
+ ngpu: 1
8
+ seed: 777
9
+ num_workers: 4
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: 3
14
+ dist_rank: 0
15
+ local_rank: 0
16
+ dist_master_addr: localhost
17
+ dist_master_port: 59741
18
+ dist_launcher: null
19
+ multiprocessing_distributed: true
20
+ unused_parameters: true
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: false
25
+ collect_stats: false
26
+ write_collected_feats: false
27
+ max_epoch: 100
28
+ patience: null
29
+ val_scheduler_criterion:
30
+ - valid
31
+ - loss
32
+ early_stopping_criterion:
33
+ - valid
34
+ - loss
35
+ - min
36
+ best_model_criterion:
37
+ - - train
38
+ - total_count
39
+ - max
40
+ keep_nbest_models: 10
41
+ nbest_averaging_interval: 0
42
+ grad_clip: -1
43
+ grad_clip_type: 2.0
44
+ grad_noise: false
45
+ accum_grad: 1
46
+ no_forward_run: false
47
+ resume: true
48
+ train_dtype: float32
49
+ use_amp: false
50
+ log_interval: 50
51
+ use_matplotlib: true
52
+ use_tensorboard: true
53
+ create_graph_in_tensorboard: false
54
+ use_wandb: false
55
+ wandb_project: null
56
+ wandb_id: null
57
+ wandb_entity: null
58
+ wandb_name: null
59
+ wandb_model_log_interval: -1
60
+ detect_anomaly: false
61
+ pretrain_path: null
62
+ init_param: []
63
+ ignore_init_mismatch: false
64
+ freeze_param: []
65
+ num_iters_per_epoch: 1000
66
+ batch_size: 20
67
+ valid_batch_size: null
68
+ batch_bins: 10000000
69
+ valid_batch_bins: null
70
+ train_shape_file:
71
+ - exp/44k/tts_stats_raw_linear_spectrogram_phn_tacotron_g2p_en/train/text_shape.phn
72
+ - exp/44k/tts_stats_raw_linear_spectrogram_phn_tacotron_g2p_en/train/speech_shape
73
+ valid_shape_file:
74
+ - exp/44k/tts_stats_raw_linear_spectrogram_phn_tacotron_g2p_en/valid/text_shape.phn
75
+ - exp/44k/tts_stats_raw_linear_spectrogram_phn_tacotron_g2p_en/valid/speech_shape
76
+ batch_type: numel
77
+ valid_batch_type: null
78
+ fold_length:
79
+ - 150
80
+ - 409600
81
+ sort_in_batch: descending
82
+ sort_batch: descending
83
+ multiple_iterator: false
84
+ chunk_length: 500
85
+ chunk_shift_ratio: 0.5
86
+ num_cache_chunks: 1024
87
+ train_data_path_and_name_and_type:
88
+ - - dump/44k/raw/train/text
89
+ - text
90
+ - text
91
+ - - dump/44k/raw/train/wav.scp
92
+ - speech
93
+ - sound
94
+ valid_data_path_and_name_and_type:
95
+ - - dump/44k/raw/valid/text
96
+ - text
97
+ - text
98
+ - - dump/44k/raw/valid/wav.scp
99
+ - speech
100
+ - sound
101
+ allow_variable_data_keys: false
102
+ max_cache_size: 0.0
103
+ max_cache_fd: 32
104
+ valid_max_cache_size: null
105
+ optim: adamw
106
+ optim_conf:
107
+ lr: 0.0002
108
+ betas:
109
+ - 0.8
110
+ - 0.99
111
+ eps: 1.0e-09
112
+ weight_decay: 0.0
113
+ scheduler: exponentiallr
114
+ scheduler_conf:
115
+ gamma: 0.999875
116
+ optim2: adamw
117
+ optim2_conf:
118
+ lr: 0.0002
119
+ betas:
120
+ - 0.8
121
+ - 0.99
122
+ eps: 1.0e-09
123
+ weight_decay: 0.0
124
+ scheduler2: exponentiallr
125
+ scheduler2_conf:
126
+ gamma: 0.999875
127
+ generator_first: false
128
+ token_list:
129
+ - <blank>
130
+ - <unk>
131
+ - ''
132
+ - AA1
133
+ - AH0
134
+ - N
135
+ - K
136
+ - OW0
137
+ - M
138
+ - B
139
+ - IY0
140
+ - L
141
+ - T
142
+ - Y
143
+ - S
144
+ - OW1
145
+ - IY1
146
+ - G
147
+ - P
148
+ - IH0
149
+ - EY1
150
+ - Z
151
+ - NG
152
+ - AA0
153
+ - EH1
154
+ - UW1
155
+ - D
156
+ - AE1
157
+ - OY1
158
+ - AE2
159
+ - OW2
160
+ - AY1
161
+ - W
162
+ - IH1
163
+ - UW0
164
+ - AO1
165
+ - EH0
166
+ - AA2
167
+ - JH
168
+ - EH2
169
+ - R
170
+ - V
171
+ - F
172
+ - AH1
173
+ - UW2
174
+ - SH
175
+ - AE0
176
+ - '?'
177
+ - EY0
178
+ - CH
179
+ - AY2
180
+ - AY0
181
+ - EY2
182
+ - ER0
183
+ - IH2
184
+ - TH
185
+ - AW2
186
+ - OY0
187
+ - IY2
188
+ - AO0
189
+ - HH
190
+ - AO2
191
+ - OY2
192
+ - UH0
193
+ - ZH
194
+ - ER1
195
+ - AW1
196
+ - ''''
197
+ - AW0
198
+ - UH1
199
+ - AH2
200
+ - <sos/eos>
201
+ odim: null
202
+ model_conf: {}
203
+ use_preprocessor: true
204
+ token_type: phn
205
+ bpemodel: null
206
+ non_linguistic_symbols: null
207
+ cleaner: tacotron
208
+ g2p: g2p_en
209
+ feats_extract: linear_spectrogram
210
+ feats_extract_conf:
211
+ n_fft: 2048
212
+ hop_length: 512
213
+ win_length: null
214
+ normalize: null
215
+ normalize_conf: {}
216
+ tts: vits
217
+ tts_conf:
218
+ generator_type: vits_generator
219
+ generator_params:
220
+ hidden_channels: 192
221
+ spks: -1
222
+ global_channels: -1
223
+ segment_size: 32
224
+ text_encoder_attention_heads: 2
225
+ text_encoder_ffn_expand: 4
226
+ text_encoder_blocks: 6
227
+ text_encoder_positionwise_layer_type: conv1d
228
+ text_encoder_positionwise_conv_kernel_size: 3
229
+ text_encoder_positional_encoding_layer_type: rel_pos
230
+ text_encoder_self_attention_layer_type: rel_selfattn
231
+ text_encoder_activation_type: swish
232
+ text_encoder_normalize_before: true
233
+ text_encoder_dropout_rate: 0.1
234
+ text_encoder_positional_dropout_rate: 0.0
235
+ text_encoder_attention_dropout_rate: 0.1
236
+ use_macaron_style_in_text_encoder: true
237
+ use_conformer_conv_in_text_encoder: false
238
+ text_encoder_conformer_kernel_size: -1
239
+ decoder_kernel_size: 7
240
+ decoder_channels: 512
241
+ decoder_upsample_scales:
242
+ - 8
243
+ - 8
244
+ - 2
245
+ - 2
246
+ - 2
247
+ decoder_upsample_kernel_sizes:
248
+ - 16
249
+ - 16
250
+ - 4
251
+ - 4
252
+ - 4
253
+ decoder_resblock_kernel_sizes:
254
+ - 3
255
+ - 7
256
+ - 11
257
+ decoder_resblock_dilations:
258
+ - - 1
259
+ - 3
260
+ - 5
261
+ - - 1
262
+ - 3
263
+ - 5
264
+ - - 1
265
+ - 3
266
+ - 5
267
+ use_weight_norm_in_decoder: true
268
+ posterior_encoder_kernel_size: 5
269
+ posterior_encoder_layers: 16
270
+ posterior_encoder_stacks: 1
271
+ posterior_encoder_base_dilation: 1
272
+ posterior_encoder_dropout_rate: 0.0
273
+ use_weight_norm_in_posterior_encoder: true
274
+ flow_flows: 4
275
+ flow_kernel_size: 5
276
+ flow_base_dilation: 1
277
+ flow_layers: 4
278
+ flow_dropout_rate: 0.0
279
+ use_weight_norm_in_flow: true
280
+ use_only_mean_in_flow: true
281
+ stochastic_duration_predictor_kernel_size: 3
282
+ stochastic_duration_predictor_dropout_rate: 0.5
283
+ stochastic_duration_predictor_flows: 4
284
+ stochastic_duration_predictor_dds_conv_layers: 3
285
+ vocabs: 72
286
+ aux_channels: 1025
287
+ discriminator_type: hifigan_multi_scale_multi_period_discriminator
288
+ discriminator_params:
289
+ scales: 1
290
+ scale_downsample_pooling: AvgPool1d
291
+ scale_downsample_pooling_params:
292
+ kernel_size: 4
293
+ stride: 2
294
+ padding: 2
295
+ scale_discriminator_params:
296
+ in_channels: 1
297
+ out_channels: 1
298
+ kernel_sizes:
299
+ - 15
300
+ - 41
301
+ - 5
302
+ - 3
303
+ channels: 128
304
+ max_downsample_channels: 1024
305
+ max_groups: 16
306
+ bias: true
307
+ downsample_scales:
308
+ - 2
309
+ - 2
310
+ - 4
311
+ - 4
312
+ - 1
313
+ nonlinear_activation: LeakyReLU
314
+ nonlinear_activation_params:
315
+ negative_slope: 0.1
316
+ use_weight_norm: true
317
+ use_spectral_norm: false
318
+ follow_official_norm: false
319
+ periods:
320
+ - 2
321
+ - 3
322
+ - 5
323
+ - 7
324
+ - 11
325
+ period_discriminator_params:
326
+ in_channels: 1
327
+ out_channels: 1
328
+ kernel_sizes:
329
+ - 5
330
+ - 3
331
+ channels: 32
332
+ downsample_scales:
333
+ - 3
334
+ - 3
335
+ - 3
336
+ - 3
337
+ - 1
338
+ max_downsample_channels: 1024
339
+ bias: true
340
+ nonlinear_activation: LeakyReLU
341
+ nonlinear_activation_params:
342
+ negative_slope: 0.1
343
+ use_weight_norm: true
344
+ use_spectral_norm: false
345
+ generator_adv_loss_params:
346
+ average_by_discriminators: false
347
+ loss_type: mse
348
+ discriminator_adv_loss_params:
349
+ average_by_discriminators: false
350
+ loss_type: mse
351
+ feat_match_loss_params:
352
+ average_by_discriminators: false
353
+ average_by_layers: false
354
+ include_final_outputs: true
355
+ mel_loss_params:
356
+ fs: 44100
357
+ n_fft: 2048
358
+ hop_length: 512
359
+ win_length: null
360
+ window: hann
361
+ n_mels: 80
362
+ fmin: 0
363
+ fmax: null
364
+ log_base: null
365
+ lambda_adv: 1.0
366
+ lambda_mel: 45.0
367
+ lambda_feat_match: 2.0
368
+ lambda_dur: 1.0
369
+ lambda_kl: 1.0
370
+ sampling_rate: 44100
371
+ cache_generator_outputs: true
372
+ pitch_extract: null
373
+ pitch_extract_conf: {}
374
+ pitch_normalize: null
375
+ pitch_normalize_conf: {}
376
+ energy_extract: null
377
+ energy_extract_conf: {}
378
+ energy_normalize: null
379
+ energy_normalize_conf: {}
380
+ required:
381
+ - output_dir
382
+ - token_list
383
+ version: '202209'
384
+ distributed: true
train.total_count.best.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c338961d978526b48d5e8743bc5f2ded332a266595e07d5a741358f67a39137
3
+ size 373265408