Yurii Paniv commited on
Commit
27ddcef
1 Parent(s): 524d54e

Add sample VITS config

Browse files
Files changed (1) hide show
  1. training/train_vits.yaml +185 -0
training/train_vits.yaml ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This configuration is for ESPnet2 to train VITS, which
2
+ # is truely end-to-end text-to-waveform model. To run
3
+ # this config, you need to specify "--tts_task gan_tts"
4
+ # option for tts.sh at least and use 22050 hz audio as
5
+ # the training data (mainly tested on LJspeech).
6
+ # This configuration tested on 4 GPUs (V100) with 32GB GPU
7
+ # memory. It takes around 2 weeks to finish the training
8
+ # but 100k iters model should generate reasonable results.
9
+
10
+ ##########################################################
11
+ # TTS MODEL SETTING #
12
+ ##########################################################
13
+ tts: vits
14
+ tts_conf:
15
+ # generator related
16
+ generator_type: vits_generator
17
+ generator_params:
18
+ hidden_channels: 192
19
+ spks: -1
20
+ global_channels: -1
21
+ segment_size: 32
22
+ text_encoder_attention_heads: 2
23
+ text_encoder_ffn_expand: 4
24
+ text_encoder_blocks: 6
25
+ text_encoder_positionwise_layer_type: "conv1d"
26
+ text_encoder_positionwise_conv_kernel_size: 3
27
+ text_encoder_positional_encoding_layer_type: "rel_pos"
28
+ text_encoder_self_attention_layer_type: "rel_selfattn"
29
+ text_encoder_activation_type: "swish"
30
+ text_encoder_normalize_before: true
31
+ text_encoder_dropout_rate: 0.1
32
+ text_encoder_positional_dropout_rate: 0.0
33
+ text_encoder_attention_dropout_rate: 0.1
34
+ use_macaron_style_in_text_encoder: true
35
+ # NOTE(kan-bayashi): Conformer conv requires BatchNorm1d which causes
36
+ # errors when multiple GPUs in pytorch 1.7.1. Therefore, we disable
37
+ # it as a default. We need to consider the alternative normalization
38
+ # or different version pytorch may solve this issue.
39
+ use_conformer_conv_in_text_encoder: false
40
+ text_encoder_conformer_kernel_size: -1
41
+ decoder_kernel_size: 7
42
+ decoder_channels: 512
43
+ decoder_upsample_scales: [8, 8, 2, 2]
44
+ decoder_upsample_kernel_sizes: [16, 16, 4, 4]
45
+ decoder_resblock_kernel_sizes: [3, 7, 11]
46
+ decoder_resblock_dilations: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
47
+ use_weight_norm_in_decoder: true
48
+ posterior_encoder_kernel_size: 5
49
+ posterior_encoder_layers: 16
50
+ posterior_encoder_stacks: 1
51
+ posterior_encoder_base_dilation: 1
52
+ posterior_encoder_dropout_rate: 0.0
53
+ use_weight_norm_in_posterior_encoder: true
54
+ flow_flows: 4
55
+ flow_kernel_size: 5
56
+ flow_base_dilation: 1
57
+ flow_layers: 4
58
+ flow_dropout_rate: 0.0
59
+ use_weight_norm_in_flow: true
60
+ use_only_mean_in_flow: true
61
+ stochastic_duration_predictor_kernel_size: 3
62
+ stochastic_duration_predictor_dropout_rate: 0.5
63
+ stochastic_duration_predictor_flows: 4
64
+ stochastic_duration_predictor_dds_conv_layers: 3
65
+ # discriminator related
66
+ discriminator_type: hifigan_multi_scale_multi_period_discriminator
67
+ discriminator_params:
68
+ scales: 1
69
+ scale_downsample_pooling: "AvgPool1d"
70
+ scale_downsample_pooling_params:
71
+ kernel_size: 4
72
+ stride: 2
73
+ padding: 2
74
+ scale_discriminator_params:
75
+ in_channels: 1
76
+ out_channels: 1
77
+ kernel_sizes: [15, 41, 5, 3]
78
+ channels: 128
79
+ max_downsample_channels: 1024
80
+ max_groups: 16
81
+ bias: True
82
+ downsample_scales: [2, 2, 4, 4, 1]
83
+ nonlinear_activation: "LeakyReLU"
84
+ nonlinear_activation_params:
85
+ negative_slope: 0.1
86
+ use_weight_norm: True
87
+ use_spectral_norm: False
88
+ follow_official_norm: False
89
+ periods: [2, 3, 5, 7, 11]
90
+ period_discriminator_params:
91
+ in_channels: 1
92
+ out_channels: 1
93
+ kernel_sizes: [5, 3]
94
+ channels: 32
95
+ downsample_scales: [3, 3, 3, 3, 1]
96
+ max_downsample_channels: 1024
97
+ bias: True
98
+ nonlinear_activation: "LeakyReLU"
99
+ nonlinear_activation_params:
100
+ negative_slope: 0.1
101
+ use_weight_norm: True
102
+ use_spectral_norm: False
103
+ # loss function related
104
+ generator_adv_loss_params:
105
+ average_by_discriminators: false # whether to average loss value by #discriminators
106
+ loss_type: mse # loss type, "mse" or "hinge"
107
+ discriminator_adv_loss_params:
108
+ average_by_discriminators: false # whether to average loss value by #discriminators
109
+ loss_type: mse # loss type, "mse" or "hinge"
110
+ feat_match_loss_params:
111
+ average_by_discriminators: false # whether to average loss value by #discriminators
112
+ average_by_layers: false # whether to average loss value by #layers of each discriminator
113
+ include_final_outputs: true # whether to include final outputs for loss calculation
114
+ mel_loss_params:
115
+ fs: 22050 # must be the same as the training data
116
+ n_fft: 1024 # fft points
117
+ hop_length: 256 # hop size
118
+ win_length: null # window length
119
+ window: hann # window type
120
+ n_mels: 80 # number of Mel basis
121
+ fmin: 0 # minimum frequency for Mel basis
122
+ fmax: null # maximum frequency for Mel basis
123
+ log_base: null # null represent natural log
124
+ lambda_adv: 1.0 # loss scaling coefficient for adversarial loss
125
+ lambda_mel: 45.0 # loss scaling coefficient for Mel loss
126
+ lambda_feat_match: 2.0 # loss scaling coefficient for feat match loss
127
+ lambda_dur: 1.0 # loss scaling coefficient for duration loss
128
+ lambda_kl: 1.0 # loss scaling coefficient for KL divergence loss
129
+ # others
130
+ sampling_rate: 22050 # needed in the inference for saving wav
131
+ cache_generator_outputs: true # whether to cache generator outputs in the training
132
+
133
+ ##########################################################
134
+ # OPTIMIZER & SCHEDULER SETTING #
135
+ ##########################################################
136
+ # optimizer setting for generator
137
+ optim: adamw
138
+ optim_conf:
139
+ lr: 2.0e-4
140
+ betas: [0.8, 0.99]
141
+ eps: 1.0e-9
142
+ weight_decay: 0.0
143
+ scheduler: exponentiallr
144
+ scheduler_conf:
145
+ gamma: 0.999875
146
+ # optimizer setting for discriminator
147
+ optim2: adamw
148
+ optim2_conf:
149
+ lr: 2.0e-4
150
+ betas: [0.8, 0.99]
151
+ eps: 1.0e-9
152
+ weight_decay: 0.0
153
+ scheduler2: exponentiallr
154
+ scheduler2_conf:
155
+ gamma: 0.999875
156
+ generator_first: false # whether to start updating generator first
157
+
158
+ ##########################################################
159
+ # OTHER TRAINING SETTING #
160
+ ##########################################################
161
+ #num_iters_per_epoch: 1000 # number of iterations per epoch
162
+ max_epoch: 1000 # number of epochs
163
+ accum_grad: 1 # gradient accumulation
164
+ batch_bins: 9000000 # batch bins (feats_type=raw)
165
+ batch_type: numel # how to make batch
166
+ grad_clip: -1 # gradient clipping norm
167
+ grad_noise: false # whether to use gradient noise injection
168
+ sort_in_batch: descending # how to sort data in making batch
169
+ sort_batch: descending # how to sort created batches
170
+ num_workers: 12 # number of workers of data loader
171
+ use_amp: false # whether to use pytorch amp
172
+ log_interval: 50 # log interval in iterations
173
+ keep_nbest_models: 10 # number of models to keep
174
+ num_att_plot: 3 # number of attention figures to be saved in every check
175
+ seed: 3407 # random seed number
176
+ patience: null # patience for early stopping
177
+ unused_parameters: true # needed for multi gpu case
178
+ best_model_criterion: # criterion to save the best models
179
+ - - train
180
+ - total_count
181
+ - max
182
+ cudnn_deterministic: false # setting to false accelerates the training speed but makes it non-deterministic
183
+ # in the case of GAN-TTS training, we strongly recommend setting to false
184
+ cudnn_benchmark: false # setting to true might acdelerate the training speed but sometimes decrease it
185
+ # therefore, we set to false as a default (recommend trying both cases)