yerfor commited on
Commit
87fac49
·
1 Parent(s): f7912f3

add models

Browse files
checkpoints/lj_synta/config.yaml ADDED
@@ -0,0 +1,272 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accumulate_grad_batches: 1
2
+ add_word_pos: true
3
+ amp: false
4
+ audio_num_mel_bins: 80
5
+ audio_sample_rate: 22050
6
+ base_config:
7
+ - egs/egs_bases/tts/synta.yaml
8
+ - ./base_text2mel.yaml
9
+ binarization_args:
10
+ min_sil_duration: 0.1
11
+ shuffle: false
12
+ test_range:
13
+ - 0
14
+ - 523
15
+ train_range:
16
+ - 871
17
+ - -1
18
+ trim_eos_bos: false
19
+ valid_range:
20
+ - 523
21
+ - 871
22
+ with_align: true
23
+ with_f0: true
24
+ with_f0cwt: false
25
+ with_linear: false
26
+ with_spk_embed: false
27
+ with_wav: false
28
+ binarizer_cls: data_gen.tts.base_binarizer.BaseBinarizer
29
+ binary_data_dir: data/binary/ljspeech
30
+ check_val_every_n_epoch: 10
31
+ clip_grad_norm: 1
32
+ clip_grad_value: 0
33
+ conv_use_pos: false
34
+ debug: false
35
+ dec_dilations:
36
+ - 1
37
+ - 1
38
+ - 1
39
+ - 1
40
+ dec_ffn_kernel_size: 9
41
+ dec_inp_add_noise: false
42
+ dec_kernel_size: 5
43
+ dec_layers: 4
44
+ dec_post_net_kernel: 3
45
+ decoder_rnn_dim: 0
46
+ decoder_type: conv
47
+ detach_postflow_input: true
48
+ disc_interval: 1
49
+ disc_lr: 0.0001
50
+ disc_norm: in
51
+ disc_reduction: stack
52
+ disc_start_steps: 0
53
+ disc_win_num: 3
54
+ discriminator_optimizer_params:
55
+ eps: 1.0e-06
56
+ weight_decay: 0.0
57
+ discriminator_scheduler_params:
58
+ gamma: 0.5
59
+ step_size: 40000
60
+ dropout: 0.0
61
+ ds_name: ljspeech
62
+ ds_workers: 2
63
+ dur_level: word
64
+ dur_predictor_kernel: 5
65
+ dur_predictor_layers: 3
66
+ enc_dec_norm: ln
67
+ enc_dilations:
68
+ - 1
69
+ - 1
70
+ - 1
71
+ - 1
72
+ enc_ffn_kernel_size: 5
73
+ enc_kernel_size: 5
74
+ enc_layers: 4
75
+ enc_post_net_kernel: 3
76
+ enc_pre_ln: true
77
+ enc_prenet: true
78
+ encoder_K: 8
79
+ encoder_type: rel_fft
80
+ endless_ds: true
81
+ eval_max_batches: -1
82
+ f0_max: 600
83
+ f0_min: 80
84
+ ffn_act: gelu
85
+ ffn_hidden_size: 768
86
+ fft_size: 1024
87
+ fmax: 7600
88
+ fmin: 80
89
+ frames_multiple: 4
90
+ fvae_dec_n_layers: 4
91
+ fvae_decoder_type: conv
92
+ fvae_enc_dec_hidden: 192
93
+ fvae_enc_n_layers: 8
94
+ fvae_encoder_type: conv
95
+ fvae_kernel_size: 5
96
+ fvae_noise_scale: 1.0
97
+ fvae_strides: 4
98
+ gen_dir_name: ''
99
+ griffin_lim_iters: 30
100
+ hidden_size: 192
101
+ hop_size: 256
102
+ infer: false
103
+ infer_post_glow: true
104
+ kl_min: 0.0
105
+ kl_start_steps: 10000
106
+ lambda_commit: 0.25
107
+ lambda_energy: 0.1
108
+ lambda_f0: 1.0
109
+ lambda_kl: 1.0
110
+ lambda_mel_adv: 0.05
111
+ lambda_ph_dur: 0.1
112
+ lambda_sent_dur: 0.0
113
+ lambda_uv: 1.0
114
+ lambda_word_dur: 1.0
115
+ latent_size: 16
116
+ layers_in_block: 2
117
+ load_ckpt: ''
118
+ loud_norm: false
119
+ lr: 0.0002
120
+ max_epochs: 1000
121
+ max_frames: 1548
122
+ max_input_tokens: 1550
123
+ max_sentences: 80
124
+ max_tokens: 40000
125
+ max_updates: 480000
126
+ max_valid_sentences: 1
127
+ max_valid_tokens: 60000
128
+ mel_disc_hidden_size: 128
129
+ mel_losses: l1:0.5|ssim:0.5
130
+ mel_vmax: 1.5
131
+ mel_vmin: -6
132
+ min_frames: 0
133
+ noise_scale: 0.8
134
+ num_ckpt_keep: 3
135
+ num_heads: 2
136
+ num_sanity_val_steps: 5
137
+ num_spk: 1
138
+ num_valid_plots: 10
139
+ optimizer_adam_beta1: 0.9
140
+ optimizer_adam_beta2: 0.98
141
+ out_wav_norm: false
142
+ pitch_extractor: parselmouth
143
+ pitch_key: pitch
144
+ pitch_type: frame
145
+ post_flow_lr: 0.001
146
+ post_glow_hidden: 192
147
+ post_glow_kernel_size: 3
148
+ post_glow_n_block_layers: 3
149
+ post_glow_n_blocks: 12
150
+ post_glow_training_start: 160000
151
+ post_share_cond_layers: false
152
+ posterior_start_steps: 0
153
+ predictor_dropout: 0.2
154
+ predictor_grad: 0.1
155
+ predictor_hidden: -1
156
+ predictor_kernel: 5
157
+ predictor_layers: 5
158
+ preprocess_args:
159
+ add_eos_bos: true
160
+ mfa_group_shuffle: false
161
+ mfa_offset: 0.02
162
+ nsample_per_mfa_group: 1000
163
+ reset_phone_dict: true
164
+ reset_word_dict: true
165
+ save_sil_mask: true
166
+ txt_processor: en
167
+ use_mfa: true
168
+ vad_max_silence_length: 12
169
+ wav_processors: []
170
+ with_phsep: true
171
+ preprocess_cls: egs.datasets.audio.lj.preprocess.LJPreprocess
172
+ print_nan_grads: false
173
+ prior_flow_hidden: 64
174
+ prior_flow_kernel_size: 3
175
+ prior_flow_n_blocks: 4
176
+ processed_data_dir: data/processed/ljspeech
177
+ profile_infer: false
178
+ raw_data_dir: data/raw/LJSpeech-1.1
179
+ ref_norm_layer: bn
180
+ rename_tmux: true
181
+ resume_from_checkpoint: 0
182
+ save_best: false
183
+ save_codes:
184
+ - tasks
185
+ - modules
186
+ - egs
187
+ save_f0: false
188
+ save_gt: true
189
+ scheduler: warmup
190
+ seed: 1234
191
+ share_wn_layers: 4
192
+ sigmoid_scale: false
193
+ sort_by_len: true
194
+ task_cls: tasks.tts.synta.SyntaSpeechTask
195
+ tb_log_interval: 100
196
+ test_ids:
197
+ - 0
198
+ - 1
199
+ - 2
200
+ - 3
201
+ - 4
202
+ - 5
203
+ - 6
204
+ - 7
205
+ - 8
206
+ - 9
207
+ - 10
208
+ - 11
209
+ - 12
210
+ - 13
211
+ - 14
212
+ - 15
213
+ - 16
214
+ - 17
215
+ - 18
216
+ - 19
217
+ - 68
218
+ - 70
219
+ - 74
220
+ - 87
221
+ - 110
222
+ - 172
223
+ - 190
224
+ - 215
225
+ - 231
226
+ - 294
227
+ - 316
228
+ - 324
229
+ - 402
230
+ - 422
231
+ - 485
232
+ - 500
233
+ - 505
234
+ - 508
235
+ - 509
236
+ - 519
237
+ test_input_yaml: ''
238
+ test_num: 100
239
+ test_set_name: test
240
+ text_encoder_postnet: true
241
+ train_set_name: train
242
+ train_sets: ''
243
+ two_stage: true
244
+ use_cond_proj: false
245
+ use_fvae: true
246
+ use_gt_dur: false
247
+ use_gt_f0: false
248
+ use_latent_cond: false
249
+ use_pitch_embed: false
250
+ use_pos_embed: true
251
+ use_post_flow: true
252
+ use_prior_flow: true
253
+ use_spk_embed: false
254
+ use_spk_id: false
255
+ use_txt_cond: true
256
+ use_uv: true
257
+ use_word_encoder: true
258
+ use_word_input: false
259
+ val_check_interval: 2000
260
+ valid_infer_interval: 10000
261
+ valid_monitor_key: val_loss
262
+ valid_monitor_mode: min
263
+ valid_set_name: valid
264
+ vocoder: HifiGAN
265
+ vocoder_ckpt: checkpoints/hifi_lj
266
+ warmup_updates: 8000
267
+ weight_decay: 0
268
+ win_size: 1024
269
+ word_dict_size: 10000
270
+ word_enc_layers: 4
271
+ word_encoder_type: rel_fft
272
+ work_dir: checkpoints/lj_synta
checkpoints/lj_synta/model_ckpt_steps_38000.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45f37e6d2315f2af7e8a19df602ab38c0355d721c4e7abcc1fa5791fdb82ff30
3
+ size 349398788