lmzjms commited on
Commit
1736147
1 Parent(s): 9206300

Upload 15 files

Browse files
audio/c00d9240.wav ADDED
Binary file (320 kB). View file
checkpoints/0102_xiaoma_pe/config.yaml ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accumulate_grad_batches: 1
2
+ audio_num_mel_bins: 80
3
+ audio_sample_rate: 24000
4
+ base_config:
5
+ - configs/tts/lj/fs2.yaml
6
+ binarization_args:
7
+ shuffle: false
8
+ with_align: true
9
+ with_f0: true
10
+ with_f0cwt: true
11
+ with_spk_embed: true
12
+ with_txt: true
13
+ with_wav: false
14
+ binarizer_cls: data_gen.tts.base_binarizer.BaseBinarizer
15
+ binary_data_dir: data/binary/xiaoma1022_24k_128hop
16
+ check_val_every_n_epoch: 10
17
+ clip_grad_norm: 1
18
+ cwt_add_f0_loss: false
19
+ cwt_hidden_size: 128
20
+ cwt_layers: 2
21
+ cwt_loss: l1
22
+ cwt_std_scale: 0.8
23
+ debug: false
24
+ dec_ffn_kernel_size: 9
25
+ dec_layers: 4
26
+ decoder_type: fft
27
+ dict_dir: ''
28
+ dropout: 0.1
29
+ ds_workers: 4
30
+ dur_enc_hidden_stride_kernel:
31
+ - 0,2,3
32
+ - 0,2,3
33
+ - 0,1,3
34
+ dur_loss: mse
35
+ dur_predictor_kernel: 3
36
+ dur_predictor_layers: 2
37
+ enc_ffn_kernel_size: 9
38
+ enc_layers: 4
39
+ encoder_K: 8
40
+ encoder_type: fft
41
+ endless_ds: true
42
+ ffn_act: gelu
43
+ ffn_padding: SAME
44
+ fft_size: 512
45
+ fmax: 12000
46
+ fmin: 30
47
+ gen_dir_name: ''
48
+ hidden_size: 256
49
+ hop_size: 128
50
+ infer: false
51
+ lambda_commit: 0.25
52
+ lambda_energy: 0.1
53
+ lambda_f0: 1.0
54
+ lambda_ph_dur: 1.0
55
+ lambda_sent_dur: 1.0
56
+ lambda_uv: 1.0
57
+ lambda_word_dur: 1.0
58
+ load_ckpt: ''
59
+ log_interval: 100
60
+ loud_norm: false
61
+ lr: 2.0
62
+ max_epochs: 1000
63
+ max_eval_sentences: 1
64
+ max_eval_tokens: 60000
65
+ max_frames: 5000
66
+ max_input_tokens: 1550
67
+ max_sentences: 100000
68
+ max_tokens: 20000
69
+ max_updates: 60000
70
+ mel_loss: l1
71
+ mel_vmax: 1.5
72
+ mel_vmin: -6
73
+ min_level_db: -120
74
+ norm_type: gn
75
+ num_ckpt_keep: 3
76
+ num_heads: 2
77
+ num_sanity_val_steps: 5
78
+ num_spk: 1
79
+ num_test_samples: 20
80
+ num_valid_plots: 10
81
+ optimizer_adam_beta1: 0.9
82
+ optimizer_adam_beta2: 0.98
83
+ out_wav_norm: false
84
+ pitch_ar: false
85
+ pitch_enc_hidden_stride_kernel:
86
+ - 0,2,5
87
+ - 0,2,5
88
+ - 0,2,5
89
+ pitch_extractor_conv_layers: 2
90
+ pitch_loss: l1
91
+ pitch_norm: log
92
+ pitch_type: frame
93
+ pre_align_args:
94
+ allow_no_txt: false
95
+ denoise: false
96
+ forced_align: mfa
97
+ txt_processor: en
98
+ use_sox: false
99
+ use_tone: true
100
+ pre_align_cls: data_gen.tts.lj.pre_align.LJPreAlign
101
+ predictor_dropout: 0.5
102
+ predictor_grad: 0.1
103
+ predictor_hidden: -1
104
+ predictor_kernel: 5
105
+ predictor_layers: 2
106
+ prenet_dropout: 0.5
107
+ prenet_hidden_size: 256
108
+ pretrain_fs_ckpt: ''
109
+ processed_data_dir: data/processed/ljspeech
110
+ profile_infer: false
111
+ raw_data_dir: data/raw/LJSpeech-1.1
112
+ ref_norm_layer: bn
113
+ reset_phone_dict: true
114
+ save_best: false
115
+ save_ckpt: true
116
+ save_codes:
117
+ - configs
118
+ - modules
119
+ - tasks
120
+ - utils
121
+ - usr
122
+ save_f0: false
123
+ save_gt: false
124
+ seed: 1234
125
+ sort_by_len: true
126
+ stop_token_weight: 5.0
127
+ task_cls: tasks.tts.pe.PitchExtractionTask
128
+ test_ids:
129
+ - 68
130
+ - 70
131
+ - 74
132
+ - 87
133
+ - 110
134
+ - 172
135
+ - 190
136
+ - 215
137
+ - 231
138
+ - 294
139
+ - 316
140
+ - 324
141
+ - 402
142
+ - 422
143
+ - 485
144
+ - 500
145
+ - 505
146
+ - 508
147
+ - 509
148
+ - 519
149
+ test_input_dir: ''
150
+ test_num: 523
151
+ test_set_name: test
152
+ train_set_name: train
153
+ use_denoise: false
154
+ use_energy_embed: false
155
+ use_gt_dur: false
156
+ use_gt_f0: false
157
+ use_pitch_embed: true
158
+ use_pos_embed: true
159
+ use_spk_embed: false
160
+ use_spk_id: false
161
+ use_split_spk_id: false
162
+ use_uv: true
163
+ use_var_enc: false
164
+ val_check_interval: 2000
165
+ valid_num: 348
166
+ valid_set_name: valid
167
+ vocoder: pwg
168
+ vocoder_ckpt: ''
169
+ warmup_updates: 2000
170
+ weight_decay: 0
171
+ win_size: 512
172
+ work_dir: checkpoints/0102_xiaoma_pe
checkpoints/0102_xiaoma_pe/model_ckpt_steps_60000.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:53942abd8cb908b6d161e1ad7ff3d7d0dd6b204d5bf050613c9d00c56b185ceb
3
+ size 13047222
checkpoints/0109_hifigan_bigpopcs_hop128/config.yaml ADDED
@@ -0,0 +1,241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accumulate_grad_batches: 1
2
+ adam_b1: 0.8
3
+ adam_b2: 0.99
4
+ amp: false
5
+ audio_num_mel_bins: 80
6
+ audio_sample_rate: 24000
7
+ aux_context_window: 0
8
+ #base_config:
9
+ #- egs/egs_bases/singing/pwg.yaml
10
+ #- egs/egs_bases/tts/vocoder/hifigan.yaml
11
+ binarization_args:
12
+ reset_phone_dict: true
13
+ reset_word_dict: true
14
+ shuffle: false
15
+ trim_eos_bos: false
16
+ trim_sil: false
17
+ with_align: false
18
+ with_f0: true
19
+ with_f0cwt: false
20
+ with_linear: false
21
+ with_spk_embed: false
22
+ with_spk_id: true
23
+ with_txt: false
24
+ with_wav: true
25
+ with_word: false
26
+ binarizer_cls: data_gen.tts.singing.binarize.SingingBinarizer
27
+ binary_data_dir: data/binary/big_popcs_24k_hop128
28
+ check_val_every_n_epoch: 10
29
+ clip_grad_norm: 1
30
+ clip_grad_value: 0
31
+ datasets: []
32
+ debug: false
33
+ dec_ffn_kernel_size: 9
34
+ dec_layers: 4
35
+ dict_dir: ''
36
+ disc_start_steps: 40000
37
+ discriminator_grad_norm: 1
38
+ discriminator_optimizer_params:
39
+ eps: 1.0e-06
40
+ lr: 0.0002
41
+ weight_decay: 0.0
42
+ discriminator_params:
43
+ bias: true
44
+ conv_channels: 64
45
+ in_channels: 1
46
+ kernel_size: 3
47
+ layers: 10
48
+ nonlinear_activation: LeakyReLU
49
+ nonlinear_activation_params:
50
+ negative_slope: 0.2
51
+ out_channels: 1
52
+ use_weight_norm: true
53
+ discriminator_scheduler_params:
54
+ gamma: 0.999
55
+ step_size: 600
56
+ dropout: 0.1
57
+ ds_workers: 1
58
+ enc_ffn_kernel_size: 9
59
+ enc_layers: 4
60
+ endless_ds: true
61
+ ffn_act: gelu
62
+ ffn_padding: SAME
63
+ fft_size: 512
64
+ fmax: 12000
65
+ fmin: 30
66
+ frames_multiple: 1
67
+ gen_dir_name: ''
68
+ generator_grad_norm: 10
69
+ generator_optimizer_params:
70
+ eps: 1.0e-06
71
+ lr: 0.0002
72
+ weight_decay: 0.0
73
+ generator_params:
74
+ aux_channels: 80
75
+ dropout: 0.0
76
+ gate_channels: 128
77
+ in_channels: 1
78
+ kernel_size: 3
79
+ layers: 30
80
+ out_channels: 1
81
+ residual_channels: 64
82
+ skip_channels: 64
83
+ stacks: 3
84
+ upsample_net: ConvInUpsampleNetwork
85
+ upsample_params:
86
+ upsample_scales:
87
+ - 2
88
+ - 4
89
+ - 4
90
+ - 4
91
+ use_nsf: false
92
+ use_pitch_embed: true
93
+ use_weight_norm: true
94
+ generator_scheduler_params:
95
+ gamma: 0.999
96
+ step_size: 600
97
+ griffin_lim_iters: 60
98
+ hidden_size: 256
99
+ hop_size: 128
100
+ infer: false
101
+ lambda_adv: 1.0
102
+ lambda_cdisc: 4.0
103
+ lambda_energy: 0.0
104
+ lambda_f0: 0.0
105
+ lambda_mel: 5.0
106
+ lambda_mel_adv: 1.0
107
+ lambda_ph_dur: 0.0
108
+ lambda_sent_dur: 0.0
109
+ lambda_uv: 0.0
110
+ lambda_word_dur: 0.0
111
+ load_ckpt: ''
112
+ loud_norm: false
113
+ lr: 2.0
114
+ max_epochs: 1000
115
+ max_frames: 2400
116
+ max_input_tokens: 1550
117
+ max_samples: 8192
118
+ max_sentences: 20
119
+ max_tokens: 24000
120
+ max_updates: 3000000
121
+ max_valid_sentences: 1
122
+ max_valid_tokens: 60000
123
+ mel_loss: ssim:0.5|l1:0.5
124
+ mel_vmax: 1.5
125
+ mel_vmin: -6
126
+ min_frames: 0
127
+ min_level_db: -120
128
+ num_ckpt_keep: 3
129
+ num_heads: 2
130
+ num_mels: 80
131
+ num_sanity_val_steps: 5
132
+ num_spk: 100
133
+ num_test_samples: 0
134
+ num_valid_plots: 10
135
+ optimizer_adam_beta1: 0.9
136
+ optimizer_adam_beta2: 0.98
137
+ out_wav_norm: false
138
+ pitch_extractor: parselmouth
139
+ pitch_type: frame
140
+ pre_align_args:
141
+ allow_no_txt: false
142
+ denoise: false
143
+ sox_resample: true
144
+ sox_to_wav: false
145
+ trim_sil: false
146
+ txt_processor: zh
147
+ use_tone: false
148
+ pre_align_cls: data_gen.tts.singing.pre_align.SingingPreAlign
149
+ predictor_grad: 0.0
150
+ print_nan_grads: false
151
+ processed_data_dir: ''
152
+ profile_infer: false
153
+ raw_data_dir: ''
154
+ ref_level_db: 20
155
+ rename_tmux: true
156
+ rerun_gen: true
157
+ resblock: '1'
158
+ resblock_dilation_sizes:
159
+ - - 1
160
+ - 3
161
+ - 5
162
+ - - 1
163
+ - 3
164
+ - 5
165
+ - - 1
166
+ - 3
167
+ - 5
168
+ resblock_kernel_sizes:
169
+ - 3
170
+ - 7
171
+ - 11
172
+ resume_from_checkpoint: 0
173
+ save_best: true
174
+ save_codes: []
175
+ save_f0: true
176
+ save_gt: true
177
+ scheduler: rsqrt
178
+ seed: 1234
179
+ sort_by_len: true
180
+ stft_loss_params:
181
+ fft_sizes:
182
+ - 1024
183
+ - 2048
184
+ - 512
185
+ hop_sizes:
186
+ - 120
187
+ - 240
188
+ - 50
189
+ win_lengths:
190
+ - 600
191
+ - 1200
192
+ - 240
193
+ window: hann_window
194
+ task_cls: tasks.vocoder.hifigan.HifiGanTask
195
+ tb_log_interval: 100
196
+ test_ids: []
197
+ test_input_dir: ''
198
+ test_num: 50
199
+ test_prefixes: []
200
+ test_set_name: test
201
+ train_set_name: train
202
+ train_sets: ''
203
+ upsample_initial_channel: 512
204
+ upsample_kernel_sizes:
205
+ - 16
206
+ - 16
207
+ - 4
208
+ - 4
209
+ upsample_rates:
210
+ - 8
211
+ - 4
212
+ - 2
213
+ - 2
214
+ use_cdisc: false
215
+ use_cond_disc: false
216
+ use_fm_loss: false
217
+ use_gt_dur: true
218
+ use_gt_f0: true
219
+ use_mel_loss: true
220
+ use_ms_stft: false
221
+ use_pitch_embed: true
222
+ use_ref_enc: true
223
+ use_spec_disc: false
224
+ use_spk_embed: false
225
+ use_spk_id: false
226
+ use_split_spk_id: false
227
+ val_check_interval: 2000
228
+ valid_infer_interval: 10000
229
+ valid_monitor_key: val_loss
230
+ valid_monitor_mode: min
231
+ valid_set_name: valid
232
+ vocoder: pwg
233
+ vocoder_ckpt: ''
234
+ vocoder_denoise_c: 0.0
235
+ warmup_updates: 8000
236
+ weight_decay: 0
237
+ win_length: null
238
+ win_size: 512
239
+ window: hann
240
+ word_size: 3000
241
+ work_dir: checkpoints/0109_hifigan_bigpopcs_hop128
checkpoints/0109_hifigan_bigpopcs_hop128/model_ckpt_steps_1512000.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1cb68f3ce0c46ba0a8b6d49718f1fffdf5bd7bcab769a986fd2fd129835cc1d1
3
+ size 55827436
checkpoints/0228_opencpop_ds100_rel/config.yaml ADDED
@@ -0,0 +1,342 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ K_step: 100
2
+ accumulate_grad_batches: 1
3
+ audio_num_mel_bins: 80
4
+ audio_sample_rate: 24000
5
+ base_config:
6
+ - usr/configs/popcs_ds_beta6.yaml
7
+ - usr/configs/midi/cascade/opencs/opencpop_statis.yaml
8
+ binarization_args:
9
+ shuffle: false
10
+ with_align: true
11
+ with_f0: true
12
+ with_f0cwt: true
13
+ with_spk_embed: false
14
+ with_txt: true
15
+ with_wav: true
16
+ binarizer_cls: data_gen.singing.binarize.OpencpopBinarizer
17
+ binary_data_dir: data/binary/opencpop-midi-dp
18
+ check_val_every_n_epoch: 10
19
+ clip_grad_norm: 1
20
+ content_cond_steps: []
21
+ cwt_add_f0_loss: false
22
+ cwt_hidden_size: 128
23
+ cwt_layers: 2
24
+ cwt_loss: l1
25
+ cwt_std_scale: 0.8
26
+ datasets:
27
+ - popcs
28
+ debug: false
29
+ dec_ffn_kernel_size: 9
30
+ dec_layers: 4
31
+ decay_steps: 50000
32
+ decoder_type: fft
33
+ dict_dir: ''
34
+ diff_decoder_type: wavenet
35
+ diff_loss_type: l1
36
+ dilation_cycle_length: 4
37
+ dropout: 0.1
38
+ ds_workers: 4
39
+ dur_enc_hidden_stride_kernel:
40
+ - 0,2,3
41
+ - 0,2,3
42
+ - 0,1,3
43
+ dur_loss: mse
44
+ dur_predictor_kernel: 3
45
+ dur_predictor_layers: 5
46
+ enc_ffn_kernel_size: 9
47
+ enc_layers: 4
48
+ encoder_K: 8
49
+ encoder_type: fft
50
+ endless_ds: true
51
+ ffn_act: gelu
52
+ ffn_padding: SAME
53
+ fft_size: 512
54
+ fmax: 12000
55
+ fmin: 30
56
+ fs2_ckpt: ''
57
+ gaussian_start: true
58
+ gen_dir_name: ''
59
+ gen_tgt_spk_id: -1
60
+ hidden_size: 256
61
+ hop_size: 128
62
+ infer: false
63
+ keep_bins: 80
64
+ lambda_commit: 0.25
65
+ lambda_energy: 0.0
66
+ lambda_f0: 0.0
67
+ lambda_ph_dur: 1.0
68
+ lambda_sent_dur: 1.0
69
+ lambda_uv: 0.0
70
+ lambda_word_dur: 1.0
71
+ load_ckpt: ''
72
+ log_interval: 100
73
+ loud_norm: false
74
+ lr: 0.001
75
+ max_beta: 0.06
76
+ max_epochs: 1000
77
+ max_eval_sentences: 1
78
+ max_eval_tokens: 60000
79
+ max_frames: 8000
80
+ max_input_tokens: 1550
81
+ max_sentences: 48
82
+ max_tokens: 40000
83
+ max_updates: 160000
84
+ mel_loss: ssim:0.5|l1:0.5
85
+ mel_vmax: 1.5
86
+ mel_vmin: -6.0
87
+ min_level_db: -120
88
+ norm_type: gn
89
+ num_ckpt_keep: 3
90
+ num_heads: 2
91
+ num_sanity_val_steps: 1
92
+ num_spk: 1
93
+ num_test_samples: 0
94
+ num_valid_plots: 10
95
+ optimizer_adam_beta1: 0.9
96
+ optimizer_adam_beta2: 0.98
97
+ out_wav_norm: false
98
+ pe_ckpt: checkpoints/0102_xiaoma_pe
99
+ pe_enable: true
100
+ pitch_ar: false
101
+ pitch_enc_hidden_stride_kernel:
102
+ - 0,2,5
103
+ - 0,2,5
104
+ - 0,2,5
105
+ pitch_extractor: parselmouth
106
+ pitch_loss: l1
107
+ pitch_norm: log
108
+ pitch_type: frame
109
+ pre_align_args:
110
+ allow_no_txt: false
111
+ denoise: false
112
+ forced_align: mfa
113
+ txt_processor: zh_g2pM
114
+ use_sox: true
115
+ use_tone: false
116
+ pre_align_cls: data_gen.singing.pre_align.SingingPreAlign
117
+ predictor_dropout: 0.5
118
+ predictor_grad: 0.1
119
+ predictor_hidden: -1
120
+ predictor_kernel: 5
121
+ predictor_layers: 5
122
+ prenet_dropout: 0.5
123
+ prenet_hidden_size: 256
124
+ pretrain_fs_ckpt: ''
125
+ processed_data_dir: data/processed/popcs
126
+ profile_infer: false
127
+ raw_data_dir: data/raw/popcs
128
+ ref_norm_layer: bn
129
+ rel_pos: true
130
+ reset_phone_dict: true
131
+ residual_channels: 256
132
+ residual_layers: 20
133
+ save_best: false
134
+ save_ckpt: true
135
+ save_codes:
136
+ - configs
137
+ - modules
138
+ - tasks
139
+ - utils
140
+ - usr
141
+ save_f0: true
142
+ save_gt: false
143
+ schedule_type: linear
144
+ seed: 1234
145
+ sort_by_len: true
146
+ spec_max:
147
+ - -0.79453
148
+ - -0.81116
149
+ - -0.61631
150
+ - -0.30679
151
+ - -0.13863
152
+ - -0.050652
153
+ - -0.11563
154
+ - -0.10679
155
+ - -0.091068
156
+ - -0.062174
157
+ - -0.075302
158
+ - -0.072217
159
+ - -0.063815
160
+ - -0.073299
161
+ - 0.007361
162
+ - -0.072508
163
+ - -0.050234
164
+ - -0.16534
165
+ - -0.26928
166
+ - -0.20782
167
+ - -0.20823
168
+ - -0.11702
169
+ - -0.070128
170
+ - -0.065868
171
+ - -0.012675
172
+ - 0.0015121
173
+ - -0.089902
174
+ - -0.21392
175
+ - -0.23789
176
+ - -0.28922
177
+ - -0.30405
178
+ - -0.23029
179
+ - -0.22088
180
+ - -0.21542
181
+ - -0.29367
182
+ - -0.30137
183
+ - -0.38281
184
+ - -0.4359
185
+ - -0.28681
186
+ - -0.46855
187
+ - -0.57485
188
+ - -0.47022
189
+ - -0.54266
190
+ - -0.44848
191
+ - -0.6412
192
+ - -0.687
193
+ - -0.6486
194
+ - -0.76436
195
+ - -0.49971
196
+ - -0.71068
197
+ - -0.69724
198
+ - -0.61487
199
+ - -0.55843
200
+ - -0.69773
201
+ - -0.57502
202
+ - -0.70919
203
+ - -0.82431
204
+ - -0.84213
205
+ - -0.90431
206
+ - -0.8284
207
+ - -0.77945
208
+ - -0.82758
209
+ - -0.87699
210
+ - -1.0532
211
+ - -1.0766
212
+ - -1.1198
213
+ - -1.0185
214
+ - -0.98983
215
+ - -1.0001
216
+ - -1.0756
217
+ - -1.0024
218
+ - -1.0304
219
+ - -1.0579
220
+ - -1.0188
221
+ - -1.05
222
+ - -1.0842
223
+ - -1.0923
224
+ - -1.1223
225
+ - -1.2381
226
+ - -1.6467
227
+ spec_min:
228
+ - -6.0
229
+ - -6.0
230
+ - -6.0
231
+ - -6.0
232
+ - -6.0
233
+ - -6.0
234
+ - -6.0
235
+ - -6.0
236
+ - -6.0
237
+ - -6.0
238
+ - -6.0
239
+ - -6.0
240
+ - -6.0
241
+ - -6.0
242
+ - -6.0
243
+ - -6.0
244
+ - -6.0
245
+ - -6.0
246
+ - -6.0
247
+ - -6.0
248
+ - -6.0
249
+ - -6.0
250
+ - -6.0
251
+ - -6.0
252
+ - -6.0
253
+ - -6.0
254
+ - -6.0
255
+ - -6.0
256
+ - -6.0
257
+ - -6.0
258
+ - -6.0
259
+ - -6.0
260
+ - -6.0
261
+ - -6.0
262
+ - -6.0
263
+ - -6.0
264
+ - -6.0
265
+ - -6.0
266
+ - -6.0
267
+ - -6.0
268
+ - -6.0
269
+ - -6.0
270
+ - -6.0
271
+ - -6.0
272
+ - -6.0
273
+ - -6.0
274
+ - -6.0
275
+ - -6.0
276
+ - -6.0
277
+ - -6.0
278
+ - -6.0
279
+ - -6.0
280
+ - -6.0
281
+ - -6.0
282
+ - -6.0
283
+ - -6.0
284
+ - -6.0
285
+ - -6.0
286
+ - -6.0
287
+ - -6.0
288
+ - -6.0
289
+ - -6.0
290
+ - -6.0
291
+ - -6.0
292
+ - -6.0
293
+ - -6.0
294
+ - -6.0
295
+ - -6.0
296
+ - -6.0
297
+ - -6.0
298
+ - -6.0
299
+ - -6.0
300
+ - -6.0
301
+ - -6.0
302
+ - -6.0
303
+ - -6.0
304
+ - -6.0
305
+ - -6.0
306
+ - -6.0
307
+ - -6.0
308
+ spk_cond_steps: []
309
+ stop_token_weight: 5.0
310
+ task_cls: usr.diffsinger_task.DiffSingerMIDITask
311
+ test_ids: []
312
+ test_input_dir: ''
313
+ test_num: 0
314
+ test_prefixes:
315
+ - "popcs-\u8BF4\u6563\u5C31\u6563"
316
+ - "popcs-\u9690\u5F62\u7684\u7FC5\u8180"
317
+ test_set_name: test
318
+ timesteps: 100
319
+ train_set_name: train
320
+ use_denoise: false
321
+ use_energy_embed: false
322
+ use_gt_dur: false
323
+ use_gt_f0: false
324
+ use_midi: true
325
+ use_nsf: true
326
+ use_pitch_embed: false
327
+ use_pos_embed: true
328
+ use_spk_embed: false
329
+ use_spk_id: false
330
+ use_split_spk_id: false
331
+ use_uv: true
332
+ use_var_enc: false
333
+ val_check_interval: 2000
334
+ valid_num: 0
335
+ valid_set_name: valid
336
+ vocoder: vocoders.hifigan.HifiGAN
337
+ vocoder_ckpt: checkpoints/0109_hifigan_bigpopcs_hop128
338
+ warmup_updates: 2000
339
+ wav2spec_eps: 1e-6
340
+ weight_decay: 0
341
+ win_size: 512
342
+ work_dir: checkpoints/0228_opencpop_ds100_rel
checkpoints/0228_opencpop_ds100_rel/model_ckpt_steps_160000.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a8261f7415bb39eb80a19d4c27c0ea084f63af2fdf6b82e63fcbd9cd82fc90c
3
+ size 170226367
checkpoints/0831_opencpop_ds1000/config.yaml ADDED
@@ -0,0 +1,346 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ K_step: 1000
2
+ accumulate_grad_batches: 1
3
+ audio_num_mel_bins: 80
4
+ audio_sample_rate: 24000
5
+ base_config:
6
+ - usr/configs/popcs_ds_beta6.yaml
7
+ - usr/configs/midi/cascade/opencs/opencpop_statis.yaml
8
+ binarization_args:
9
+ shuffle: false
10
+ with_align: true
11
+ with_f0: true
12
+ with_f0cwt: true
13
+ with_spk_embed: false
14
+ with_txt: true
15
+ with_wav: true
16
+ binarizer_cls: data_gen.singing.binarize.OpencpopBinarizer
17
+ binary_data_dir: data/binary/opencpop-midi-dp
18
+ check_val_every_n_epoch: 10
19
+ clip_grad_norm: 1
20
+ content_cond_steps: []
21
+ cwt_add_f0_loss: false
22
+ cwt_hidden_size: 128
23
+ cwt_layers: 2
24
+ cwt_loss: l1
25
+ cwt_std_scale: 0.8
26
+ datasets:
27
+ - opencpop
28
+ debug: false
29
+ dec_ffn_kernel_size: 9
30
+ dec_layers: 4
31
+ decay_steps: 50000
32
+ decoder_type: fft
33
+ dict_dir: ''
34
+ diff_decoder_type: wavenet
35
+ diff_loss_type: l1
36
+ dilation_cycle_length: 4
37
+ dropout: 0.1
38
+ ds_workers: 4
39
+ dur_enc_hidden_stride_kernel:
40
+ - 0,2,3
41
+ - 0,2,3
42
+ - 0,1,3
43
+ dur_loss: mse
44
+ dur_predictor_kernel: 3
45
+ dur_predictor_layers: 5
46
+ enc_ffn_kernel_size: 9
47
+ enc_layers: 4
48
+ encoder_K: 8
49
+ encoder_type: fft
50
+ endless_ds: true
51
+ ffn_act: gelu
52
+ ffn_padding: SAME
53
+ fft_size: 512
54
+ fmax: 12000
55
+ fmin: 30
56
+ fs2_ckpt: ''
57
+ gaussian_start: true
58
+ gen_dir_name: ''
59
+ gen_tgt_spk_id: -1
60
+ hidden_size: 256
61
+ hop_size: 128
62
+ infer: false
63
+ keep_bins: 80
64
+ lambda_commit: 0.25
65
+ lambda_energy: 0.0
66
+ lambda_f0: 0.0
67
+ lambda_ph_dur: 1.0
68
+ lambda_sent_dur: 1.0
69
+ lambda_uv: 0.0
70
+ lambda_word_dur: 1.0
71
+ load_ckpt: ''
72
+ log_interval: 100
73
+ loud_norm: false
74
+ lr: 0.001
75
+ max_beta: 0.02
76
+ max_epochs: 1000
77
+ max_eval_sentences: 1
78
+ max_eval_tokens: 60000
79
+ max_frames: 8000
80
+ max_input_tokens: 1550
81
+ max_sentences: 48
82
+ max_tokens: 36000
83
+ max_updates: 320000
84
+ mel_loss: ssim:0.5|l1:0.5
85
+ mel_vmax: 1.5
86
+ mel_vmin: -6.0
87
+ min_level_db: -120
88
+ norm_type: gn
89
+ num_ckpt_keep: 3
90
+ num_heads: 2
91
+ num_sanity_val_steps: 1
92
+ num_spk: 1
93
+ num_test_samples: 0
94
+ num_valid_plots: 10
95
+ optimizer_adam_beta1: 0.9
96
+ optimizer_adam_beta2: 0.98
97
+ out_wav_norm: false
98
+ pe_ckpt: checkpoints/0102_xiaoma_pe
99
+ pe_enable: true
100
+ pitch_ar: false
101
+ pitch_enc_hidden_stride_kernel:
102
+ - 0,2,5
103
+ - 0,2,5
104
+ - 0,2,5
105
+ pitch_extractor: parselmouth
106
+ pitch_loss: l1
107
+ pitch_norm: log
108
+ pitch_type: frame
109
+ pre_align_args:
110
+ allow_no_txt: false
111
+ denoise: false
112
+ forced_align: mfa
113
+ txt_processor: zh_g2pM
114
+ use_sox: true
115
+ use_tone: false
116
+ pre_align_cls: data_gen.singing.pre_align.SingingPreAlign
117
+ predictor_dropout: 0.5
118
+ predictor_grad: 0.1
119
+ predictor_hidden: -1
120
+ predictor_kernel: 5
121
+ predictor_layers: 5
122
+ prenet_dropout: 0.5
123
+ prenet_hidden_size: 256
124
+ pretrain_fs_ckpt: ''
125
+ processed_data_dir: xxx
126
+ profile_infer: false
127
+ raw_data_dir: data/raw/opencpop/segments
128
+ ref_norm_layer: bn
129
+ rel_pos: true
130
+ reset_phone_dict: true
131
+ residual_channels: 256
132
+ residual_layers: 20
133
+ save_best: false
134
+ save_ckpt: true
135
+ save_codes:
136
+ - configs
137
+ - modules
138
+ - tasks
139
+ - utils
140
+ - usr
141
+ save_f0: true
142
+ save_gt: false
143
+ schedule_type: linear
144
+ seed: 1234
145
+ sort_by_len: true
146
+ spec_max:
147
+ - -0.79453
148
+ - -0.81116
149
+ - -0.61631
150
+ - -0.30679
151
+ - -0.13863
152
+ - -0.050652
153
+ - -0.11563
154
+ - -0.10679
155
+ - -0.091068
156
+ - -0.062174
157
+ - -0.075302
158
+ - -0.072217
159
+ - -0.063815
160
+ - -0.073299
161
+ - 0.007361
162
+ - -0.072508
163
+ - -0.050234
164
+ - -0.16534
165
+ - -0.26928
166
+ - -0.20782
167
+ - -0.20823
168
+ - -0.11702
169
+ - -0.070128
170
+ - -0.065868
171
+ - -0.012675
172
+ - 0.0015121
173
+ - -0.089902
174
+ - -0.21392
175
+ - -0.23789
176
+ - -0.28922
177
+ - -0.30405
178
+ - -0.23029
179
+ - -0.22088
180
+ - -0.21542
181
+ - -0.29367
182
+ - -0.30137
183
+ - -0.38281
184
+ - -0.4359
185
+ - -0.28681
186
+ - -0.46855
187
+ - -0.57485
188
+ - -0.47022
189
+ - -0.54266
190
+ - -0.44848
191
+ - -0.6412
192
+ - -0.687
193
+ - -0.6486
194
+ - -0.76436
195
+ - -0.49971
196
+ - -0.71068
197
+ - -0.69724
198
+ - -0.61487
199
+ - -0.55843
200
+ - -0.69773
201
+ - -0.57502
202
+ - -0.70919
203
+ - -0.82431
204
+ - -0.84213
205
+ - -0.90431
206
+ - -0.8284
207
+ - -0.77945
208
+ - -0.82758
209
+ - -0.87699
210
+ - -1.0532
211
+ - -1.0766
212
+ - -1.1198
213
+ - -1.0185
214
+ - -0.98983
215
+ - -1.0001
216
+ - -1.0756
217
+ - -1.0024
218
+ - -1.0304
219
+ - -1.0579
220
+ - -1.0188
221
+ - -1.05
222
+ - -1.0842
223
+ - -1.0923
224
+ - -1.1223
225
+ - -1.2381
226
+ - -1.6467
227
+ spec_min:
228
+ - -6.0
229
+ - -6.0
230
+ - -6.0
231
+ - -6.0
232
+ - -6.0
233
+ - -6.0
234
+ - -6.0
235
+ - -6.0
236
+ - -6.0
237
+ - -6.0
238
+ - -6.0
239
+ - -6.0
240
+ - -6.0
241
+ - -6.0
242
+ - -6.0
243
+ - -6.0
244
+ - -6.0
245
+ - -6.0
246
+ - -6.0
247
+ - -6.0
248
+ - -6.0
249
+ - -6.0
250
+ - -6.0
251
+ - -6.0
252
+ - -6.0
253
+ - -6.0
254
+ - -6.0
255
+ - -6.0
256
+ - -6.0
257
+ - -6.0
258
+ - -6.0
259
+ - -6.0
260
+ - -6.0
261
+ - -6.0
262
+ - -6.0
263
+ - -6.0
264
+ - -6.0
265
+ - -6.0
266
+ - -6.0
267
+ - -6.0
268
+ - -6.0
269
+ - -6.0
270
+ - -6.0
271
+ - -6.0
272
+ - -6.0
273
+ - -6.0
274
+ - -6.0
275
+ - -6.0
276
+ - -6.0
277
+ - -6.0
278
+ - -6.0
279
+ - -6.0
280
+ - -6.0
281
+ - -6.0
282
+ - -6.0
283
+ - -6.0
284
+ - -6.0
285
+ - -6.0
286
+ - -6.0
287
+ - -6.0
288
+ - -6.0
289
+ - -6.0
290
+ - -6.0
291
+ - -6.0
292
+ - -6.0
293
+ - -6.0
294
+ - -6.0
295
+ - -6.0
296
+ - -6.0
297
+ - -6.0
298
+ - -6.0
299
+ - -6.0
300
+ - -6.0
301
+ - -6.0
302
+ - -6.0
303
+ - -6.0
304
+ - -6.0
305
+ - -6.0
306
+ - -6.0
307
+ - -6.0
308
+ spk_cond_steps: []
309
+ stop_token_weight: 5.0
310
+ task_cls: usr.diffsinger_task.DiffSingerMIDITask
311
+ test_ids: []
312
+ test_input_dir: ''
313
+ test_num: 0
314
+ test_prefixes:
315
+ - '2044'
316
+ - '2086'
317
+ - '2092'
318
+ - '2093'
319
+ - '2100'
320
+ test_set_name: test
321
+ timesteps: 1000
322
+ train_set_name: train
323
+ use_denoise: false
324
+ use_energy_embed: false
325
+ use_gt_dur: false
326
+ use_gt_f0: false
327
+ use_midi: true
328
+ use_nsf: true
329
+ use_pitch_embed: false
330
+ use_pos_embed: true
331
+ use_spk_embed: false
332
+ use_spk_id: false
333
+ use_split_spk_id: false
334
+ use_uv: true
335
+ use_var_enc: false
336
+ val_check_interval: 2000
337
+ valid_num: 0
338
+ valid_set_name: valid
339
+ vocoder: vocoders.hifigan.HifiGAN
340
+ vocoder_ckpt: checkpoints/0109_hifigan_bigpopcs_hop128
341
+ warmup_updates: 2000
342
+ wav2spec_eps: 1e-6
343
+ weight_decay: 0
344
+ win_size: 512
345
+ work_dir: checkpoints/0831_opencpop_ds1000
346
+ pndm_speedup: 10
checkpoints/0831_opencpop_ds1000/model_ckpt_steps_320000.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:954a31208ee6afb6240d09454bb204c4fbc63cf70e2586bed0ab29b1dc964c9e
3
+ size 170269591
checkpoints/Emotion_encoder.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f9de4930cbd8e5ba51efdef84c326e3728a5482dd7668f82960e4cb0f97cc8e5
3
+ size 17095350
checkpoints/GenerSpeech/config.yaml ADDED
@@ -0,0 +1,249 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accumulate_grad_batches: 1
2
+ amp: false
3
+ audio_num_mel_bins: 80
4
+ audio_sample_rate: 16000
5
+ base_config:
6
+ - egs/egs_bases/tts/fs2_adv.yaml
7
+ - egs/datasets/audio/emotion/base_text2mel.yaml
8
+ binarization_args:
9
+ reset_phone_dict: true
10
+ reset_word_dict: true
11
+ shuffle: true
12
+ trim_eos_bos: false
13
+ trim_sil: false
14
+ with_align: true
15
+ with_f0: true
16
+ with_f0cwt: false
17
+ with_linear: false
18
+ with_spk_embed: true
19
+ with_spk_id: true
20
+ with_txt: true
21
+ with_wav: true
22
+ with_word: true
23
+ binarizer_cls: data_gen.tts.base_binarizer_emotion.EmotionBinarizer
24
+ binary_data_dir: data/binary/training_set
25
+ check_val_every_n_epoch: 10
26
+ clip_grad_norm: 1
27
+ clip_grad_value: 0
28
+ conv_use_pos: false
29
+ crop: false
30
+ cwt_add_f0_loss: false
31
+ cwt_hidden_size: 128
32
+ cwt_layers: 2
33
+ cwt_loss: l1
34
+ cwt_std_scale: 0.8
35
+ debug: false
36
+ dec_dilations:
37
+ - 1
38
+ - 1
39
+ - 1
40
+ - 1
41
+ dec_ffn_kernel_size: 9
42
+ dec_inp_add_noise: false
43
+ dec_kernel_size: 5
44
+ dec_layers: 4
45
+ dec_num_heads: 2
46
+ decoder_rnn_dim: 0
47
+ decoder_type: fft
48
+ dict_dir: ''
49
+ disc_hidden_size: 128
50
+ disc_interval: 1
51
+ disc_lr: 0.0001
52
+ disc_norm: in
53
+ disc_reduction: stack
54
+ disc_start_steps: 0
55
+ disc_win_num: 3
56
+ discriminator_grad_norm: 1
57
+ discriminator_optimizer_params:
58
+ eps: 1.0e-06
59
+ weight_decay: 0.0
60
+ discriminator_scheduler_params:
61
+ gamma: 0.5
62
+ step_size: 60000
63
+ dropout: 0.05
64
+ ds_workers: 2
65
+ dur_enc_hidden_stride_kernel:
66
+ - 0,2,3
67
+ - 0,2,3
68
+ - 0,1,3
69
+ dur_loss: mse
70
+ dur_predictor_kernel: 3
71
+ dur_predictor_layers: 2
72
+ emotion_encoder_path: checkpoints/Emotion_encoder.pt # set the emotion encoder path
73
+ enc_dec_norm: ln
74
+ enc_dilations:
75
+ - 1
76
+ - 1
77
+ - 1
78
+ - 1
79
+ enc_ffn_kernel_size: 9
80
+ enc_kernel_size: 5
81
+ enc_layers: 4
82
+ encoder_K: 8
83
+ encoder_type: fft
84
+ endless_ds: true
85
+ ffn_act: gelu
86
+ ffn_hidden_size: 1024
87
+ ffn_padding: SAME
88
+ fft_size: 1024
89
+ fmax: 7600
90
+ fmin: 80
91
+ forcing: 20000
92
+ frames_multiple: 1
93
+ gen_dir_name: ''
94
+ generator_grad_norm: 5.0
95
+ griffin_lim_iters: 60
96
+ hidden_size: 256
97
+ hop_size: 256
98
+ infer: false
99
+ lambda_commit: 0.25
100
+ lambda_energy: 0.1
101
+ lambda_f0: 1.0
102
+ lambda_mel_adv: 0.1
103
+ lambda_ph_dur: 0.1
104
+ lambda_sent_dur: 1.0
105
+ lambda_uv: 1.0
106
+ lambda_word_dur: 1.0
107
+ layers_in_block: 2
108
+ load_ckpt: ''
109
+ loud_norm: false
110
+ lr: 1.0
111
+ max_epochs: 1000
112
+ max_frames: 1548
113
+ max_input_tokens: 1550
114
+ max_sentences: 100000
115
+ max_tokens: 30000
116
+ max_updates: 300000
117
+ max_valid_sentences: 1
118
+ max_valid_tokens: 60000
119
+ mel_disc_hidden_size: 128
120
+ mel_gan: true
121
+ mel_hidden_size: 256
122
+ mel_loss: ssim:0.5|l1:0.5
123
+ mel_vmax: 1.5
124
+ mel_vmin: -6
125
+ min_frames: 128
126
+ min_level_db: -100
127
+ nVQ: 128
128
+ noise_scale: 0.8
129
+ num_ckpt_keep: 2
130
+ num_heads: 2
131
+ num_sanity_val_steps: -1
132
+ num_spk: 500
133
+ num_test_samples: 72
134
+ num_valid_plots: 10
135
+ optimizer_adam_beta1: 0.5
136
+ optimizer_adam_beta2: 0.999
137
+ out_wav_norm: false
138
+ pitch_ar: false
139
+ pitch_embed_type: 0
140
+ pitch_enc_hidden_stride_kernel:
141
+ - 0,2,5
142
+ - 0,2,5
143
+ - 0,2,5
144
+ pitch_extractor: parselmouth
145
+ pitch_loss: l1
146
+ pitch_norm: standard
147
+ pitch_ssim_win: 11
148
+ pitch_type: frame
149
+ post_glow_hidden: 128
150
+ post_glow_kernel_size: 3
151
+ post_glow_n_block_layers: 3
152
+ post_glow_n_blocks: 8
153
+ post_share_cond_layers: false
154
+ pre_align_args:
155
+ allow_no_txt: false
156
+ denoise: false
157
+ sox_resample: false
158
+ sox_to_wav: false
159
+ trim_sil: false
160
+ txt_processor: en
161
+ use_tone: true
162
+ pre_align_cls: egs.datasets.audio.emotion.pre_align.EmoPreAlign
163
+ predictor_dropout: 0.5
164
+ predictor_grad: 1.0
165
+ predictor_hidden: -1
166
+ predictor_kernel: 5
167
+ predictor_layers: 2
168
+ preprocess_args:
169
+ add_eos_bos: true
170
+ mfa_group_shuffle: false
171
+ mfa_offset: 0.02
172
+ nsample_per_mfa_group: 1000
173
+ reset_phone_dict: true
174
+ reset_word_dict: true
175
+ save_sil_mask: true
176
+ txt_processor: en
177
+ use_mfa: true
178
+ vad_max_silence_length: 12
179
+ wav_processors: []
180
+ with_phsep: true
181
+ preprocess_cls: egs.datasets.audio.libritts.pre_align.LibrittsPreAlign
182
+ pretrain_fs_ckpt: ''
183
+ print_nan_grads: false
184
+ processed_data_dir: data/processed/emotion
185
+ profile_infer: false
186
+ raw_data_dir: data/raw/ESD
187
+ ref_audio: ''
188
+ ref_hidden_stride_kernel:
189
+ - 0,3,5
190
+ - 0,3,5
191
+ - 0,2,5
192
+ - 0,2,5
193
+ - 0,2,5
194
+ ref_level_db: 20
195
+ ref_norm_layer: bn
196
+ rename_tmux: true
197
+ rerun_gen: false
198
+ resume_from_checkpoint: 0
199
+ save_best: false
200
+ save_codes: []
201
+ save_f0: false
202
+ save_gt: true
203
+ scheduler: rsqrt
204
+ seed: 1234
205
+ share_wn_layers: 4
206
+ sigmoid_scale: false
207
+ sil_add_noise: false
208
+ sort_by_len: true
209
+ task_cls: modules.GenerSpeech.task.generspeech.GenerSpeechTask
210
+ tb_log_interval: 100
211
+ test_ids: []
212
+ test_input_dir: ''
213
+ test_num: 200
214
+ test_set_name: test
215
+ text: ''
216
+ train_set_name: train
217
+ train_sets: ''
218
+ use_cond_disc: false
219
+ use_emotion: true
220
+ use_energy_embed: false
221
+ use_gt_dur: false
222
+ use_gt_f0: false
223
+ use_latent_cond: true
224
+ use_pitch_embed: true
225
+ use_pos_embed: true
226
+ use_ref_enc: false
227
+ use_spk_embed: true
228
+ use_spk_id: false
229
+ use_split_spk_id: false
230
+ use_txt_cond: true
231
+ use_uv: true
232
+ use_var_enc: false
233
+ use_word: true
234
+ vae_dropout: 0.0
235
+ val_check_interval: 2000
236
+ valid_infer_interval: 10000
237
+ valid_monitor_key: val_loss
238
+ valid_monitor_mode: min
239
+ valid_set_name: valid
240
+ var_enc_vq_codes: 64
241
+ vocoder: hifigan
242
+ vocoder_ckpt: checkpoints/trainset_hifigan
243
+ vocoder_denoise_c: 0.0
244
+ vq_start: 20500
245
+ warmup_updates: 2000
246
+ weight_decay: 0
247
+ win_size: 1024
248
+ word_size: 30000
249
+ work_dir: checkpoints/GenerSpeech
checkpoints/GenerSpeech/model_ckpt_steps_300000.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b872bb686013cee2a98cc610b8b66b788c46ff4c33130682b63af4ac005405ea
3
+ size 619582860
checkpoints/trainset_hifigan/config.yaml ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accumulate_grad_batches: 1
2
+ adam_b1: 0.8
3
+ adam_b2: 0.99
4
+ amp: false
5
+ audio_num_mel_bins: 80
6
+ audio_sample_rate: 16000
7
+ aux_context_window: 0
8
+ base_config:
9
+ - egs/egs_bases/tts/vocoder/hifigan.yaml
10
+ - egs/datasets/audio/emotion/base_text2mel.yaml
11
+ binarization_args:
12
+ reset_phone_dict: true
13
+ reset_word_dict: true
14
+ shuffle: true
15
+ trim_eos_bos: false
16
+ trim_sil: false
17
+ with_align: false
18
+ with_f0: true
19
+ with_f0cwt: false
20
+ with_linear: false
21
+ with_spk_embed: false
22
+ with_spk_id: true
23
+ with_txt: false
24
+ with_wav: true
25
+ with_word: false
26
+ binarizer_cls: data_gen.tts.base_binarizer_emotion.EmotionBinarizer
27
+ binary_data_dir: data/binary/training_set
28
+ check_val_every_n_epoch: 10
29
+ clip_grad_norm: 1
30
+ clip_grad_value: 0
31
+ debug: false
32
+ dec_ffn_kernel_size: 9
33
+ dec_layers: 4
34
+ dict_dir: ''
35
+ disc_start_steps: 40000
36
+ discriminator_grad_norm: 1
37
+ discriminator_optimizer_params:
38
+ lr: 0.0002
39
+ discriminator_scheduler_params:
40
+ gamma: 0.999
41
+ step_size: 600
42
+ dropout: 0.1
43
+ ds_workers: 1
44
+ enc_ffn_kernel_size: 9
45
+ enc_layers: 4
46
+ endless_ds: true
47
+ ffn_act: gelu
48
+ ffn_padding: SAME
49
+ fft_size: 1024
50
+ fmax: 7600
51
+ fmin: 80
52
+ frames_multiple: 1
53
+ gen_dir_name: ''
54
+ generator_grad_norm: 10
55
+ generator_optimizer_params:
56
+ lr: 0.0002
57
+ generator_scheduler_params:
58
+ gamma: 0.999
59
+ step_size: 600
60
+ griffin_lim_iters: 60
61
+ hidden_size: 256
62
+ hop_size: 256
63
+ infer: false
64
+ lambda_adv: 1.0
65
+ lambda_cdisc: 4.0
66
+ lambda_mel: 5.0
67
+ lambda_mel_adv: 1.0
68
+ load_ckpt: ''
69
+ loud_norm: false
70
+ lr: 2.0
71
+ max_epochs: 1000
72
+ max_frames: 1548
73
+ max_input_tokens: 1550
74
+ max_samples: 8192
75
+ max_sentences: 24
76
+ max_tokens: 30000
77
+ max_updates: 1000000
78
+ max_valid_sentences: 1
79
+ max_valid_tokens: 60000
80
+ mel_loss: ssim:0.5|l1:0.5
81
+ mel_vmax: 1.5
82
+ mel_vmin: -6
83
+ min_frames: 128
84
+ min_level_db: -100
85
+ num_ckpt_keep: 3
86
+ num_heads: 2
87
+ num_mels: 80
88
+ num_sanity_val_steps: -1
89
+ num_spk: 10
90
+ num_test_samples: 30
91
+ num_valid_plots: 10
92
+ optimizer_adam_beta1: 0.9
93
+ optimizer_adam_beta2: 0.98
94
+ out_wav_norm: false
95
+ pitch_extractor: parselmouth
96
+ pitch_type: frame
97
+ pre_align_args:
98
+ allow_no_txt: false
99
+ denoise: false
100
+ sox_resample: false
101
+ sox_to_wav: false
102
+ trim_sil: false
103
+ txt_processor: en
104
+ use_tone: true
105
+ pre_align_cls: egs.datasets.audio.emotion.pre_align.EmoPreAlign
106
+ print_nan_grads: false
107
+ processed_data_dir: data/processed/emotion,data/processed/LibriTTS
108
+ profile_infer: false
109
+ raw_data_dir: data/raw/ESD
110
+ ref_level_db: 20
111
+ rename_tmux: true
112
+ resblock: '1'
113
+ resblock_dilation_sizes:
114
+ - - 1
115
+ - 3
116
+ - 5
117
+ - - 1
118
+ - 3
119
+ - 5
120
+ - - 1
121
+ - 3
122
+ - 5
123
+ resblock_kernel_sizes:
124
+ - 3
125
+ - 7
126
+ - 11
127
+ resume_from_checkpoint: 0
128
+ save_best: true
129
+ save_codes: []
130
+ save_f0: false
131
+ save_gt: true
132
+ scheduler: rsqrt
133
+ seed: 1234
134
+ sort_by_len: true
135
+ task_cls: tasks.vocoder.hifigan.HifiGanTask
136
+ tb_log_interval: 100
137
+ test_ids: []
138
+ test_input_dir: ''
139
+ test_num: 200
140
+ test_set_name: test
141
+ train_set_name: train
142
+ train_sets: ''
143
+ upsample_initial_channel: 512
144
+ upsample_kernel_sizes:
145
+ - 16
146
+ - 16
147
+ - 4
148
+ - 4
149
+ upsample_rates:
150
+ - 8
151
+ - 8
152
+ - 2
153
+ - 2
154
+ use_cdisc: false
155
+ use_cond_disc: false
156
+ use_emotion: true
157
+ use_fm_loss: false
158
+ use_ms_stft: false
159
+ use_pitch_embed: false
160
+ use_spec_disc: false
161
+ use_spk_embed: false
162
+ use_spk_id: true
163
+ use_split_spk_id: false
164
+ val_check_interval: 2000
165
+ valid_infer_interval: 10000
166
+ valid_monitor_key: val_loss
167
+ valid_monitor_mode: min
168
+ valid_set_name: valid
169
+ vocoder: pwg
170
+ vocoder_ckpt: ''
171
+ vocoder_denoise_c: 0.0
172
+ warmup_updates: 8000
173
+ weight_decay: 0
174
+ win_length: null
175
+ win_size: 1024
176
+ window: hann
177
+ word_size: 30000
178
+ work_dir: checkpoints/trainset_hifigan
checkpoints/trainset_hifigan/model_ckpt_steps_1000000.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a2577919899400a111ef42a2aba65797d282c259d083d2c276539dda9d17870
3
+ size 1016199247
image/violin.png ADDED