RMSnow commited on
Commit
0883aa1
1 Parent(s): df2accb

add backend inference and inferface output

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitignore +3 -0
  2. app.py +52 -13
  3. ckpts/svc/vocalist_l1_contentvec+whisper/args.json +2 -1
  4. config/audioldm.json +92 -0
  5. config/autoencoderkl.json +69 -0
  6. config/base.json +220 -0
  7. config/comosvc.json +216 -0
  8. config/diffusion.json +227 -0
  9. config/fs2.json +117 -0
  10. config/transformer.json +180 -0
  11. config/tts.json +23 -0
  12. config/valle.json +52 -0
  13. config/vits.json +101 -0
  14. config/vocoder.json +84 -0
  15. egs/vocoder/README.md +23 -0
  16. egs/vocoder/diffusion/README.md +0 -0
  17. egs/vocoder/diffusion/exp_config_base.json +0 -0
  18. egs/vocoder/gan/README.md +224 -0
  19. egs/vocoder/gan/_template/run.sh +143 -0
  20. egs/vocoder/gan/apnet/exp_config.json +45 -0
  21. egs/vocoder/gan/apnet/run.sh +143 -0
  22. egs/vocoder/gan/bigvgan/exp_config.json +66 -0
  23. egs/vocoder/gan/bigvgan/run.sh +143 -0
  24. egs/vocoder/gan/bigvgan_large/exp_config.json +70 -0
  25. egs/vocoder/gan/bigvgan_large/run.sh +143 -0
  26. egs/vocoder/gan/exp_config_base.json +111 -0
  27. egs/vocoder/gan/hifigan/exp_config.json +59 -0
  28. egs/vocoder/gan/hifigan/run.sh +143 -0
  29. egs/vocoder/gan/melgan/exp_config.json +34 -0
  30. egs/vocoder/gan/melgan/run.sh +143 -0
  31. egs/vocoder/gan/nsfhifigan/exp_config.json +83 -0
  32. egs/vocoder/gan/nsfhifigan/run.sh +143 -0
  33. egs/vocoder/gan/tfr_enhanced_hifigan/README.md +185 -0
  34. egs/vocoder/gan/tfr_enhanced_hifigan/exp_config.json +118 -0
  35. egs/vocoder/gan/tfr_enhanced_hifigan/run.sh +145 -0
  36. inference.py +6 -2
  37. modules/__init__.py +0 -0
  38. modules/activation_functions/__init__.py +7 -0
  39. modules/activation_functions/gated_activation_unit.py +61 -0
  40. modules/activation_functions/snake.py +122 -0
  41. modules/anti_aliasing/__init__.py +8 -0
  42. modules/anti_aliasing/act.py +35 -0
  43. modules/anti_aliasing/filter.py +99 -0
  44. modules/anti_aliasing/resample.py +64 -0
  45. modules/base/base_module.py +75 -0
  46. modules/diffusion/__init__.py +7 -0
  47. modules/diffusion/bidilconv/bidilated_conv.py +102 -0
  48. modules/diffusion/bidilconv/residual_block.py +73 -0
  49. modules/diffusion/karras/karras_diffusion.py +979 -0
  50. modules/diffusion/karras/random_utils.py +177 -0
.gitignore CHANGED
@@ -1,6 +1,9 @@
1
  __pycache__
2
  flagged
3
  result
 
 
 
4
 
5
  # Developing mode
6
  _*.sh
 
1
  __pycache__
2
  flagged
3
  result
4
+ source_audios
5
+ ckpts/svc/vocalist_l1_contentvec+whisper/data
6
+ !ckpts/svc/vocalist_l1_contentvec+whisper/data/vocalist_l1
7
 
8
  # Developing mode
9
  _*.sh
app.py CHANGED
@@ -1,5 +1,11 @@
1
- import gradio as gr
 
 
 
2
 
 
 
 
3
 
4
  SUPPORTED_TARGET_SINGERS = {
5
  "Adele": "vocalist_l1_Adele",
@@ -21,33 +27,58 @@ SUPPORTED_TARGET_SINGERS = {
21
 
22
 
23
  def svc_inference(
24
- source_audio,
25
  target_singer,
26
- diffusion_steps=1000,
27
- key_shift_mode="auto",
28
  key_shift_num=0,
 
29
  ):
30
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
 
33
  demo_inputs = [
34
  gr.Audio(
35
  sources=["upload", "microphone"],
36
  label="Upload (or record) a song you want to listen",
 
37
  ),
38
  gr.Radio(
39
  choices=list(SUPPORTED_TARGET_SINGERS.keys()),
40
  label="Target Singer",
41
  value="Jian Li 李健",
42
  ),
43
- gr.Slider(
44
- 1,
45
- 1000,
46
- value=1000,
47
- step=1,
48
- label="Diffusion Inference Steps",
49
- info="As the step number increases, the synthesis quality will be better while the inference speed will be lower",
50
- ),
51
  gr.Radio(
52
  choices=["Auto Shift", "Key Shift"],
53
  value="Auto Shift",
@@ -62,6 +93,14 @@ demo_inputs = [
62
  label="Key Shift Values",
63
  info='How many semitones you want to transpose. This parameter will work only if you choose "Key Shift"',
64
  ),
 
 
 
 
 
 
 
 
65
  ]
66
 
67
  demo_outputs = gr.Audio(label="")
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
 
6
+ import gradio as gr
7
+ import os
8
+ import inference
9
 
10
  SUPPORTED_TARGET_SINGERS = {
11
  "Adele": "vocalist_l1_Adele",
 
27
 
28
 
29
  def svc_inference(
30
+ source_audio_path,
31
  target_singer,
32
+ key_shift_mode="Auto Shift",
 
33
  key_shift_num=0,
34
+ diffusion_steps=1000,
35
  ):
36
+ #### Prepare source audio file ####
37
+ print("source_audio_path: {}".format(source_audio_path))
38
+ audio_file = source_audio_path.split("/")[-1]
39
+ audio_name = audio_file.split(".")[0]
40
+ source_audio_dir = source_audio_path.replace(audio_file, "")
41
+
42
+ ### Target Singer ###
43
+ target_singer = SUPPORTED_TARGET_SINGERS[target_singer]
44
+
45
+ ### Inference ###
46
+ if key_shift_mode == "Auto Shift":
47
+ key_shift = "autoshift"
48
+ else:
49
+ key_shift = key_shift_num
50
+
51
+ args_list = ["--config", "ckpts/svc/vocalist_l1_contentvec+whisper/args.json"]
52
+ args_list += ["--acoustics_dir", "ckpts/svc/vocalist_l1_contentvec+whisper"]
53
+ args_list += ["--vocoder_dir", "pretrained/bigvgan"]
54
+ args_list += ["--target_singer", target_singer]
55
+ args_list += ["--trans_key", str(key_shift)]
56
+ args_list += ["--diffusion_inference_steps", str(diffusion_steps)]
57
+ args_list += ["--source", source_audio_dir]
58
+ args_list += ["--output_dir", "result"]
59
+ args_list += ["--log_level", "debug"]
60
+
61
+ os.environ["WORK_DIR"] = "./"
62
+ inference.main(args_list)
63
+
64
+ ### Display ###
65
+ result_file = os.path.join(
66
+ "result/{}/{}_{}.wav".format(audio_name, audio_name, target_singer)
67
+ )
68
+ return result_file
69
 
70
 
71
  demo_inputs = [
72
  gr.Audio(
73
  sources=["upload", "microphone"],
74
  label="Upload (or record) a song you want to listen",
75
+ type="filepath",
76
  ),
77
  gr.Radio(
78
  choices=list(SUPPORTED_TARGET_SINGERS.keys()),
79
  label="Target Singer",
80
  value="Jian Li 李健",
81
  ),
 
 
 
 
 
 
 
 
82
  gr.Radio(
83
  choices=["Auto Shift", "Key Shift"],
84
  value="Auto Shift",
 
93
  label="Key Shift Values",
94
  info='How many semitones you want to transpose. This parameter will work only if you choose "Key Shift"',
95
  ),
96
+ gr.Slider(
97
+ 1,
98
+ 1000,
99
+ value=1000,
100
+ step=1,
101
+ label="Diffusion Inference Steps",
102
+ info="As the step number increases, the synthesis quality will be better while the inference speed will be lower",
103
+ ),
104
  ]
105
 
106
  demo_outputs = gr.Audio(label="")
ckpts/svc/vocalist_l1_contentvec+whisper/args.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "base_config": "config/diffusion.json",
3
  "dataset": [
4
  "vocalist_l1",
5
  ],
@@ -195,6 +195,7 @@
195
  "whisper_frameshift": 0.01,
196
  "whisper_model": "medium",
197
  "whisper_model_path": "pretrained/whisper/medium.pt",
 
198
  "win_size": 1024,
199
  },
200
  "supported_model_type": [
 
1
  {
2
+ "task_type": "svc",
3
  "dataset": [
4
  "vocalist_l1",
5
  ],
 
195
  "whisper_frameshift": 0.01,
196
  "whisper_model": "medium",
197
  "whisper_model_path": "pretrained/whisper/medium.pt",
198
+ "whisper_sample_rate": 16000,
199
  "win_size": 1024,
200
  },
201
  "supported_model_type": [
config/audioldm.json ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/base.json",
3
+ "model_type": "AudioLDM",
4
+ "task_type": "tta",
5
+ "dataset": [
6
+ "AudioCaps"
7
+ ],
8
+ "preprocess": {
9
+ // feature used for model training
10
+ "use_spkid": false,
11
+ "use_uv": false,
12
+ "use_frame_pitch": false,
13
+ "use_phone_pitch": false,
14
+ "use_frame_energy": false,
15
+ "use_phone_energy": false,
16
+ "use_mel": false,
17
+ "use_audio": false,
18
+ "use_label": false,
19
+ "use_one_hot": false,
20
+ "cond_mask_prob": 0.1
21
+ },
22
+ // model
23
+ "model": {
24
+ "audioldm": {
25
+ "image_size": 32,
26
+ "in_channels": 4,
27
+ "out_channels": 4,
28
+ "model_channels": 256,
29
+ "attention_resolutions": [
30
+ 4,
31
+ 2,
32
+ 1
33
+ ],
34
+ "num_res_blocks": 2,
35
+ "channel_mult": [
36
+ 1,
37
+ 2,
38
+ 4
39
+ ],
40
+ "num_heads": 8,
41
+ "use_spatial_transformer": true,
42
+ "transformer_depth": 1,
43
+ "context_dim": 768,
44
+ "use_checkpoint": true,
45
+ "legacy": false
46
+ },
47
+ "autoencoderkl": {
48
+ "ch": 128,
49
+ "ch_mult": [
50
+ 1,
51
+ 1,
52
+ 2,
53
+ 2,
54
+ 4
55
+ ],
56
+ "num_res_blocks": 2,
57
+ "in_channels": 1,
58
+ "z_channels": 4,
59
+ "out_ch": 1,
60
+ "double_z": true
61
+ },
62
+ "noise_scheduler": {
63
+ "num_train_timesteps": 1000,
64
+ "beta_start": 0.00085,
65
+ "beta_end": 0.012,
66
+ "beta_schedule": "scaled_linear",
67
+ "clip_sample": false,
68
+ "steps_offset": 1,
69
+ "set_alpha_to_one": false,
70
+ "skip_prk_steps": true,
71
+ "prediction_type": "epsilon"
72
+ }
73
+ },
74
+ // train
75
+ "train": {
76
+ "lronPlateau": {
77
+ "factor": 0.9,
78
+ "patience": 100,
79
+ "min_lr": 4.0e-5,
80
+ "verbose": true
81
+ },
82
+ "adam": {
83
+ "lr": 5.0e-5,
84
+ "betas": [
85
+ 0.9,
86
+ 0.999
87
+ ],
88
+ "weight_decay": 1.0e-2,
89
+ "eps": 1.0e-8
90
+ }
91
+ }
92
+ }
config/autoencoderkl.json ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/base.json",
3
+ "model_type": "AutoencoderKL",
4
+ "task_type": "tta",
5
+ "dataset": [
6
+ "AudioCaps"
7
+ ],
8
+ "preprocess": {
9
+ // feature used for model training
10
+ "use_spkid": false,
11
+ "use_uv": false,
12
+ "use_frame_pitch": false,
13
+ "use_phone_pitch": false,
14
+ "use_frame_energy": false,
15
+ "use_phone_energy": false,
16
+ "use_mel": false,
17
+ "use_audio": false,
18
+ "use_label": false,
19
+ "use_one_hot": false
20
+ },
21
+ // model
22
+ "model": {
23
+ "autoencoderkl": {
24
+ "ch": 128,
25
+ "ch_mult": [
26
+ 1,
27
+ 1,
28
+ 2,
29
+ 2,
30
+ 4
31
+ ],
32
+ "num_res_blocks": 2,
33
+ "in_channels": 1,
34
+ "z_channels": 4,
35
+ "out_ch": 1,
36
+ "double_z": true
37
+ },
38
+ "loss": {
39
+ "kl_weight": 1e-8,
40
+ "disc_weight": 0.5,
41
+ "disc_factor": 1.0,
42
+ "logvar_init": 0.0,
43
+ "min_adapt_d_weight": 0.0,
44
+ "max_adapt_d_weight": 10.0,
45
+ "disc_start": 50001,
46
+ "disc_in_channels": 1,
47
+ "disc_num_layers": 3,
48
+ "use_actnorm": false
49
+ }
50
+ },
51
+ // train
52
+ "train": {
53
+ "lronPlateau": {
54
+ "factor": 0.9,
55
+ "patience": 100,
56
+ "min_lr": 4.0e-5,
57
+ "verbose": true
58
+ },
59
+ "adam": {
60
+ "lr": 4.0e-4,
61
+ "betas": [
62
+ 0.9,
63
+ 0.999
64
+ ],
65
+ "weight_decay": 1.0e-2,
66
+ "eps": 1.0e-8
67
+ }
68
+ }
69
+ }
config/base.json ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "supported_model_type": [
3
+ "GANVocoder",
4
+ "Fastspeech2",
5
+ "DiffSVC",
6
+ "Transformer",
7
+ "EDM",
8
+ "CD"
9
+ ],
10
+ "task_type": "",
11
+ "dataset": [],
12
+ "use_custom_dataset": false,
13
+ "preprocess": {
14
+ "phone_extractor": "espeak", // "espeak, pypinyin, pypinyin_initials_finals, lexicon"
15
+ // trim audio silence
16
+ "data_augment": false,
17
+ "trim_silence": false,
18
+ "num_silent_frames": 8,
19
+ "trim_fft_size": 512, // fft size used in trimming
20
+ "trim_hop_size": 128, // hop size used in trimming
21
+ "trim_top_db": 30, // top db used in trimming sensitive to each dataset
22
+ // acoustic features
23
+ "extract_mel": false,
24
+ "mel_extract_mode": "",
25
+ "extract_linear_spec": false,
26
+ "extract_mcep": false,
27
+ "extract_pitch": false,
28
+ "extract_acoustic_token": false,
29
+ "pitch_remove_outlier": false,
30
+ "extract_uv": false,
31
+ "pitch_norm": false,
32
+ "extract_audio": false,
33
+ "extract_label": false,
34
+ "pitch_extractor": "parselmouth", // pyin, dio, pyworld, pyreaper, parselmouth, CWT (Continuous Wavelet Transform)
35
+ "extract_energy": false,
36
+ "energy_remove_outlier": false,
37
+ "energy_norm": false,
38
+ "energy_extract_mode": "from_mel",
39
+ "extract_duration": false,
40
+ "extract_amplitude_phase": false,
41
+ "mel_min_max_norm": false,
42
+ // lingusitic features
43
+ "extract_phone": false,
44
+ "lexicon_path": "./text/lexicon/librispeech-lexicon.txt",
45
+ // content features
46
+ "extract_whisper_feature": false,
47
+ "extract_contentvec_feature": false,
48
+ "extract_mert_feature": false,
49
+ "extract_wenet_feature": false,
50
+ // Settings for data preprocessing
51
+ "n_mel": 80,
52
+ "win_size": 480,
53
+ "hop_size": 120,
54
+ "sample_rate": 24000,
55
+ "n_fft": 1024,
56
+ "fmin": 0,
57
+ "fmax": 12000,
58
+ "min_level_db": -115,
59
+ "ref_level_db": 20,
60
+ "bits": 8,
61
+ // Directory names of processed data or extracted features
62
+ "processed_dir": "processed_data",
63
+ "trimmed_wav_dir": "trimmed_wavs", // directory name of silence trimed wav
64
+ "raw_data": "raw_data",
65
+ "phone_dir": "phones",
66
+ "wav_dir": "wavs", // directory name of processed wav (such as downsampled waveform)
67
+ "audio_dir": "audios",
68
+ "log_amplitude_dir": "log_amplitudes",
69
+ "phase_dir": "phases",
70
+ "real_dir": "reals",
71
+ "imaginary_dir": "imaginarys",
72
+ "label_dir": "labels",
73
+ "linear_dir": "linears",
74
+ "mel_dir": "mels", // directory name of extraced mel features
75
+ "mcep_dir": "mcep", // directory name of extraced mcep features
76
+ "dur_dir": "durs",
77
+ "symbols_dict": "symbols.dict",
78
+ "lab_dir": "labs", // directory name of extraced label features
79
+ "wenet_dir": "wenet", // directory name of extraced wenet features
80
+ "contentvec_dir": "contentvec", // directory name of extraced wenet features
81
+ "pitch_dir": "pitches", // directory name of extraced pitch features
82
+ "energy_dir": "energys", // directory name of extracted energy features
83
+ "phone_pitch_dir": "phone_pitches", // directory name of extraced pitch features
84
+ "phone_energy_dir": "phone_energys", // directory name of extracted energy features
85
+ "uv_dir": "uvs", // directory name of extracted unvoiced features
86
+ "duration_dir": "duration", // ground-truth duration file
87
+ "phone_seq_file": "phone_seq_file", // phoneme sequence file
88
+ "file_lst": "file.lst",
89
+ "train_file": "train.json", // training set, the json file contains detailed information about the dataset, including dataset name, utterance id, duration of the utterance
90
+ "valid_file": "valid.json", // validattion set
91
+ "spk2id": "spk2id.json", // used for multi-speaker dataset
92
+ "utt2spk": "utt2spk", // used for multi-speaker dataset
93
+ "emo2id": "emo2id.json", // used for multi-emotion dataset
94
+ "utt2emo": "utt2emo", // used for multi-emotion dataset
95
+ // Features used for model training
96
+ "use_text": false,
97
+ "use_phone": false,
98
+ "use_phn_seq": false,
99
+ "use_lab": false,
100
+ "use_linear": false,
101
+ "use_mel": false,
102
+ "use_min_max_norm_mel": false,
103
+ "use_wav": false,
104
+ "use_phone_pitch": false,
105
+ "use_log_scale_pitch": false,
106
+ "use_phone_energy": false,
107
+ "use_phone_duration": false,
108
+ "use_log_scale_energy": false,
109
+ "use_wenet": false,
110
+ "use_dur": false,
111
+ "use_spkid": false, // True: use speaker id for multi-speaker dataset
112
+ "use_emoid": false, // True: use emotion id for multi-emotion dataset
113
+ "use_frame_pitch": false,
114
+ "use_uv": false,
115
+ "use_frame_energy": false,
116
+ "use_frame_duration": false,
117
+ "use_audio": false,
118
+ "use_label": false,
119
+ "use_one_hot": false,
120
+ "use_amplitude_phase": false,
121
+ "data_augment": false,
122
+ "align_mel_duration": false
123
+ },
124
+ "train": {
125
+ "ddp": true,
126
+ "random_seed": 970227,
127
+ "batch_size": 16,
128
+ "max_steps": 1000000,
129
+ // Trackers
130
+ "tracker": [
131
+ "tensorboard"
132
+ // "wandb",
133
+ // "cometml",
134
+ // "mlflow",
135
+ ],
136
+ "max_epoch": -1,
137
+ // -1 means no limit
138
+ "save_checkpoint_stride": [
139
+ 5,
140
+ 20
141
+ ],
142
+ // unit is epoch
143
+ "keep_last": [
144
+ 3,
145
+ -1
146
+ ],
147
+ // -1 means infinite, if one number will broadcast
148
+ "run_eval": [
149
+ false,
150
+ true
151
+ ],
152
+ // if one number will broadcast
153
+ // Fix the random seed
154
+ "random_seed": 10086,
155
+ // Optimizer
156
+ "optimizer": "AdamW",
157
+ "adamw": {
158
+ "lr": 4.0e-4
159
+ // nn model lr
160
+ },
161
+ // LR Scheduler
162
+ "scheduler": "ReduceLROnPlateau",
163
+ "reducelronplateau": {
164
+ "factor": 0.8,
165
+ "patience": 10,
166
+ // unit is epoch
167
+ "min_lr": 1.0e-4
168
+ },
169
+ // Batchsampler
170
+ "sampler": {
171
+ "holistic_shuffle": true,
172
+ "drop_last": true
173
+ },
174
+ // Dataloader
175
+ "dataloader": {
176
+ "num_worker": 32,
177
+ "pin_memory": true
178
+ },
179
+ "gradient_accumulation_step": 1,
180
+ "total_training_steps": 50000,
181
+ "save_summary_steps": 500,
182
+ "save_checkpoints_steps": 10000,
183
+ "valid_interval": 10000,
184
+ "keep_checkpoint_max": 5,
185
+ "multi_speaker_training": false, // True: train multi-speaker model; False: training single-speaker model;
186
+ "max_epoch": -1,
187
+ // -1 means no limit
188
+ "save_checkpoint_stride": [
189
+ 5,
190
+ 20
191
+ ],
192
+ // unit is epoch
193
+ "keep_last": [
194
+ 3,
195
+ -1
196
+ ],
197
+ // -1 means infinite, if one number will broadcast
198
+ "run_eval": [
199
+ false,
200
+ true
201
+ ],
202
+ // Batchsampler
203
+ "sampler": {
204
+ "holistic_shuffle": true,
205
+ "drop_last": true
206
+ },
207
+ // Dataloader
208
+ "dataloader": {
209
+ "num_worker": 32,
210
+ "pin_memory": true
211
+ },
212
+ // Trackers
213
+ "tracker": [
214
+ "tensorboard"
215
+ // "wandb",
216
+ // "cometml",
217
+ // "mlflow",
218
+ ],
219
+ },
220
+ }
config/comosvc.json ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/base.json",
3
+ "model_type": "DiffComoSVC",
4
+ "task_type": "svc",
5
+ "use_custom_dataset": false,
6
+ "preprocess": {
7
+ // data augmentations
8
+ "use_pitch_shift": false,
9
+ "use_formant_shift": false,
10
+ "use_time_stretch": false,
11
+ "use_equalizer": false,
12
+ // acoustic features
13
+ "extract_mel": true,
14
+ "mel_min_max_norm": true,
15
+ "extract_pitch": true,
16
+ "pitch_extractor": "parselmouth",
17
+ "extract_uv": true,
18
+ "extract_energy": true,
19
+ // content features
20
+ "extract_whisper_feature": false,
21
+ "whisper_sample_rate": 16000,
22
+ "extract_contentvec_feature": false,
23
+ "contentvec_sample_rate": 16000,
24
+ "extract_wenet_feature": false,
25
+ "wenet_sample_rate": 16000,
26
+ "extract_mert_feature": false,
27
+ "mert_sample_rate": 16000,
28
+ // Default config for whisper
29
+ "whisper_frameshift": 0.01,
30
+ "whisper_downsample_rate": 2,
31
+ // Default config for content vector
32
+ "contentvec_frameshift": 0.02,
33
+ // Default config for mert
34
+ "mert_model": "m-a-p/MERT-v1-330M",
35
+ "mert_feature_layer": -1,
36
+ "mert_hop_size": 320,
37
+ // 24k
38
+ "mert_frameshit": 0.01333,
39
+ // 10ms
40
+ "wenet_frameshift": 0.01,
41
+ // wenetspeech is 4, gigaspeech is 6
42
+ "wenet_downsample_rate": 4,
43
+ // Default config
44
+ "n_mel": 100,
45
+ "win_size": 1024,
46
+ // todo
47
+ "hop_size": 256,
48
+ "sample_rate": 24000,
49
+ "n_fft": 1024,
50
+ // todo
51
+ "fmin": 0,
52
+ "fmax": 12000,
53
+ // todo
54
+ "f0_min": 50,
55
+ // ~C2
56
+ "f0_max": 1100,
57
+ //1100, // ~C6(1100), ~G5(800)
58
+ "pitch_bin": 256,
59
+ "pitch_max": 1100.0,
60
+ "pitch_min": 50.0,
61
+ "is_label": true,
62
+ "is_mu_law": true,
63
+ "bits": 8,
64
+ "mel_min_max_stats_dir": "mel_min_max_stats",
65
+ "whisper_dir": "whisper",
66
+ "contentvec_dir": "contentvec",
67
+ "wenet_dir": "wenet",
68
+ "mert_dir": "mert",
69
+ // Extract content features using dataloader
70
+ "pin_memory": true,
71
+ "num_workers": 8,
72
+ "content_feature_batch_size": 16,
73
+ // Features used for model training
74
+ "use_mel": true,
75
+ "use_min_max_norm_mel": true,
76
+ "use_frame_pitch": true,
77
+ "use_uv": true,
78
+ "use_frame_energy": true,
79
+ "use_log_scale_pitch": false,
80
+ "use_log_scale_energy": false,
81
+ "use_spkid": true,
82
+ // Meta file
83
+ "train_file": "train.json",
84
+ "valid_file": "test.json",
85
+ "spk2id": "singers.json",
86
+ "utt2spk": "utt2singer"
87
+ },
88
+ "model": {
89
+ "teacher_model_path": "[Your Teacher Model Path].bin",
90
+ "condition_encoder": {
91
+ "merge_mode": "add",
92
+ "input_melody_dim": 1,
93
+ "use_log_f0": true,
94
+ "n_bins_melody": 256,
95
+ //# Quantization (0 for not quantization)
96
+ "output_melody_dim": 384,
97
+ "input_loudness_dim": 1,
98
+ "use_log_loudness": true,
99
+ "n_bins_loudness": 256,
100
+ "output_loudness_dim": 384,
101
+ "use_whisper": false,
102
+ "use_contentvec": false,
103
+ "use_wenet": false,
104
+ "use_mert": false,
105
+ "whisper_dim": 1024,
106
+ "contentvec_dim": 256,
107
+ "mert_dim": 256,
108
+ "wenet_dim": 512,
109
+ "content_encoder_dim": 384,
110
+ "output_singer_dim": 384,
111
+ "singer_table_size": 512,
112
+ "output_content_dim": 384,
113
+ "use_spkid": true
114
+ },
115
+ "comosvc": {
116
+ "distill": false,
117
+ // conformer encoder
118
+ "input_dim": 384,
119
+ "output_dim": 100,
120
+ "n_heads": 2,
121
+ "n_layers": 6,
122
+ "filter_channels": 512,
123
+ "dropout": 0.1,
124
+ // karras diffusion
125
+ "P_mean": -1.2,
126
+ "P_std": 1.2,
127
+ "sigma_data": 0.5,
128
+ "sigma_min": 0.002,
129
+ "sigma_max": 80,
130
+ "rho": 7,
131
+ "n_timesteps": 40,
132
+ },
133
+ "diffusion": {
134
+ // Diffusion steps encoder
135
+ "step_encoder": {
136
+ "dim_raw_embedding": 128,
137
+ "dim_hidden_layer": 512,
138
+ "activation": "SiLU",
139
+ "num_layer": 2,
140
+ "max_period": 10000
141
+ },
142
+ // Diffusion decoder
143
+ "model_type": "bidilconv",
144
+ // bidilconv, unet2d, TODO: unet1d
145
+ "bidilconv": {
146
+ "base_channel": 384,
147
+ "n_res_block": 20,
148
+ "conv_kernel_size": 3,
149
+ "dilation_cycle_length": 4,
150
+ // specially, 1 means no dilation
151
+ "conditioner_size": 100
152
+ }
153
+ },
154
+ },
155
+ "train": {
156
+ // Basic settings
157
+ "fast_steps": 0,
158
+ "batch_size": 32,
159
+ "gradient_accumulation_step": 1,
160
+ "max_epoch": -1,
161
+ // -1 means no limit
162
+ "save_checkpoint_stride": [
163
+ 10,
164
+ 100
165
+ ],
166
+ // unit is epoch
167
+ "keep_last": [
168
+ 3,
169
+ -1
170
+ ],
171
+ // -1 means infinite, if one number will broadcast
172
+ "run_eval": [
173
+ false,
174
+ true
175
+ ],
176
+ // if one number will broadcast
177
+ // Fix the random seed
178
+ "random_seed": 10086,
179
+ // Batchsampler
180
+ "sampler": {
181
+ "holistic_shuffle": true,
182
+ "drop_last": true
183
+ },
184
+ // Dataloader
185
+ "dataloader": {
186
+ "num_worker": 32,
187
+ "pin_memory": true
188
+ },
189
+ // Trackers
190
+ "tracker": [
191
+ "tensorboard"
192
+ // "wandb",
193
+ // "cometml",
194
+ // "mlflow",
195
+ ],
196
+ // Optimizer
197
+ "optimizer": "AdamW",
198
+ "adamw": {
199
+ "lr": 4.0e-4
200
+ // nn model lr
201
+ },
202
+ // LR Scheduler
203
+ "scheduler": "ReduceLROnPlateau",
204
+ "reducelronplateau": {
205
+ "factor": 0.8,
206
+ "patience": 10,
207
+ // unit is epoch
208
+ "min_lr": 1.0e-4
209
+ }
210
+ },
211
+ "inference": {
212
+ "comosvc": {
213
+ "inference_steps": 40
214
+ }
215
+ }
216
+ }
config/diffusion.json ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ // FIXME: THESE ARE LEGACY
3
+ "base_config": "config/base.json",
4
+ "model_type": "diffusion",
5
+ "task_type": "svc",
6
+ "use_custom_dataset": false,
7
+ "preprocess": {
8
+ // data augmentations
9
+ "use_pitch_shift": false,
10
+ "use_formant_shift": false,
11
+ "use_time_stretch": false,
12
+ "use_equalizer": false,
13
+ // acoustic features
14
+ "extract_mel": true,
15
+ "mel_min_max_norm": true,
16
+ "extract_pitch": true,
17
+ "pitch_extractor": "parselmouth",
18
+ "extract_uv": true,
19
+ "extract_energy": true,
20
+ // content features
21
+ "extract_whisper_feature": false,
22
+ "whisper_sample_rate": 16000,
23
+ "extract_contentvec_feature": false,
24
+ "contentvec_sample_rate": 16000,
25
+ "extract_wenet_feature": false,
26
+ "wenet_sample_rate": 16000,
27
+ "extract_mert_feature": false,
28
+ "mert_sample_rate": 16000,
29
+ // Default config for whisper
30
+ "whisper_frameshift": 0.01,
31
+ "whisper_downsample_rate": 2,
32
+ // Default config for content vector
33
+ "contentvec_frameshift": 0.02,
34
+ // Default config for mert
35
+ "mert_model": "m-a-p/MERT-v1-330M",
36
+ "mert_feature_layer": -1,
37
+ "mert_hop_size": 320,
38
+ // 24k
39
+ "mert_frameshit": 0.01333,
40
+ // 10ms
41
+ "wenet_frameshift": 0.01,
42
+ // wenetspeech is 4, gigaspeech is 6
43
+ "wenet_downsample_rate": 4,
44
+ // Default config
45
+ "n_mel": 100,
46
+ "win_size": 1024,
47
+ // todo
48
+ "hop_size": 256,
49
+ "sample_rate": 24000,
50
+ "n_fft": 1024,
51
+ // todo
52
+ "fmin": 0,
53
+ "fmax": 12000,
54
+ // todo
55
+ "f0_min": 50,
56
+ // ~C2
57
+ "f0_max": 1100,
58
+ //1100, // ~C6(1100), ~G5(800)
59
+ "pitch_bin": 256,
60
+ "pitch_max": 1100.0,
61
+ "pitch_min": 50.0,
62
+ "is_label": true,
63
+ "is_mu_law": true,
64
+ "bits": 8,
65
+ "mel_min_max_stats_dir": "mel_min_max_stats",
66
+ "whisper_dir": "whisper",
67
+ "contentvec_dir": "contentvec",
68
+ "wenet_dir": "wenet",
69
+ "mert_dir": "mert",
70
+ // Extract content features using dataloader
71
+ "pin_memory": true,
72
+ "num_workers": 8,
73
+ "content_feature_batch_size": 16,
74
+ // Features used for model training
75
+ "use_mel": true,
76
+ "use_min_max_norm_mel": true,
77
+ "use_frame_pitch": true,
78
+ "use_uv": true,
79
+ "use_frame_energy": true,
80
+ "use_log_scale_pitch": false,
81
+ "use_log_scale_energy": false,
82
+ "use_spkid": true,
83
+ // Meta file
84
+ "train_file": "train.json",
85
+ "valid_file": "test.json",
86
+ "spk2id": "singers.json",
87
+ "utt2spk": "utt2singer"
88
+ },
89
+ "model": {
90
+ "condition_encoder": {
91
+ "merge_mode": "add",
92
+ "input_melody_dim": 1,
93
+ "use_log_f0": true,
94
+ "n_bins_melody": 256,
95
+ //# Quantization (0 for not quantization)
96
+ "output_melody_dim": 384,
97
+ "input_loudness_dim": 1,
98
+ "use_log_loudness": true,
99
+ "n_bins_loudness": 256,
100
+ "output_loudness_dim": 384,
101
+ "use_whisper": false,
102
+ "use_contentvec": false,
103
+ "use_wenet": false,
104
+ "use_mert": false,
105
+ "whisper_dim": 1024,
106
+ "contentvec_dim": 256,
107
+ "mert_dim": 256,
108
+ "wenet_dim": 512,
109
+ "content_encoder_dim": 384,
110
+ "output_singer_dim": 384,
111
+ "singer_table_size": 512,
112
+ "output_content_dim": 384,
113
+ "use_spkid": true
114
+ },
115
+ // FIXME: FOLLOWING ARE NEW!!
116
+ "diffusion": {
117
+ "scheduler": "ddpm",
118
+ "scheduler_settings": {
119
+ "num_train_timesteps": 1000,
120
+ "beta_start": 1.0e-4,
121
+ "beta_end": 0.02,
122
+ "beta_schedule": "linear"
123
+ },
124
+ // Diffusion steps encoder
125
+ "step_encoder": {
126
+ "dim_raw_embedding": 128,
127
+ "dim_hidden_layer": 512,
128
+ "activation": "SiLU",
129
+ "num_layer": 2,
130
+ "max_period": 10000
131
+ },
132
+ // Diffusion decoder
133
+ "model_type": "bidilconv",
134
+ // bidilconv, unet2d, TODO: unet1d
135
+ "bidilconv": {
136
+ "base_channel": 384,
137
+ "n_res_block": 20,
138
+ "conv_kernel_size": 3,
139
+ "dilation_cycle_length": 4,
140
+ // specially, 1 means no dilation
141
+ "conditioner_size": 384
142
+ },
143
+ "unet2d": {
144
+ "in_channels": 1,
145
+ "out_channels": 1,
146
+ "down_block_types": [
147
+ "CrossAttnDownBlock2D",
148
+ "CrossAttnDownBlock2D",
149
+ "CrossAttnDownBlock2D",
150
+ "DownBlock2D"
151
+ ],
152
+ "mid_block_type": "UNetMidBlock2DCrossAttn",
153
+ "up_block_types": [
154
+ "UpBlock2D",
155
+ "CrossAttnUpBlock2D",
156
+ "CrossAttnUpBlock2D",
157
+ "CrossAttnUpBlock2D"
158
+ ],
159
+ "only_cross_attention": false
160
+ }
161
+ }
162
+ },
163
+ // FIXME: FOLLOWING ARE NEW!!
164
+ "train": {
165
+ // Basic settings
166
+ "batch_size": 64,
167
+ "gradient_accumulation_step": 1,
168
+ "max_epoch": -1,
169
+ // -1 means no limit
170
+ "save_checkpoint_stride": [
171
+ 5,
172
+ 20
173
+ ],
174
+ // unit is epoch
175
+ "keep_last": [
176
+ 3,
177
+ -1
178
+ ],
179
+ // -1 means infinite, if one number will broadcast
180
+ "run_eval": [
181
+ false,
182
+ true
183
+ ],
184
+ // if one number will broadcast
185
+ // Fix the random seed
186
+ "random_seed": 10086,
187
+ // Batchsampler
188
+ "sampler": {
189
+ "holistic_shuffle": true,
190
+ "drop_last": true
191
+ },
192
+ // Dataloader
193
+ "dataloader": {
194
+ "num_worker": 32,
195
+ "pin_memory": true
196
+ },
197
+ // Trackers
198
+ "tracker": [
199
+ "tensorboard"
200
+ // "wandb",
201
+ // "cometml",
202
+ // "mlflow",
203
+ ],
204
+ // Optimizer
205
+ "optimizer": "AdamW",
206
+ "adamw": {
207
+ "lr": 4.0e-4
208
+ // nn model lr
209
+ },
210
+ // LR Scheduler
211
+ "scheduler": "ReduceLROnPlateau",
212
+ "reducelronplateau": {
213
+ "factor": 0.8,
214
+ "patience": 10,
215
+ // unit is epoch
216
+ "min_lr": 1.0e-4
217
+ }
218
+ },
219
+ "inference": {
220
+ "diffusion": {
221
+ "scheduler": "pndm",
222
+ "scheduler_settings": {
223
+ "num_inference_timesteps": 1000
224
+ }
225
+ }
226
+ }
227
+ }
config/fs2.json ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/tts.json",
3
+ "model_type": "FastSpeech2",
4
+ "task_type": "tts",
5
+ "dataset": ["LJSpeech"],
6
+ "preprocess": {
7
+ // acoustic features
8
+ "extract_audio": true,
9
+ "extract_mel": true,
10
+ "mel_extract_mode": "taco",
11
+ "mel_min_max_norm": false,
12
+ "extract_pitch": true,
13
+ "extract_uv": false,
14
+ "pitch_extractor": "dio",
15
+ "extract_energy": true,
16
+ "energy_extract_mode": "from_tacotron_stft",
17
+ "extract_duration": true,
18
+ "use_phone": true,
19
+ "pitch_norm": true,
20
+ "energy_norm": true,
21
+ "pitch_remove_outlier": true,
22
+ "energy_remove_outlier": true,
23
+
24
+ // Default config
25
+ "n_mel": 80,
26
+ "win_size": 1024, // todo
27
+ "hop_size": 256,
28
+ "sample_rate": 22050,
29
+ "n_fft": 1024, // todo
30
+ "fmin": 0,
31
+ "fmax": 8000, // todo
32
+ "raw_data": "raw_data",
33
+ "text_cleaners": ["english_cleaners"],
34
+ "f0_min": 71, // ~C2
35
+ "f0_max": 800, //1100, // ~C6(1100), ~G5(800)
36
+ "pitch_bin": 256,
37
+ "pitch_max": 1100.0,
38
+ "pitch_min": 50.0,
39
+ "is_label": true,
40
+ "is_mu_law": true,
41
+ "bits": 8,
42
+
43
+ "mel_min_max_stats_dir": "mel_min_max_stats",
44
+ "whisper_dir": "whisper",
45
+ "content_vector_dir": "content_vector",
46
+ "wenet_dir": "wenet",
47
+ "mert_dir": "mert",
48
+ "spk2id":"spk2id.json",
49
+ "utt2spk":"utt2spk",
50
+
51
+ // Features used for model training
52
+ "use_mel": true,
53
+ "use_min_max_norm_mel": false,
54
+ "use_frame_pitch": false,
55
+ "use_frame_energy": false,
56
+ "use_phone_pitch": true,
57
+ "use_phone_energy": true,
58
+ "use_log_scale_pitch": false,
59
+ "use_log_scale_energy": false,
60
+ "use_spkid": false,
61
+ "align_mel_duration": true,
62
+ "text_cleaners": ["english_cleaners"]
63
+ },
64
+ "model": {
65
+ // Settings for transformer
66
+ "transformer": {
67
+ "encoder_layer": 4,
68
+ "encoder_head": 2,
69
+ "encoder_hidden": 256,
70
+ "decoder_layer": 6,
71
+ "decoder_head": 2,
72
+ "decoder_hidden": 256,
73
+ "conv_filter_size": 1024,
74
+ "conv_kernel_size": [9, 1],
75
+ "encoder_dropout": 0.2,
76
+ "decoder_dropout": 0.2
77
+ },
78
+
79
+ // Settings for variance_predictor
80
+ "variance_predictor":{
81
+ "filter_size": 256,
82
+ "kernel_size": 3,
83
+ "dropout": 0.5
84
+ },
85
+ "variance_embedding":{
86
+ "pitch_quantization": "linear", // support 'linear' or 'log', 'log' is allowed only if the pitch values are not normalized during preprocessing
87
+ "energy_quantization": "linear", // support 'linear' or 'log', 'log' is allowed only if the energy values are not normalized during preprocessing
88
+ "n_bins": 256
89
+ },
90
+ "max_seq_len": 1000
91
+ },
92
+ "train":{
93
+ "batch_size": 16,
94
+ "sort_sample": true,
95
+ "drop_last": true,
96
+ "group_size": 4,
97
+ "grad_clip_thresh": 1.0,
98
+ "dataloader": {
99
+ "num_worker": 8,
100
+ "pin_memory": true
101
+ },
102
+ "lr_scheduler":{
103
+ "num_warmup": 4000
104
+ },
105
+ // LR Scheduler
106
+ "scheduler": "NoamLR",
107
+ // Optimizer
108
+ "optimizer": "Adam",
109
+ "adam": {
110
+ "lr": 0.0625,
111
+ "betas": [0.9, 0.98],
112
+ "eps": 0.000000001,
113
+ "weight_decay": 0.0
114
+ },
115
+ }
116
+
117
+ }
config/transformer.json ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/base.json",
3
+ "model_type": "Transformer",
4
+ "task_type": "svc",
5
+ "use_custom_dataset": false,
6
+ "preprocess": {
7
+ // data augmentations
8
+ "use_pitch_shift": false,
9
+ "use_formant_shift": false,
10
+ "use_time_stretch": false,
11
+ "use_equalizer": false,
12
+ // acoustic features
13
+ "extract_mel": true,
14
+ "mel_min_max_norm": true,
15
+ "extract_pitch": true,
16
+ "pitch_extractor": "parselmouth",
17
+ "extract_uv": true,
18
+ "extract_energy": true,
19
+ // content features
20
+ "extract_whisper_feature": false,
21
+ "whisper_sample_rate": 16000,
22
+ "extract_contentvec_feature": false,
23
+ "contentvec_sample_rate": 16000,
24
+ "extract_wenet_feature": false,
25
+ "wenet_sample_rate": 16000,
26
+ "extract_mert_feature": false,
27
+ "mert_sample_rate": 16000,
28
+ // Default config for whisper
29
+ "whisper_frameshift": 0.01,
30
+ "whisper_downsample_rate": 2,
31
+ // Default config for content vector
32
+ "contentvec_frameshift": 0.02,
33
+ // Default config for mert
34
+ "mert_model": "m-a-p/MERT-v1-330M",
35
+ "mert_feature_layer": -1,
36
+ "mert_hop_size": 320,
37
+ // 24k
38
+ "mert_frameshit": 0.01333,
39
+ // 10ms
40
+ "wenet_frameshift": 0.01,
41
+ // wenetspeech is 4, gigaspeech is 6
42
+ "wenet_downsample_rate": 4,
43
+ // Default config
44
+ "n_mel": 100,
45
+ "win_size": 1024,
46
+ // todo
47
+ "hop_size": 256,
48
+ "sample_rate": 24000,
49
+ "n_fft": 1024,
50
+ // todo
51
+ "fmin": 0,
52
+ "fmax": 12000,
53
+ // todo
54
+ "f0_min": 50,
55
+ // ~C2
56
+ "f0_max": 1100,
57
+ //1100, // ~C6(1100), ~G5(800)
58
+ "pitch_bin": 256,
59
+ "pitch_max": 1100.0,
60
+ "pitch_min": 50.0,
61
+ "is_label": true,
62
+ "is_mu_law": true,
63
+ "bits": 8,
64
+ "mel_min_max_stats_dir": "mel_min_max_stats",
65
+ "whisper_dir": "whisper",
66
+ "contentvec_dir": "contentvec",
67
+ "wenet_dir": "wenet",
68
+ "mert_dir": "mert",
69
+ // Extract content features using dataloader
70
+ "pin_memory": true,
71
+ "num_workers": 8,
72
+ "content_feature_batch_size": 16,
73
+ // Features used for model training
74
+ "use_mel": true,
75
+ "use_min_max_norm_mel": true,
76
+ "use_frame_pitch": true,
77
+ "use_uv": true,
78
+ "use_frame_energy": true,
79
+ "use_log_scale_pitch": false,
80
+ "use_log_scale_energy": false,
81
+ "use_spkid": true,
82
+ // Meta file
83
+ "train_file": "train.json",
84
+ "valid_file": "test.json",
85
+ "spk2id": "singers.json",
86
+ "utt2spk": "utt2singer"
87
+ },
88
+ "model": {
89
+ "condition_encoder": {
90
+ "merge_mode": "add",
91
+ "input_melody_dim": 1,
92
+ "use_log_f0": true,
93
+ "n_bins_melody": 256,
94
+ //# Quantization (0 for not quantization)
95
+ "output_melody_dim": 384,
96
+ "input_loudness_dim": 1,
97
+ "use_log_loudness": true,
98
+ "n_bins_loudness": 256,
99
+ "output_loudness_dim": 384,
100
+ "use_whisper": false,
101
+ "use_contentvec": true,
102
+ "use_wenet": false,
103
+ "use_mert": false,
104
+ "whisper_dim": 1024,
105
+ "contentvec_dim": 256,
106
+ "mert_dim": 256,
107
+ "wenet_dim": 512,
108
+ "content_encoder_dim": 384,
109
+ "output_singer_dim": 384,
110
+ "singer_table_size": 512,
111
+ "output_content_dim": 384,
112
+ "use_spkid": true
113
+ },
114
+ "transformer": {
115
+ "type": "conformer",
116
+ // 'conformer' or 'transformer'
117
+ "input_dim": 384,
118
+ "output_dim": 100,
119
+ "n_heads": 2,
120
+ "n_layers": 6,
121
+ "filter_channels": 512,
122
+ "dropout": 0.1,
123
+ }
124
+ },
125
+ "train": {
126
+ // Basic settings
127
+ "batch_size": 64,
128
+ "gradient_accumulation_step": 1,
129
+ "max_epoch": -1,
130
+ // -1 means no limit
131
+ "save_checkpoint_stride": [
132
+ 10,
133
+ 100
134
+ ],
135
+ // unit is epoch
136
+ "keep_last": [
137
+ 3,
138
+ -1
139
+ ],
140
+ // -1 means infinite, if one number will broadcast
141
+ "run_eval": [
142
+ false,
143
+ true
144
+ ],
145
+ // if one number will broadcast
146
+ // Fix the random seed
147
+ "random_seed": 10086,
148
+ // Batchsampler
149
+ "sampler": {
150
+ "holistic_shuffle": true,
151
+ "drop_last": true
152
+ },
153
+ // Dataloader
154
+ "dataloader": {
155
+ "num_worker": 32,
156
+ "pin_memory": true
157
+ },
158
+ // Trackers
159
+ "tracker": [
160
+ "tensorboard"
161
+ // "wandb",
162
+ // "cometml",
163
+ // "mlflow",
164
+ ],
165
+ // Optimizer
166
+ "optimizer": "AdamW",
167
+ "adamw": {
168
+ "lr": 4.0e-4
169
+ // nn model lr
170
+ },
171
+ // LR Scheduler
172
+ "scheduler": "ReduceLROnPlateau",
173
+ "reducelronplateau": {
174
+ "factor": 0.8,
175
+ "patience": 10,
176
+ // unit is epoch
177
+ "min_lr": 1.0e-4
178
+ }
179
+ }
180
+ }
config/tts.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/base.json",
3
+ "supported_model_type": [
4
+ "Fastspeech2",
5
+ "VITS",
6
+ "VALLE",
7
+ ],
8
+ "task_type": "tts",
9
+ "preprocess": {
10
+ "language": "en-us",
11
+ // linguistic features
12
+ "extract_phone": true,
13
+ "phone_extractor": "espeak", // "espeak, pypinyin, pypinyin_initials_finals, lexicon (only for language=en-us right now)"
14
+ "lexicon_path": "./text/lexicon/librispeech-lexicon.txt",
15
+ // Directory names of processed data or extracted features
16
+ "phone_dir": "phones",
17
+ "use_phone": true,
18
+ },
19
+ "model": {
20
+ "text_token_num": 512,
21
+ }
22
+
23
+ }
config/valle.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/tts.json",
3
+ "model_type": "VALLE",
4
+ "task_type": "tts",
5
+ "dataset": [
6
+ "libritts"
7
+ ],
8
+ "preprocess": {
9
+ "extract_phone": true,
10
+ "phone_extractor": "espeak", // phoneme extractor: espeak, pypinyin, pypinyin_initials_finals or lexicon
11
+ "extract_acoustic_token": true,
12
+ "acoustic_token_extractor": "Encodec", // acoustic token extractor: encodec, dac(todo)
13
+ "acoustic_token_dir": "acoutic_tokens",
14
+ "use_text": false,
15
+ "use_phone": true,
16
+ "use_acoustic_token": true,
17
+ "symbols_dict": "symbols.dict",
18
+ "min_duration": 0.5, // the duration lowerbound to filter the audio with duration < min_duration
19
+ "max_duration": 14, // the duration uperbound to filter the audio with duration > max_duration.
20
+ "sampling_rate": 24000,
21
+ },
22
+ "model": {
23
+ "text_token_num": 512,
24
+ "audio_token_num": 1024,
25
+ "decoder_dim": 1024, // embedding dimension of the decoder model
26
+ "nhead": 16, // number of attention heads in the decoder layers
27
+ "num_decoder_layers": 12, // number of decoder layers
28
+ "norm_first": true, // pre or post Normalization.
29
+ "add_prenet": false, // whether add PreNet after Inputs
30
+ "prefix_mode": 0, // mode for how to prefix VALL-E NAR Decoder, 0: no prefix, 1: 0 to random, 2: random to random, 4: chunk of pre or post utterance
31
+ "share_embedding": true, // share the parameters of the output projection layer with the parameters of the acoustic embedding
32
+ "nar_scale_factor": 1, // model scale factor which will be assigned different meanings in different models
33
+ "prepend_bos": false, // whether prepend <BOS> to the acoustic tokens -> AR Decoder inputs
34
+ "num_quantizers": 8, // numbert of the audio quantization layers
35
+ // "scaling_xformers": false, // Apply Reworked Conformer scaling on Transformers
36
+ },
37
+ "train": {
38
+ "ddp": false,
39
+ "train_stage": 1, // 0: train all modules, For VALL_E, support 1: AR Decoder 2: NAR Decoder(s)
40
+ "max_epoch": 20,
41
+ "optimizer": "ScaledAdam",
42
+ "scheduler": "Eden",
43
+ "warmup_steps": 200, // number of steps that affects how rapidly the learning rate decreases
44
+ "base_lr": 0.05, // base learning rate."
45
+ "valid_interval": 1000,
46
+ "log_epoch_step": 1000,
47
+ "save_checkpoint_stride": [
48
+ 1,
49
+ 1
50
+ ]
51
+ }
52
+ }
config/vits.json ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/tts.json",
3
+ "model_type": "VITS",
4
+ "task_type": "tts",
5
+ "preprocess": {
6
+ "extract_phone": true,
7
+ "extract_mel": true,
8
+ "n_mel": 80,
9
+ "fmin": 0,
10
+ "fmax": null,
11
+ "extract_linear_spec": true,
12
+ "extract_audio": true,
13
+ "use_linear": true,
14
+ "use_mel": true,
15
+ "use_audio": true,
16
+ "use_text": false,
17
+ "use_phone": true,
18
+ "lexicon_path": "./text/lexicon/librispeech-lexicon.txt",
19
+ "n_fft": 1024,
20
+ "win_size": 1024,
21
+ "hop_size": 256,
22
+ "segment_size": 8192,
23
+ "text_cleaners": [
24
+ "english_cleaners"
25
+ ]
26
+ },
27
+ "model": {
28
+ "text_token_num": 512,
29
+ "inter_channels": 192,
30
+ "hidden_channels": 192,
31
+ "filter_channels": 768,
32
+ "n_heads": 2,
33
+ "n_layers": 6,
34
+ "kernel_size": 3,
35
+ "p_dropout": 0.1,
36
+ "resblock": "1",
37
+ "resblock_kernel_sizes": [
38
+ 3,
39
+ 7,
40
+ 11
41
+ ],
42
+ "resblock_dilation_sizes": [
43
+ [
44
+ 1,
45
+ 3,
46
+ 5
47
+ ],
48
+ [
49
+ 1,
50
+ 3,
51
+ 5
52
+ ],
53
+ [
54
+ 1,
55
+ 3,
56
+ 5
57
+ ]
58
+ ],
59
+ "upsample_rates": [
60
+ 8,
61
+ 8,
62
+ 2,
63
+ 2
64
+ ],
65
+ "upsample_initial_channel": 512,
66
+ "upsample_kernel_sizes": [
67
+ 16,
68
+ 16,
69
+ 4,
70
+ 4
71
+ ],
72
+ "n_layers_q": 3,
73
+ "use_spectral_norm": false,
74
+ "n_speakers": 0, // number of speakers, while be automatically set if n_speakers is 0 and multi_speaker_training is true
75
+ "gin_channels": 256,
76
+ "use_sdp": true
77
+ },
78
+ "train": {
79
+ "fp16_run": true,
80
+ "learning_rate": 2e-4,
81
+ "betas": [
82
+ 0.8,
83
+ 0.99
84
+ ],
85
+ "eps": 1e-9,
86
+ "batch_size": 16,
87
+ "lr_decay": 0.999875,
88
+ // "segment_size": 8192,
89
+ "init_lr_ratio": 1,
90
+ "warmup_epochs": 0,
91
+ "c_mel": 45,
92
+ "c_kl": 1.0,
93
+ "AdamW": {
94
+ "betas": [
95
+ 0.8,
96
+ 0.99
97
+ ],
98
+ "eps": 1e-9,
99
+ }
100
+ }
101
+ }
config/vocoder.json ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/base.json",
3
+ "dataset": [
4
+ "LJSpeech",
5
+ "LibriTTS",
6
+ "opencpop",
7
+ "m4singer",
8
+ "svcc",
9
+ "svcceval",
10
+ "pjs",
11
+ "opensinger",
12
+ "popbutfy",
13
+ "nus48e",
14
+ "popcs",
15
+ "kising",
16
+ "csd",
17
+ "opera",
18
+ "vctk",
19
+ "lijian",
20
+ "cdmusiceval"
21
+ ],
22
+ "task_type": "vocoder",
23
+ "preprocess": {
24
+ // acoustic features
25
+ "extract_mel": true,
26
+ "extract_pitch": false,
27
+ "extract_uv": false,
28
+ "extract_audio": true,
29
+ "extract_label": false,
30
+ "extract_one_hot": false,
31
+ "extract_amplitude_phase": false,
32
+ "pitch_extractor": "parselmouth",
33
+ // Settings for data preprocessing
34
+ "n_mel": 100,
35
+ "win_size": 1024,
36
+ "hop_size": 256,
37
+ "sample_rate": 24000,
38
+ "n_fft": 1024,
39
+ "fmin": 0,
40
+ "fmax": 12000,
41
+ "f0_min": 50,
42
+ "f0_max": 1100,
43
+ "pitch_bin": 256,
44
+ "pitch_max": 1100.0,
45
+ "pitch_min": 50.0,
46
+ "is_mu_law": false,
47
+ "bits": 8,
48
+ "cut_mel_frame": 32,
49
+ // Directory names of processed data or extracted features
50
+ "spk2id": "singers.json",
51
+ // Features used for model training
52
+ "use_mel": true,
53
+ "use_frame_pitch": false,
54
+ "use_uv": false,
55
+ "use_audio": true,
56
+ "use_label": false,
57
+ "use_one_hot": false,
58
+ "train_file": "train.json",
59
+ "valid_file": "test.json"
60
+ },
61
+ "train": {
62
+ "random_seed": 114514,
63
+ "batch_size": 64,
64
+ "gradient_accumulation_step": 1,
65
+ "max_epoch": 1000000,
66
+ "save_checkpoint_stride": [
67
+ 20
68
+ ],
69
+ "run_eval": [
70
+ true
71
+ ],
72
+ "sampler": {
73
+ "holistic_shuffle": true,
74
+ "drop_last": true
75
+ },
76
+ "dataloader": {
77
+ "num_worker": 4,
78
+ "pin_memory": true
79
+ },
80
+ "tracker": [
81
+ "tensorboard"
82
+ ],
83
+ }
84
+ }
egs/vocoder/README.md ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Amphion Vocoder Recipe
2
+
3
+ ## Quick Start
4
+
5
+ We provide a [**beginner recipe**](gan/tfr_enhanced_hifigan/README.md) to demonstrate how to train a high quality HiFi-GAN speech vocoder. Specially, it is also an official implementation of our paper "[Multi-Scale Sub-Band Constant-Q Transform Discriminator for High-Fidelity Vocoder](https://arxiv.org/abs/2311.14957)". Some demos can be seen [here](https://vocodexelysium.github.io/MS-SB-CQTD/).
6
+
7
+ ## Supported Models
8
+
9
+ Neural vocoder generates audible waveforms from acoustic representations, which is one of the key parts for current audio generation systems. Until now, Amphion has supported various widely-used vocoders according to different vocoder types, including:
10
+
11
+ - **GAN-based vocoders**, which we have provided [**a unified recipe**](gan/README.md) :
12
+ - [MelGAN](https://arxiv.org/abs/1910.06711)
13
+ - [HiFi-GAN](https://arxiv.org/abs/2010.05646)
14
+ - [NSF-HiFiGAN](https://github.com/nii-yamagishilab/project-NN-Pytorch-scripts)
15
+ - [BigVGAN](https://arxiv.org/abs/2206.04658)
16
+ - [APNet](https://arxiv.org/abs/2305.07952)
17
+ - **Flow-based vocoders** (👨‍💻 developing):
18
+ - [WaveGlow](https://arxiv.org/abs/1811.00002)
19
+ - **Diffusion-based vocoders** (👨‍💻 developing):
20
+ - [Diffwave](https://arxiv.org/abs/2009.09761)
21
+ - **Auto-regressive based vocoders** (👨‍💻 developing):
22
+ - [WaveNet](https://arxiv.org/abs/1609.03499)
23
+ - [WaveRNN](https://arxiv.org/abs/1802.08435v1)
egs/vocoder/diffusion/README.md ADDED
File without changes
egs/vocoder/diffusion/exp_config_base.json ADDED
File without changes
egs/vocoder/gan/README.md ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Amphion GAN-based Vocoder Recipe
2
+
3
+ ## Supported Model Architectures
4
+
5
+ GAN-based Vocoder consists of a generator and multiple discriminators, as illustrated below:
6
+
7
+ <br>
8
+ <div align="center">
9
+ <img src="../../../imgs/vocoder/gan/pipeline.png" width="40%">
10
+ </div>
11
+ <br>
12
+
13
+ Until now, Amphion GAN-based Vocoder has supported the following generators and discriminators.
14
+
15
+ - **Generators**
16
+ - [MelGAN](https://arxiv.org/abs/1910.06711)
17
+ - [HiFi-GAN](https://arxiv.org/abs/2010.05646)
18
+ - [NSF-HiFiGAN](https://github.com/nii-yamagishilab/project-NN-Pytorch-scripts)
19
+ - [BigVGAN](https://arxiv.org/abs/2206.04658)
20
+ - [APNet](https://arxiv.org/abs/2305.07952)
21
+ - **Discriminators**
22
+ - [Multi-Scale Discriminator](https://arxiv.org/abs/2010.05646)
23
+ - [Multi-Period Discriminator](https://arxiv.org/abs/2010.05646)
24
+ - [Multi-Resolution Discriminator](https://arxiv.org/abs/2011.09631)
25
+ - [Multi-Scale Short-Time Fourier Transform Discriminator](https://arxiv.org/abs/2210.13438)
26
+ - [**Multi-Scale Constant-Q Transfrom Discriminator (ours)**](https://arxiv.org/abs/2311.14957)
27
+
28
+ You can use any vocoder architecture with any dataset you want. There are four steps in total:
29
+
30
+ 1. Data preparation
31
+ 2. Feature extraction
32
+ 3. Training
33
+ 4. Inference
34
+
35
+ > **NOTE:** You need to run every command of this recipe in the `Amphion` root path:
36
+ > ```bash
37
+ > cd Amphion
38
+ > ```
39
+
40
+ ## 1. Data Preparation
41
+
42
+ You can train the vocoder with any datasets. Amphion's supported open-source datasets are detailed [here](../../../datasets/README.md).
43
+
44
+ ### Configuration
45
+
46
+ Specify the dataset path in `exp_config_base.json`. Note that you can change the `dataset` list to use your preferred datasets.
47
+
48
+ ```json
49
+ "dataset": [
50
+ "csd",
51
+ "kising",
52
+ "m4singer",
53
+ "nus48e",
54
+ "opencpop",
55
+ "opensinger",
56
+ "opera",
57
+ "pjs",
58
+ "popbutfy",
59
+ "popcs",
60
+ "ljspeech",
61
+ "vctk",
62
+ "libritts",
63
+ ],
64
+ "dataset_path": {
65
+ // TODO: Fill in your dataset path
66
+ "csd": "[dataset path]",
67
+ "kising": "[dataset path]",
68
+ "m4singer": "[dataset path]",
69
+ "nus48e": "[dataset path]",
70
+ "opencpop": "[dataset path]",
71
+ "opensinger": "[dataset path]",
72
+ "opera": "[dataset path]",
73
+ "pjs": "[dataset path]",
74
+ "popbutfy": "[dataset path]",
75
+ "popcs": "[dataset path]",
76
+ "ljspeech": "[dataset path]",
77
+ "vctk": "[dataset path]",
78
+ "libritts": "[dataset path]",
79
+ },
80
+ ```
81
+
82
+ ### 2. Feature Extraction
83
+
84
+ The needed features are speficied in the individual vocoder direction so it doesn't require any modification.
85
+
86
+ ### Configuration
87
+
88
+ Specify the dataset path and the output path for saving the processed data and the training model in `exp_config_base.json`:
89
+
90
+ ```json
91
+ // TODO: Fill in the output log path. The default value is "Amphion/ckpts/vocoder"
92
+ "log_dir": "ckpts/vocoder",
93
+ "preprocess": {
94
+ // TODO: Fill in the output data path. The default value is "Amphion/data"
95
+ "processed_dir": "data",
96
+ ...
97
+ },
98
+ ```
99
+
100
+ ### Run
101
+
102
+ Run the `run.sh` as the preproces stage (set `--stage 1`).
103
+
104
+ ```bash
105
+ sh egs/vocoder/gan/{vocoder_name}/run.sh --stage 1
106
+ ```
107
+
108
+ > **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`.
109
+
110
+ ## 3. Training
111
+
112
+ ### Configuration
113
+
114
+ We provide the default hyparameters in the `exp_config_base.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines.
115
+
116
+ ```json
117
+ "train": {
118
+ "batch_size": 16,
119
+ "max_epoch": 1000000,
120
+ "save_checkpoint_stride": [20],
121
+ "adamw": {
122
+ "lr": 2.0e-4,
123
+ "adam_b1": 0.8,
124
+ "adam_b2": 0.99
125
+ },
126
+ "exponential_lr": {
127
+ "lr_decay": 0.999
128
+ },
129
+ }
130
+ ```
131
+
132
+ You can also choose any amount of prefered discriminators for training in the `exp_config_base.json`.
133
+
134
+ ```json
135
+ "discriminators": [
136
+ "msd",
137
+ "mpd",
138
+ "msstftd",
139
+ "mssbcqtd",
140
+ ],
141
+ ```
142
+
143
+ ### Run
144
+
145
+ Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/vocoder/[YourExptName]`.
146
+
147
+ ```bash
148
+ sh egs/vocoder/gan/{vocoder_name}/run.sh --stage 2 --name [YourExptName]
149
+ ```
150
+
151
+ > **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`.
152
+
153
+
154
+ ## 4. Inference
155
+
156
+ ### Run
157
+
158
+ Run the `run.sh` as the training stage (set `--stage 3`), we provide three different inference modes, including `infer_from_dataset`, `infer_from_feature`, `and infer_from_audio`.
159
+
160
+ ```bash
161
+ sh egs/vocoder/gan/{vocoder_name}/run.sh --stage 3 \
162
+ --infer_mode [Your chosen inference mode] \
163
+ --infer_datasets [Datasets you want to inference, needed when infer_from_dataset] \
164
+ --infer_feature_dir [Your path to your predicted acoustic features, needed when infer_from_feature] \
165
+ --infer_audio_dir [Your path to your audio files, needed when infer_form_audio] \
166
+ --infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \
167
+ --infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \
168
+ ```
169
+
170
+ #### a. Inference from Dataset
171
+
172
+ Run the `run.sh` with specified datasets, here is an example.
173
+
174
+ ```bash
175
+ sh egs/vocoder/gan/{vocoder_name}/run.sh --stage 3 \
176
+ --infer_mode infer_from_dataset \
177
+ --infer_datasets "libritts vctk ljspeech" \
178
+ --infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \
179
+ --infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \
180
+ ```
181
+
182
+ #### b. Inference from Features
183
+
184
+ If you want to inference from your generated acoustic features, you should first prepare your acoustic features into the following structure:
185
+
186
+ ```plaintext
187
+ ┣ {infer_feature_dir}
188
+ ┃ ┣ mels
189
+ ┃ ┃ ┣ sample1.npy
190
+ ┃ ┃ ┣ sample2.npy
191
+ ┃ ┣ f0s (required if you use NSF-HiFiGAN)
192
+ ┃ ┃ ┣ sample1.npy
193
+ ┃ ┃ ┣ sample2.npy
194
+ ```
195
+
196
+ Then run the `run.sh` with specificed folder direction, here is an example.
197
+
198
+ ```bash
199
+ sh egs/vocoder/gan/{vocoder_name}/run.sh --stage 3 \
200
+ --infer_mode infer_from_feature \
201
+ --infer_feature_dir [Your path to your predicted acoustic features] \
202
+ --infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \
203
+ --infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \
204
+ ```
205
+
206
+ #### c. Inference from Audios
207
+
208
+ If you want to inference from audios for quick analysis synthesis, you should first prepare your audios into the following structure:
209
+
210
+ ```plaintext
211
+ ┣ audios
212
+ ┃ ┣ sample1.wav
213
+ ┃ ┣ sample2.wav
214
+ ```
215
+
216
+ Then run the `run.sh` with specificed folder direction, here is an example.
217
+
218
+ ```bash
219
+ sh egs/vocoder/gan/{vocoder_name}/run.sh --stage 3 \
220
+ --infer_mode infer_from_audio \
221
+ --infer_audio_dir [Your path to your audio files] \
222
+ --infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \
223
+ --infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \
224
+ ```
egs/vocoder/gan/_template/run.sh ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ ######## Build Experiment Environment ###########
7
+ exp_dir=$(cd `dirname $0`; pwd)
8
+ work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir))))
9
+
10
+ export WORK_DIR=$work_dir
11
+ export PYTHONPATH=$work_dir
12
+ export PYTHONIOENCODING=UTF-8
13
+
14
+ ######## Parse the Given Parameters from the Commond ###########
15
+ options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,checkpoint:,resume_type:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@")
16
+ eval set -- "$options"
17
+
18
+ while true; do
19
+ case $1 in
20
+ # Experimental Configuration File
21
+ -c | --config) shift; exp_config=$1 ; shift ;;
22
+ # Experimental Name
23
+ -n | --name) shift; exp_name=$1 ; shift ;;
24
+ # Running Stage
25
+ -s | --stage) shift; running_stage=$1 ; shift ;;
26
+ # Visible GPU machines. The default value is "0".
27
+ --gpu) shift; gpu=$1 ; shift ;;
28
+
29
+ # [Only for Training] Resume configuration
30
+ --resume) shift; resume=$1 ; shift ;;
31
+ # [Only for Training] The specific checkpoint path that you want to resume from.
32
+ --checkpoint) shift; cehckpoint=$1 ; shift ;;
33
+ # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
34
+ --resume_type) shift; resume_type=$1 ; shift ;;
35
+
36
+ # [Only for Inference] The inference mode
37
+ --infer_mode) shift; infer_mode=$1 ; shift ;;
38
+ # [Only for Inference] The inferenced datasets
39
+ --infer_datasets) shift; infer_datasets=$1 ; shift ;;
40
+ # [Only for Inference] The feature dir for inference
41
+ --infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;;
42
+ # [Only for Inference] The audio dir for inference
43
+ --infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;;
44
+ # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
45
+ --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
46
+ # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
47
+ --infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
48
+
49
+ --) shift ; break ;;
50
+ *) echo "Invalid option: $1" exit 1 ;;
51
+ esac
52
+ done
53
+
54
+
55
+ ### Value check ###
56
+ if [ -z "$running_stage" ]; then
57
+ echo "[Error] Please specify the running stage"
58
+ exit 1
59
+ fi
60
+
61
+ if [ -z "$exp_config" ]; then
62
+ exp_config="${exp_dir}"/exp_config.json
63
+ fi
64
+ echo "Exprimental Configuration File: $exp_config"
65
+
66
+ if [ -z "$gpu" ]; then
67
+ gpu="0"
68
+ fi
69
+
70
+ ######## Features Extraction ###########
71
+ if [ $running_stage -eq 1 ]; then
72
+ CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \
73
+ --config $exp_config \
74
+ --num_workers 8
75
+ fi
76
+
77
+ ######## Training ###########
78
+ if [ $running_stage -eq 2 ]; then
79
+ if [ -z "$exp_name" ]; then
80
+ echo "[Error] Please specify the experiments name"
81
+ exit 1
82
+ fi
83
+ echo "Exprimental Name: $exp_name"
84
+
85
+ if [ "$resume" = true ]; then
86
+ echo "Automatically resume from the experimental dir..."
87
+ CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/vocoder/train.py \
88
+ --config "$exp_config" \
89
+ --exp_name "$exp_name" \
90
+ --log_level info \
91
+ --resume
92
+ else
93
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/vocoder/train.py \
94
+ --config "$exp_config" \
95
+ --exp_name "$exp_name" \
96
+ --log_level info \
97
+ --checkpoint "$checkpoint" \
98
+ --resume_type "$resume_type"
99
+ fi
100
+ fi
101
+
102
+ ######## Inference/Conversion ###########
103
+ if [ $running_stage -eq 3 ]; then
104
+ if [ -z "$infer_expt_dir" ]; then
105
+ echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
106
+ exit 1
107
+ fi
108
+
109
+ if [ -z "$infer_output_dir" ]; then
110
+ infer_output_dir="$infer_expt_dir/result"
111
+ fi
112
+
113
+ if [ $infer_mode = "infer_from_dataset" ]; then
114
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
115
+ --config $exp_config \
116
+ --infer_mode $infer_mode \
117
+ --infer_datasets $infer_datasets \
118
+ --vocoder_dir $infer_expt_dir \
119
+ --output_dir $infer_output_dir \
120
+ --log_level debug
121
+ fi
122
+
123
+ if [ $infer_mode = "infer_from_feature" ]; then
124
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
125
+ --config $exp_config \
126
+ --infer_mode $infer_mode \
127
+ --feature_folder $infer_feature_dir \
128
+ --vocoder_dir $infer_expt_dir \
129
+ --output_dir $infer_output_dir \
130
+ --log_level debug
131
+ fi
132
+
133
+ if [ $infer_mode = "infer_from_audio" ]; then
134
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
135
+ --config $exp_config \
136
+ --infer_mode $infer_mode \
137
+ --audio_folder $infer_audio_dir \
138
+ --vocoder_dir $infer_expt_dir \
139
+ --output_dir $infer_output_dir \
140
+ --log_level debug
141
+ fi
142
+
143
+ fi
egs/vocoder/gan/apnet/exp_config.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "egs/vocoder/gan/exp_config_base.json",
3
+ "preprocess": {
4
+ // acoustic features
5
+ "extract_mel": true,
6
+ "extract_audio": true,
7
+ "extract_amplitude_phase": true,
8
+
9
+ // Features used for model training
10
+ "use_mel": true,
11
+ "use_audio": true,
12
+ "use_amplitude_phase": true
13
+ },
14
+ "model": {
15
+ "generator": "apnet",
16
+ "apnet": {
17
+ "ASP_channel": 512,
18
+ "ASP_resblock_kernel_sizes": [3,7,11],
19
+ "ASP_resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
20
+ "ASP_input_conv_kernel_size": 7,
21
+ "ASP_output_conv_kernel_size": 7,
22
+
23
+ "PSP_channel": 512,
24
+ "PSP_resblock_kernel_sizes": [3,7,11],
25
+ "PSP_resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
26
+ "PSP_input_conv_kernel_size": 7,
27
+ "PSP_output_R_conv_kernel_size": 7,
28
+ "PSP_output_I_conv_kernel_size": 7,
29
+ }
30
+ },
31
+ "train": {
32
+ "criterions": [
33
+ "feature",
34
+ "discriminator",
35
+ "generator",
36
+ "mel",
37
+ "phase",
38
+ "amplitude",
39
+ "consistency"
40
+ ]
41
+ },
42
+ "inference": {
43
+ "batch_size": 1,
44
+ }
45
+ }
egs/vocoder/gan/apnet/run.sh ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ ######## Build Experiment Environment ###########
7
+ exp_dir=$(cd `dirname $0`; pwd)
8
+ work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir))))
9
+
10
+ export WORK_DIR=$work_dir
11
+ export PYTHONPATH=$work_dir
12
+ export PYTHONIOENCODING=UTF-8
13
+
14
+ ######## Parse the Given Parameters from the Commond ###########
15
+ options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,checkpoint:,resume_type:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@")
16
+ eval set -- "$options"
17
+
18
+ while true; do
19
+ case $1 in
20
+ # Experimental Configuration File
21
+ -c | --config) shift; exp_config=$1 ; shift ;;
22
+ # Experimental Name
23
+ -n | --name) shift; exp_name=$1 ; shift ;;
24
+ # Running Stage
25
+ -s | --stage) shift; running_stage=$1 ; shift ;;
26
+ # Visible GPU machines. The default value is "0".
27
+ --gpu) shift; gpu=$1 ; shift ;;
28
+
29
+ # [Only for Training] Resume configuration
30
+ --resume) shift; resume=$1 ; shift ;;
31
+ # [Only for Training] The specific checkpoint path that you want to resume from.
32
+ --checkpoint) shift; cehckpoint=$1 ; shift ;;
33
+ # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
34
+ --resume_type) shift; resume_type=$1 ; shift ;;
35
+
36
+ # [Only for Inference] The inference mode
37
+ --infer_mode) shift; infer_mode=$1 ; shift ;;
38
+ # [Only for Inference] The inferenced datasets
39
+ --infer_datasets) shift; infer_datasets=$1 ; shift ;;
40
+ # [Only for Inference] The feature dir for inference
41
+ --infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;;
42
+ # [Only for Inference] The audio dir for inference
43
+ --infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;;
44
+ # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
45
+ --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
46
+ # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
47
+ --infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
48
+
49
+ --) shift ; break ;;
50
+ *) echo "Invalid option: $1" exit 1 ;;
51
+ esac
52
+ done
53
+
54
+
55
+ ### Value check ###
56
+ if [ -z "$running_stage" ]; then
57
+ echo "[Error] Please specify the running stage"
58
+ exit 1
59
+ fi
60
+
61
+ if [ -z "$exp_config" ]; then
62
+ exp_config="${exp_dir}"/exp_config.json
63
+ fi
64
+ echo "Exprimental Configuration File: $exp_config"
65
+
66
+ if [ -z "$gpu" ]; then
67
+ gpu="0"
68
+ fi
69
+
70
+ ######## Features Extraction ###########
71
+ if [ $running_stage -eq 1 ]; then
72
+ CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \
73
+ --config $exp_config \
74
+ --num_workers 8
75
+ fi
76
+
77
+ ######## Training ###########
78
+ if [ $running_stage -eq 2 ]; then
79
+ if [ -z "$exp_name" ]; then
80
+ echo "[Error] Please specify the experiments name"
81
+ exit 1
82
+ fi
83
+ echo "Exprimental Name: $exp_name"
84
+
85
+ if [ "$resume" = true ]; then
86
+ echo "Automatically resume from the experimental dir..."
87
+ CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/vocoder/train.py \
88
+ --config "$exp_config" \
89
+ --exp_name "$exp_name" \
90
+ --log_level info \
91
+ --resume
92
+ else
93
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/vocoder/train.py \
94
+ --config "$exp_config" \
95
+ --exp_name "$exp_name" \
96
+ --log_level info \
97
+ --checkpoint "$checkpoint" \
98
+ --resume_type "$resume_type"
99
+ fi
100
+ fi
101
+
102
+ ######## Inference/Conversion ###########
103
+ if [ $running_stage -eq 3 ]; then
104
+ if [ -z "$infer_expt_dir" ]; then
105
+ echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
106
+ exit 1
107
+ fi
108
+
109
+ if [ -z "$infer_output_dir" ]; then
110
+ infer_output_dir="$infer_expt_dir/result"
111
+ fi
112
+
113
+ if [ $infer_mode = "infer_from_dataset" ]; then
114
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
115
+ --config $exp_config \
116
+ --infer_mode $infer_mode \
117
+ --infer_datasets $infer_datasets \
118
+ --vocoder_dir $infer_expt_dir \
119
+ --output_dir $infer_output_dir \
120
+ --log_level debug
121
+ fi
122
+
123
+ if [ $infer_mode = "infer_from_feature" ]; then
124
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
125
+ --config $exp_config \
126
+ --infer_mode $infer_mode \
127
+ --feature_folder $infer_feature_dir \
128
+ --vocoder_dir $infer_expt_dir \
129
+ --output_dir $infer_output_dir \
130
+ --log_level debug
131
+ fi
132
+
133
+ if [ $infer_mode = "infer_from_audio" ]; then
134
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
135
+ --config $exp_config \
136
+ --infer_mode $infer_mode \
137
+ --audio_folder $infer_audio_dir \
138
+ --vocoder_dir $infer_expt_dir \
139
+ --output_dir $infer_output_dir \
140
+ --log_level debug
141
+ fi
142
+
143
+ fi
egs/vocoder/gan/bigvgan/exp_config.json ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "egs/vocoder/gan/exp_config_base.json",
3
+ "preprocess": {
4
+ // acoustic features
5
+ "extract_mel": true,
6
+ "extract_audio": true,
7
+
8
+ // Features used for model training
9
+ "use_mel": true,
10
+ "use_audio": true
11
+ },
12
+ "model": {
13
+ "generator": "bigvgan",
14
+ "bigvgan": {
15
+ "resblock": "1",
16
+ "activation": "snakebeta",
17
+ "snake_logscale": true,
18
+ "upsample_rates": [
19
+ 8,
20
+ 8,
21
+ 2,
22
+ 2,
23
+ ],
24
+ "upsample_kernel_sizes": [
25
+ 16,
26
+ 16,
27
+ 4,
28
+ 4
29
+ ],
30
+ "upsample_initial_channel": 512,
31
+ "resblock_kernel_sizes": [
32
+ 3,
33
+ 7,
34
+ 11
35
+ ],
36
+ "resblock_dilation_sizes": [
37
+ [
38
+ 1,
39
+ 3,
40
+ 5
41
+ ],
42
+ [
43
+ 1,
44
+ 3,
45
+ 5
46
+ ],
47
+ [
48
+ 1,
49
+ 3,
50
+ 5
51
+ ]
52
+ ]
53
+ }
54
+ },
55
+ "train": {
56
+ "criterions": [
57
+ "feature",
58
+ "discriminator",
59
+ "generator",
60
+ "mel",
61
+ ]
62
+ },
63
+ "inference": {
64
+ "batch_size": 1,
65
+ }
66
+ }
egs/vocoder/gan/bigvgan/run.sh ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ ######## Build Experiment Environment ###########
7
+ exp_dir=$(cd `dirname $0`; pwd)
8
+ work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir))))
9
+
10
+ export WORK_DIR=$work_dir
11
+ export PYTHONPATH=$work_dir
12
+ export PYTHONIOENCODING=UTF-8
13
+
14
+ ######## Parse the Given Parameters from the Commond ###########
15
+ options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,checkpoint:,resume_type:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@")
16
+ eval set -- "$options"
17
+
18
+ while true; do
19
+ case $1 in
20
+ # Experimental Configuration File
21
+ -c | --config) shift; exp_config=$1 ; shift ;;
22
+ # Experimental Name
23
+ -n | --name) shift; exp_name=$1 ; shift ;;
24
+ # Running Stage
25
+ -s | --stage) shift; running_stage=$1 ; shift ;;
26
+ # Visible GPU machines. The default value is "0".
27
+ --gpu) shift; gpu=$1 ; shift ;;
28
+
29
+ # [Only for Training] Resume configuration
30
+ --resume) shift; resume=$1 ; shift ;;
31
+ # [Only for Training] The specific checkpoint path that you want to resume from.
32
+ --checkpoint) shift; cehckpoint=$1 ; shift ;;
33
+ # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
34
+ --resume_type) shift; resume_type=$1 ; shift ;;
35
+
36
+ # [Only for Inference] The inference mode
37
+ --infer_mode) shift; infer_mode=$1 ; shift ;;
38
+ # [Only for Inference] The inferenced datasets
39
+ --infer_datasets) shift; infer_datasets=$1 ; shift ;;
40
+ # [Only for Inference] The feature dir for inference
41
+ --infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;;
42
+ # [Only for Inference] The audio dir for inference
43
+ --infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;;
44
+ # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
45
+ --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
46
+ # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
47
+ --infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
48
+
49
+ --) shift ; break ;;
50
+ *) echo "Invalid option: $1" exit 1 ;;
51
+ esac
52
+ done
53
+
54
+
55
+ ### Value check ###
56
+ if [ -z "$running_stage" ]; then
57
+ echo "[Error] Please specify the running stage"
58
+ exit 1
59
+ fi
60
+
61
+ if [ -z "$exp_config" ]; then
62
+ exp_config="${exp_dir}"/exp_config.json
63
+ fi
64
+ echo "Exprimental Configuration File: $exp_config"
65
+
66
+ if [ -z "$gpu" ]; then
67
+ gpu="0"
68
+ fi
69
+
70
+ ######## Features Extraction ###########
71
+ if [ $running_stage -eq 1 ]; then
72
+ CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \
73
+ --config $exp_config \
74
+ --num_workers 8
75
+ fi
76
+
77
+ ######## Training ###########
78
+ if [ $running_stage -eq 2 ]; then
79
+ if [ -z "$exp_name" ]; then
80
+ echo "[Error] Please specify the experiments name"
81
+ exit 1
82
+ fi
83
+ echo "Exprimental Name: $exp_name"
84
+
85
+ if [ "$resume" = true ]; then
86
+ echo "Automatically resume from the experimental dir..."
87
+ CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/vocoder/train.py \
88
+ --config "$exp_config" \
89
+ --exp_name "$exp_name" \
90
+ --log_level info \
91
+ --resume
92
+ else
93
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/vocoder/train.py \
94
+ --config "$exp_config" \
95
+ --exp_name "$exp_name" \
96
+ --log_level info \
97
+ --checkpoint "$checkpoint" \
98
+ --resume_type "$resume_type"
99
+ fi
100
+ fi
101
+
102
+ ######## Inference/Conversion ###########
103
+ if [ $running_stage -eq 3 ]; then
104
+ if [ -z "$infer_expt_dir" ]; then
105
+ echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
106
+ exit 1
107
+ fi
108
+
109
+ if [ -z "$infer_output_dir" ]; then
110
+ infer_output_dir="$infer_expt_dir/result"
111
+ fi
112
+
113
+ if [ $infer_mode = "infer_from_dataset" ]; then
114
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
115
+ --config $exp_config \
116
+ --infer_mode $infer_mode \
117
+ --infer_datasets $infer_datasets \
118
+ --vocoder_dir $infer_expt_dir \
119
+ --output_dir $infer_output_dir \
120
+ --log_level debug
121
+ fi
122
+
123
+ if [ $infer_mode = "infer_from_feature" ]; then
124
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
125
+ --config $exp_config \
126
+ --infer_mode $infer_mode \
127
+ --feature_folder $infer_feature_dir \
128
+ --vocoder_dir $infer_expt_dir \
129
+ --output_dir $infer_output_dir \
130
+ --log_level debug
131
+ fi
132
+
133
+ if [ $infer_mode = "infer_from_audio" ]; then
134
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
135
+ --config $exp_config \
136
+ --infer_mode $infer_mode \
137
+ --audio_folder $infer_audio_dir \
138
+ --vocoder_dir $infer_expt_dir \
139
+ --output_dir $infer_output_dir \
140
+ --log_level debug
141
+ fi
142
+
143
+ fi
egs/vocoder/gan/bigvgan_large/exp_config.json ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "egs/vocoder/gan/exp_config_base.json",
3
+ "preprocess": {
4
+ // acoustic features
5
+ "extract_mel": true,
6
+ "extract_audio": true,
7
+
8
+ // Features used for model training
9
+ "use_mel": true,
10
+ "use_audio": true
11
+ },
12
+ "model": {
13
+ "generator": "bigvgan",
14
+ "bigvgan": {
15
+ "resblock": "1",
16
+ "activation": "snakebeta",
17
+ "snake_logscale": true,
18
+ "upsample_rates": [
19
+ 4,
20
+ 4,
21
+ 2,
22
+ 2,
23
+ 2,
24
+ 2
25
+ ],
26
+ "upsample_kernel_sizes": [
27
+ 8,
28
+ 8,
29
+ 4,
30
+ 4,
31
+ 4,
32
+ 4
33
+ ],
34
+ "upsample_initial_channel": 1536,
35
+ "resblock_kernel_sizes": [
36
+ 3,
37
+ 7,
38
+ 11
39
+ ],
40
+ "resblock_dilation_sizes": [
41
+ [
42
+ 1,
43
+ 3,
44
+ 5
45
+ ],
46
+ [
47
+ 1,
48
+ 3,
49
+ 5
50
+ ],
51
+ [
52
+ 1,
53
+ 3,
54
+ 5
55
+ ]
56
+ ]
57
+ },
58
+ },
59
+ "train": {
60
+ "criterions": [
61
+ "feature",
62
+ "discriminator",
63
+ "generator",
64
+ "mel",
65
+ ]
66
+ },
67
+ "inference": {
68
+ "batch_size": 1,
69
+ }
70
+ }
egs/vocoder/gan/bigvgan_large/run.sh ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ ######## Build Experiment Environment ###########
7
+ exp_dir=$(cd `dirname $0`; pwd)
8
+ work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir))))
9
+
10
+ export WORK_DIR=$work_dir
11
+ export PYTHONPATH=$work_dir
12
+ export PYTHONIOENCODING=UTF-8
13
+
14
+ ######## Parse the Given Parameters from the Commond ###########
15
+ options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,checkpoint:,resume_type:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@")
16
+ eval set -- "$options"
17
+
18
+ while true; do
19
+ case $1 in
20
+ # Experimental Configuration File
21
+ -c | --config) shift; exp_config=$1 ; shift ;;
22
+ # Experimental Name
23
+ -n | --name) shift; exp_name=$1 ; shift ;;
24
+ # Running Stage
25
+ -s | --stage) shift; running_stage=$1 ; shift ;;
26
+ # Visible GPU machines. The default value is "0".
27
+ --gpu) shift; gpu=$1 ; shift ;;
28
+
29
+ # [Only for Training] Resume configuration
30
+ --resume) shift; resume=$1 ; shift ;;
31
+ # [Only for Training] The specific checkpoint path that you want to resume from.
32
+ --checkpoint) shift; cehckpoint=$1 ; shift ;;
33
+ # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
34
+ --resume_type) shift; resume_type=$1 ; shift ;;
35
+
36
+ # [Only for Inference] The inference mode
37
+ --infer_mode) shift; infer_mode=$1 ; shift ;;
38
+ # [Only for Inference] The inferenced datasets
39
+ --infer_datasets) shift; infer_datasets=$1 ; shift ;;
40
+ # [Only for Inference] The feature dir for inference
41
+ --infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;;
42
+ # [Only for Inference] The audio dir for inference
43
+ --infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;;
44
+ # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
45
+ --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
46
+ # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
47
+ --infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
48
+
49
+ --) shift ; break ;;
50
+ *) echo "Invalid option: $1" exit 1 ;;
51
+ esac
52
+ done
53
+
54
+
55
+ ### Value check ###
56
+ if [ -z "$running_stage" ]; then
57
+ echo "[Error] Please specify the running stage"
58
+ exit 1
59
+ fi
60
+
61
+ if [ -z "$exp_config" ]; then
62
+ exp_config="${exp_dir}"/exp_config.json
63
+ fi
64
+ echo "Exprimental Configuration File: $exp_config"
65
+
66
+ if [ -z "$gpu" ]; then
67
+ gpu="0"
68
+ fi
69
+
70
+ ######## Features Extraction ###########
71
+ if [ $running_stage -eq 1 ]; then
72
+ CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \
73
+ --config $exp_config \
74
+ --num_workers 8
75
+ fi
76
+
77
+ ######## Training ###########
78
+ if [ $running_stage -eq 2 ]; then
79
+ if [ -z "$exp_name" ]; then
80
+ echo "[Error] Please specify the experiments name"
81
+ exit 1
82
+ fi
83
+ echo "Exprimental Name: $exp_name"
84
+
85
+ if [ "$resume" = true ]; then
86
+ echo "Automatically resume from the experimental dir..."
87
+ CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/vocoder/train.py \
88
+ --config "$exp_config" \
89
+ --exp_name "$exp_name" \
90
+ --log_level info \
91
+ --resume
92
+ else
93
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/vocoder/train.py \
94
+ --config "$exp_config" \
95
+ --exp_name "$exp_name" \
96
+ --log_level info \
97
+ --checkpoint "$checkpoint" \
98
+ --resume_type "$resume_type"
99
+ fi
100
+ fi
101
+
102
+ ######## Inference/Conversion ###########
103
+ if [ $running_stage -eq 3 ]; then
104
+ if [ -z "$infer_expt_dir" ]; then
105
+ echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
106
+ exit 1
107
+ fi
108
+
109
+ if [ -z "$infer_output_dir" ]; then
110
+ infer_output_dir="$infer_expt_dir/result"
111
+ fi
112
+
113
+ if [ $infer_mode = "infer_from_dataset" ]; then
114
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
115
+ --config $exp_config \
116
+ --infer_mode $infer_mode \
117
+ --infer_datasets $infer_datasets \
118
+ --vocoder_dir $infer_expt_dir \
119
+ --output_dir $infer_output_dir \
120
+ --log_level debug
121
+ fi
122
+
123
+ if [ $infer_mode = "infer_from_feature" ]; then
124
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
125
+ --config $exp_config \
126
+ --infer_mode $infer_mode \
127
+ --feature_folder $infer_feature_dir \
128
+ --vocoder_dir $infer_expt_dir \
129
+ --output_dir $infer_output_dir \
130
+ --log_level debug
131
+ fi
132
+
133
+ if [ $infer_mode = "infer_from_audio" ]; then
134
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
135
+ --config $exp_config \
136
+ --infer_mode $infer_mode \
137
+ --audio_folder $infer_audio_dir \
138
+ --vocoder_dir $infer_expt_dir \
139
+ --output_dir $infer_output_dir \
140
+ --log_level debug
141
+ fi
142
+
143
+ fi
egs/vocoder/gan/exp_config_base.json ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/vocoder.json",
3
+ "model_type": "GANVocoder",
4
+ // TODO: Choose your needed datasets
5
+ "dataset": [
6
+ "csd",
7
+ "kising",
8
+ "m4singer",
9
+ "nus48e",
10
+ "opencpop",
11
+ "opensinger",
12
+ "opera",
13
+ "pjs",
14
+ "popbutfy",
15
+ "popcs",
16
+ "ljspeech",
17
+ "vctk",
18
+ "libritts",
19
+ ],
20
+ "dataset_path": {
21
+ // TODO: Fill in your dataset path
22
+ "csd": "[dataset path]",
23
+ "kising": "[dataset path]",
24
+ "m4singer": "[dataset path]",
25
+ "nus48e": "[dataset path]",
26
+ "opencpop": "[dataset path]",
27
+ "opensinger": "[dataset path]",
28
+ "opera": "[dataset path]",
29
+ "pjs": "[dataset path]",
30
+ "popbutfy": "[dataset path]",
31
+ "popcs": "[dataset path]",
32
+ "ljspeech": "[dataset path]",
33
+ "vctk": "[dataset path]",
34
+ "libritts": "[dataset path]",
35
+ },
36
+ // TODO: Fill in the output log path
37
+ "log_dir": "ckpts/vocoder",
38
+ "preprocess": {
39
+ // Acoustic features
40
+ "extract_mel": true,
41
+ "extract_audio": true,
42
+ "extract_pitch": false,
43
+ "extract_uv": false,
44
+ "pitch_extractor": "parselmouth",
45
+
46
+ // Features used for model training
47
+ "use_mel": true,
48
+ "use_frame_pitch": false,
49
+ "use_uv": false,
50
+ "use_audio": true,
51
+
52
+ // TODO: Fill in the output data path
53
+ "processed_dir": "data/",
54
+ "n_mel": 100,
55
+ "sample_rate": 24000
56
+ },
57
+ "model": {
58
+ // TODO: Choose your needed discriminators
59
+ "discriminators": [
60
+ "msd",
61
+ "mpd",
62
+ "msstftd",
63
+ "mssbcqtd",
64
+ ],
65
+ "mpd": {
66
+ "mpd_reshapes": [
67
+ 2,
68
+ 3,
69
+ 5,
70
+ 7,
71
+ 11
72
+ ],
73
+ "use_spectral_norm": false,
74
+ "discriminator_channel_mult_factor": 1
75
+ },
76
+ "mrd": {
77
+ "resolutions": [[1024, 120, 600], [2048, 240, 1200], [512, 50, 240]],
78
+ "use_spectral_norm": false,
79
+ "discriminator_channel_mult_factor": 1,
80
+ "mrd_override": false
81
+ },
82
+ "msstftd": {
83
+ "filters": 32
84
+ },
85
+ "mssbcqtd": {
86
+ hop_lengths: [512, 256, 256],
87
+ filters: 32,
88
+ max_filters: 1024,
89
+ filters_scale: 1,
90
+ dilations: [1, 2, 4],
91
+ in_channels: 1,
92
+ out_channels: 1,
93
+ n_octaves: [9, 9, 9],
94
+ bins_per_octaves: [24, 36, 48]
95
+ },
96
+ },
97
+ "train": {
98
+ // TODO: Choose a suitable batch size, training epoch, and save stride
99
+ "batch_size": 32,
100
+ "max_epoch": 1000000,
101
+ "save_checkpoint_stride": [20],
102
+ "adamw": {
103
+ "lr": 2.0e-4,
104
+ "adam_b1": 0.8,
105
+ "adam_b2": 0.99
106
+ },
107
+ "exponential_lr": {
108
+ "lr_decay": 0.999
109
+ },
110
+ }
111
+ }
egs/vocoder/gan/hifigan/exp_config.json ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "egs/vocoder/gan/exp_config_base.json",
3
+ "preprocess": {
4
+ // acoustic features
5
+ "extract_mel": true,
6
+ "extract_audio": true,
7
+
8
+ // Features used for model training
9
+ "use_mel": true,
10
+ "use_audio": true
11
+ },
12
+ "model": {
13
+ "generator": "hifigan",
14
+ "hifigan": {
15
+ "resblock": "2",
16
+ "upsample_rates": [
17
+ 8,
18
+ 8,
19
+ 4
20
+ ],
21
+ "upsample_kernel_sizes": [
22
+ 16,
23
+ 16,
24
+ 8
25
+ ],
26
+ "upsample_initial_channel": 256,
27
+ "resblock_kernel_sizes": [
28
+ 3,
29
+ 5,
30
+ 7
31
+ ],
32
+ "resblock_dilation_sizes": [
33
+ [
34
+ 1,
35
+ 2
36
+ ],
37
+ [
38
+ 2,
39
+ 6
40
+ ],
41
+ [
42
+ 3,
43
+ 12
44
+ ]
45
+ ]
46
+ }
47
+ },
48
+ "train": {
49
+ "criterions": [
50
+ "feature",
51
+ "discriminator",
52
+ "generator",
53
+ "mel",
54
+ ]
55
+ },
56
+ "inference": {
57
+ "batch_size": 1,
58
+ }
59
+ }
egs/vocoder/gan/hifigan/run.sh ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ ######## Build Experiment Environment ###########
7
+ exp_dir=$(cd `dirname $0`; pwd)
8
+ work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir))))
9
+
10
+ export WORK_DIR=$work_dir
11
+ export PYTHONPATH=$work_dir
12
+ export PYTHONIOENCODING=UTF-8
13
+
14
+ ######## Parse the Given Parameters from the Commond ###########
15
+ options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,checkpoint:,resume_type:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@")
16
+ eval set -- "$options"
17
+
18
+ while true; do
19
+ case $1 in
20
+ # Experimental Configuration File
21
+ -c | --config) shift; exp_config=$1 ; shift ;;
22
+ # Experimental Name
23
+ -n | --name) shift; exp_name=$1 ; shift ;;
24
+ # Running Stage
25
+ -s | --stage) shift; running_stage=$1 ; shift ;;
26
+ # Visible GPU machines. The default value is "0".
27
+ --gpu) shift; gpu=$1 ; shift ;;
28
+
29
+ # [Only for Training] Resume configuration
30
+ --resume) shift; resume=$1 ; shift ;;
31
+ # [Only for Training] The specific checkpoint path that you want to resume from.
32
+ --checkpoint) shift; cehckpoint=$1 ; shift ;;
33
+ # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
34
+ --resume_type) shift; resume_type=$1 ; shift ;;
35
+
36
+ # [Only for Inference] The inference mode
37
+ --infer_mode) shift; infer_mode=$1 ; shift ;;
38
+ # [Only for Inference] The inferenced datasets
39
+ --infer_datasets) shift; infer_datasets=$1 ; shift ;;
40
+ # [Only for Inference] The feature dir for inference
41
+ --infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;;
42
+ # [Only for Inference] The audio dir for inference
43
+ --infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;;
44
+ # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
45
+ --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
46
+ # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
47
+ --infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
48
+
49
+ --) shift ; break ;;
50
+ *) echo "Invalid option: $1" exit 1 ;;
51
+ esac
52
+ done
53
+
54
+
55
+ ### Value check ###
56
+ if [ -z "$running_stage" ]; then
57
+ echo "[Error] Please specify the running stage"
58
+ exit 1
59
+ fi
60
+
61
+ if [ -z "$exp_config" ]; then
62
+ exp_config="${exp_dir}"/exp_config.json
63
+ fi
64
+ echo "Exprimental Configuration File: $exp_config"
65
+
66
+ if [ -z "$gpu" ]; then
67
+ gpu="0"
68
+ fi
69
+
70
+ ######## Features Extraction ###########
71
+ if [ $running_stage -eq 1 ]; then
72
+ CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \
73
+ --config $exp_config \
74
+ --num_workers 8
75
+ fi
76
+
77
+ ######## Training ###########
78
+ if [ $running_stage -eq 2 ]; then
79
+ if [ -z "$exp_name" ]; then
80
+ echo "[Error] Please specify the experiments name"
81
+ exit 1
82
+ fi
83
+ echo "Exprimental Name: $exp_name"
84
+
85
+ if [ "$resume" = true ]; then
86
+ echo "Automatically resume from the experimental dir..."
87
+ CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/vocoder/train.py \
88
+ --config "$exp_config" \
89
+ --exp_name "$exp_name" \
90
+ --log_level info \
91
+ --resume
92
+ else
93
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/vocoder/train.py \
94
+ --config "$exp_config" \
95
+ --exp_name "$exp_name" \
96
+ --log_level info \
97
+ --checkpoint "$checkpoint" \
98
+ --resume_type "$resume_type"
99
+ fi
100
+ fi
101
+
102
+ ######## Inference/Conversion ###########
103
+ if [ $running_stage -eq 3 ]; then
104
+ if [ -z "$infer_expt_dir" ]; then
105
+ echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
106
+ exit 1
107
+ fi
108
+
109
+ if [ -z "$infer_output_dir" ]; then
110
+ infer_output_dir="$infer_expt_dir/result"
111
+ fi
112
+
113
+ if [ $infer_mode = "infer_from_dataset" ]; then
114
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
115
+ --config $exp_config \
116
+ --infer_mode $infer_mode \
117
+ --infer_datasets $infer_datasets \
118
+ --vocoder_dir $infer_expt_dir \
119
+ --output_dir $infer_output_dir \
120
+ --log_level debug
121
+ fi
122
+
123
+ if [ $infer_mode = "infer_from_feature" ]; then
124
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
125
+ --config $exp_config \
126
+ --infer_mode $infer_mode \
127
+ --feature_folder $infer_feature_dir \
128
+ --vocoder_dir $infer_expt_dir \
129
+ --output_dir $infer_output_dir \
130
+ --log_level debug
131
+ fi
132
+
133
+ if [ $infer_mode = "infer_from_audio" ]; then
134
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
135
+ --config $exp_config \
136
+ --infer_mode $infer_mode \
137
+ --audio_folder $infer_audio_dir \
138
+ --vocoder_dir $infer_expt_dir \
139
+ --output_dir $infer_output_dir \
140
+ --log_level debug
141
+ fi
142
+
143
+ fi
egs/vocoder/gan/melgan/exp_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "egs/vocoder/gan/exp_config_base.json",
3
+ "preprocess": {
4
+ // acoustic features
5
+ "extract_mel": true,
6
+ "extract_audio": true,
7
+
8
+ // Features used for model training
9
+ "use_mel": true,
10
+ "use_audio": true
11
+ },
12
+ "model": {
13
+ "generator": "melgan",
14
+ "melgan": {
15
+ "ratios": [8, 8, 2, 2],
16
+ "ngf": 32,
17
+ "n_residual_layers": 3,
18
+ "num_D": 3,
19
+ "ndf": 16,
20
+ "n_layers": 4,
21
+ "downsampling_factor": 4
22
+ },
23
+ },
24
+ "train": {
25
+ "criterions": [
26
+ "feature",
27
+ "discriminator",
28
+ "generator",
29
+ ]
30
+ },
31
+ "inference": {
32
+ "batch_size": 1,
33
+ }
34
+ }
egs/vocoder/gan/melgan/run.sh ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ ######## Build Experiment Environment ###########
7
+ exp_dir=$(cd `dirname $0`; pwd)
8
+ work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir))))
9
+
10
+ export WORK_DIR=$work_dir
11
+ export PYTHONPATH=$work_dir
12
+ export PYTHONIOENCODING=UTF-8
13
+
14
+ ######## Parse the Given Parameters from the Commond ###########
15
+ options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,checkpoint:,resume_type:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@")
16
+ eval set -- "$options"
17
+
18
+ while true; do
19
+ case $1 in
20
+ # Experimental Configuration File
21
+ -c | --config) shift; exp_config=$1 ; shift ;;
22
+ # Experimental Name
23
+ -n | --name) shift; exp_name=$1 ; shift ;;
24
+ # Running Stage
25
+ -s | --stage) shift; running_stage=$1 ; shift ;;
26
+ # Visible GPU machines. The default value is "0".
27
+ --gpu) shift; gpu=$1 ; shift ;;
28
+
29
+ # [Only for Training] Resume configuration
30
+ --resume) shift; resume=$1 ; shift ;;
31
+ # [Only for Training] The specific checkpoint path that you want to resume from.
32
+ --checkpoint) shift; cehckpoint=$1 ; shift ;;
33
+ # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
34
+ --resume_type) shift; resume_type=$1 ; shift ;;
35
+
36
+ # [Only for Inference] The inference mode
37
+ --infer_mode) shift; infer_mode=$1 ; shift ;;
38
+ # [Only for Inference] The inferenced datasets
39
+ --infer_datasets) shift; infer_datasets=$1 ; shift ;;
40
+ # [Only for Inference] The feature dir for inference
41
+ --infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;;
42
+ # [Only for Inference] The audio dir for inference
43
+ --infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;;
44
+ # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
45
+ --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
46
+ # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
47
+ --infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
48
+
49
+ --) shift ; break ;;
50
+ *) echo "Invalid option: $1" exit 1 ;;
51
+ esac
52
+ done
53
+
54
+
55
+ ### Value check ###
56
+ if [ -z "$running_stage" ]; then
57
+ echo "[Error] Please specify the running stage"
58
+ exit 1
59
+ fi
60
+
61
+ if [ -z "$exp_config" ]; then
62
+ exp_config="${exp_dir}"/exp_config.json
63
+ fi
64
+ echo "Exprimental Configuration File: $exp_config"
65
+
66
+ if [ -z "$gpu" ]; then
67
+ gpu="0"
68
+ fi
69
+
70
+ ######## Features Extraction ###########
71
+ if [ $running_stage -eq 1 ]; then
72
+ CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \
73
+ --config $exp_config \
74
+ --num_workers 8
75
+ fi
76
+
77
+ ######## Training ###########
78
+ if [ $running_stage -eq 2 ]; then
79
+ if [ -z "$exp_name" ]; then
80
+ echo "[Error] Please specify the experiments name"
81
+ exit 1
82
+ fi
83
+ echo "Exprimental Name: $exp_name"
84
+
85
+ if [ "$resume" = true ]; then
86
+ echo "Automatically resume from the experimental dir..."
87
+ CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/vocoder/train.py \
88
+ --config "$exp_config" \
89
+ --exp_name "$exp_name" \
90
+ --log_level info \
91
+ --resume
92
+ else
93
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/vocoder/train.py \
94
+ --config "$exp_config" \
95
+ --exp_name "$exp_name" \
96
+ --log_level info \
97
+ --checkpoint "$checkpoint" \
98
+ --resume_type "$resume_type"
99
+ fi
100
+ fi
101
+
102
+ ######## Inference/Conversion ###########
103
+ if [ $running_stage -eq 3 ]; then
104
+ if [ -z "$infer_expt_dir" ]; then
105
+ echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
106
+ exit 1
107
+ fi
108
+
109
+ if [ -z "$infer_output_dir" ]; then
110
+ infer_output_dir="$infer_expt_dir/result"
111
+ fi
112
+
113
+ if [ $infer_mode = "infer_from_dataset" ]; then
114
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
115
+ --config $exp_config \
116
+ --infer_mode $infer_mode \
117
+ --infer_datasets $infer_datasets \
118
+ --vocoder_dir $infer_expt_dir \
119
+ --output_dir $infer_output_dir \
120
+ --log_level debug
121
+ fi
122
+
123
+ if [ $infer_mode = "infer_from_feature" ]; then
124
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
125
+ --config $exp_config \
126
+ --infer_mode $infer_mode \
127
+ --feature_folder $infer_feature_dir \
128
+ --vocoder_dir $infer_expt_dir \
129
+ --output_dir $infer_output_dir \
130
+ --log_level debug
131
+ fi
132
+
133
+ if [ $infer_mode = "infer_from_audio" ]; then
134
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
135
+ --config $exp_config \
136
+ --infer_mode $infer_mode \
137
+ --audio_folder $infer_audio_dir \
138
+ --vocoder_dir $infer_expt_dir \
139
+ --output_dir $infer_output_dir \
140
+ --log_level debug
141
+ fi
142
+
143
+ fi
egs/vocoder/gan/nsfhifigan/exp_config.json ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "egs/vocoder/gan/exp_config_base.json",
3
+ "preprocess": {
4
+ // acoustic features
5
+ "extract_mel": true,
6
+ "extract_audio": true,
7
+ "extract_pitch": true,
8
+
9
+ // Features used for model training
10
+ "use_mel": true,
11
+ "use_audio": true,
12
+ "use_frame_pitch": true
13
+ },
14
+ "model": {
15
+ "generator": "nsfhifigan",
16
+ "nsfhifigan": {
17
+ "resblock": "1",
18
+ "harmonic_num": 8,
19
+ "upsample_rates": [
20
+ 8,
21
+ 4,
22
+ 2,
23
+ 2,
24
+ 2
25
+ ],
26
+ "upsample_kernel_sizes": [
27
+ 16,
28
+ 8,
29
+ 4,
30
+ 4,
31
+ 4
32
+ ],
33
+ "upsample_initial_channel": 768,
34
+ "resblock_kernel_sizes": [
35
+ 3,
36
+ 7,
37
+ 11
38
+ ],
39
+ "resblock_dilation_sizes": [
40
+ [
41
+ 1,
42
+ 3,
43
+ 5
44
+ ],
45
+ [
46
+ 1,
47
+ 3,
48
+ 5
49
+ ],
50
+ [
51
+ 1,
52
+ 3,
53
+ 5
54
+ ]
55
+ ]
56
+ },
57
+ "mpd": {
58
+ "mpd_reshapes": [
59
+ 2,
60
+ 3,
61
+ 5,
62
+ 7,
63
+ 11,
64
+ 17,
65
+ 23,
66
+ 37
67
+ ],
68
+ "use_spectral_norm": false,
69
+ "discriminator_channel_multi": 1
70
+ }
71
+ },
72
+ "train": {
73
+ "criterions": [
74
+ "feature",
75
+ "discriminator",
76
+ "generator",
77
+ "mel",
78
+ ]
79
+ },
80
+ "inference": {
81
+ "batch_size": 1,
82
+ }
83
+ }
egs/vocoder/gan/nsfhifigan/run.sh ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ ######## Build Experiment Environment ###########
7
+ exp_dir=$(cd `dirname $0`; pwd)
8
+ work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir))))
9
+
10
+ export WORK_DIR=$work_dir
11
+ export PYTHONPATH=$work_dir
12
+ export PYTHONIOENCODING=UTF-8
13
+
14
+ ######## Parse the Given Parameters from the Commond ###########
15
+ options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,checkpoint:,resume_type:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@")
16
+ eval set -- "$options"
17
+
18
+ while true; do
19
+ case $1 in
20
+ # Experimental Configuration File
21
+ -c | --config) shift; exp_config=$1 ; shift ;;
22
+ # Experimental Name
23
+ -n | --name) shift; exp_name=$1 ; shift ;;
24
+ # Running Stage
25
+ -s | --stage) shift; running_stage=$1 ; shift ;;
26
+ # Visible GPU machines. The default value is "0".
27
+ --gpu) shift; gpu=$1 ; shift ;;
28
+
29
+ # [Only for Training] Resume configuration
30
+ --resume) shift; resume=$1 ; shift ;;
31
+ # [Only for Training] The specific checkpoint path that you want to resume from.
32
+ --checkpoint) shift; cehckpoint=$1 ; shift ;;
33
+ # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
34
+ --resume_type) shift; resume_type=$1 ; shift ;;
35
+
36
+ # [Only for Inference] The inference mode
37
+ --infer_mode) shift; infer_mode=$1 ; shift ;;
38
+ # [Only for Inference] The inferenced datasets
39
+ --infer_datasets) shift; infer_datasets=$1 ; shift ;;
40
+ # [Only for Inference] The feature dir for inference
41
+ --infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;;
42
+ # [Only for Inference] The audio dir for inference
43
+ --infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;;
44
+ # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
45
+ --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
46
+ # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
47
+ --infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
48
+
49
+ --) shift ; break ;;
50
+ *) echo "Invalid option: $1" exit 1 ;;
51
+ esac
52
+ done
53
+
54
+
55
+ ### Value check ###
56
+ if [ -z "$running_stage" ]; then
57
+ echo "[Error] Please specify the running stage"
58
+ exit 1
59
+ fi
60
+
61
+ if [ -z "$exp_config" ]; then
62
+ exp_config="${exp_dir}"/exp_config.json
63
+ fi
64
+ echo "Exprimental Configuration File: $exp_config"
65
+
66
+ if [ -z "$gpu" ]; then
67
+ gpu="0"
68
+ fi
69
+
70
+ ######## Features Extraction ###########
71
+ if [ $running_stage -eq 1 ]; then
72
+ CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \
73
+ --config $exp_config \
74
+ --num_workers 8
75
+ fi
76
+
77
+ ######## Training ###########
78
+ if [ $running_stage -eq 2 ]; then
79
+ if [ -z "$exp_name" ]; then
80
+ echo "[Error] Please specify the experiments name"
81
+ exit 1
82
+ fi
83
+ echo "Exprimental Name: $exp_name"
84
+
85
+ if [ "$resume" = true ]; then
86
+ echo "Automatically resume from the experimental dir..."
87
+ CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/vocoder/train.py \
88
+ --config "$exp_config" \
89
+ --exp_name "$exp_name" \
90
+ --log_level info \
91
+ --resume
92
+ else
93
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/vocoder/train.py \
94
+ --config "$exp_config" \
95
+ --exp_name "$exp_name" \
96
+ --log_level info \
97
+ --checkpoint "$checkpoint" \
98
+ --resume_type "$resume_type"
99
+ fi
100
+ fi
101
+
102
+ ######## Inference/Conversion ###########
103
+ if [ $running_stage -eq 3 ]; then
104
+ if [ -z "$infer_expt_dir" ]; then
105
+ echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
106
+ exit 1
107
+ fi
108
+
109
+ if [ -z "$infer_output_dir" ]; then
110
+ infer_output_dir="$infer_expt_dir/result"
111
+ fi
112
+
113
+ if [ $infer_mode = "infer_from_dataset" ]; then
114
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
115
+ --config $exp_config \
116
+ --infer_mode $infer_mode \
117
+ --infer_datasets $infer_datasets \
118
+ --vocoder_dir $infer_expt_dir \
119
+ --output_dir $infer_output_dir \
120
+ --log_level debug
121
+ fi
122
+
123
+ if [ $infer_mode = "infer_from_feature" ]; then
124
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
125
+ --config $exp_config \
126
+ --infer_mode $infer_mode \
127
+ --feature_folder $infer_feature_dir \
128
+ --vocoder_dir $infer_expt_dir \
129
+ --output_dir $infer_output_dir \
130
+ --log_level debug
131
+ fi
132
+
133
+ if [ $infer_mode = "infer_from_audio" ]; then
134
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
135
+ --config $exp_config \
136
+ --infer_mode $infer_mode \
137
+ --audio_folder $infer_audio_dir \
138
+ --vocoder_dir $infer_expt_dir \
139
+ --output_dir $infer_output_dir \
140
+ --log_level debug
141
+ fi
142
+
143
+ fi
egs/vocoder/gan/tfr_enhanced_hifigan/README.md ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Multi-Scale Sub-Band Constant-Q Transform Discriminator for High-Fedility Vocoder
2
+
3
+ [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2311.14957)
4
+ [![demo](https://img.shields.io/badge/Vocoder-Demo-red)](https://vocodexelysium.github.io/MS-SB-CQTD/)
5
+
6
+ <br>
7
+ <div align="center">
8
+ <img src="../../../../imgs/vocoder/gan/MSSBCQTD.png" width="80%">
9
+ </div>
10
+ <br>
11
+
12
+ This is the official implementation of the paper "[Multi-Scale Sub-Band Constant-Q Transform Discriminator for High-Fidelity Vocoder](https://arxiv.org/abs/2311.14957)". In this recipe, we will illustrate how to train a high quality HiFi-GAN on LibriTTS, VCTK and LJSpeech via utilizing multiple Time-Frequency-Representation-based Discriminators.
13
+
14
+ There are four stages in total:
15
+
16
+ 1. Data preparation
17
+ 2. Feature extraction
18
+ 3. Training
19
+ 4. Inference
20
+
21
+ > **NOTE:** You need to run every command of this recipe in the `Amphion` root path:
22
+ > ```bash
23
+ > cd Amphion
24
+ > ```
25
+
26
+ ## 1. Data Preparation
27
+
28
+ ### Dataset Download
29
+
30
+ By default, we utilize the three datasets for training: LibriTTS, VCTK and LJSpeech. How to download them is detailed in [here](../../../datasets/README.md).
31
+
32
+ ### Configuration
33
+
34
+ Specify the dataset path in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets.
35
+
36
+ ```json
37
+ "dataset": [
38
+ "ljspeech",
39
+ "vctk",
40
+ "libritts",
41
+ ],
42
+ "dataset_path": {
43
+ // TODO: Fill in your dataset path
44
+ "ljspeech": "[LJSpeech dataset path]",
45
+ "vctk": "[VCTK dataset path]",
46
+ "libritts": "[LibriTTS dataset path]",
47
+ },
48
+ ```
49
+
50
+ ## 2. Features Extraction
51
+
52
+ For HiFiGAN, only the Mel-Spectrogram and the Output Audio are needed for training.
53
+
54
+ ### Configuration
55
+
56
+ Specify the dataset path and the output path for saving the processed data and the training model in `exp_config.json`:
57
+
58
+ ```json
59
+ // TODO: Fill in the output log path. The default value is "Amphion/ckpts/vocoder"
60
+ "log_dir": "ckpts/vocoder",
61
+ "preprocess": {
62
+ // TODO: Fill in the output data path. The default value is "Amphion/data"
63
+ "processed_dir": "data",
64
+ ...
65
+ },
66
+ ```
67
+
68
+ ### Run
69
+
70
+ Run the `run.sh` as the preproces stage (set `--stage 1`).
71
+
72
+ ```bash
73
+ sh egs/vocoder/gan/tfr_enhanced_hifigan/run.sh --stage 1
74
+ ```
75
+
76
+ > **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`.
77
+
78
+ ## 3. Training
79
+
80
+ ### Configuration
81
+
82
+ We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines.
83
+
84
+ ```json
85
+ "train": {
86
+ "batch_size": 32,
87
+ ...
88
+ }
89
+ ```
90
+
91
+ ### Run
92
+
93
+ Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/vocoder/[YourExptName]`.
94
+
95
+ ```bash
96
+ sh egs/vocoder/gan/tfr_enhanced_hifigan/run.sh --stage 2 --name [YourExptName]
97
+ ```
98
+
99
+ > **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`.
100
+
101
+ ## 4. Inference
102
+
103
+ ### Pretrained Vocoder Download
104
+
105
+ We trained a HiFiGAN checkpoint with around 685 hours Speech data. The final pretrained checkpoint is released [here](../../../../pretrained/hifigan/README.md).
106
+
107
+ ### Run
108
+
109
+ Run the `run.sh` as the training stage (set `--stage 3`), we provide three different inference modes, including `infer_from_dataset`, `infer_from_feature`, `and infer_from audio`.
110
+
111
+ ```bash
112
+ sh egs/vocoder/gan/tfr_enhanced_hifigan/run.sh --stage 3 \
113
+ --infer_mode [Your chosen inference mode] \
114
+ --infer_datasets [Datasets you want to inference, needed when infer_from_dataset] \
115
+ --infer_feature_dir [Your path to your predicted acoustic features, needed when infer_from_feature] \
116
+ --infer_audio_dir [Your path to your audio files, needed when infer_form_audio] \
117
+ --infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \
118
+ --infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \
119
+ ```
120
+
121
+ #### a. Inference from Dataset
122
+
123
+ Run the `run.sh` with specified datasets, here is an example.
124
+
125
+ ```bash
126
+ sh egs/vocoder/gan/tfr_enhanced_hifigan/run.sh --stage 3 \
127
+ --infer_mode infer_from_dataset \
128
+ --infer_datasets "libritts vctk ljspeech" \
129
+ --infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \
130
+ --infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \
131
+ ```
132
+
133
+ #### b. Inference from Features
134
+
135
+ If you want to inference from your generated acoustic features, you should first prepare your acoustic features into the following structure:
136
+
137
+ ```plaintext
138
+ ┣ {infer_feature_dir}
139
+ ┃ ┣ mels
140
+ ┃ ┃ ┣ sample1.npy
141
+ ┃ ┃ ┣ sample2.npy
142
+ ```
143
+
144
+ Then run the `run.sh` with specificed folder direction, here is an example.
145
+
146
+ ```bash
147
+ sh egs/vocoder/gan/tfr_enhanced_hifigan/run.sh --stage 3 \
148
+ --infer_mode infer_from_feature \
149
+ --infer_feature_dir [Your path to your predicted acoustic features] \
150
+ --infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \
151
+ --infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \
152
+ ```
153
+
154
+ #### c. Inference from Audios
155
+
156
+ If you want to inference from audios for quick analysis synthesis, you should first prepare your audios into the following structure:
157
+
158
+ ```plaintext
159
+ ┣ audios
160
+ ┃ ┣ sample1.wav
161
+ ┃ ┣ sample2.wav
162
+ ```
163
+
164
+ Then run the `run.sh` with specificed folder direction, here is an example.
165
+
166
+ ```bash
167
+ sh egs/vocoder/gan/tfr_enhanced_hifigan/run.sh --stage 3 \
168
+ --infer_mode infer_from_audio \
169
+ --infer_audio_dir [Your path to your audio files] \
170
+ --infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \
171
+ --infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \
172
+ ```
173
+
174
+ ## Citations
175
+
176
+ ```bibtex
177
+ @misc{gu2023cqt,
178
+ title={Multi-Scale Sub-Band Constant-Q Transform Discriminator for High-Fidelity Vocoder},
179
+ author={Yicheng Gu and Xueyao Zhang and Liumeng Xue and Zhizheng Wu},
180
+ year={2023},
181
+ eprint={2311.14957},
182
+ archivePrefix={arXiv},
183
+ primaryClass={cs.SD}
184
+ }
185
+ ```
egs/vocoder/gan/tfr_enhanced_hifigan/exp_config.json ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "egs/vocoder/gan/exp_config_base.json",
3
+ "model_type": "GANVocoder",
4
+ "dataset": [
5
+ "ljspeech",
6
+ "vctk",
7
+ "libritts",
8
+ ],
9
+ "dataset_path": {
10
+ // TODO: Fill in your dataset path
11
+ "ljspeech": "[dataset path]",
12
+ "vctk": "[dataset path]",
13
+ "libritts": "[dataset path]",
14
+ },
15
+ // TODO: Fill in the output log path. The default value is "Amphion/ckpts/vocoder"
16
+ "log_dir": "ckpts/vocoder",
17
+ "preprocess": {
18
+ // TODO: Fill in the output data path. The default value is "Amphion/data"
19
+ "processed_dir": "data",
20
+ // acoustic features
21
+ "extract_mel": true,
22
+ "extract_audio": true,
23
+ "extract_pitch": false,
24
+ "extract_uv": false,
25
+ "extract_amplitude_phase": false,
26
+ "pitch_extractor": "parselmouth",
27
+ // Features used for model training
28
+ "use_mel": true,
29
+ "use_frame_pitch": false,
30
+ "use_uv": false,
31
+ "use_audio": true,
32
+ "n_mel": 100,
33
+ "sample_rate": 24000
34
+ },
35
+ "model": {
36
+ "generator": "hifigan",
37
+ "discriminators": [
38
+ "msd",
39
+ "mpd",
40
+ "mssbcqtd",
41
+ "msstftd",
42
+ ],
43
+ "hifigan": {
44
+ "resblock": "1",
45
+ "upsample_rates": [
46
+ 8,
47
+ 4,
48
+ 2,
49
+ 2,
50
+ 2
51
+ ],
52
+ "upsample_kernel_sizes": [
53
+ 16,
54
+ 8,
55
+ 4,
56
+ 4,
57
+ 4
58
+ ],
59
+ "upsample_initial_channel": 768,
60
+ "resblock_kernel_sizes": [
61
+ 3,
62
+ 5,
63
+ 7
64
+ ],
65
+ "resblock_dilation_sizes": [
66
+ [
67
+ 1,
68
+ 3,
69
+ 5
70
+ ],
71
+ [
72
+ 1,
73
+ 3,
74
+ 5
75
+ ],
76
+ [
77
+ 1,
78
+ 3,
79
+ 5
80
+ ]
81
+ ]
82
+ },
83
+ "mpd": {
84
+ "mpd_reshapes": [
85
+ 2,
86
+ 3,
87
+ 5,
88
+ 7,
89
+ 11,
90
+ 17,
91
+ 23,
92
+ 37
93
+ ],
94
+ "use_spectral_norm": false,
95
+ "discriminator_channel_multi": 1
96
+ }
97
+ },
98
+ "train": {
99
+ "batch_size": 16,
100
+ "adamw": {
101
+ "lr": 2.0e-4,
102
+ "adam_b1": 0.8,
103
+ "adam_b2": 0.99
104
+ },
105
+ "exponential_lr": {
106
+ "lr_decay": 0.999
107
+ },
108
+ "criterions": [
109
+ "feature",
110
+ "discriminator",
111
+ "generator",
112
+ "mel",
113
+ ]
114
+ },
115
+ "inference": {
116
+ "batch_size": 1,
117
+ }
118
+ }
egs/vocoder/gan/tfr_enhanced_hifigan/run.sh ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ ######## Build Experiment Environment ###########
7
+ exp_dir=$(cd `dirname $0`; pwd)
8
+ work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir))))
9
+
10
+ export WORK_DIR=$work_dir
11
+ export PYTHONPATH=$work_dir
12
+ export PYTHONIOENCODING=UTF-8
13
+
14
+ ######## Parse the Given Parameters from the Commond ###########
15
+ options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,checkpoint:,resume_type:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@")
16
+ eval set -- "$options"
17
+
18
+ while true; do
19
+ case $1 in
20
+ # Experimental Configuration File
21
+ -c | --config) shift; exp_config=$1 ; shift ;;
22
+ # Experimental Name
23
+ -n | --name) shift; exp_name=$1 ; shift ;;
24
+ # Running Stage
25
+ -s | --stage) shift; running_stage=$1 ; shift ;;
26
+ # Visible GPU machines. The default value is "0".
27
+ --gpu) shift; gpu=$1 ; shift ;;
28
+
29
+ # [Only for Training] Resume configuration
30
+ --resume) shift; resume=$1 ; shift ;;
31
+ # [Only for Training] The specific checkpoint path that you want to resume from.
32
+ --checkpoint) shift; cehckpoint=$1 ; shift ;;
33
+ # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
34
+ --resume_type) shift; resume_type=$1 ; shift ;;
35
+
36
+ # [Only for Inference] The inference mode
37
+ --infer_mode) shift; infer_mode=$1 ; shift ;;
38
+ # [Only for Inference] The inferenced datasets
39
+ --infer_datasets) shift; infer_datasets=$1 ; shift ;;
40
+ # [Only for Inference] The feature dir for inference
41
+ --infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;;
42
+ # [Only for Inference] The audio dir for inference
43
+ --infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;;
44
+ # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
45
+ --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
46
+ # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
47
+ --infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
48
+
49
+ --) shift ; break ;;
50
+ *) echo "Invalid option: $1" exit 1 ;;
51
+ esac
52
+ done
53
+
54
+
55
+ ### Value check ###
56
+ if [ -z "$running_stage" ]; then
57
+ echo "[Error] Please specify the running stage"
58
+ exit 1
59
+ fi
60
+
61
+ if [ -z "$exp_config" ]; then
62
+ exp_config="${exp_dir}"/exp_config.json
63
+ fi
64
+ echo "Exprimental Configuration File: $exp_config"
65
+
66
+ if [ -z "$gpu" ]; then
67
+ gpu="0"
68
+ fi
69
+
70
+ ######## Features Extraction ###########
71
+ if [ $running_stage -eq 1 ]; then
72
+ CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \
73
+ --config $exp_config \
74
+ --num_workers 8
75
+ fi
76
+
77
+ ######## Training ###########
78
+ if [ $running_stage -eq 2 ]; then
79
+ if [ -z "$exp_name" ]; then
80
+ echo "[Error] Please specify the experiments name"
81
+ exit 1
82
+ fi
83
+ echo "Exprimental Name: $exp_name"
84
+
85
+ if [ "$resume" = true ]; then
86
+ echo "Automatically resume from the experimental dir..."
87
+ CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/vocoder/train.py \
88
+ --config "$exp_config" \
89
+ --exp_name "$exp_name" \
90
+ --log_level info \
91
+ --resume
92
+ else
93
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/vocoder/train.py \
94
+ --config "$exp_config" \
95
+ --exp_name "$exp_name" \
96
+ --log_level info \
97
+ --checkpoint "$checkpoint" \
98
+ --resume_type "$resume_type"
99
+ fi
100
+ fi
101
+
102
+ ######## Inference/Conversion ###########
103
+ if [ $running_stage -eq 3 ]; then
104
+ if [ -z "$infer_expt_dir" ]; then
105
+ echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
106
+ exit 1
107
+ fi
108
+
109
+ if [ -z "$infer_output_dir" ]; then
110
+ infer_output_dir="$infer_expt_dir/result"
111
+ fi
112
+
113
+ echo $infer_datasets
114
+
115
+ if [ $infer_mode = "infer_from_dataset" ]; then
116
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
117
+ --config $exp_config \
118
+ --infer_mode $infer_mode \
119
+ --infer_datasets $infer_datasets \
120
+ --vocoder_dir $infer_expt_dir \
121
+ --output_dir $infer_output_dir \
122
+ --log_level debug
123
+ fi
124
+
125
+ if [ $infer_mode = "infer_from_feature" ]; then
126
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
127
+ --config $exp_config \
128
+ --infer_mode $infer_mode \
129
+ --feature_folder $infer_feature_dir \
130
+ --vocoder_dir $infer_expt_dir \
131
+ --output_dir $infer_output_dir \
132
+ --log_level debug
133
+ fi
134
+
135
+ if [ $infer_mode = "infer_from_audio" ]; then
136
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
137
+ --config $exp_config \
138
+ --infer_mode $infer_mode \
139
+ --audio_folder $infer_audio_dir \
140
+ --vocoder_dir $infer_expt_dir \
141
+ --output_dir $infer_output_dir \
142
+ --log_level debug
143
+ fi
144
+
145
+ fi
inference.py CHANGED
@@ -208,9 +208,9 @@ def build_parser():
208
  return parser
209
 
210
 
211
- def main():
212
  ### Parse arguments and config
213
- args = build_parser().parse_args()
214
  cfg = load_config(args.config)
215
 
216
  # CUDA settings
@@ -256,3 +256,7 @@ def main():
256
  else:
257
  ### Infer from dataset
258
  infer(args, cfg, infer_type="from_dataset")
 
 
 
 
 
208
  return parser
209
 
210
 
211
+ def main(args_list):
212
  ### Parse arguments and config
213
+ args = build_parser().parse_args(args_list)
214
  cfg = load_config(args.config)
215
 
216
  # CUDA settings
 
256
  else:
257
  ### Infer from dataset
258
  infer(args, cfg, infer_type="from_dataset")
259
+
260
+
261
+ if __name__ == "__main__":
262
+ main()
modules/__init__.py ADDED
File without changes
modules/activation_functions/__init__.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ from .gated_activation_unit import GaU
7
+ from .snake import Snake, SnakeBeta
modules/activation_functions/gated_activation_unit.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import torch
7
+ import torch.nn as nn
8
+
9
+ from modules.general.utils import Conv1d
10
+
11
+
12
+ class GaU(nn.Module):
13
+ r"""Gated Activation Unit (GaU) proposed in `Gated Activation Units for Neural
14
+ Networks <https://arxiv.org/pdf/1606.05328.pdf>`_.
15
+
16
+ Args:
17
+ channels: number of input channels.
18
+ kernel_size: kernel size of the convolution.
19
+ dilation: dilation rate of the convolution.
20
+ d_context: dimension of context tensor, None if don't use context.
21
+ """
22
+
23
+ def __init__(
24
+ self,
25
+ channels: int,
26
+ kernel_size: int = 3,
27
+ dilation: int = 1,
28
+ d_context: int = None,
29
+ ):
30
+ super().__init__()
31
+
32
+ self.context = d_context
33
+
34
+ self.conv = Conv1d(
35
+ channels,
36
+ channels * 2,
37
+ kernel_size,
38
+ dilation=dilation,
39
+ padding=dilation * (kernel_size - 1) // 2,
40
+ )
41
+
42
+ if self.context:
43
+ self.context_proj = Conv1d(d_context, channels * 2, 1)
44
+
45
+ def forward(self, x: torch.Tensor, context: torch.Tensor = None):
46
+ r"""Calculate forward propagation.
47
+
48
+ Args:
49
+ x: input tensor with shape [B, C, T].
50
+ context: context tensor with shape [B, ``d_context``, T], default to None.
51
+ """
52
+
53
+ h = self.conv(x)
54
+
55
+ if self.context:
56
+ h = h + self.context_proj(context)
57
+
58
+ h1, h2 = h.chunk(2, 1)
59
+ h = torch.tanh(h1) * torch.sigmoid(h2)
60
+
61
+ return h
modules/activation_functions/snake.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import torch
7
+ from torch import nn, pow, sin
8
+ from torch.nn import Parameter
9
+
10
+
11
+ class Snake(nn.Module):
12
+ r"""Implementation of a sine-based periodic activation function.
13
+ Alpha is initialized to 1 by default, higher values means higher frequency.
14
+ It will be trained along with the rest of your model.
15
+
16
+ Args:
17
+ in_features: shape of the input
18
+ alpha: trainable parameter
19
+
20
+ Shape:
21
+ - Input: (B, C, T)
22
+ - Output: (B, C, T), same shape as the input
23
+
24
+ References:
25
+ This activation function is from this paper by Liu Ziyin, Tilman Hartwig,
26
+ Masahito Ueda: https://arxiv.org/abs/2006.08195
27
+
28
+ Examples:
29
+ >>> a1 = Snake(256)
30
+ >>> x = torch.randn(256)
31
+ >>> x = a1(x)
32
+ """
33
+
34
+ def __init__(
35
+ self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False
36
+ ):
37
+ super(Snake, self).__init__()
38
+ self.in_features = in_features
39
+
40
+ # initialize alpha
41
+ self.alpha_logscale = alpha_logscale
42
+ if self.alpha_logscale: # log scale alphas initialized to zeros
43
+ self.alpha = Parameter(torch.zeros(in_features) * alpha)
44
+ else: # linear scale alphas initialized to ones
45
+ self.alpha = Parameter(torch.ones(in_features) * alpha)
46
+
47
+ self.alpha.requires_grad = alpha_trainable
48
+
49
+ self.no_div_by_zero = 0.000000001
50
+
51
+ def forward(self, x):
52
+ r"""Forward pass of the function. Applies the function to the input elementwise.
53
+ Snake ∶= x + 1/a * sin^2 (ax)
54
+ """
55
+
56
+ alpha = self.alpha.unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T]
57
+ if self.alpha_logscale:
58
+ alpha = torch.exp(alpha)
59
+ x = x + (1.0 / (alpha + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
60
+
61
+ return x
62
+
63
+
64
+ class SnakeBeta(nn.Module):
65
+ r"""A modified Snake function which uses separate parameters for the magnitude
66
+ of the periodic components. Alpha is initialized to 1 by default,
67
+ higher values means higher frequency. Beta is initialized to 1 by default,
68
+ higher values means higher magnitude. Both will be trained along with the
69
+ rest of your model.
70
+
71
+ Args:
72
+ in_features: shape of the input
73
+ alpha: trainable parameter that controls frequency
74
+ beta: trainable parameter that controls magnitude
75
+
76
+ Shape:
77
+ - Input: (B, C, T)
78
+ - Output: (B, C, T), same shape as the input
79
+
80
+ References:
81
+ This activation function is a modified version based on this paper by Liu Ziyin,
82
+ Tilman Hartwig, Masahito Ueda: https://arxiv.org/abs/2006.08195
83
+
84
+ Examples:
85
+ >>> a1 = SnakeBeta(256)
86
+ >>> x = torch.randn(256)
87
+ >>> x = a1(x)
88
+ """
89
+
90
+ def __init__(
91
+ self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False
92
+ ):
93
+ super(SnakeBeta, self).__init__()
94
+ self.in_features = in_features
95
+
96
+ # initialize alpha
97
+ self.alpha_logscale = alpha_logscale
98
+ if self.alpha_logscale: # log scale alphas initialized to zeros
99
+ self.alpha = Parameter(torch.zeros(in_features) * alpha)
100
+ self.beta = Parameter(torch.zeros(in_features) * alpha)
101
+ else: # linear scale alphas initialized to ones
102
+ self.alpha = Parameter(torch.ones(in_features) * alpha)
103
+ self.beta = Parameter(torch.ones(in_features) * alpha)
104
+
105
+ self.alpha.requires_grad = alpha_trainable
106
+ self.beta.requires_grad = alpha_trainable
107
+
108
+ self.no_div_by_zero = 0.000000001
109
+
110
+ def forward(self, x):
111
+ r"""Forward pass of the function. Applies the function to the input elementwise.
112
+ SnakeBeta ∶= x + 1/b * sin^2 (xa)
113
+ """
114
+
115
+ alpha = self.alpha.unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T]
116
+ beta = self.beta.unsqueeze(0).unsqueeze(-1)
117
+ if self.alpha_logscale:
118
+ alpha = torch.exp(alpha)
119
+ beta = torch.exp(beta)
120
+ x = x + (1.0 / (beta + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
121
+
122
+ return x
modules/anti_aliasing/__init__.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ from .act import *
7
+ from .filter import *
8
+ from .resample import *
modules/anti_aliasing/act.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import torch.nn as nn
7
+
8
+ from .resample import *
9
+
10
+ # This code is adopted from BigVGAN under the MIT License
11
+ # https://github.com/NVIDIA/BigVGAN
12
+
13
+ class Activation1d(nn.Module):
14
+ def __init__(
15
+ self,
16
+ activation,
17
+ up_ratio: int = 2,
18
+ down_ratio: int = 2,
19
+ up_kernel_size: int = 12,
20
+ down_kernel_size: int = 12,
21
+ ):
22
+ super().__init__()
23
+ self.up_ratio = up_ratio
24
+ self.down_ratio = down_ratio
25
+ self.act = activation
26
+ self.upsample = UpSample1d(up_ratio, up_kernel_size)
27
+ self.downsample = DownSample1d(down_ratio, down_kernel_size)
28
+
29
+ # x: [B,C,T]
30
+ def forward(self, x):
31
+ x = self.upsample(x)
32
+ x = self.act(x)
33
+ x = self.downsample(x)
34
+
35
+ return x
modules/anti_aliasing/filter.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import torch
7
+ import torch.nn as nn
8
+ import torch.nn.functional as F
9
+ import math
10
+
11
+ if "sinc" in dir(torch):
12
+ sinc = torch.sinc
13
+ else:
14
+ # This code is adopted from adefossez's julius.core.sinc under the MIT License
15
+ # https://adefossez.github.io/julius/julius/core.html
16
+ def sinc(x: torch.Tensor):
17
+ """
18
+ Implementation of sinc, i.e. sin(pi * x) / (pi * x)
19
+ __Warning__: Different to julius.sinc, the input is multiplied by `pi`!
20
+ """
21
+ return torch.where(
22
+ x == 0,
23
+ torch.tensor(1.0, device=x.device, dtype=x.dtype),
24
+ torch.sin(math.pi * x) / math.pi / x,
25
+ )
26
+
27
+
28
+ # This code is adopted from adefossez's julius.lowpass.LowPassFilters under the MIT License
29
+ # https://adefossez.github.io/julius/julius/lowpass.html
30
+ def kaiser_sinc_filter1d(
31
+ cutoff, half_width, kernel_size
32
+ ): # return filter [1,1,kernel_size]
33
+ even = kernel_size % 2 == 0
34
+ half_size = kernel_size // 2
35
+
36
+ # For kaiser window
37
+ delta_f = 4 * half_width
38
+ A = 2.285 * (half_size - 1) * math.pi * delta_f + 7.95
39
+ if A > 50.0:
40
+ beta = 0.1102 * (A - 8.7)
41
+ elif A >= 21.0:
42
+ beta = 0.5842 * (A - 21) ** 0.4 + 0.07886 * (A - 21.0)
43
+ else:
44
+ beta = 0.0
45
+ window = torch.kaiser_window(kernel_size, beta=beta, periodic=False)
46
+
47
+ # ratio = 0.5/cutoff -> 2 * cutoff = 1 / ratio
48
+ if even:
49
+ time = torch.arange(-half_size, half_size) + 0.5
50
+ else:
51
+ time = torch.arange(kernel_size) - half_size
52
+ if cutoff == 0:
53
+ filter_ = torch.zeros_like(time)
54
+ else:
55
+ filter_ = 2 * cutoff * window * sinc(2 * cutoff * time)
56
+ # Normalize filter to have sum = 1, otherwise we will have a small leakage
57
+ # of the constant component in the input signal.
58
+ filter_ /= filter_.sum()
59
+ filter = filter_.view(1, 1, kernel_size)
60
+
61
+ return filter
62
+
63
+
64
+ class LowPassFilter1d(nn.Module):
65
+ def __init__(
66
+ self,
67
+ cutoff=0.5,
68
+ half_width=0.6,
69
+ stride: int = 1,
70
+ padding: bool = True,
71
+ padding_mode: str = "replicate",
72
+ kernel_size: int = 12,
73
+ ):
74
+ # kernel_size should be even number for stylegan3 setup,
75
+ # in this implementation, odd number is also possible.
76
+ super().__init__()
77
+ if cutoff < -0.0:
78
+ raise ValueError("Minimum cutoff must be larger than zero.")
79
+ if cutoff > 0.5:
80
+ raise ValueError("A cutoff above 0.5 does not make sense.")
81
+ self.kernel_size = kernel_size
82
+ self.even = kernel_size % 2 == 0
83
+ self.pad_left = kernel_size // 2 - int(self.even)
84
+ self.pad_right = kernel_size // 2
85
+ self.stride = stride
86
+ self.padding = padding
87
+ self.padding_mode = padding_mode
88
+ filter = kaiser_sinc_filter1d(cutoff, half_width, kernel_size)
89
+ self.register_buffer("filter", filter)
90
+
91
+ # input [B, C, T]
92
+ def forward(self, x):
93
+ _, C, _ = x.shape
94
+
95
+ if self.padding:
96
+ x = F.pad(x, (self.pad_left, self.pad_right), mode=self.padding_mode)
97
+ out = F.conv1d(x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C)
98
+
99
+ return out
modules/anti_aliasing/resample.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ #################### Anti-aliasing ####################
7
+
8
+ import torch.nn as nn
9
+ from torch.nn import functional as F
10
+
11
+ from .filter import *
12
+
13
+ # This code is adopted from BigVGAN under the MIT License
14
+ # https://github.com/NVIDIA/BigVGAN
15
+
16
+ class UpSample1d(nn.Module):
17
+ def __init__(self, ratio=2, kernel_size=None):
18
+ super().__init__()
19
+ self.ratio = ratio
20
+ self.kernel_size = (
21
+ int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
22
+ )
23
+ self.stride = ratio
24
+ self.pad = self.kernel_size // ratio - 1
25
+ self.pad_left = self.pad * self.stride + (self.kernel_size - self.stride) // 2
26
+ self.pad_right = (
27
+ self.pad * self.stride + (self.kernel_size - self.stride + 1) // 2
28
+ )
29
+ filter = kaiser_sinc_filter1d(
30
+ cutoff=0.5 / ratio, half_width=0.6 / ratio, kernel_size=self.kernel_size
31
+ )
32
+ self.register_buffer("filter", filter)
33
+
34
+ # x: [B, C, T]
35
+ def forward(self, x):
36
+ _, C, _ = x.shape
37
+
38
+ x = F.pad(x, (self.pad, self.pad), mode="replicate")
39
+ x = self.ratio * F.conv_transpose1d(
40
+ x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C
41
+ )
42
+ x = x[..., self.pad_left : -self.pad_right]
43
+
44
+ return x
45
+
46
+
47
+ class DownSample1d(nn.Module):
48
+ def __init__(self, ratio=2, kernel_size=None):
49
+ super().__init__()
50
+ self.ratio = ratio
51
+ self.kernel_size = (
52
+ int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
53
+ )
54
+ self.lowpass = LowPassFilter1d(
55
+ cutoff=0.5 / ratio,
56
+ half_width=0.6 / ratio,
57
+ stride=ratio,
58
+ kernel_size=self.kernel_size,
59
+ )
60
+
61
+ def forward(self, x):
62
+ xx = self.lowpass(x)
63
+
64
+ return xx
modules/base/base_module.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import torch
7
+ from torch import nn
8
+ from torch.nn import functional as F
9
+
10
+
11
+ class LayerNorm(nn.Module):
12
+ def __init__(self, channels, eps=1e-5):
13
+ super().__init__()
14
+ self.channels = channels
15
+ self.eps = eps
16
+
17
+ self.gamma = nn.Parameter(torch.ones(channels))
18
+ self.beta = nn.Parameter(torch.zeros(channels))
19
+
20
+ def forward(self, x):
21
+ x = x.transpose(1, -1)
22
+ x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
23
+ return x.transpose(1, -1)
24
+
25
+
26
+ class ConvReluNorm(nn.Module):
27
+ def __init__(
28
+ self,
29
+ in_channels,
30
+ hidden_channels,
31
+ out_channels,
32
+ kernel_size,
33
+ n_layers,
34
+ p_dropout,
35
+ ):
36
+ super().__init__()
37
+ self.in_channels = in_channels
38
+ self.hidden_channels = hidden_channels
39
+ self.out_channels = out_channels
40
+ self.kernel_size = kernel_size
41
+ self.n_layers = n_layers
42
+ self.p_dropout = p_dropout
43
+ assert n_layers > 1, "Number of layers should be larger than 0."
44
+
45
+ self.conv_layers = nn.ModuleList()
46
+ self.norm_layers = nn.ModuleList()
47
+ self.conv_layers.append(
48
+ nn.Conv1d(
49
+ in_channels, hidden_channels, kernel_size, padding=kernel_size // 2
50
+ )
51
+ )
52
+ self.norm_layers.append(LayerNorm(hidden_channels))
53
+ self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(p_dropout))
54
+ for _ in range(n_layers - 1):
55
+ self.conv_layers.append(
56
+ nn.Conv1d(
57
+ hidden_channels,
58
+ hidden_channels,
59
+ kernel_size,
60
+ padding=kernel_size // 2,
61
+ )
62
+ )
63
+ self.norm_layers.append(LayerNorm(hidden_channels))
64
+ self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
65
+ self.proj.weight.data.zero_()
66
+ self.proj.bias.data.zero_()
67
+
68
+ def forward(self, x, x_mask):
69
+ x_org = x
70
+ for i in range(self.n_layers):
71
+ x = self.conv_layers[i](x * x_mask)
72
+ x = self.norm_layers[i](x)
73
+ x = self.relu_drop(x)
74
+ x = x_org + self.proj(x)
75
+ return x * x_mask
modules/diffusion/__init__.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ from .bidilconv.bidilated_conv import BiDilConv
7
+ from .unet.unet import UNet
modules/diffusion/bidilconv/bidilated_conv.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import math
7
+
8
+ import torch.nn as nn
9
+
10
+ from modules.general.utils import Conv1d, zero_module
11
+ from .residual_block import ResidualBlock
12
+
13
+
14
+ class BiDilConv(nn.Module):
15
+ r"""Dilated CNN architecture with residual connections, default diffusion decoder.
16
+
17
+ Args:
18
+ input_channel: The number of input channels.
19
+ base_channel: The number of base channels.
20
+ n_res_block: The number of residual blocks.
21
+ conv_kernel_size: The kernel size of convolutional layers.
22
+ dilation_cycle_length: The cycle length of dilation.
23
+ conditioner_size: The size of conditioner.
24
+ """
25
+
26
+ def __init__(
27
+ self,
28
+ input_channel,
29
+ base_channel,
30
+ n_res_block,
31
+ conv_kernel_size,
32
+ dilation_cycle_length,
33
+ conditioner_size,
34
+ output_channel: int = -1,
35
+ ):
36
+ super().__init__()
37
+
38
+ self.input_channel = input_channel
39
+ self.base_channel = base_channel
40
+ self.n_res_block = n_res_block
41
+ self.conv_kernel_size = conv_kernel_size
42
+ self.dilation_cycle_length = dilation_cycle_length
43
+ self.conditioner_size = conditioner_size
44
+ self.output_channel = output_channel if output_channel > 0 else input_channel
45
+
46
+ self.input = nn.Sequential(
47
+ Conv1d(
48
+ input_channel,
49
+ base_channel,
50
+ 1,
51
+ ),
52
+ nn.ReLU(),
53
+ )
54
+
55
+ self.residual_blocks = nn.ModuleList(
56
+ [
57
+ ResidualBlock(
58
+ channels=base_channel,
59
+ kernel_size=conv_kernel_size,
60
+ dilation=2 ** (i % dilation_cycle_length),
61
+ d_context=conditioner_size,
62
+ )
63
+ for i in range(n_res_block)
64
+ ]
65
+ )
66
+
67
+ self.out_proj = nn.Sequential(
68
+ Conv1d(
69
+ base_channel,
70
+ base_channel,
71
+ 1,
72
+ ),
73
+ nn.ReLU(),
74
+ zero_module(
75
+ Conv1d(
76
+ base_channel,
77
+ self.output_channel,
78
+ 1,
79
+ ),
80
+ ),
81
+ )
82
+
83
+ def forward(self, x, y, context=None):
84
+ """
85
+ Args:
86
+ x: Noisy mel-spectrogram [B x ``n_mel`` x L]
87
+ y: FILM embeddings with the shape of (B, ``base_channel``)
88
+ context: Context with the shape of [B x ``d_context`` x L], default to None.
89
+ """
90
+
91
+ h = self.input(x)
92
+
93
+ skip = None
94
+ for i in range(self.n_res_block):
95
+ h, skip_connection = self.residual_blocks[i](h, y, context)
96
+ skip = skip_connection if skip is None else skip_connection + skip
97
+
98
+ out = skip / math.sqrt(self.n_res_block)
99
+
100
+ out = self.out_proj(out)
101
+
102
+ return out
modules/diffusion/bidilconv/residual_block.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import math
7
+
8
+ import torch
9
+ import torch.nn as nn
10
+
11
+ from modules.activation_functions import GaU
12
+ from modules.general.utils import Conv1d
13
+
14
+
15
+ class ResidualBlock(nn.Module):
16
+ r"""Residual block with dilated convolution, main portion of ``BiDilConv``.
17
+
18
+ Args:
19
+ channels: The number of channels of input and output.
20
+ kernel_size: The kernel size of dilated convolution.
21
+ dilation: The dilation rate of dilated convolution.
22
+ d_context: The dimension of content encoder output, None if don't use context.
23
+ """
24
+
25
+ def __init__(
26
+ self,
27
+ channels: int = 256,
28
+ kernel_size: int = 3,
29
+ dilation: int = 1,
30
+ d_context: int = None,
31
+ ):
32
+ super().__init__()
33
+
34
+ self.context = d_context
35
+
36
+ self.gau = GaU(
37
+ channels,
38
+ kernel_size,
39
+ dilation,
40
+ d_context,
41
+ )
42
+
43
+ self.out_proj = Conv1d(
44
+ channels,
45
+ channels * 2,
46
+ 1,
47
+ )
48
+
49
+ def forward(
50
+ self,
51
+ x: torch.Tensor,
52
+ y_emb: torch.Tensor,
53
+ context: torch.Tensor = None,
54
+ ):
55
+ """
56
+ Args:
57
+ x: Latent representation inherited from previous residual block
58
+ with the shape of [B x C x T].
59
+ y_emb: Embeddings with the shape of [B x C], which will be FILM on the x.
60
+ context: Context with the shape of [B x ``d_context`` x T], default to None.
61
+ """
62
+
63
+ h = x + y_emb[..., None]
64
+
65
+ if self.context:
66
+ h = self.gau(h, context)
67
+ else:
68
+ h = self.gau(h)
69
+
70
+ h = self.out_proj(h)
71
+ res, skip = h.chunk(2, 1)
72
+
73
+ return (res + x) / math.sqrt(2.0), skip
modules/diffusion/karras/karras_diffusion.py ADDED
@@ -0,0 +1,979 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ """
7
+ Based on: https://github.com/crowsonkb/k-diffusion
8
+ """
9
+ import random
10
+
11
+ import numpy as np
12
+ import torch as th
13
+ import torch.nn as nn
14
+ import torch.nn.functional as F
15
+
16
+ # from piq import LPIPS
17
+ from utils.ssim import SSIM
18
+
19
+ from modules.diffusion.karras.random_utils import get_generator
20
+
21
+
22
+ def mean_flat(tensor):
23
+ """
24
+ Take the mean over all non-batch dimensions.
25
+ """
26
+ return tensor.mean(dim=list(range(1, len(tensor.shape))))
27
+
28
+
29
+ def append_dims(x, target_dims):
30
+ """Appends dimensions to the end of a tensor until it has target_dims dimensions."""
31
+ dims_to_append = target_dims - x.ndim
32
+ if dims_to_append < 0:
33
+ raise ValueError(
34
+ f"input has {x.ndim} dims but target_dims is {target_dims}, which is less"
35
+ )
36
+ return x[(...,) + (None,) * dims_to_append]
37
+
38
+
39
+ def append_zero(x):
40
+ return th.cat([x, x.new_zeros([1])])
41
+
42
+
43
+ def get_weightings(weight_schedule, snrs, sigma_data):
44
+ if weight_schedule == "snr":
45
+ weightings = snrs
46
+ elif weight_schedule == "snr+1":
47
+ weightings = snrs + 1
48
+ elif weight_schedule == "karras":
49
+ weightings = snrs + 1.0 / sigma_data**2
50
+ elif weight_schedule == "truncated-snr":
51
+ weightings = th.clamp(snrs, min=1.0)
52
+ elif weight_schedule == "uniform":
53
+ weightings = th.ones_like(snrs)
54
+ else:
55
+ raise NotImplementedError()
56
+ return weightings
57
+
58
+
59
+ class KarrasDenoiser:
60
+ def __init__(
61
+ self,
62
+ sigma_data: float = 0.5,
63
+ sigma_max=80.0,
64
+ sigma_min=0.002,
65
+ rho=7.0,
66
+ weight_schedule="karras",
67
+ distillation=False,
68
+ loss_norm="l2",
69
+ ):
70
+ self.sigma_data = sigma_data
71
+ self.sigma_max = sigma_max
72
+ self.sigma_min = sigma_min
73
+ self.weight_schedule = weight_schedule
74
+ self.distillation = distillation
75
+ self.loss_norm = loss_norm
76
+ # if loss_norm == "lpips":
77
+ # self.lpips_loss = LPIPS(replace_pooling=True, reduction="none")
78
+ if loss_norm == "ssim":
79
+ self.ssim_loss = SSIM()
80
+ self.rho = rho
81
+ self.num_timesteps = 40
82
+
83
+ def get_snr(self, sigmas):
84
+ return sigmas**-2
85
+
86
+ def get_sigmas(self, sigmas):
87
+ return sigmas
88
+
89
+ def get_scalings(self, sigma):
90
+ c_skip = self.sigma_data**2 / (sigma**2 + self.sigma_data**2)
91
+ c_out = sigma * self.sigma_data / (sigma**2 + self.sigma_data**2) ** 0.5
92
+ c_in = 1 / (sigma**2 + self.sigma_data**2) ** 0.5
93
+ return c_skip, c_out, c_in
94
+
95
+ def get_scalings_for_boundary_condition(self, sigma):
96
+ c_skip = self.sigma_data**2 / (
97
+ (sigma - self.sigma_min) ** 2 + self.sigma_data**2
98
+ )
99
+ c_out = (
100
+ (sigma - self.sigma_min)
101
+ * self.sigma_data
102
+ / (sigma**2 + self.sigma_data**2) ** 0.5
103
+ )
104
+ c_in = 1 / (sigma**2 + self.sigma_data**2) ** 0.5
105
+ return c_skip, c_out, c_in
106
+
107
+ def training_losses(self, model, x_start, sigmas, condition=None, noise=None):
108
+ if noise is None:
109
+ noise = th.randn_like(x_start)
110
+
111
+ terms = {}
112
+
113
+ dims = x_start.ndim
114
+ x_t = x_start + noise * append_dims(sigmas, dims)
115
+ model_output, denoised = self.denoise(model, x_t, sigmas, condition)
116
+
117
+ snrs = self.get_snr(sigmas)
118
+ weights = append_dims(
119
+ get_weightings(self.weight_schedule, snrs, self.sigma_data), dims
120
+ )
121
+ # terms["xs_mse"] = mean_flat((denoised - x_start) ** 2)
122
+ terms["mse"] = mean_flat(weights * (denoised - x_start) ** 2)
123
+ # terms["mae"] = mean_flat(weights * th.abs(denoised - x_start))
124
+ # terms["mse"] = nn.MSELoss(reduction="none")(denoised, x_start)
125
+
126
+ # if "vb" in terms:
127
+ # terms["loss"] = terms["mse"] + terms["vb"]
128
+ # else:
129
+ terms["loss"] = terms["mse"]
130
+
131
+ return terms
132
+
133
+ def consistency_losses(
134
+ self,
135
+ model,
136
+ x_start,
137
+ num_scales,
138
+ # model_kwargs=None,
139
+ condition=None,
140
+ target_model=None,
141
+ teacher_model=None,
142
+ teacher_diffusion=None,
143
+ noise=None,
144
+ ):
145
+ if noise is None:
146
+ noise = th.randn_like(x_start)
147
+
148
+ dims = x_start.ndim
149
+
150
+ def denoise_fn(x, t):
151
+ return self.denoise(model, x, t, condition)[1]
152
+
153
+ if target_model:
154
+
155
+ @th.no_grad()
156
+ def target_denoise_fn(x, t):
157
+ return self.denoise(target_model, x, t, condition)[1]
158
+
159
+ else:
160
+ raise NotImplementedError("Must have a target model")
161
+
162
+ if teacher_model:
163
+
164
+ @th.no_grad()
165
+ def teacher_denoise_fn(x, t):
166
+ return teacher_diffusion.denoise(teacher_model, x, t, condition)[1]
167
+
168
+ @th.no_grad()
169
+ def heun_solver(samples, t, next_t, x0):
170
+ x = samples
171
+ if teacher_model is None:
172
+ denoiser = x0
173
+ else:
174
+ denoiser = teacher_denoise_fn(x, t)
175
+
176
+ d = (x - denoiser) / append_dims(t, dims)
177
+ samples = x + d * append_dims(next_t - t, dims)
178
+ if teacher_model is None:
179
+ denoiser = x0
180
+ else:
181
+ denoiser = teacher_denoise_fn(samples, next_t)
182
+
183
+ next_d = (samples - denoiser) / append_dims(next_t, dims)
184
+ samples = x + (d + next_d) * append_dims((next_t - t) / 2, dims)
185
+
186
+ return samples
187
+
188
+ @th.no_grad()
189
+ def euler_solver(samples, t, next_t, x0):
190
+ x = samples
191
+ if teacher_model is None:
192
+ denoiser = x0
193
+ else:
194
+ denoiser = teacher_denoise_fn(x, t)
195
+ d = (x - denoiser) / append_dims(t, dims)
196
+ samples = x + d * append_dims(next_t - t, dims)
197
+
198
+ return samples
199
+
200
+ indices = th.randint(
201
+ 0, num_scales - 1, (x_start.shape[0],), device=x_start.device
202
+ )
203
+
204
+ t = self.sigma_max ** (1 / self.rho) + indices / (num_scales - 1) * (
205
+ self.sigma_min ** (1 / self.rho) - self.sigma_max ** (1 / self.rho)
206
+ )
207
+ t = t**self.rho
208
+
209
+ t2 = self.sigma_max ** (1 / self.rho) + (indices + 1) / (num_scales - 1) * (
210
+ self.sigma_min ** (1 / self.rho) - self.sigma_max ** (1 / self.rho)
211
+ )
212
+ t2 = t2**self.rho
213
+
214
+ x_t = x_start + noise * append_dims(t, dims)
215
+
216
+ dropout_state = th.get_rng_state()
217
+ distiller = denoise_fn(x_t, t)
218
+
219
+ if teacher_model is None:
220
+ x_t2 = euler_solver(x_t, t, t2, x_start).detach()
221
+ else:
222
+ x_t2 = heun_solver(x_t, t, t2, x_start).detach()
223
+
224
+ th.set_rng_state(dropout_state)
225
+ distiller_target = target_denoise_fn(x_t2, t2)
226
+ distiller_target = distiller_target.detach()
227
+
228
+ snrs = self.get_snr(t)
229
+ weights = get_weightings(self.weight_schedule, snrs, self.sigma_data)
230
+ if self.loss_norm == "l1":
231
+ diffs = th.abs(distiller - distiller_target)
232
+ loss = mean_flat(diffs) * weights
233
+ elif self.loss_norm == "l2":
234
+ # diffs = (distiller - distiller_target) ** 2
235
+ loss = F.mse_loss(distiller, distiller_target)
236
+ # loss = mean_flat(diffs) * weights
237
+ elif self.loss_norm == "ssim":
238
+ loss = self.ssim_loss(distiller, distiller_target) * weights
239
+ # elif self.loss_norm == "l2-32":
240
+ # distiller = F.interpolate(distiller, size=32, mode="bilinear")
241
+ # distiller_target = F.interpolate(
242
+ # distiller_target,
243
+ # size=32,
244
+ # mode="bilinear",
245
+ # )
246
+ # diffs = (distiller - distiller_target) ** 2
247
+ # loss = mean_flat(diffs) * weights
248
+ # elif self.loss_norm == "lpips":
249
+ # if x_start.shape[-1] < 256:
250
+ # distiller = F.interpolate(distiller, size=224, mode="bilinear")
251
+ # distiller_target = F.interpolate(
252
+ # distiller_target, size=224, mode="bilinear"
253
+ # )
254
+
255
+ # loss = (
256
+ # self.lpips_loss(
257
+ # (distiller + 1) / 2.0,
258
+ # (distiller_target + 1) / 2.0,
259
+ # )
260
+ # * weights
261
+ # )
262
+ else:
263
+ raise ValueError(f"Unknown loss norm {self.loss_norm}")
264
+
265
+ terms = {}
266
+ terms["loss"] = loss
267
+
268
+ return terms
269
+
270
+ # def progdist_losses(
271
+ # self,
272
+ # model,
273
+ # x_start,
274
+ # num_scales,
275
+ # model_kwargs=None,
276
+ # teacher_model=None,
277
+ # teacher_diffusion=None,
278
+ # noise=None,
279
+ # ):
280
+ # if model_kwargs is None:
281
+ # model_kwargs = {}
282
+ # if noise is None:
283
+ # noise = th.randn_like(x_start)
284
+
285
+ # dims = x_start.ndim
286
+
287
+ # def denoise_fn(x, t):
288
+ # return self.denoise(model, x, t, **model_kwargs)[1]
289
+
290
+ # @th.no_grad()
291
+ # def teacher_denoise_fn(x, t):
292
+ # return teacher_diffusion.denoise(teacher_model, x, t, **model_kwargs)[1]
293
+
294
+ # @th.no_grad()
295
+ # def euler_solver(samples, t, next_t):
296
+ # x = samples
297
+ # denoiser = teacher_denoise_fn(x, t)
298
+ # d = (x - denoiser) / append_dims(t, dims)
299
+ # samples = x + d * append_dims(next_t - t, dims)
300
+
301
+ # return samples
302
+
303
+ # @th.no_grad()
304
+ # def euler_to_denoiser(x_t, t, x_next_t, next_t):
305
+ # denoiser = x_t - append_dims(t, dims) * (x_next_t - x_t) / append_dims(
306
+ # next_t - t, dims
307
+ # )
308
+ # return denoiser
309
+
310
+ # indices = th.randint(0, num_scales, (x_start.shape[0],), device=x_start.device)
311
+
312
+ # t = self.sigma_max ** (1 / self.rho) + indices / num_scales * (
313
+ # self.sigma_min ** (1 / self.rho) - self.sigma_max ** (1 / self.rho)
314
+ # )
315
+ # t = t**self.rho
316
+
317
+ # t2 = self.sigma_max ** (1 / self.rho) + (indices + 0.5) / num_scales * (
318
+ # self.sigma_min ** (1 / self.rho) - self.sigma_max ** (1 / self.rho)
319
+ # )
320
+ # t2 = t2**self.rho
321
+
322
+ # t3 = self.sigma_max ** (1 / self.rho) + (indices + 1) / num_scales * (
323
+ # self.sigma_min ** (1 / self.rho) - self.sigma_max ** (1 / self.rho)
324
+ # )
325
+ # t3 = t3**self.rho
326
+
327
+ # x_t = x_start + noise * append_dims(t, dims)
328
+
329
+ # denoised_x = denoise_fn(x_t, t)
330
+
331
+ # x_t2 = euler_solver(x_t, t, t2).detach()
332
+ # x_t3 = euler_solver(x_t2, t2, t3).detach()
333
+
334
+ # target_x = euler_to_denoiser(x_t, t, x_t3, t3).detach()
335
+
336
+ # snrs = self.get_snr(t)
337
+ # weights = get_weightings(self.weight_schedule, snrs, self.sigma_data)
338
+ # if self.loss_norm == "l1":
339
+ # diffs = th.abs(denoised_x - target_x)
340
+ # loss = mean_flat(diffs) * weights
341
+ # elif self.loss_norm == "l2":
342
+ # diffs = (denoised_x - target_x) ** 2
343
+ # loss = mean_flat(diffs) * weights
344
+ # elif self.loss_norm == "lpips":
345
+ # if x_start.shape[-1] < 256:
346
+ # denoised_x = F.interpolate(denoised_x, size=224, mode="bilinear")
347
+ # target_x = F.interpolate(target_x, size=224, mode="bilinear")
348
+ # loss = (
349
+ # self.lpips_loss(
350
+ # (denoised_x + 1) / 2.0,
351
+ # (target_x + 1) / 2.0,
352
+ # )
353
+ # * weights
354
+ # )
355
+ # else:
356
+ # raise ValueError(f"Unknown loss norm {self.loss_norm}")
357
+
358
+ # terms = {}
359
+ # terms["loss"] = loss
360
+
361
+ # return terms
362
+
363
+ def denoise(self, model, x_t, sigmas, condition):
364
+ if not self.distillation:
365
+ c_skip, c_out, c_in = [
366
+ append_dims(x, x_t.ndim) for x in self.get_scalings(sigmas)
367
+ ]
368
+ else:
369
+ c_skip, c_out, c_in = [
370
+ append_dims(x, x_t.ndim)
371
+ for x in self.get_scalings_for_boundary_condition(sigmas)
372
+ ]
373
+ rescaled_t = 1000 * 0.25 * th.log(sigmas + 1e-44)
374
+ # rescaled_t = rescaled_t[:, None]
375
+ model_output = model(c_in * x_t, rescaled_t, condition)
376
+ denoised = c_out * model_output + c_skip * x_t
377
+ return model_output, denoised
378
+
379
+
380
+ def karras_sample(
381
+ diffusion,
382
+ model,
383
+ shape,
384
+ steps,
385
+ clip_denoised=True,
386
+ progress=True,
387
+ callback=None,
388
+ # model_kwargs=None,
389
+ condition=None,
390
+ device=None,
391
+ sigma_min=0.002,
392
+ sigma_max=80, # higher for highres?
393
+ rho=7.0,
394
+ sampler="heun",
395
+ s_churn=0.0,
396
+ s_tmin=0.0,
397
+ s_tmax=float("inf"),
398
+ s_noise=1.0,
399
+ generator=None,
400
+ ts=None,
401
+ ):
402
+ if generator is None:
403
+ generator = get_generator("dummy")
404
+
405
+ if sampler == "progdist":
406
+ sigmas = get_sigmas_karras(steps + 1, sigma_min, sigma_max, rho, device=device)
407
+ else:
408
+ sigmas = get_sigmas_karras(steps, sigma_min, sigma_max, rho, device=device)
409
+ th.manual_seed(42)
410
+ x_T = generator.randn(*shape, device=device) * sigma_max
411
+ sigmas = sigmas.unsqueeze(-1)
412
+ sample_fn = {
413
+ "heun": sample_heun,
414
+ "dpm": sample_dpm,
415
+ "ancestral": sample_euler_ancestral,
416
+ "onestep": sample_onestep,
417
+ "progdist": sample_progdist,
418
+ "euler": sample_euler,
419
+ "multistep": stochastic_iterative_sampler,
420
+ }[sampler]
421
+
422
+ if sampler in ["heun", "dpm"]:
423
+ sampler_args = dict(
424
+ s_churn=s_churn, s_tmin=s_tmin, s_tmax=s_tmax, s_noise=s_noise
425
+ )
426
+ elif sampler == "multistep":
427
+ sampler_args = dict(
428
+ ts=ts, t_min=sigma_min, t_max=sigma_max, rho=diffusion.rho, steps=steps
429
+ )
430
+ else:
431
+ sampler_args = {}
432
+
433
+ def denoiser(x_t, sigma):
434
+ _, denoised = diffusion.denoise(model, x_t, sigma, condition)
435
+ if clip_denoised:
436
+ denoised = denoised.clamp(-1, 1)
437
+ return denoised
438
+
439
+ x_0 = sample_fn(
440
+ denoiser,
441
+ x_T,
442
+ sigmas,
443
+ generator,
444
+ progress=progress,
445
+ callback=callback,
446
+ **sampler_args,
447
+ )
448
+ return x_0.clamp(-1, 1)
449
+
450
+
451
+ def get_sigmas_karras(n, sigma_min, sigma_max, rho=7.0, device="cpu"):
452
+ """Constructs the noise schedule of Karras et al. (2022)."""
453
+ ramp = th.linspace(0, 1, n)
454
+ min_inv_rho = sigma_min ** (1 / rho)
455
+ max_inv_rho = sigma_max ** (1 / rho)
456
+ sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
457
+ return append_zero(sigmas).to(device)
458
+
459
+
460
+ def to_d(x, sigma, denoised):
461
+ """Converts a denoiser output to a Karras ODE derivative."""
462
+ return (x - denoised) / append_dims(sigma, x.ndim)
463
+
464
+
465
+ def get_ancestral_step(sigma_from, sigma_to):
466
+ """Calculates the noise level (sigma_down) to step down to and the amount
467
+ of noise to add (sigma_up) when doing an ancestral sampling step."""
468
+ sigma_up = (
469
+ sigma_to**2 * (sigma_from**2 - sigma_to**2) / sigma_from**2
470
+ ) ** 0.5
471
+ sigma_down = (sigma_to**2 - sigma_up**2) ** 0.5
472
+ return sigma_down, sigma_up
473
+
474
+
475
+ @th.no_grad()
476
+ def sample_euler_ancestral(model, x, sigmas, generator, progress=False, callback=None):
477
+ """Ancestral sampling with Euler method steps."""
478
+ s_in = x.new_ones([x.shape[0]])
479
+ indices = range(len(sigmas) - 1)
480
+ if progress:
481
+ from tqdm.auto import tqdm
482
+
483
+ indices = tqdm(indices)
484
+
485
+ for i in indices:
486
+ denoised = model(x, sigmas[i] * s_in)
487
+ sigma_down, sigma_up = get_ancestral_step(sigmas[i], sigmas[i + 1])
488
+ if callback is not None:
489
+ callback(
490
+ {
491
+ "x": x,
492
+ "i": i,
493
+ "sigma": sigmas[i],
494
+ "sigma_hat": sigmas[i],
495
+ "denoised": denoised,
496
+ }
497
+ )
498
+ d = to_d(x, sigmas[i], denoised)
499
+ # Euler method
500
+ dt = sigma_down - sigmas[i]
501
+ x = x + d * dt
502
+ x = x + generator.randn_like(x) * sigma_up
503
+ return x
504
+
505
+
506
+ @th.no_grad()
507
+ def sample_midpoint_ancestral(model, x, ts, generator, progress=False, callback=None):
508
+ """Ancestral sampling with midpoint method steps."""
509
+ s_in = x.new_ones([x.shape[0]])
510
+ step_size = 1 / len(ts)
511
+ if progress:
512
+ from tqdm.auto import tqdm
513
+
514
+ ts = tqdm(ts)
515
+
516
+ for tn in ts:
517
+ dn = model(x, tn * s_in)
518
+ dn_2 = model(x + (step_size / 2) * dn, (tn + step_size / 2) * s_in)
519
+ x = x + step_size * dn_2
520
+ if callback is not None:
521
+ callback({"x": x, "tn": tn, "dn": dn, "dn_2": dn_2})
522
+ return x
523
+
524
+
525
+ @th.no_grad()
526
+ def sample_heun(
527
+ denoiser,
528
+ x,
529
+ sigmas,
530
+ generator,
531
+ progress=False,
532
+ callback=None,
533
+ s_churn=0.0,
534
+ s_tmin=0.0,
535
+ s_tmax=float("inf"),
536
+ s_noise=1.0,
537
+ ):
538
+ """Implements Algorithm 2 (Heun steps) from Karras et al. (2022)."""
539
+ s_in = x.new_ones([x.shape[0]])
540
+ indices = range(len(sigmas) - 1)
541
+ if progress:
542
+ from tqdm.auto import tqdm
543
+
544
+ indices = tqdm(indices)
545
+
546
+ for i in indices:
547
+ gamma = (
548
+ min(s_churn / (len(sigmas) - 1), 2**0.5 - 1)
549
+ if s_tmin <= sigmas[i] <= s_tmax
550
+ else 0.0
551
+ )
552
+ eps = generator.randn_like(x) * s_noise
553
+ sigma_hat = sigmas[i] * (gamma + 1)
554
+ if gamma > 0:
555
+ x = x + eps * (sigma_hat**2 - sigmas[i] ** 2) ** 0.5
556
+ denoised = denoiser(x, sigma_hat * s_in)
557
+ d = to_d(x, sigma_hat, denoised)
558
+ if callback is not None:
559
+ callback(
560
+ {
561
+ "x": x,
562
+ "i": i,
563
+ "sigma": sigmas[i],
564
+ "sigma_hat": sigma_hat,
565
+ "denoised": denoised,
566
+ }
567
+ )
568
+ dt = sigmas[i + 1] - sigma_hat
569
+ if sigmas[i + 1] == 0:
570
+ # Euler method
571
+ x = x + d * dt
572
+ else:
573
+ # Heun's method
574
+ x_2 = x + d * dt
575
+ denoised_2 = denoiser(x_2, sigmas[i + 1] * s_in)
576
+ d_2 = to_d(x_2, sigmas[i + 1], denoised_2)
577
+ d_prime = (d + d_2) / 2
578
+ x = x + d_prime * dt
579
+ return x
580
+
581
+
582
+ @th.no_grad()
583
+ def sample_euler(
584
+ denoiser,
585
+ x,
586
+ sigmas,
587
+ generator,
588
+ progress=False,
589
+ callback=None,
590
+ ):
591
+ """Implements Algorithm 2 (Heun steps) from Karras et al. (2022)."""
592
+ s_in = x.new_ones([x.shape[0]])
593
+ indices = range(len(sigmas) - 1)
594
+ if progress:
595
+ from tqdm.auto import tqdm
596
+
597
+ indices = tqdm(indices)
598
+
599
+ for i in indices:
600
+ sigma = sigmas[i]
601
+ denoised = denoiser(x, sigma * s_in)
602
+ d = to_d(x, sigma, denoised)
603
+ if callback is not None:
604
+ callback(
605
+ {
606
+ "x": x,
607
+ "i": i,
608
+ "sigma": sigmas[i],
609
+ "denoised": denoised,
610
+ }
611
+ )
612
+ dt = sigmas[i + 1] - sigma
613
+ x = x + d * dt
614
+ return x
615
+
616
+
617
+ @th.no_grad()
618
+ def sample_dpm(
619
+ denoiser,
620
+ x,
621
+ sigmas,
622
+ generator,
623
+ progress=False,
624
+ callback=None,
625
+ s_churn=0.0,
626
+ s_tmin=0.0,
627
+ s_tmax=float("inf"),
628
+ s_noise=1.0,
629
+ ):
630
+ """A sampler inspired by DPM-Solver-2 and Algorithm 2 from Karras et al. (2022)."""
631
+ s_in = x.new_ones([x.shape[0]])
632
+ indices = range(len(sigmas) - 1)
633
+ if progress:
634
+ from tqdm.auto import tqdm
635
+
636
+ indices = tqdm(indices)
637
+
638
+ for i in indices:
639
+ gamma = (
640
+ min(s_churn / (len(sigmas) - 1), 2**0.5 - 1)
641
+ if s_tmin <= sigmas[i] <= s_tmax
642
+ else 0.0
643
+ )
644
+ eps = generator.randn_like(x) * s_noise
645
+ sigma_hat = sigmas[i] * (gamma + 1)
646
+ if gamma > 0:
647
+ x = x + eps * (sigma_hat**2 - sigmas[i] ** 2) ** 0.5
648
+ denoised = denoiser(x, sigma_hat * s_in)
649
+ d = to_d(x, sigma_hat, denoised)
650
+ if callback is not None:
651
+ callback(
652
+ {
653
+ "x": x,
654
+ "i": i,
655
+ "sigma": sigmas[i],
656
+ "sigma_hat": sigma_hat,
657
+ "denoised": denoised,
658
+ }
659
+ )
660
+ # Midpoint method, where the midpoint is chosen according to a rho=3 Karras schedule
661
+ sigma_mid = ((sigma_hat ** (1 / 3) + sigmas[i + 1] ** (1 / 3)) / 2) ** 3
662
+ dt_1 = sigma_mid - sigma_hat
663
+ dt_2 = sigmas[i + 1] - sigma_hat
664
+ x_2 = x + d * dt_1
665
+ denoised_2 = denoiser(x_2, sigma_mid * s_in)
666
+ d_2 = to_d(x_2, sigma_mid, denoised_2)
667
+ x = x + d_2 * dt_2
668
+ return x
669
+
670
+
671
+ @th.no_grad()
672
+ def sample_onestep(
673
+ distiller,
674
+ x,
675
+ sigmas,
676
+ generator=None,
677
+ progress=False,
678
+ callback=None,
679
+ ):
680
+ """Single-step generation from a distilled model."""
681
+ s_in = x.new_ones([x.shape[0]])
682
+ return distiller(x, sigmas[0] * s_in)
683
+
684
+
685
+ @th.no_grad()
686
+ def stochastic_iterative_sampler(
687
+ distiller,
688
+ x,
689
+ sigmas,
690
+ generator,
691
+ ts,
692
+ progress=False,
693
+ callback=None,
694
+ t_min=0.002,
695
+ t_max=80.0,
696
+ rho=7.0,
697
+ steps=40,
698
+ ):
699
+ t_max_rho = t_max ** (1 / rho)
700
+ t_min_rho = t_min ** (1 / rho)
701
+ s_in = x.new_ones([x.shape[0]])
702
+
703
+ for i in range(len(ts) - 1):
704
+ t = (t_max_rho + ts[i] / (steps - 1) * (t_min_rho - t_max_rho)) ** rho
705
+ x0 = distiller(x, t * s_in)
706
+ next_t = (t_max_rho + ts[i + 1] / (steps - 1) * (t_min_rho - t_max_rho)) ** rho
707
+ next_t = np.clip(next_t, t_min, t_max)
708
+ x = x0 + generator.randn_like(x) * np.sqrt(next_t**2 - t_min**2)
709
+
710
+ return x
711
+
712
+
713
+ @th.no_grad()
714
+ def sample_progdist(
715
+ denoiser,
716
+ x,
717
+ sigmas,
718
+ generator=None,
719
+ progress=False,
720
+ callback=None,
721
+ ):
722
+ s_in = x.new_ones([x.shape[0]])
723
+ sigmas = sigmas[:-1] # skip the zero sigma
724
+
725
+ indices = range(len(sigmas) - 1)
726
+ if progress:
727
+ from tqdm.auto import tqdm
728
+
729
+ indices = tqdm(indices)
730
+
731
+ for i in indices:
732
+ sigma = sigmas[i]
733
+ denoised = denoiser(x, sigma * s_in)
734
+ d = to_d(x, sigma, denoised)
735
+ if callback is not None:
736
+ callback(
737
+ {
738
+ "x": x,
739
+ "i": i,
740
+ "sigma": sigma,
741
+ "denoised": denoised,
742
+ }
743
+ )
744
+ dt = sigmas[i + 1] - sigma
745
+ x = x + d * dt
746
+
747
+ return x
748
+
749
+
750
+ # @th.no_grad()
751
+ # def iterative_colorization(
752
+ # distiller,
753
+ # images,
754
+ # x,
755
+ # ts,
756
+ # t_min=0.002,
757
+ # t_max=80.0,
758
+ # rho=7.0,
759
+ # steps=40,
760
+ # generator=None,
761
+ # ):
762
+ # def obtain_orthogonal_matrix():
763
+ # vector = np.asarray([0.2989, 0.5870, 0.1140])
764
+ # vector = vector / np.linalg.norm(vector)
765
+ # matrix = np.eye(3)
766
+ # matrix[:, 0] = vector
767
+ # matrix = np.linalg.qr(matrix)[0]
768
+ # if np.sum(matrix[:, 0]) < 0:
769
+ # matrix = -matrix
770
+ # return matrix
771
+
772
+ # Q = th.from_numpy(obtain_orthogonal_matrix()).to(dist_util.dev()).to(th.float32)
773
+ # mask = th.zeros(*x.shape[1:], device=dist_util.dev())
774
+ # mask[0, ...] = 1.0
775
+
776
+ # def replacement(x0, x1):
777
+ # x0 = th.einsum("bchw,cd->bdhw", x0, Q)
778
+ # x1 = th.einsum("bchw,cd->bdhw", x1, Q)
779
+
780
+ # x_mix = x0 * mask + x1 * (1.0 - mask)
781
+ # x_mix = th.einsum("bdhw,cd->bchw", x_mix, Q)
782
+ # return x_mix
783
+
784
+ # t_max_rho = t_max ** (1 / rho)
785
+ # t_min_rho = t_min ** (1 / rho)
786
+ # s_in = x.new_ones([x.shape[0]])
787
+ # images = replacement(images, th.zeros_like(images))
788
+
789
+ # for i in range(len(ts) - 1):
790
+ # t = (t_max_rho + ts[i] / (steps - 1) * (t_min_rho - t_max_rho)) ** rho
791
+ # x0 = distiller(x, t * s_in)
792
+ # x0 = th.clamp(x0, -1.0, 1.0)
793
+ # x0 = replacement(images, x0)
794
+ # next_t = (t_max_rho + ts[i + 1] / (steps - 1) * (t_min_rho - t_max_rho)) ** rho
795
+ # next_t = np.clip(next_t, t_min, t_max)
796
+ # x = x0 + generator.randn_like(x) * np.sqrt(next_t**2 - t_min**2)
797
+
798
+ # return x, images
799
+
800
+
801
+ # @th.no_grad()
802
+ # def iterative_inpainting(
803
+ # distiller,
804
+ # images,
805
+ # x,
806
+ # ts,
807
+ # t_min=0.002,
808
+ # t_max=80.0,
809
+ # rho=7.0,
810
+ # steps=40,
811
+ # generator=None,
812
+ # ):
813
+ # from PIL import Image, ImageDraw, ImageFont
814
+
815
+ # image_size = x.shape[-1]
816
+
817
+ # # create a blank image with a white background
818
+ # img = Image.new("RGB", (image_size, image_size), color="white")
819
+
820
+ # # get a drawing context for the image
821
+ # draw = ImageDraw.Draw(img)
822
+
823
+ # # load a font
824
+ # font = ImageFont.truetype("arial.ttf", 250)
825
+
826
+ # # draw the letter "C" in black
827
+ # draw.text((50, 0), "S", font=font, fill=(0, 0, 0))
828
+
829
+ # # convert the image to a numpy array
830
+ # img_np = np.array(img)
831
+ # img_np = img_np.transpose(2, 0, 1)
832
+ # img_th = th.from_numpy(img_np).to(dist_util.dev())
833
+
834
+ # mask = th.zeros(*x.shape, device=dist_util.dev())
835
+ # mask = mask.reshape(-1, 7, 3, image_size, image_size)
836
+
837
+ # mask[::2, :, img_th > 0.5] = 1.0
838
+ # mask[1::2, :, img_th < 0.5] = 1.0
839
+ # mask = mask.reshape(-1, 3, image_size, image_size)
840
+
841
+ # def replacement(x0, x1):
842
+ # x_mix = x0 * mask + x1 * (1 - mask)
843
+ # return x_mix
844
+
845
+ # t_max_rho = t_max ** (1 / rho)
846
+ # t_min_rho = t_min ** (1 / rho)
847
+ # s_in = x.new_ones([x.shape[0]])
848
+ # images = replacement(images, -th.ones_like(images))
849
+
850
+ # for i in range(len(ts) - 1):
851
+ # t = (t_max_rho + ts[i] / (steps - 1) * (t_min_rho - t_max_rho)) ** rho
852
+ # x0 = distiller(x, t * s_in)
853
+ # x0 = th.clamp(x0, -1.0, 1.0)
854
+ # x0 = replacement(images, x0)
855
+ # next_t = (t_max_rho + ts[i + 1] / (steps - 1) * (t_min_rho - t_max_rho)) ** rho
856
+ # next_t = np.clip(next_t, t_min, t_max)
857
+ # x = x0 + generator.randn_like(x) * np.sqrt(next_t**2 - t_min**2)
858
+
859
+ # return x, images
860
+
861
+
862
+ # @th.no_grad()
863
+ # def iterative_superres(
864
+ # distiller,
865
+ # images,
866
+ # x,
867
+ # ts,
868
+ # t_min=0.002,
869
+ # t_max=80.0,
870
+ # rho=7.0,
871
+ # steps=40,
872
+ # generator=None,
873
+ # ):
874
+ # patch_size = 8
875
+
876
+ # def obtain_orthogonal_matrix():
877
+ # vector = np.asarray([1] * patch_size**2)
878
+ # vector = vector / np.linalg.norm(vector)
879
+ # matrix = np.eye(patch_size**2)
880
+ # matrix[:, 0] = vector
881
+ # matrix = np.linalg.qr(matrix)[0]
882
+ # if np.sum(matrix[:, 0]) < 0:
883
+ # matrix = -matrix
884
+ # return matrix
885
+
886
+ # Q = th.from_numpy(obtain_orthogonal_matrix()).to(dist_util.dev()).to(th.float32)
887
+
888
+ # image_size = x.shape[-1]
889
+
890
+ # def replacement(x0, x1):
891
+ # x0_flatten = (
892
+ # x0.reshape(-1, 3, image_size, image_size)
893
+ # .reshape(
894
+ # -1,
895
+ # 3,
896
+ # image_size // patch_size,
897
+ # patch_size,
898
+ # image_size // patch_size,
899
+ # patch_size,
900
+ # )
901
+ # .permute(0, 1, 2, 4, 3, 5)
902
+ # .reshape(-1, 3, image_size**2 // patch_size**2, patch_size**2)
903
+ # )
904
+ # x1_flatten = (
905
+ # x1.reshape(-1, 3, image_size, image_size)
906
+ # .reshape(
907
+ # -1,
908
+ # 3,
909
+ # image_size // patch_size,
910
+ # patch_size,
911
+ # image_size // patch_size,
912
+ # patch_size,
913
+ # )
914
+ # .permute(0, 1, 2, 4, 3, 5)
915
+ # .reshape(-1, 3, image_size**2 // patch_size**2, patch_size**2)
916
+ # )
917
+ # x0 = th.einsum("bcnd,de->bcne", x0_flatten, Q)
918
+ # x1 = th.einsum("bcnd,de->bcne", x1_flatten, Q)
919
+ # x_mix = x0.new_zeros(x0.shape)
920
+ # x_mix[..., 0] = x0[..., 0]
921
+ # x_mix[..., 1:] = x1[..., 1:]
922
+ # x_mix = th.einsum("bcne,de->bcnd", x_mix, Q)
923
+ # x_mix = (
924
+ # x_mix.reshape(
925
+ # -1,
926
+ # 3,
927
+ # image_size // patch_size,
928
+ # image_size // patch_size,
929
+ # patch_size,
930
+ # patch_size,
931
+ # )
932
+ # .permute(0, 1, 2, 4, 3, 5)
933
+ # .reshape(-1, 3, image_size, image_size)
934
+ # )
935
+ # return x_mix
936
+
937
+ # def average_image_patches(x):
938
+ # x_flatten = (
939
+ # x.reshape(-1, 3, image_size, image_size)
940
+ # .reshape(
941
+ # -1,
942
+ # 3,
943
+ # image_size // patch_size,
944
+ # patch_size,
945
+ # image_size // patch_size,
946
+ # patch_size,
947
+ # )
948
+ # .permute(0, 1, 2, 4, 3, 5)
949
+ # .reshape(-1, 3, image_size**2 // patch_size**2, patch_size**2)
950
+ # )
951
+ # x_flatten[..., :] = x_flatten.mean(dim=-1, keepdim=True)
952
+ # return (
953
+ # x_flatten.reshape(
954
+ # -1,
955
+ # 3,
956
+ # image_size // patch_size,
957
+ # image_size // patch_size,
958
+ # patch_size,
959
+ # patch_size,
960
+ # )
961
+ # .permute(0, 1, 2, 4, 3, 5)
962
+ # .reshape(-1, 3, image_size, image_size)
963
+ # )
964
+
965
+ # t_max_rho = t_max ** (1 / rho)
966
+ # t_min_rho = t_min ** (1 / rho)
967
+ # s_in = x.new_ones([x.shape[0]])
968
+ # images = average_image_patches(images)
969
+
970
+ # for i in range(len(ts) - 1):
971
+ # t = (t_max_rho + ts[i] / (steps - 1) * (t_min_rho - t_max_rho)) ** rho
972
+ # x0 = distiller(x, t * s_in)
973
+ # x0 = th.clamp(x0, -1.0, 1.0)
974
+ # x0 = replacement(images, x0)
975
+ # next_t = (t_max_rho + ts[i + 1] / (steps - 1) * (t_min_rho - t_max_rho)) ** rho
976
+ # next_t = np.clip(next_t, t_min, t_max)
977
+ # x = x0 + generator.randn_like(x) * np.sqrt(next_t**2 - t_min**2)
978
+
979
+ # return x, images
modules/diffusion/karras/random_utils.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import torch as th
7
+
8
+
9
+ def get_generator(generator, num_samples=0, seed=0):
10
+ if generator == "dummy":
11
+ return DummyGenerator()
12
+ elif generator == "determ":
13
+ return DeterministicGenerator(num_samples, seed)
14
+ elif generator == "determ-indiv":
15
+ return DeterministicIndividualGenerator(num_samples, seed)
16
+ else:
17
+ raise NotImplementedError
18
+
19
+
20
+ class DummyGenerator:
21
+ def randn(self, *args, **kwargs):
22
+ return th.randn(*args, **kwargs)
23
+
24
+ def randint(self, *args, **kwargs):
25
+ return th.randint(*args, **kwargs)
26
+
27
+ def randn_like(self, *args, **kwargs):
28
+ return th.randn_like(*args, **kwargs)
29
+
30
+
31
+ class DeterministicGenerator:
32
+ """
33
+ RNG to deterministically sample num_samples samples that does not depend on batch_size or mpi_machines
34
+ Uses a single rng and samples num_samples sized randomness and subsamples the current indices
35
+ """
36
+
37
+ def __init__(self, num_samples, seed=0):
38
+ print("Warning: Distributed not initialised, using single rank")
39
+ self.rank = 0
40
+ self.world_size = 1
41
+ self.num_samples = num_samples
42
+ self.done_samples = 0
43
+ self.seed = seed
44
+ self.rng_cpu = th.Generator()
45
+ if th.cuda.is_available():
46
+ self.rng_cuda = th.Generator(dist_util.dev())
47
+ self.set_seed(seed)
48
+
49
+ def get_global_size_and_indices(self, size):
50
+ global_size = (self.num_samples, *size[1:])
51
+ indices = th.arange(
52
+ self.done_samples + self.rank,
53
+ self.done_samples + self.world_size * int(size[0]),
54
+ self.world_size,
55
+ )
56
+ indices = th.clamp(indices, 0, self.num_samples - 1)
57
+ assert (
58
+ len(indices) == size[0]
59
+ ), f"rank={self.rank}, ws={self.world_size}, l={len(indices)}, bs={size[0]}"
60
+ return global_size, indices
61
+
62
+ def get_generator(self, device):
63
+ return self.rng_cpu if th.device(device).type == "cpu" else self.rng_cuda
64
+
65
+ def randn(self, *size, dtype=th.float, device="cpu"):
66
+ global_size, indices = self.get_global_size_and_indices(size)
67
+ generator = self.get_generator(device)
68
+ return th.randn(*global_size, generator=generator, dtype=dtype, device=device)[
69
+ indices
70
+ ]
71
+
72
+ def randint(self, low, high, size, dtype=th.long, device="cpu"):
73
+ global_size, indices = self.get_global_size_and_indices(size)
74
+ generator = self.get_generator(device)
75
+ return th.randint(
76
+ low, high, generator=generator, size=global_size, dtype=dtype, device=device
77
+ )[indices]
78
+
79
+ def randn_like(self, tensor):
80
+ size, dtype, device = tensor.size(), tensor.dtype, tensor.device
81
+ return self.randn(*size, dtype=dtype, device=device)
82
+
83
+ def set_done_samples(self, done_samples):
84
+ self.done_samples = done_samples
85
+ self.set_seed(self.seed)
86
+
87
+ def get_seed(self):
88
+ return self.seed
89
+
90
+ def set_seed(self, seed):
91
+ self.rng_cpu.manual_seed(seed)
92
+ if th.cuda.is_available():
93
+ self.rng_cuda.manual_seed(seed)
94
+
95
+
96
+ class DeterministicIndividualGenerator:
97
+ """
98
+ RNG to deterministically sample num_samples samples that does not depend on batch_size or mpi_machines
99
+ Uses a separate rng for each sample to reduce memoery usage
100
+ """
101
+
102
+ def __init__(self, num_samples, seed=0):
103
+ print("Warning: Distributed not initialised, using single rank")
104
+ self.rank = 0
105
+ self.world_size = 1
106
+ self.num_samples = num_samples
107
+ self.done_samples = 0
108
+ self.seed = seed
109
+ self.rng_cpu = [th.Generator() for _ in range(num_samples)]
110
+ if th.cuda.is_available():
111
+ self.rng_cuda = [th.Generator(dist_util.dev()) for _ in range(num_samples)]
112
+ self.set_seed(seed)
113
+
114
+ def get_size_and_indices(self, size):
115
+ indices = th.arange(
116
+ self.done_samples + self.rank,
117
+ self.done_samples + self.world_size * int(size[0]),
118
+ self.world_size,
119
+ )
120
+ indices = th.clamp(indices, 0, self.num_samples - 1)
121
+ assert (
122
+ len(indices) == size[0]
123
+ ), f"rank={self.rank}, ws={self.world_size}, l={len(indices)}, bs={size[0]}"
124
+ return (1, *size[1:]), indices
125
+
126
+ def get_generator(self, device):
127
+ return self.rng_cpu if th.device(device).type == "cpu" else self.rng_cuda
128
+
129
+ def randn(self, *size, dtype=th.float, device="cpu"):
130
+ size, indices = self.get_size_and_indices(size)
131
+ generator = self.get_generator(device)
132
+ return th.cat(
133
+ [
134
+ th.randn(*size, generator=generator[i], dtype=dtype, device=device)
135
+ for i in indices
136
+ ],
137
+ dim=0,
138
+ )
139
+
140
+ def randint(self, low, high, size, dtype=th.long, device="cpu"):
141
+ size, indices = self.get_size_and_indices(size)
142
+ generator = self.get_generator(device)
143
+ return th.cat(
144
+ [
145
+ th.randint(
146
+ low,
147
+ high,
148
+ generator=generator[i],
149
+ size=size,
150
+ dtype=dtype,
151
+ device=device,
152
+ )
153
+ for i in indices
154
+ ],
155
+ dim=0,
156
+ )
157
+
158
+ def randn_like(self, tensor):
159
+ size, dtype, device = tensor.size(), tensor.dtype, tensor.device
160
+ return self.randn(*size, dtype=dtype, device=device)
161
+
162
+ def set_done_samples(self, done_samples):
163
+ self.done_samples = done_samples
164
+
165
+ def get_seed(self):
166
+ return self.seed
167
+
168
+ def set_seed(self, seed):
169
+ [
170
+ rng_cpu.manual_seed(i + self.num_samples * seed)
171
+ for i, rng_cpu in enumerate(self.rng_cpu)
172
+ ]
173
+ if th.cuda.is_available():
174
+ [
175
+ rng_cuda.manual_seed(i + self.num_samples * seed)
176
+ for i, rng_cuda in enumerate(self.rng_cuda)
177
+ ]