yuancwang commited on
Commit
9893813
1 Parent(s): b725c5a
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitignore +1 -0
  2. app.py +110 -9
  3. egs/datasets/README.md +381 -0
  4. egs/metrics/README.md +94 -0
  5. egs/metrics/run.sh +42 -0
  6. egs/svc/DiffComoSVC/README.md +234 -0
  7. egs/svc/DiffComoSVC/exp_config.json +143 -0
  8. egs/svc/DiffComoSVC/run.sh +1 -0
  9. egs/svc/MultipleContentsSVC/README.md +153 -0
  10. egs/svc/MultipleContentsSVC/exp_config.json +126 -0
  11. egs/svc/MultipleContentsSVC/run.sh +1 -0
  12. egs/svc/README.md +34 -0
  13. egs/svc/TransformerSVC/README.md +164 -0
  14. egs/svc/TransformerSVC/exp_config.json +108 -0
  15. egs/svc/TransformerSVC/run.sh +1 -0
  16. egs/svc/VitsSVC/README.md +125 -0
  17. egs/svc/VitsSVC/exp_config.json +162 -0
  18. egs/svc/VitsSVC/run.sh +1 -0
  19. egs/svc/_template/run.sh +150 -0
  20. egs/tta/README.md +19 -0
  21. egs/tta/RECIPE.md +156 -0
  22. egs/tta/audioldm/exp_config.json +90 -0
  23. egs/tta/audioldm/exp_config_base.json +11 -0
  24. egs/tta/audioldm/exp_config_latent_4_10_78.json +88 -0
  25. egs/tta/audioldm/run_inference.sh +52 -0
  26. egs/tta/audioldm/run_inference_latent_4_10_78.sh +52 -0
  27. egs/tta/audioldm/run_train.sh +26 -0
  28. egs/tta/audioldm/run_train_latent_4_10_78.sh +26 -0
  29. egs/tta/autoencoderkl/exp_config.json +49 -0
  30. egs/tta/autoencoderkl/exp_config_base.json +11 -0
  31. egs/tta/autoencoderkl/exp_config_latent_4_10_78.json +59 -0
  32. egs/tta/autoencoderkl/run_train.sh +26 -0
  33. egs/tta/autoencoderkl/run_train_latent_4_10_78.sh +26 -0
  34. egs/tts/FastSpeech2/README.md +132 -0
  35. egs/tts/FastSpeech2/exp_config.json +21 -0
  36. egs/tts/FastSpeech2/prepare_mfa.sh +14 -0
  37. egs/tts/FastSpeech2/run.sh +150 -0
  38. egs/tts/NaturalSpeech2/exp_config.json +39 -0
  39. egs/tts/NaturalSpeech2/exp_config_base.json +118 -0
  40. egs/tts/NaturalSpeech2/run_inference.sh +43 -0
  41. egs/tts/NaturalSpeech2/run_train.sh +18 -0
  42. egs/tts/README.md +17 -0
  43. egs/tts/VALLE/README.md +139 -0
  44. egs/tts/VALLE/exp_config.json +33 -0
  45. egs/tts/VALLE/prompt_examples/260_123440_000010_000004.normalized.txt +1 -0
  46. egs/tts/VALLE/prompt_examples/5142_33396_000002_000004.normalized.txt +1 -0
  47. egs/tts/VALLE/prompt_examples/6829_68771_000027_000000.normalized.txt +1 -0
  48. egs/tts/VALLE/prompt_examples/7176_92135_000004_000000.normalized.txt +1 -0
  49. egs/tts/VALLE/run.sh +158 -0
  50. egs/tts/VITS/README.md +135 -0
.gitignore CHANGED
@@ -35,6 +35,7 @@ egs/svc/dev_exp_config.json
35
  bins/svc/demo*
36
  bins/svc/preprocess_custom.py
37
  data
 
38
 
39
  # Data and ckpt
40
  *.pkl
 
35
  bins/svc/demo*
36
  bins/svc/preprocess_custom.py
37
  data
38
+ ckpts
39
 
40
  # Data and ckpt
41
  *.pkl
app.py CHANGED
@@ -1,24 +1,125 @@
1
  import gradio as gr
 
2
  import os
3
  import torch
 
 
4
 
 
 
 
 
5
 
 
 
 
6
 
7
- def build_codec():
8
- ...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
- def build_model():
11
- ...
12
 
13
  def ns2_inference(
14
- prmopt_audio_path,
15
- text,
16
- diffusion_steps=100,
17
  ):
18
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
19
 
20
- demo_inputs = ...
21
- demo_outputs = ...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  demo = gr.Interface(
24
  fn=ns2_inference,
 
1
  import gradio as gr
2
+ import argparse
3
  import os
4
  import torch
5
+ import soundfile as sf
6
+ import numpy as np
7
 
8
+ from models.tts.naturalspeech2.ns2 import NaturalSpeech2
9
+ from encodec import EncodecModel
10
+ from encodec.utils import convert_audio
11
+ from utils.util import load_config
12
 
13
+ from text import text_to_sequence
14
+ from text.cmudict import valid_symbols
15
+ from text.g2p import preprocess_english, read_lexicon
16
 
17
+ import torchaudio
18
+
19
+
20
+ def build_codec(device):
21
+ encodec_model = EncodecModel.encodec_model_24khz()
22
+ encodec_model = encodec_model.to(device=device)
23
+ encodec_model.set_target_bandwidth(12.0)
24
+ return encodec_model
25
+
26
+ def build_model(cfg, device):
27
+
28
+ model = NaturalSpeech2(cfg.model)
29
+ model.load_state_dict(
30
+ torch.load(
31
+ "ckpts/ns2/pytorch_model.bin",
32
+ map_location="cpu",
33
+ )
34
+ )
35
+ model = model.to(device=device)
36
+ return model
37
 
 
 
38
 
39
  def ns2_inference(
40
+ prmopt_audio_path,
41
+ text,
42
+ diffusion_steps=100,
43
  ):
44
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
45
 
46
+ os.environ["WORK_DIR"] = "./"
47
+ cfg = load_config("egs/tts/NaturalSpeech2/exp_config.json")
48
+
49
+ model = build_model(cfg, device)
50
+ codec = build_codec(device)
51
+
52
+ ref_wav_path = prmopt_audio_path
53
+ ref_wav, sr = torchaudio.load(ref_wav_path)
54
+ ref_wav = convert_audio(
55
+ ref_wav, sr, codec.sample_rate, codec.channels
56
+ )
57
+ ref_wav = ref_wav.unsqueeze(0).to(device=device)
58
+
59
+ with torch.no_grad():
60
+ encoded_frames = codec.encode(ref_wav)
61
+ ref_code = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1)
62
+
63
+ ref_mask = torch.ones(ref_code.shape[0], ref_code.shape[-1]).to(ref_code.device)
64
+
65
+ symbols = valid_symbols + ["sp", "spn", "sil"] + ["<s>", "</s>"]
66
+ phone2id = {s: i for i, s in enumerate(symbols)}
67
+ id2phone = {i: s for s, i in phone2id.items()}
68
+
69
+ lexicon = read_lexicon(cfg.preprocess.lexicon_path)
70
+ phone_seq = preprocess_english(text, lexicon)
71
+
72
+
73
+ phone_id = np.array(
74
+ [
75
+ *map(
76
+ phone2id.get,
77
+ phone_seq.replace("{", "").replace("}", "").split(),
78
+ )
79
+ ]
80
+ )
81
+ phone_id = torch.from_numpy(phone_id).unsqueeze(0).to(device=device)
82
+
83
+
84
+ x0, prior_out = model.inference(
85
+ ref_code, phone_id, ref_mask, diffusion_steps
86
+ )
87
+
88
+ latent_ref = codec.quantizer.vq.decode(ref_code.transpose(0, 1))
89
+ rec_wav = codec.decoder(x0)
90
+
91
+ os.makedirs("result", exist_ok=True)
92
+ sf.write(
93
+ "result/{}.wav".format(prmopt_audio_path.split("/")[-1][:-4] + "_zero_shot_result"),
94
+ rec_wav[0, 0].detach().cpu().numpy(),
95
+ samplerate=24000,
96
+ )
97
+
98
+ result_file = "result/{}.wav".format(prmopt_audio_path.split("/")[-1][:-4] + "_zero_shot_result")
99
+ return result_file
100
+
101
+
102
+ demo_inputs = [
103
+ gr.Audio(
104
+ sources=["upload", "microphone"],
105
+ label="Upload a reference speech you want to clone timbre",
106
+ type="filepath",
107
+ ),
108
+ gr.Textbox(
109
+ value="Amphion is a toolkit that can speak, make sounds, and sing.",
110
+ label="Text you want to generate",
111
+ type="text",
112
+ ),
113
+ gr.Slider(
114
+ 10,
115
+ 1000,
116
+ value=200,
117
+ step=1,
118
+ label="Diffusion Inference Steps",
119
+ info="As the step number increases, the synthesis quality will be better while the inference speed will be lower",
120
+ ),
121
+ ]
122
+ demo_outputs = gr.Audio(label="")
123
 
124
  demo = gr.Interface(
125
  fn=ns2_inference,
egs/datasets/README.md ADDED
@@ -0,0 +1,381 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Datasets Format
2
+
3
+ Amphion support the following academic datasets (sort alphabetically):
4
+
5
+ - [Datasets Format](#datasets-format)
6
+ - [AudioCaps](#audiocaps)
7
+ - [CSD](#csd)
8
+ - [KiSing](#kising)
9
+ - [LibriTTS](#libritts)
10
+ - [LJSpeech](#ljspeech)
11
+ - [M4Singer](#m4singer)
12
+ - [NUS-48E](#nus-48e)
13
+ - [Opencpop](#opencpop)
14
+ - [OpenSinger](#opensinger)
15
+ - [Opera](#opera)
16
+ - [PopBuTFy](#popbutfy)
17
+ - [PopCS](#popcs)
18
+ - [PJS](#pjs)
19
+ - [SVCC](#svcc)
20
+ - [VCTK](#vctk)
21
+
22
+ The downloading link and the file structure tree of each dataset is displayed as follows.
23
+
24
+ ## AudioCaps
25
+
26
+ AudioCaps is a dataset of around 44K audio-caption pairs, where each audio clip corresponds to a caption with rich semantic information. You can download the dataset [here](https://github.com/cdjkim/audiocaps). The file structure tree is like:
27
+
28
+ ```plaintext
29
+ [AudioCaps dataset path]
30
+ ┣ AudioCpas
31
+ ┃   ┣ wav
32
+ ┃ ┃ ┣ ---1_cCGK4M_0_10000.wav
33
+ ┃ ┃ ┣ ---lTs1dxhU_30000_40000.wav
34
+ ┃ ┃ ┣ ...
35
+ ```
36
+
37
+ ## CSD
38
+
39
+ The official CSD dataset can be download [here](https://zenodo.org/records/4785016). The file structure tree is like:
40
+
41
+ ```plaintext
42
+ [CSD dataset path]
43
+ ┣ english
44
+ ┣ korean
45
+ ┣ utterances
46
+ ┃ ┣ en001a
47
+ ┃ ┃ ┣ {UtterenceID}.wav
48
+ ┃ ┣ en001b
49
+ ┃ ┣ en002a
50
+ ┃ ┣ en002b
51
+ ┃ ┣ ...
52
+ ┣ README
53
+ ```
54
+
55
+ ## KiSing
56
+
57
+ The official KiSing dataset can be download [here](http://shijt.site/index.php/2021/05/16/kising-the-first-open-source-mandarin-singing-voice-synthesis-corpus/). The file structure tree is like:
58
+
59
+ ```plaintext
60
+ [KiSing dataset path]
61
+ ┣ clean
62
+ ┃ ┣ 421
63
+ ┃ ┣ 422
64
+ ┃ ┣ ...
65
+ ```
66
+
67
+ ## LibriTTS
68
+
69
+ The official LibriTTS dataset can be download [here](https://www.openslr.org/60/). The file structure tree is like:
70
+
71
+ ```plaintext
72
+ [LibriTTS dataset path]
73
+ ┣ BOOKS.txt
74
+ ┣ CHAPTERS.txt
75
+ ┣ eval_sentences10.tsv
76
+ ┣ LICENSE.txt
77
+ ┣ NOTE.txt
78
+ ┣ reader_book.tsv
79
+ ┣ README_librispeech.txt
80
+ ┣ README_libritts.txt
81
+ ┣ speakers.tsv
82
+ ┣ SPEAKERS.txt
83
+ ┣ dev-clean (Subset)
84
+ ┃ ┣ 1272{Speaker_ID}
85
+ ┃ ┃ ┣ 128104 {Chapter_ID}
86
+ ┃ ┃ ┃ ┣ 1272_128104_000001_000000.normalized.txt
87
+ ┃ ┃ ┃ ┣ 1272_128104_000001_000000.original.txt
88
+ ┃ ┃ ┃ ┣ 1272_128104_000001_000000.wav
89
+ ┃ ┃ ┃ ┣ ...
90
+ ┃ ┃ ┃ ┣ 1272_128104.book.tsv
91
+ ┃ ┃ ┃ ┣ 1272_128104.trans.tsv
92
+ ┃ ┃ ┣ ...
93
+ ┃ ┣ ...
94
+ ┣ dev-other (Subset)
95
+ ┃ ┣ 116 (Speaker)
96
+ ┃ ┃ ┣ 288045 {Chapter_ID}
97
+ ┃ ┃ ┃ ┣ 116_288045_000003_000000.normalized.txt
98
+ ┃ ┃ ┃ ┣ 116_288045_000003_000000.original.txt
99
+ ┃ ┃ ┃ ┣ 116_288045_000003_000000.wav
100
+ ┃ ┃ ┃ ┣ ...
101
+ ┃ ┃ ┃ ┣ 116_288045.book.tsv
102
+ ┃ ┃ ┃ ┣ 116_288045.trans.tsv
103
+ ┃ ┃ ┣ ...
104
+ ┃ ┣ ...
105
+ ┃ ┣ ...
106
+ ┣ test-clean (Subset)
107
+ ┃ ┣ {Speaker_ID}
108
+ ┃ ┃ ┣ {Chapter_ID}
109
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.normalized.txt
110
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.original.txt
111
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.wav
112
+ ┃ ┃ ┃ ┣ ...
113
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.book.tsv
114
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.trans.tsv
115
+ ┃ ┃ ┣ ...
116
+ ┃ ┣ ...
117
+ ┣ test-other
118
+ ┃ ┣ {Speaker_ID}
119
+ ┃ ┃ ┣ {Chapter_ID}
120
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.normalized.txt
121
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.original.txt
122
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.wav
123
+ ┃ ┃ ┃ ┣ ...
124
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.book.tsv
125
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.trans.tsv
126
+ ┃ ┃ ┣ ...
127
+ ┃ ┣ ...
128
+ ┣ train-clean-100
129
+ ┃ ┣ {Speaker_ID}
130
+ ┃ ┃ ┣ {Chapter_ID}
131
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.normalized.txt
132
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.original.txt
133
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.wav
134
+ ┃ ┃ ┃ ┣ ...
135
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.book.tsv
136
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.trans.tsv
137
+ ┃ ┃ ┣ ...
138
+ ┃ ┣ ...
139
+ ┣ train-clean-360
140
+ ┃ ┣ {Speaker_ID}
141
+ ┃ ┃ ┣ {Chapter_ID}
142
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.normalized.txt
143
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.original.txt
144
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.wav
145
+ ┃ ┃ ┃ ┣ ...
146
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.book.tsv
147
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.trans.tsv
148
+ ┃ ┃ ┣ ...
149
+ ┃ ┣ ...
150
+ ┣ train-other-500
151
+ ┃ ┣ {Speaker_ID}
152
+ ┃ ┃ ┣ {Chapter_ID}
153
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.normalized.txt
154
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.original.txt
155
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.wav
156
+ ┃ ┃ ┃ ┣ ...
157
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.book.tsv
158
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.trans.tsv
159
+ ┃ ┃ ┣ ...
160
+ ┃ ┣ ...
161
+ ```
162
+
163
+
164
+ ## LJSpeech
165
+
166
+ The official LJSpeech dataset can be download [here](https://keithito.com/LJ-Speech-Dataset/). The file structure tree is like:
167
+
168
+ ```plaintext
169
+ [LJSpeech dataset path]
170
+ ┣ metadata.csv
171
+ ┣ wavs
172
+ ┃ ┣ LJ001-0001.wav
173
+ ┃ ┣ LJ001-0002.wav
174
+ ┃ ┣ ...
175
+ ┣ README
176
+ ```
177
+
178
+ ## M4Singer
179
+
180
+ The official M4Singer dataset can be downloaded [here](https://drive.google.com/file/d/1xC37E59EWRRFFLdG3aJkVqwtLDgtFNqW/view). The file structure tree is like:
181
+
182
+ ```plaintext
183
+ [M4Singer dataset path]
184
+ ┣ {Singer_1}#{Song_1}
185
+ ┃ ┣ 0000.mid
186
+ ┃ ┣ 0000.TextGrid
187
+ ┃ ┣ 0000.wav
188
+ ┃ ┣ ...
189
+ ┣ {Singer_1}#{Song_2}
190
+ ┣ ...
191
+ ┣ {Singer_2}#{Song_1}
192
+ ┣ {Singer_2}#{Song_2}
193
+ ┣ ...
194
+ ┗ meta.json
195
+ ```
196
+
197
+ ## NUS-48E
198
+
199
+ The official NUS-48E dataset can be download [here](https://drive.google.com/drive/folders/12pP9uUl0HTVANU3IPLnumTJiRjPtVUMx). The file structure tree is like:
200
+
201
+ ```plaintext
202
+ [NUS-48E dataset path]
203
+ ┣ {SpeakerID}
204
+ ┃ ┣ read
205
+ ┃ ┃ ┣ {SongID}.txt
206
+ ┃ ┃ ┣ {SongID}.wav
207
+ ┃ ┃ ┣ ...
208
+ ┃ ┣ sing
209
+ ┃ ┃ ┣ {SongID}.txt
210
+ ┃ ┃ ┣ {SongID}.wav
211
+ ┃ ┃ ┣ ...
212
+ ┣ ...
213
+ ┣ README.txt
214
+
215
+ ```
216
+
217
+ ## Opencpop
218
+
219
+ The official Opera dataset can be downloaded [here](https://wenet.org.cn/opencpop/). The file structure tree is like:
220
+
221
+ ```plaintext
222
+ [Opencpop dataset path]
223
+ ┣ midis
224
+ ┃ ┣ 2001.midi
225
+ ┃ ┣ 2002.midi
226
+ ┃ ┣ 2003.midi
227
+ ┃ ┣ ...
228
+ ┣ segments
229
+ ┃ ┣ wavs
230
+ ┃ ┃ ┣ 2001000001.wav
231
+ ┃ ┃ ┣ 2001000002.wav
232
+ ┃ ┃ ┣ 2001000003.wav
233
+ ┃ ┃ ┣ ...
234
+ ┃ ┣ test.txt
235
+ ┃ ┣ train.txt
236
+ ┃ ┗ transcriptions.txt
237
+ ┣ textgrids
238
+ ┃ ┣ 2001.TextGrid
239
+ ┃ ┣ 2002.TextGrid
240
+ ┃ ┣ 2003.TextGrid
241
+ ┃ ┣ ...
242
+ ┣ wavs
243
+ ┃ ┣ 2001.wav
244
+ ┃ ┣ 2002.wav
245
+ ┃ ┣ 2003.wav
246
+ ┃ ┣ ...
247
+ ┣ TERMS_OF_ACCESS
248
+ ┗ readme.md
249
+ ```
250
+
251
+ ## OpenSinger
252
+
253
+ The official OpenSinger dataset can be downloaded [here](https://drive.google.com/file/d/1EofoZxvalgMjZqzUEuEdleHIZ6SHtNuK/view). The file structure tree is like:
254
+
255
+ ```plaintext
256
+ [OpenSinger dataset path]
257
+ ┣ ManRaw
258
+ ┃ ┣ {Singer_1}_{Song_1}
259
+ ┃ ┃ ┣ {Singer_1}_{Song_1}_0.lab
260
+ ┃ ┃ ┣ {Singer_1}_{Song_1}_0.txt
261
+ ┃ ┃ ┣ {Singer_1}_{Song_1}_0.wav
262
+ ┃ ┃ ┣ ...
263
+ ┃ ┣ {Singer_1}_{Song_2}
264
+ ┃ ┣ ...
265
+ ┣ WomanRaw
266
+ ┣ LICENSE
267
+ ┗ README.md
268
+ ```
269
+
270
+ ## Opera
271
+
272
+ The official Opera dataset can be downloaded [here](http://isophonics.net/SingingVoiceDataset). The file structure tree is like:
273
+
274
+ ```plaintext
275
+ [Opera dataset path]
276
+ ┣ monophonic
277
+ ┃ ┣ chinese
278
+ ┃ ┃ ┣ {Gender}_{SingerID}
279
+ ┃ ┃ ┃ ┣ {Emotion}_{SongID}.wav
280
+ ┃ ┃ ┃ ┣ ...
281
+ ┃ ┃ ┣ ...
282
+ ┃ ┣ western
283
+ ┣ polyphonic
284
+ ┃ ┣ chinese
285
+ ┃ ┣ western
286
+ ┣ CrossculturalDataSet.xlsx
287
+ ```
288
+
289
+ ## PopBuTFy
290
+
291
+ The official PopBuTFy dataset can be downloaded [here](https://github.com/MoonInTheRiver/NeuralSVB). The file structure tree is like:
292
+
293
+ ```plaintext
294
+ [PopBuTFy dataset path]
295
+ ┣ data
296
+ ┃ ┣ {SingerID}#singing#{SongName}_Amateur
297
+ ┃ ┃ ┣ {SingerID}#singing#{SongName}_Amateur_{UtteranceID}.mp3
298
+ ┃ ┃ ┣ ...
299
+ ┃ ┣ {SingerID}#singing#{SongName}_Professional
300
+ ┃ ┃ ┣ {SingerID}#singing#{SongName}_Professional_{UtteranceID}.mp3
301
+ ┃ ┃ ┣ ...
302
+ ┣ text_labels
303
+ ┗ TERMS_OF_ACCESS
304
+ ```
305
+
306
+ ## PopCS
307
+
308
+ The official PopCS dataset can be downloaded [here](https://github.com/MoonInTheRiver/DiffSinger/blob/master/resources/apply_form.md). The file structure tree is like:
309
+
310
+ ```plaintext
311
+ [PopCS dataset path]
312
+ ┣ popcs
313
+ ┃ ┣ popcs-{SongName}
314
+ ┃ ┃ ┣ {UtteranceID}_ph.txt
315
+ ┃ ┃ ┣ {UtteranceID}_wf0.wav
316
+ ┃ ┃ ┣ {UtteranceID}.TextGrid
317
+ ┃ ┃ ┣ {UtteranceID}.txt
318
+ ┃ ┃ ┣ ...
319
+ ┃ ┣ ...
320
+ ┗ TERMS_OF_ACCESS
321
+ ```
322
+
323
+ ## PJS
324
+
325
+ The official PJS dataset can be downloaded [here](https://sites.google.com/site/shinnosuketakamichi/research-topics/pjs_corpus). The file structure tree is like:
326
+
327
+ ```plaintext
328
+ [PJS dataset path]
329
+ ┣ PJS_corpus_ver1.1
330
+ ┃ ┣ background_noise
331
+ ┃ ┣ pjs{SongID}
332
+ ┃ ┃ ┣ pjs{SongID}_song.wav
333
+ ┃ ┃ ┣ pjs{SongID}_speech.wav
334
+ ┃ ┃ ┣ pjs{SongID}.lab
335
+ ┃ ┃ ┣ pjs{SongID}.mid
336
+ ┃ ┃ ┣ pjs{SongID}.musicxml
337
+ ┃ ┃ ┣ pjs{SongID}.txt
338
+ ┃ ┣ ...
339
+ ```
340
+
341
+ ## SVCC
342
+
343
+ The official SVCC dataset can be downloaded [here](https://github.com/lesterphillip/SVCC23_FastSVC/tree/main/egs/generate_dataset). The file structure tree is like:
344
+
345
+ ```plaintext
346
+ [SVCC dataset path]
347
+ ┣ Data
348
+ ┃ ┣ CDF1
349
+ ┃ ┃ ┣ 10001.wav
350
+ ┃ ┃ ┣ 10002.wav
351
+ ┃ ┃ ┣ ...
352
+ ┃ ┣ CDM1
353
+ ┃ ┣ IDF1
354
+ ┃ ┣ IDM1
355
+ ┗ README.md
356
+ ```
357
+
358
+ ## VCTK
359
+
360
+ The official VCTK dataset can be downloaded [here](https://datashare.ed.ac.uk/handle/10283/3443). The file structure tree is like:
361
+
362
+ ```plaintext
363
+ [VCTK dataset path]
364
+ ┣ txt
365
+ ┃ ┣ {Speaker_1}
366
+ ┃ ┃ ┣ {Speaker_1}_001.txt
367
+ ┃ ┃ ┣ {Speaker_1}_002.txt
368
+ ┃ ┃ ┣ ...
369
+ ┃ ┣ {Speaker_2}
370
+ ┃ ┣ ...
371
+ ┣ wav48_silence_trimmed
372
+ ┃ ┣ {Speaker_1}
373
+ ┃ ┃ ┣ {Speaker_1}_001_mic1.flac
374
+ ┃ ┃ ┣ {Speaker_1}_001_mic2.flac
375
+ ┃ ┃ ┣ {Speaker_1}_002_mic1.flac
376
+ ┃ ┃ ┣ ...
377
+ ┃ ┣ {Speaker_2}
378
+ ┃ ┣ ...
379
+ ┣ speaker-info.txt
380
+ ┗ update.txt
381
+ ```
egs/metrics/README.md ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Amphion Evaluation Recipe
2
+
3
+ ## Supported Evaluation Metrics
4
+
5
+ Until now, Amphion Evaluation has supported the following objective metrics:
6
+
7
+ - **F0 Modeling**:
8
+ - F0 Pearson Coefficients (FPC)
9
+ - F0 Periodicity Root Mean Square Error (PeriodicityRMSE)
10
+ - F0 Root Mean Square Error (F0RMSE)
11
+ - Voiced/Unvoiced F1 Score (V/UV F1)
12
+ - **Energy Modeling**:
13
+ - Energy Root Mean Square Error (EnergyRMSE)
14
+ - Energy Pearson Coefficients (EnergyPC)
15
+ - **Intelligibility**:
16
+ - Character Error Rate (CER) based on [Whipser](https://github.com/openai/whisper)
17
+ - Word Error Rate (WER) based on [Whipser](https://github.com/openai/whisper)
18
+ - **Spectrogram Distortion**:
19
+ - Frechet Audio Distance (FAD)
20
+ - Mel Cepstral Distortion (MCD)
21
+ - Multi-Resolution STFT Distance (MSTFT)
22
+ - Perceptual Evaluation of Speech Quality (PESQ)
23
+ - Short Time Objective Intelligibility (STOI)
24
+ - Scale Invariant Signal to Distortion Ratio (SISDR)
25
+ - Scale Invariant Signal to Noise Ratio (SISNR)
26
+ - **Speaker Similarity**:
27
+ - Cosine similarity based on [Rawnet3](https://github.com/Jungjee/RawNet)
28
+ - Cosine similarity based on [WeSpeaker](https://github.com/wenet-e2e/wespeaker) (👨‍💻 developing)
29
+
30
+ We provide a recipe to demonstrate how to objectively evaluate your generated audios. There are three steps in total:
31
+
32
+ 1. Pretrained Models Preparation
33
+ 2. Audio Data Preparation
34
+ 3. Evaluation
35
+
36
+ ## 1. Pretrained Models Preparation
37
+
38
+ If you want to calculate `RawNet3` based speaker similarity, you need to download the pretrained model first, as illustrated [here](../../pretrained/README.md).
39
+
40
+ ## 2. Aduio Data Preparation
41
+
42
+ Prepare reference audios and generated audios in two folders, the `ref_dir` contains the reference audio and the `gen_dir` contains the generated audio. Here is an example.
43
+
44
+ ```plaintext
45
+ ┣ {ref_dir}
46
+ ┃ ┣ sample1.wav
47
+ ┃ ┣ sample2.wav
48
+ ┣ {gen_dir}
49
+ ┃ ┣ sample1.wav
50
+ ┃ ┣ sample2.wav
51
+ ```
52
+
53
+ You have to make sure that the pairwise **reference audio and generated audio are named the same**, as illustrated above (sample1 to sample1, sample2 to sample2).
54
+
55
+ ## 3. Evaluation
56
+
57
+ Run the `run.sh` with specified refenrece folder, generated folder, dump folder and metrics.
58
+
59
+ ```bash
60
+ cd Amphion
61
+ sh egs/metrics/run.sh \
62
+ --reference_folder [Your path to the reference audios] \
63
+ --generated_folder [Your path to the generated audios] \
64
+ --dump_folder [Your path to dump the objective results] \
65
+ --metrics [The metrics you need] \
66
+ --fs [Optional. To calculate all metrics in the specified sampling rate]
67
+ ```
68
+
69
+ As for the metrics, an example is provided below:
70
+
71
+ ```bash
72
+ --metrics "mcd pesq fad"
73
+ ```
74
+
75
+ All currently available metrics keywords are listed below:
76
+
77
+ | Keys | Description |
78
+ | --------------------- | ------------------------------------------ |
79
+ | `fpc` | F0 Pearson Coefficients |
80
+ | `f0_periodicity_rmse` | F0 Periodicity Root Mean Square Error |
81
+ | `f0rmse` | F0 Root Mean Square Error |
82
+ | `v_uv_f1` | Voiced/Unvoiced F1 Score |
83
+ | `energy_rmse` | Energy Root Mean Square Error |
84
+ | `energy_pc` | Energy Pearson Coefficients |
85
+ | `cer` | Character Error Rate |
86
+ | `wer` | Word Error Rate |
87
+ | `speaker_similarity` | Cos Similarity based on RawNet3 |
88
+ | `fad` | Frechet Audio Distance |
89
+ | `mcd` | Mel Cepstral Distortion |
90
+ | `mstft` | Multi-Resolution STFT Distance |
91
+ | `pesq` | Perceptual Evaluation of Speech Quality |
92
+ | `si_sdr` | Scale Invariant Signal to Distortion Ratio |
93
+ | `si_snr` | Scale Invariant Signal to Noise Ratio |
94
+ | `stoi` | Short Time Objective Intelligibility |
egs/metrics/run.sh ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ ######## Build Experiment Environment ###########
7
+ exp_dir=$(cd `dirname $0`; pwd)
8
+ work_dir=$(dirname $(dirname $exp_dir))
9
+
10
+ export WORK_DIR=$work_dir
11
+ export PYTHONPATH=$work_dir
12
+ export PYTHONIOENCODING=UTF-8
13
+
14
+ ######## Parse the Given Parameters from the Commond ###########
15
+ options=$(getopt -o c:n:s --long gpu:,reference_folder:,generated_folder:,dump_folder:,metrics:,fs: -- "$@")
16
+ eval set -- "$options"
17
+
18
+ while true; do
19
+ case $1 in
20
+ # Reference Audio Folder
21
+ --reference_folder) shift; ref_dir=$1 ; shift ;;
22
+ # Generated Audio Folder
23
+ --generated_folder) shift; deg_dir=$1 ; shift ;;
24
+ # Result Dumping Folder
25
+ --dump_folder) shift; dump_dir=$1 ; shift ;;
26
+ # Metrics to Compute
27
+ --metrics) shift; metrics=$1 ; shift ;;
28
+ # Sampling Rate
29
+ --fs) shift; fs=$1 ; shift ;;
30
+
31
+ --) shift ; break ;;
32
+ *) echo "Invalid option: $1" exit 1 ;;
33
+ esac
34
+ done
35
+
36
+ ######## Calculate Objective Metrics ###########
37
+ CUDA_VISIBLE_DEVICES=$gpu python "$work_dir"/bins/calc_metrics.py \
38
+ --ref_dir $ref_dir \
39
+ --deg_dir $deg_dir \
40
+ --dump_dir $dump_dir \
41
+ --metrics $metrics \
42
+ --fs $fs \
egs/svc/DiffComoSVC/README.md ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Accelerating Diffusion-based Singing Voice Conversion through Consistency Distillation
2
+ <br>
3
+ <div align="center">
4
+ <img src="../../../imgs/svc/DiffComoSVC.png" width="90%">
5
+ </div>
6
+ <br>
7
+
8
+ This is an implement of [Consistency Models](https://arxiv.org/abs/2303.01469) for accelerating diffusion-based singing voice conversion. The overall architecture follows "[Leveraging Content-based Features from Multiple Acoustic Models for Singing Voice Conversion](https://arxiv.org/abs/2310.11160)" (NeurIPS 2023 Workshop on Machine Learning for Audio), only a slightly modification is applied on acoustic model. Specifically,
9
+
10
+ * The acoustic model is a conformer which generates a coarse spectrogram and a diffusion decoder based on Bidirectional Non-Causal Dilated CNN which polish the former spectrogram for better. This is similar to [CoMoSpeech: One-Step Speech and Singing Voice Synthesis via Consistency Model](https://comospeech.github.io/)
11
+ * To accelerate diffusion model, we apply consistency distillation from [Consistency Models](https://arxiv.org/abs/2303.01469). For teacher model, the diffusion schedule of the diffusion decoder follows [karras diffusion](https://arxiv.org/abs/2206.00364). For distilling teacher model, the condition encoder and the conformer part of acoustic model are frozen while the diffusion decoder model is updated via exponential moving average. See Figure above for details.
12
+
13
+ There are five stages in total:
14
+
15
+ 1. Data preparation
16
+ 2. Features extraction
17
+ 3. Teacher Model Training
18
+ 4. Consistency Distillation
19
+ 5. Inference/conversion
20
+
21
+ ## 1. Data Preparation
22
+
23
+ ### Dataset Download
24
+
25
+ By default, we utilize the five datasets for training: M4Singer, Opencpop, OpenSinger, SVCC, and VCTK. How to download them is detailed [here](../../datasets/README.md).
26
+
27
+ ### Configuration
28
+
29
+ Specify the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets.
30
+
31
+ ```json
32
+ "dataset": [
33
+ "m4singer",
34
+ "opencpop",
35
+ "opensinger",
36
+ "svcc",
37
+ "vctk"
38
+ ],
39
+ "dataset_path": {
40
+ // TODO: Fill in your dataset path
41
+ "m4singer": "[M4Singer dataset path]",
42
+ "opencpop": "[Opencpop dataset path]",
43
+ "opensinger": "[OpenSinger dataset path]",
44
+ "svcc": "[SVCC dataset path]",
45
+ "vctk": "[VCTK dataset path]"
46
+ },
47
+ ```
48
+
49
+ ## 2. Features Extraction
50
+
51
+ ### Content-based Pretrained Models Download
52
+
53
+ By default, we utilize the Whisper and ContentVec to extract content features. How to download them is detailed [here](../../../pretrained/README.md).
54
+
55
+ ### Configuration
56
+
57
+ Specify the dataset path and the output path for saving the processed data and the training model in `exp_config.json`:
58
+
59
+ ```json
60
+ // TODO: Fill in the output log path
61
+ "log_dir": "[Your path to save logs and checkpoints]",
62
+ "preprocess": {
63
+ // TODO: Fill in the output data path
64
+ "processed_dir": "[Your path to save processed data]",
65
+ ...
66
+ },
67
+ ```
68
+
69
+ ### Run
70
+
71
+ Run the `run.sh` as the preproces stage (set `--stage 1`).
72
+
73
+ ```bash
74
+ cd Amphion
75
+ sh egs/svc/DiffComoSVC/run.sh --stage 1
76
+ ```
77
+
78
+ Note: The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`.
79
+
80
+ ## 3. Teacher Model Training
81
+
82
+ ### Configuration
83
+
84
+ Set the `distill` in `config/comosvc.json` to `false` for teacher model training, you can also specify the detailed configuration for conformer encoder and diffusion process here:
85
+
86
+ ```JSON
87
+ "comosvc":{
88
+ "distill": false,
89
+ // conformer encoder
90
+ "input_dim": 384,
91
+ "output_dim": 100,
92
+ "n_heads": 2,
93
+ "n_layers": 6,
94
+ "filter_channels":512,
95
+ // karras diffusion
96
+ "P_mean": -1.2,
97
+ "P_std": 1.2,
98
+ "sigma_data": 0.5,
99
+ "sigma_min": 0.002,
100
+ "sigma_max": 80,
101
+ "rho": 7,
102
+ "n_timesteps": 40,
103
+ },
104
+ ```
105
+
106
+ We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines.
107
+
108
+ ```json
109
+ "train": {
110
+ "batch_size": 32,
111
+ ...
112
+ "adamw": {
113
+ "lr": 2.0e-4
114
+ },
115
+ ...
116
+ }
117
+ ```
118
+
119
+ ### Run
120
+
121
+ Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `[Your path to save logs and checkpoints]/[YourExptName]`.
122
+
123
+ ```bash
124
+ cd Amphion
125
+ sh egs/svc/DiffComoSVC/run.sh --stage 2 --name [YourExptName]
126
+ ```
127
+
128
+ Note: The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can specify it when running `run.sh` such as:
129
+
130
+ ```bash
131
+ cd Amphion
132
+ sh egs/svc/DiffComoSVC/run.sh --stage 2 --name [YourExptName] --gpu "0,1,2,3"
133
+ ```
134
+
135
+ ## 4. Consistency Distillation
136
+
137
+ ### Configuration
138
+
139
+ Set the `distill` in `config/comosvc.json` to `true` for teacher model training, and specify the `teacher_model_path` for consistency distillation. You can also specify the detailed configuration for conformer encoder and diffusion process here:
140
+
141
+ ```JSON
142
+ "model": {
143
+ "teacher_model_path":"[Your_teacher_model_checkpoint].bin",
144
+ ...
145
+ "comosvc":{
146
+ "distill": true,
147
+ // conformer encoder
148
+ "input_dim": 384,
149
+ "output_dim": 100,
150
+ "n_heads": 2,
151
+ "n_layers": 6,
152
+ "filter_channels":512,
153
+ // karras diffusion
154
+ "P_mean": -1.2,
155
+ "P_std": 1.2,
156
+ "sigma_data": 0.5,
157
+ "sigma_min": 0.002,
158
+ "sigma_max": 80,
159
+ "rho": 7,
160
+ "n_timesteps": 40,
161
+ },
162
+ ```
163
+
164
+ We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines.
165
+
166
+ ```json
167
+ "train": {
168
+ "batch_size": 32,
169
+ ...
170
+ "adamw": {
171
+ "lr": 2.0e-4
172
+ },
173
+ ...
174
+ }
175
+ ```
176
+
177
+ ### Run
178
+
179
+ Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `[Your path to save logs and checkpoints]/[YourExptName]`.
180
+
181
+ ```bash
182
+ cd Amphion
183
+ sh egs/svc/DiffComoSVC/run.sh --stage 2 --name [YourExptName]
184
+ ```
185
+
186
+ Note: The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can specify it when running `run.sh` such as:
187
+
188
+ ```bash
189
+ cd Amphion
190
+ sh egs/svc/DiffComoSVC/run.sh --stage 2 --name [YourExptName] --gpu "0,1,2,3"
191
+ ```
192
+
193
+ ## 5. Inference/Conversion
194
+
195
+ ### Pretrained Vocoder Download
196
+
197
+ We fine-tune the official BigVGAN pretrained model with over 120 hours singing voice data. The benifits of fine-tuning has been investigated in our paper (see this [demo page](https://www.zhangxueyao.com/data/MultipleContentsSVC/vocoder.html)). The final pretrained singing voice vocoder is released [here](../../../pretrained/README.md#amphion-singing-bigvgan) (called `Amphion Singing BigVGAN`).
198
+
199
+ ### Run
200
+
201
+ For inference/conversion, you need to specify the following configurations when running `run.sh`:
202
+
203
+ | Parameters | Description | Example |
204
+ | --------------------------------------------------- | ------------------------------------------------------------ | ------------------------------------------------------------ |
205
+ | `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `[Your path to save logs and checkpoints]/[YourExptName]` |
206
+ | `--infer_output_dir` | The output directory to save inferred audios. | `[Your path to save logs and checkpoints]/[YourExptName]/result` |
207
+ | `--infer_source_file` or `--infer_source_audio_dir` | The inference source (can be a json file or a dir). | The `infer_source_file` could be `[Your path to save processed data]/[YourDataset]/test.json`, and the `infer_source_audio_dir` is a folder which includes several audio files (*.wav, *.mp3 or *.flac). |
208
+ | `--infer_target_speaker` | The target speaker you want to convert into. You can refer to `[Your path to save logs and checkpoints]/[YourExptName]/singers.json` to choose a trained speaker. | For opencpop dataset, the speaker name would be `opencpop_female1`. |
209
+ | `--infer_key_shift` | How many semitones you want to transpose. | `"autoshfit"` (by default), `3`, `-3`, etc. |
210
+
211
+ For example, if you want to make `opencpop_female1` sing the songs in the `[Your Audios Folder]`, just run:
212
+
213
+ ```bash
214
+ cd Amphion
215
+ sh egs/svc/DiffComoSVC/run.sh --stage 3 --gpu "0" \
216
+ --infer_expt_dir [Your path to save logs and checkpoints]/[YourExptName] \
217
+ --infer_output_dir [Your path to save logs and checkpoints]/[YourExptName]/result \
218
+ --infer_source_audio_dir [Your Audios Folder] \
219
+ --infer_target_speaker "opencpop_female1" \
220
+ --infer_key_shift "autoshift"
221
+ ```
222
+ Specially, you can configurate the inference steps for teacher model by setting `inference` at `exp_config`(student model is always one-step sampling):
223
+ ```json
224
+ "inference": {
225
+ "comosvc": {
226
+ "inference_steps": 40
227
+ }
228
+ }
229
+ ```
230
+
231
+ # Reference
232
+ https://github.com/zhenye234/CoMoSpeech
233
+
234
+ https://github.com/openai/consistency_models
egs/svc/DiffComoSVC/exp_config.json ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/comosvc.json",
3
+ "model_type": "DiffComoSVC",
4
+ "dataset": [
5
+ "m4singer",
6
+ "opencpop",
7
+ "opensinger",
8
+ "svcc",
9
+ "vctk"
10
+ ],
11
+ "dataset_path": {
12
+ // TODO: Fill in your dataset path
13
+ "m4singer": "[M4Singer dataset path]",
14
+ "opencpop": "[Opencpop dataset path]",
15
+ "opensinger": "[OpenSinger dataset path]",
16
+ "svcc": "[SVCC dataset path]",
17
+ "vctk": "[VCTK dataset path]"
18
+ },
19
+ // TODO: Fill in the output log path
20
+ "log_dir": "[Your path to save logs and checkpoints]",
21
+ "preprocess": {
22
+ // TODO: Fill in the output data path
23
+ "processed_dir": "[Your path to save processed data]",
24
+ // Config for features extraction
25
+ "extract_mel": true,
26
+ "extract_pitch": true,
27
+ "extract_energy": true,
28
+ "extract_whisper_feature": true,
29
+ "extract_contentvec_feature": true,
30
+ "extract_wenet_feature": false,
31
+ "whisper_batch_size": 30, // decrease it if your GPU is out of memory
32
+ "contentvec_batch_size": 1,
33
+ // Fill in the content-based pretrained model's path
34
+ "contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt",
35
+ "wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt",
36
+ "wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml",
37
+ "whisper_model": "medium",
38
+ "whisper_model_path": "pretrained/whisper/medium.pt",
39
+ // Config for features usage
40
+ "use_mel": true,
41
+ "use_min_max_norm_mel": true,
42
+ "use_frame_pitch": true,
43
+ "use_frame_energy": true,
44
+ "use_spkid": true,
45
+ "use_whisper": true,
46
+ "use_contentvec": true,
47
+ "use_wenet": false,
48
+ "n_mel": 100,
49
+ "sample_rate": 24000
50
+ },
51
+ "model": {
52
+ "teacher_model_path":"[Your_teacher_model_checkpoint].bin",
53
+ "condition_encoder": {
54
+ // Config for features usage
55
+ "use_whisper": true,
56
+ "use_contentvec": true,
57
+ "use_wenet": false,
58
+ "whisper_dim": 1024,
59
+ "contentvec_dim": 256,
60
+ "wenet_dim": 512,
61
+ "use_singer_encoder": false,
62
+ "pitch_min": 50,
63
+ "pitch_max": 1100
64
+ },
65
+ "comosvc":{
66
+ "distill": false,
67
+ // conformer encoder
68
+ "input_dim": 384,
69
+ "output_dim": 100,
70
+ "n_heads": 2,
71
+ "n_layers": 6,
72
+ "filter_channels":512,
73
+ "dropout":0.1,
74
+ // karras diffusion
75
+ "P_mean": -1.2,
76
+ "P_std": 1.2,
77
+ "sigma_data": 0.5,
78
+ "sigma_min": 0.002,
79
+ "sigma_max": 80,
80
+ "rho": 7,
81
+ "n_timesteps": 40,
82
+ },
83
+ "diffusion": {
84
+ // Diffusion steps encoder
85
+ "step_encoder": {
86
+ "dim_raw_embedding": 128,
87
+ "dim_hidden_layer": 512,
88
+ "activation": "SiLU",
89
+ "num_layer": 2,
90
+ "max_period": 10000
91
+ },
92
+ // Diffusion decoder
93
+ "model_type": "bidilconv",
94
+ // bidilconv, unet2d, TODO: unet1d
95
+ "bidilconv": {
96
+ "base_channel": 384,
97
+ "n_res_block": 20,
98
+ "conv_kernel_size": 3,
99
+ "dilation_cycle_length": 4,
100
+ // specially, 1 means no dilation
101
+ "conditioner_size": 100
102
+ }
103
+ }
104
+ },
105
+ "train": {
106
+ "batch_size": 64,
107
+ "gradient_accumulation_step": 1,
108
+ "max_epoch": -1, // -1 means no limit
109
+ "save_checkpoint_stride": [
110
+ 50,
111
+ 50
112
+ ],
113
+ "keep_last": [
114
+ 5,
115
+ -1
116
+ ],
117
+ "run_eval": [
118
+ false,
119
+ true
120
+ ],
121
+ "adamw": {
122
+ "lr": 4.0e-4
123
+ },
124
+ "reducelronplateau": {
125
+ "factor": 0.8,
126
+ "patience": 10,
127
+ "min_lr": 1.0e-4
128
+ },
129
+ "dataloader": {
130
+ "num_worker": 8,
131
+ "pin_memory": true
132
+ },
133
+ "sampler": {
134
+ "holistic_shuffle": false,
135
+ "drop_last": true
136
+ }
137
+ },
138
+ "inference": {
139
+ "comosvc": {
140
+ "inference_steps": 40
141
+ }
142
+ }
143
+ }
egs/svc/DiffComoSVC/run.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ ../_template/run.sh
egs/svc/MultipleContentsSVC/README.md ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Leveraging Content-based Features from Multiple Acoustic Models for Singing Voice Conversion
2
+
3
+ [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2310.11160)
4
+ [![demo](https://img.shields.io/badge/SVC-Demo-red)](https://www.zhangxueyao.com/data/MultipleContentsSVC/index.html)
5
+
6
+ <br>
7
+ <div align="center">
8
+ <img src="../../../imgs/svc/MultipleContentsSVC.png" width="85%">
9
+ </div>
10
+ <br>
11
+
12
+ This is the official implementation of the paper "[Leveraging Content-based Features from Multiple Acoustic Models for Singing Voice Conversion](https://arxiv.org/abs/2310.11160)" (NeurIPS 2023 Workshop on Machine Learning for Audio). Specially,
13
+
14
+ - The muptile content features are from [Whipser](https://github.com/wenet-e2e/wenet) and [ContentVec](https://github.com/auspicious3000/contentvec).
15
+ - The acoustic model is based on Bidirectional Non-Causal Dilated CNN (called `DiffWaveNetSVC` in Amphion), which is similar to [WaveNet](https://arxiv.org/pdf/1609.03499.pdf), [DiffWave](https://openreview.net/forum?id=a-xFK8Ymz5J), and [DiffSVC](https://ieeexplore.ieee.org/document/9688219).
16
+ - The vocoder is [BigVGAN](https://github.com/NVIDIA/BigVGAN) architecture and we fine-tuned it in over 120 hours singing voice data.
17
+
18
+ There are four stages in total:
19
+
20
+ 1. Data preparation
21
+ 2. Features extraction
22
+ 3. Training
23
+ 4. Inference/conversion
24
+
25
+ > **NOTE:** You need to run every command of this recipe in the `Amphion` root path:
26
+ > ```bash
27
+ > cd Amphion
28
+ > ```
29
+
30
+ ## 1. Data Preparation
31
+
32
+ ### Dataset Download
33
+
34
+ By default, we utilize the five datasets for training: M4Singer, Opencpop, OpenSinger, SVCC, and VCTK. How to download them is detailed [here](../../datasets/README.md).
35
+
36
+ ### Configuration
37
+
38
+ Specify the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets.
39
+
40
+ ```json
41
+ "dataset": [
42
+ "m4singer",
43
+ "opencpop",
44
+ "opensinger",
45
+ "svcc",
46
+ "vctk"
47
+ ],
48
+ "dataset_path": {
49
+ // TODO: Fill in your dataset path
50
+ "m4singer": "[M4Singer dataset path]",
51
+ "opencpop": "[Opencpop dataset path]",
52
+ "opensinger": "[OpenSinger dataset path]",
53
+ "svcc": "[SVCC dataset path]",
54
+ "vctk": "[VCTK dataset path]"
55
+ },
56
+ ```
57
+
58
+ ## 2. Features Extraction
59
+
60
+ ### Content-based Pretrained Models Download
61
+
62
+ By default, we utilize the Whisper and ContentVec to extract content features. How to download them is detailed [here](../../../pretrained/README.md).
63
+
64
+ ### Configuration
65
+
66
+ Specify the dataset path and the output path for saving the processed data and the training model in `exp_config.json`:
67
+
68
+ ```json
69
+ // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
70
+ "log_dir": "ckpts/svc",
71
+ "preprocess": {
72
+ // TODO: Fill in the output data path. The default value is "Amphion/data"
73
+ "processed_dir": "data",
74
+ ...
75
+ },
76
+ ```
77
+
78
+ ### Run
79
+
80
+ Run the `run.sh` as the preproces stage (set `--stage 1`).
81
+
82
+ ```bash
83
+ sh egs/svc/MultipleContentsSVC/run.sh --stage 1
84
+ ```
85
+
86
+ > **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`.
87
+
88
+ ## 3. Training
89
+
90
+ ### Configuration
91
+
92
+ We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines.
93
+
94
+ ```json
95
+ "train": {
96
+ "batch_size": 32,
97
+ ...
98
+ "adamw": {
99
+ "lr": 2.0e-4
100
+ },
101
+ ...
102
+ }
103
+ ```
104
+
105
+ ### Run
106
+
107
+ Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/svc/[YourExptName]`.
108
+
109
+ ```bash
110
+ sh egs/svc/MultipleContentsSVC/run.sh --stage 2 --name [YourExptName]
111
+ ```
112
+
113
+ > **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`.
114
+
115
+ ## 4. Inference/Conversion
116
+
117
+ ### Pretrained Vocoder Download
118
+
119
+ We fine-tune the official BigVGAN pretrained model with over 120 hours singing voice data. The benifits of fine-tuning has been investigated in our paper (see this [demo page](https://www.zhangxueyao.com/data/MultipleContentsSVC/vocoder.html)). The final pretrained singing voice vocoder is released [here](../../../pretrained/README.md#amphion-singing-bigvgan) (called `Amphion Singing BigVGAN`).
120
+
121
+ ### Run
122
+
123
+ For inference/conversion, you need to specify the following configurations when running `run.sh`:
124
+
125
+ | Parameters | Description | Example |
126
+ | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
127
+ | `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `Amphion/ckpts/svc/[YourExptName]` |
128
+ | `--infer_output_dir` | The output directory to save inferred audios. | `Amphion/ckpts/svc/[YourExptName]/result` |
129
+ | `--infer_source_file` or `--infer_source_audio_dir` | The inference source (can be a json file or a dir). | The `infer_source_file` could be `Amphion/data/[YourDataset]/test.json`, and the `infer_source_audio_dir` is a folder which includes several audio files (*.wav, *.mp3 or *.flac). |
130
+ | `--infer_target_speaker` | The target speaker you want to convert into. You can refer to `Amphion/ckpts/svc/[YourExptName]/singers.json` to choose a trained speaker. | For opencpop dataset, the speaker name would be `opencpop_female1`. |
131
+ | `--infer_key_shift` | How many semitones you want to transpose. | `"autoshfit"` (by default), `3`, `-3`, etc. |
132
+
133
+ For example, if you want to make `opencpop_female1` sing the songs in the `[Your Audios Folder]`, just run:
134
+
135
+ ```bash
136
+ sh egs/svc/MultipleContentsSVC/run.sh --stage 3 --gpu "0" \
137
+ --infer_expt_dir Amphion/ckpts/svc/[YourExptName] \
138
+ --infer_output_dir Amphion/ckpts/svc/[YourExptName]/result \
139
+ --infer_source_audio_dir [Your Audios Folder] \
140
+ --infer_target_speaker "opencpop_female1" \
141
+ --infer_key_shift "autoshift"
142
+ ```
143
+
144
+ ## Citations
145
+
146
+ ```bibtex
147
+ @article{zhang2023leveraging,
148
+ title={Leveraging Content-based Features from Multiple Acoustic Models for Singing Voice Conversion},
149
+ author={Zhang, Xueyao and Gu, Yicheng and Chen, Haopeng and Fang, Zihao and Zou, Lexiao and Xue, Liumeng and Wu, Zhizheng},
150
+ journal={Machine Learning for Audio Worshop, NeurIPS 2023},
151
+ year={2023}
152
+ }
153
+ ```
egs/svc/MultipleContentsSVC/exp_config.json ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/diffusion.json",
3
+ "model_type": "DiffWaveNetSVC",
4
+ "dataset": [
5
+ "m4singer",
6
+ "opencpop",
7
+ "opensinger",
8
+ "svcc",
9
+ "vctk"
10
+ ],
11
+ "dataset_path": {
12
+ // TODO: Fill in your dataset path
13
+ "m4singer": "[M4Singer dataset path]",
14
+ "opencpop": "[Opencpop dataset path]",
15
+ "opensinger": "[OpenSinger dataset path]",
16
+ "svcc": "[SVCC dataset path]",
17
+ "vctk": "[VCTK dataset path]"
18
+ },
19
+ // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
20
+ "log_dir": "ckpts/svc",
21
+ "preprocess": {
22
+ // TODO: Fill in the output data path. The default value is "Amphion/data"
23
+ "processed_dir": "data",
24
+ // Config for features extraction
25
+ "extract_mel": true,
26
+ "extract_pitch": true,
27
+ "extract_energy": true,
28
+ "extract_whisper_feature": true,
29
+ "extract_contentvec_feature": true,
30
+ "extract_wenet_feature": false,
31
+ "whisper_batch_size": 30, // decrease it if your GPU is out of memory
32
+ "contentvec_batch_size": 1,
33
+ // Fill in the content-based pretrained model's path
34
+ "contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt",
35
+ "wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt",
36
+ "wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml",
37
+ "whisper_model": "medium",
38
+ "whisper_model_path": "pretrained/whisper/medium.pt",
39
+ // Config for features usage
40
+ "use_mel": true,
41
+ "use_min_max_norm_mel": true,
42
+ "use_frame_pitch": true,
43
+ "use_frame_energy": true,
44
+ "use_spkid": true,
45
+ "use_whisper": true,
46
+ "use_contentvec": true,
47
+ "use_wenet": false,
48
+ "n_mel": 100,
49
+ "sample_rate": 24000
50
+ },
51
+ "model": {
52
+ "condition_encoder": {
53
+ // Config for features usage
54
+ "use_whisper": true,
55
+ "use_contentvec": true,
56
+ "use_wenet": false,
57
+ "whisper_dim": 1024,
58
+ "contentvec_dim": 256,
59
+ "wenet_dim": 512,
60
+ "use_singer_encoder": false,
61
+ "pitch_min": 50,
62
+ "pitch_max": 1100
63
+ },
64
+ "diffusion": {
65
+ "scheduler": "ddpm",
66
+ "scheduler_settings": {
67
+ "num_train_timesteps": 1000,
68
+ "beta_start": 1.0e-4,
69
+ "beta_end": 0.02,
70
+ "beta_schedule": "linear"
71
+ },
72
+ // Diffusion steps encoder
73
+ "step_encoder": {
74
+ "dim_raw_embedding": 128,
75
+ "dim_hidden_layer": 512,
76
+ "activation": "SiLU",
77
+ "num_layer": 2,
78
+ "max_period": 10000
79
+ },
80
+ // Diffusion decoder
81
+ "model_type": "bidilconv",
82
+ // bidilconv, unet2d, TODO: unet1d
83
+ "bidilconv": {
84
+ "base_channel": 512,
85
+ "n_res_block": 40,
86
+ "conv_kernel_size": 3,
87
+ "dilation_cycle_length": 4,
88
+ // specially, 1 means no dilation
89
+ "conditioner_size": 384
90
+ }
91
+ }
92
+ },
93
+ "train": {
94
+ "batch_size": 32,
95
+ "gradient_accumulation_step": 1,
96
+ "max_epoch": -1, // -1 means no limit
97
+ "save_checkpoint_stride": [
98
+ 3,
99
+ 50
100
+ ],
101
+ "keep_last": [
102
+ 3,
103
+ 2
104
+ ],
105
+ "run_eval": [
106
+ true,
107
+ true
108
+ ],
109
+ "adamw": {
110
+ "lr": 2.0e-4
111
+ },
112
+ "reducelronplateau": {
113
+ "factor": 0.8,
114
+ "patience": 30,
115
+ "min_lr": 1.0e-4
116
+ },
117
+ "dataloader": {
118
+ "num_worker": 8,
119
+ "pin_memory": true
120
+ },
121
+ "sampler": {
122
+ "holistic_shuffle": false,
123
+ "drop_last": true
124
+ }
125
+ }
126
+ }
egs/svc/MultipleContentsSVC/run.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ ../_template/run.sh
egs/svc/README.md ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Amphion Singing Voice Conversion (SVC) Recipe
2
+
3
+ ## Quick Start
4
+
5
+ We provide a **[beginner recipe](MultipleContentsSVC)** to demonstrate how to train a cutting edge SVC model. Specifically, it is also an official implementation of the paper "[Leveraging Content-based Features from Multiple Acoustic Models for Singing Voice Conversion](https://arxiv.org/abs/2310.11160)" (NeurIPS 2023 Workshop on Machine Learning for Audio). Some demos can be seen [here](https://www.zhangxueyao.com/data/MultipleContentsSVC/index.html).
6
+
7
+ ## Supported Model Architectures
8
+
9
+ The main idea of SVC is to first disentangle the speaker-agnostic representations from the source audio, and then inject the desired speaker information to synthesize the target, which usually utilizes an acoustic decoder and a subsequent waveform synthesizer (vocoder):
10
+
11
+ <br>
12
+ <div align="center">
13
+ <img src="../../imgs/svc/pipeline.png" width="70%">
14
+ </div>
15
+ <br>
16
+
17
+ Until now, Amphion SVC has supported the following features and models:
18
+
19
+ - **Speaker-agnostic Representations**:
20
+ - Content Features: Sourcing from [WeNet](https://github.com/wenet-e2e/wenet), [Whisper](https://github.com/openai/whisper), and [ContentVec](https://github.com/auspicious3000/contentvec).
21
+ - Prosody Features: F0 and energy.
22
+ - **Speaker Embeddings**:
23
+ - Speaker Look-Up Table.
24
+ - Reference Encoder (👨‍💻 developing): It can be used for zero-shot SVC.
25
+ - **Acoustic Decoders**:
26
+ - Diffusion-based models:
27
+ - **[DiffWaveNetSVC](MultipleContentsSVC)**: The encoder is based on Bidirectional Non-Causal Dilated CNN, which is similar to [WaveNet](https://arxiv.org/pdf/1609.03499.pdf), [DiffWave](https://openreview.net/forum?id=a-xFK8Ymz5J), and [DiffSVC](https://ieeexplore.ieee.org/document/9688219).
28
+ - **[DiffComoSVC](DiffComoSVC)** (👨‍💻 developing): The diffusion framework is based on [Consistency Model](https://proceedings.mlr.press/v202/song23a.html). It can significantly accelerate the inference process of the diffusion model.
29
+ - Transformer-based models:
30
+ - **[TransformerSVC](TransformerSVC)**: Encoder-only and Non-autoregressive Transformer Architecture.
31
+ - VAE- and Flow-based models:
32
+ - **[VitsSVC](VitsSVC)**: It is designed as a [VITS](https://arxiv.org/abs/2106.06103)-like model whose textual input is replaced by the content features, which is similar to [so-vits-svc](https://github.com/svc-develop-team/so-vits-svc).
33
+ - **Waveform Synthesizers (Vocoders)**:
34
+ - The supported vocoders can be seen in [Amphion Vocoder Recipe](../vocoder/README.md).
egs/svc/TransformerSVC/README.md ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Transformer for Singing Voice Conversion
2
+
3
+ This is an implementation of **vanilla transformer encoder**/**conformer** as acoustic model for singing voice conversion.
4
+
5
+ There are four stages in total:
6
+
7
+ 1. Data preparation
8
+ 2. Features extraction
9
+ 3. Training
10
+ 4. Inference/conversion
11
+
12
+ > **NOTE:** You need to run every command of this recipe in the `Amphion` root path:
13
+ > ```bash
14
+ > cd Amphion
15
+ > ```
16
+
17
+ ## 1. Data Preparation
18
+
19
+ ### Dataset Download
20
+
21
+ By default, we utilize the five datasets for training: M4Singer, Opencpop, OpenSinger, SVCC, and VCTK. How to download them is detailed [here](../../datasets/README.md).
22
+
23
+ ### Configuration
24
+
25
+ Specify the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets.
26
+
27
+ ```json
28
+ "dataset": [
29
+ "m4singer",
30
+ "opencpop",
31
+ "opensinger",
32
+ "svcc",
33
+ "vctk"
34
+ ],
35
+ "dataset_path": {
36
+ // TODO: Fill in your dataset path
37
+ "m4singer": "[M4Singer dataset path]",
38
+ "opencpop": "[Opencpop dataset path]",
39
+ "opensinger": "[OpenSinger dataset path]",
40
+ "svcc": "[SVCC dataset path]",
41
+ "vctk": "[VCTK dataset path]"
42
+ },
43
+ ```
44
+
45
+ ## 2. Features Extraction
46
+
47
+ ### Content-based Pretrained Models Download
48
+
49
+ By default, we utilize the Whisper and ContentVec to extract content features. How to download them is detailed [here](../../../pretrained/README.md).
50
+
51
+ ### Configuration
52
+
53
+ Specify the dataset path and the output path for saving the processed data and the training model in `exp_config.json`:
54
+
55
+ ```json
56
+ // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
57
+ "log_dir": "ckpts/svc",
58
+ "preprocess": {
59
+ // TODO: Fill in the output data path. The default value is "Amphion/data"
60
+ "processed_dir": "data",
61
+ ...
62
+ },
63
+ ```
64
+
65
+ ### Run
66
+
67
+ Run the `run.sh` as the preproces stage (set `--stage 1`).
68
+
69
+ ```bash
70
+ sh egs/svc/TransformerSVC/run.sh --stage 1
71
+ ```
72
+
73
+ > **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`.
74
+
75
+ ## 3. Training
76
+
77
+ ### Configuration
78
+ Specify the detailed configuration for transformer block in `exp_config.json`. For key `type`, `conformer` and `transformer` are supported:
79
+ ```json
80
+ "model": {
81
+ ...
82
+ "transformer":{
83
+ // 'conformer' or 'transformer'
84
+ "type": "conformer",
85
+ "input_dim": 384,
86
+ "output_dim": 100,
87
+ "n_heads": 2,
88
+ "n_layers": 6,
89
+ "filter_channels":512,
90
+ "dropout":0.1,
91
+ }
92
+ }
93
+ ```
94
+ We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines.
95
+
96
+ ```json
97
+ "train": {
98
+ "batch_size": 32,
99
+ ...
100
+ "adamw": {
101
+ "lr": 2.0e-4
102
+ },
103
+ ...
104
+ }
105
+ ```
106
+
107
+ ### Run
108
+
109
+ Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/svc/[YourExptName]`.
110
+
111
+ ```bash
112
+ sh egs/svc/TransformerSVC/run.sh --stage 2 --name [YourExptName]
113
+ ```
114
+
115
+ > **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`.
116
+
117
+ ## 4. Inference/Conversion
118
+
119
+ ### Pretrained Vocoder Download
120
+
121
+ We fine-tune the official BigVGAN pretrained model with over 120 hours singing voice data. The benifits of fine-tuning has been investigated in our paper (see this [demo page](https://www.zhangxueyao.com/data/MultipleContentsSVC/vocoder.html)). The final pretrained singing voice vocoder is released [here](../../../pretrained/README.md#amphion-singing-bigvgan) (called `Amphion Singing BigVGAN`).
122
+
123
+ ### Run
124
+
125
+ For inference/conversion, you need to specify the following configurations when running `run.sh`:
126
+
127
+ | Parameters | Description | Example |
128
+ | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
129
+ | `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `Amphion/ckpts/svc/[YourExptName]` |
130
+ | `--infer_output_dir` | The output directory to save inferred audios. | `Amphion/ckpts/svc/[YourExptName]/result` |
131
+ | `--infer_source_file` or `--infer_source_audio_dir` | The inference source (can be a json file or a dir). | The `infer_source_file` could be `Amphion/data/[YourDataset]/test.json`, and the `infer_source_audio_dir` is a folder which includes several audio files (*.wav, *.mp3 or *.flac). |
132
+ | `--infer_target_speaker` | The target speaker you want to convert into. You can refer to `Amphion/ckpts/svc/[YourExptName]/singers.json` to choose a trained speaker. | For opencpop dataset, the speaker name would be `opencpop_female1`. |
133
+ | `--infer_key_shift` | How many semitones you want to transpose. | `"autoshfit"` (by default), `3`, `-3`, etc. |
134
+
135
+ For example, if you want to make `opencpop_female1` sing the songs in the `[Your Audios Folder]`, just run:
136
+
137
+ ```bash
138
+ cd Amphion
139
+ sh egs/svc/TransformerSVC/run.sh --stage 3 --gpu "0" \
140
+ --infer_expt_dir Amphion/ckpts/svc/[YourExptName] \
141
+ --infer_output_dir Amphion/ckpts/svc/[YourExptName]/result \
142
+ --infer_source_audio_dir [Your Audios Folder] \
143
+ --infer_target_speaker "opencpop_female1" \
144
+ --infer_key_shift "autoshift"
145
+ ```
146
+
147
+ ## Citations
148
+
149
+ ```bibtex
150
+ @inproceedings{transformer,
151
+ author = {Ashish Vaswani and
152
+ Noam Shazeer and
153
+ Niki Parmar and
154
+ Jakob Uszkoreit and
155
+ Llion Jones and
156
+ Aidan N. Gomez and
157
+ Lukasz Kaiser and
158
+ Illia Polosukhin},
159
+ title = {Attention is All you Need},
160
+ booktitle = {{NIPS}},
161
+ pages = {5998--6008},
162
+ year = {2017}
163
+ }
164
+ ```
egs/svc/TransformerSVC/exp_config.json ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/transformer.json",
3
+ "model_type": "TransformerSVC",
4
+ "dataset": [
5
+ "m4singer",
6
+ "opencpop",
7
+ "opensinger",
8
+ "svcc",
9
+ "vctk"
10
+ ],
11
+ "dataset_path": {
12
+ // TODO: Fill in your dataset path
13
+ "m4singer": "[M4Singer dataset path]",
14
+ "opencpop": "[Opencpop dataset path]",
15
+ "opensinger": "[OpenSinger dataset path]",
16
+ "svcc": "[SVCC dataset path]",
17
+ "vctk": "[VCTK dataset path]"
18
+ },
19
+ // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
20
+ "log_dir": "ckpts/svc",
21
+ "preprocess": {
22
+ // TODO: Fill in the output data path. The default value is "Amphion/data"
23
+ "processed_dir": "data",
24
+ // Config for features extraction
25
+ "extract_mel": true,
26
+ "extract_pitch": true,
27
+ "extract_energy": true,
28
+ "extract_whisper_feature": true,
29
+ "extract_contentvec_feature": true,
30
+ "extract_wenet_feature": false,
31
+ "whisper_batch_size": 30, // decrease it if your GPU is out of memory
32
+ "contentvec_batch_size": 1,
33
+ // Fill in the content-based pretrained model's path
34
+ "contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt",
35
+ "wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt",
36
+ "wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml",
37
+ "whisper_model": "medium",
38
+ "whisper_model_path": "pretrained/whisper/medium.pt",
39
+ // Config for features usage
40
+ "use_mel": true,
41
+ "use_min_max_norm_mel": true,
42
+ "use_frame_pitch": true,
43
+ "use_frame_energy": true,
44
+ "use_spkid": true,
45
+ "use_whisper": true,
46
+ "use_contentvec": true,
47
+ "use_wenet": false,
48
+ "n_mel": 100,
49
+ "sample_rate": 24000
50
+ },
51
+ "model": {
52
+ "condition_encoder": {
53
+ // Config for features usage
54
+ "use_whisper": true,
55
+ "use_contentvec": true,
56
+ "use_wenet": false,
57
+ "whisper_dim": 1024,
58
+ "contentvec_dim": 256,
59
+ "wenet_dim": 512,
60
+ "use_singer_encoder": false,
61
+ "pitch_min": 50,
62
+ "pitch_max": 1100
63
+ },
64
+ "transformer": {
65
+ // 'conformer' or 'transformer'
66
+ "type": "conformer",
67
+ "input_dim": 384,
68
+ "output_dim": 100,
69
+ "n_heads": 2,
70
+ "n_layers": 6,
71
+ "filter_channels": 512,
72
+ "dropout": 0.1,
73
+ }
74
+ },
75
+ "train": {
76
+ "batch_size": 64,
77
+ "gradient_accumulation_step": 1,
78
+ "max_epoch": -1, // -1 means no limit
79
+ "save_checkpoint_stride": [
80
+ 50,
81
+ 50
82
+ ],
83
+ "keep_last": [
84
+ 5,
85
+ -1
86
+ ],
87
+ "run_eval": [
88
+ false,
89
+ true
90
+ ],
91
+ "adamw": {
92
+ "lr": 4.0e-4
93
+ },
94
+ "reducelronplateau": {
95
+ "factor": 0.8,
96
+ "patience": 10,
97
+ "min_lr": 1.0e-4
98
+ },
99
+ "dataloader": {
100
+ "num_worker": 8,
101
+ "pin_memory": true
102
+ },
103
+ "sampler": {
104
+ "holistic_shuffle": false,
105
+ "drop_last": true
106
+ }
107
+ }
108
+ }
egs/svc/TransformerSVC/run.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ ../_template/run.sh
egs/svc/VitsSVC/README.md ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # VITS for Singing Voice Conversion
2
+
3
+ This is an implementation of VITS as acoustic model for end-to-end singing voice conversion. Adapted from [so-vits-svc](https://github.com/svc-develop-team/so-vits-svc), SoftVC content encoder is used to extract content features from the source audio. These feature vectors are directly fed into VITS without the need for conversion to a text-based intermediate representation.
4
+
5
+ There are four stages in total:
6
+
7
+ 1. Data preparation
8
+ 2. Features extraction
9
+ 3. Training
10
+ 4. Inference/conversion
11
+
12
+ > **NOTE:** You need to run every command of this recipe in the `Amphion` root path:
13
+ > ```bash
14
+ > cd Amphion
15
+ > ```
16
+
17
+ ## 1. Data Preparation
18
+
19
+ ### Dataset Download
20
+
21
+ By default, we utilize the five datasets for training: M4Singer, Opencpop, OpenSinger, SVCC, and VCTK. How to download them is detailed [here](../../datasets/README.md).
22
+
23
+ ### Configuration
24
+
25
+ Specify the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets.
26
+
27
+ ```json
28
+ "dataset": [
29
+ "m4singer",
30
+ "opencpop",
31
+ "opensinger",
32
+ "svcc",
33
+ "vctk"
34
+ ],
35
+ "dataset_path": {
36
+ // TODO: Fill in your dataset path
37
+ "m4singer": "[M4Singer dataset path]",
38
+ "opencpop": "[Opencpop dataset path]",
39
+ "opensinger": "[OpenSinger dataset path]",
40
+ "svcc": "[SVCC dataset path]",
41
+ "vctk": "[VCTK dataset path]"
42
+ },
43
+ ```
44
+
45
+ ## 2. Features Extraction
46
+
47
+ ### Content-based Pretrained Models Download
48
+
49
+ By default, we utilize ContentVec and Whisper to extract content features. How to download them is detailed [here](../../../pretrained/README.md).
50
+
51
+ ### Configuration
52
+
53
+ Specify the dataset path and the output path for saving the processed data and the training model in `exp_config.json`:
54
+
55
+ ```json
56
+ // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
57
+ "log_dir": "ckpts/svc",
58
+ "preprocess": {
59
+ // TODO: Fill in the output data path. The default value is "Amphion/data"
60
+ "processed_dir": "data",
61
+ ...
62
+ },
63
+ ```
64
+
65
+ ### Run
66
+
67
+ Run the `run.sh` as the preproces stage (set `--stage 1`).
68
+
69
+ ```bash
70
+ sh egs/svc/VitsSVC/run.sh --stage 1
71
+ ```
72
+
73
+ > **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`.
74
+
75
+ ## 3. Training
76
+
77
+ ### Configuration
78
+
79
+ We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines.
80
+
81
+ ```json
82
+ "train": {
83
+ "batch_size": 32,
84
+ ...
85
+ "adamw": {
86
+ "lr": 2.0e-4
87
+ },
88
+ ...
89
+ }
90
+ ```
91
+
92
+ ### Run
93
+
94
+ Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/svc/[YourExptName]`.
95
+
96
+ ```bash
97
+ sh egs/svc/VitsSVC/run.sh --stage 2 --name [YourExptName]
98
+ ```
99
+
100
+ > **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`.
101
+
102
+ ## 4. Inference/Conversion
103
+
104
+ ### Run
105
+
106
+ For inference/conversion, you need to specify the following configurations when running `run.sh`:
107
+
108
+ | Parameters | Description | Example |
109
+ | --------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
110
+ | `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `[Your path to save logs and checkpoints]/[YourExptName]` |
111
+ | `--infer_output_dir` | The output directory to save inferred audios. | `[Your path to save logs and checkpoints]/[YourExptName]/result` |
112
+ | `--infer_source_file` or `--infer_source_audio_dir` | The inference source (can be a json file or a dir). | The `infer_source_file` could be `[Your path to save processed data]/[YourDataset]/test.json`, and the `infer_source_audio_dir` is a folder which includes several audio files (*.wav, *.mp3 or *.flac). |
113
+ | `--infer_target_speaker` | The target speaker you want to convert into. You can refer to `[Your path to save logs and checkpoints]/[YourExptName]/singers.json` to choose a trained speaker. | For opencpop dataset, the speaker name would be `opencpop_female1`. |
114
+ | `--infer_key_shift` | How many semitones you want to transpose. | `"autoshfit"` (by default), `3`, `-3`, etc. |
115
+
116
+ For example, if you want to make `opencpop_female1` sing the songs in the `[Your Audios Folder]`, just run:
117
+
118
+ ```bash
119
+ sh egs/svc/VitsSVC/run.sh --stage 3 --gpu "0" \
120
+ --infer_expt_dir Amphion/ckpts/svc/[YourExptName] \
121
+ --infer_output_dir Amphion/ckpts/svc/[YourExptName]/result \
122
+ --infer_source_audio_dir [Your Audios Folder] \
123
+ --infer_target_speaker "opencpop_female1" \
124
+ --infer_key_shift "autoshift"
125
+ ```
egs/svc/VitsSVC/exp_config.json ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/vitssvc.json",
3
+ "model_type": "VitsSVC",
4
+ "dataset": [
5
+ "m4singer",
6
+ "opencpop",
7
+ "opensinger",
8
+ "svcc",
9
+ "vctk"
10
+ ],
11
+ "dataset_path": {
12
+ // TODO: Fill in your dataset path
13
+ "m4singer": "[M4Singer dataset path]",
14
+ "opencpop": "[Opencpop dataset path]",
15
+ "opensinger": "[OpenSinger dataset path]",
16
+ "svcc": "[SVCC dataset path]",
17
+ "vctk": "[VCTK dataset path]"
18
+ },
19
+ // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
20
+ "log_dir": "ckpts/svc",
21
+ "preprocess": {
22
+ // TODO: Fill in the output data path. The default value is "Amphion/data"
23
+ "processed_dir": "data",
24
+
25
+ "f0_min": 50,
26
+ "f0_max": 1100,
27
+ // f0_bin in sovits
28
+ "pitch_bin": 256,
29
+ // filter_length in sovits
30
+ "n_fft": 2048,
31
+ // hop_length in sovits
32
+ "hop_size": 512,
33
+ // win_length in sovits
34
+ "win_size": 2048,
35
+ "segment_size": 8192,
36
+ "n_mel": 100,
37
+ "sample_rate": 44100,
38
+
39
+ // Config for features extraction
40
+ "extract_mel": true,
41
+ "extract_pitch": true,
42
+ "pitch_extractor": "parselmouth",
43
+ "extract_energy": false,
44
+ "extract_uv": true,
45
+ "extract_linear_spec": true,
46
+ "extract_audio": true,
47
+ // contentvec
48
+ "extract_contentvec_feature": true,
49
+ "contentvec_sample_rate": 16000,
50
+ "contentvec_batch_size": 1,
51
+ "contentvec_frameshift": 0.02,
52
+ // whisper
53
+ "extract_whisper_feature": true,
54
+ "whisper_sample_rate": 16000,
55
+ "whisper_frameshift": 0.01,
56
+ "whisper_downsample_rate": 2,
57
+ // Fill in the content-based pretrained model's path
58
+ "contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt",
59
+ "wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt",
60
+ "wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml",
61
+ "whisper_model": "medium",
62
+ "whisper_model_path": "pretrained/whisper/medium.pt",
63
+ // Config for features usage
64
+ "use_mel": true,
65
+ "use_frame_pitch": true,
66
+ "use_uv": true,
67
+ "use_spkid": true,
68
+ "use_contentvec": true,
69
+ "use_whisper": true,
70
+ "use_text": false,
71
+ "use_phone": false,
72
+
73
+ // Extract content features using dataloader
74
+ "pin_memory": true,
75
+ "num_workers": 8,
76
+ "content_feature_batch_size": 16,
77
+ // Meta file
78
+ "train_file": "train.json",
79
+ "valid_file": "test.json",
80
+ "spk2id": "singers.json",
81
+ "utt2spk": "utt2singer"
82
+ },
83
+ "model": {
84
+ "condition_encoder": {
85
+ // Config for features usage
86
+ "merge_mode": "add",
87
+ "input_melody_dim": 1,
88
+ "use_log_f0": true,
89
+ "n_bins_melody": 256,
90
+ //# Quantization (0 for not quantization)
91
+ "output_melody_dim": 192,
92
+
93
+ "use_contentvec": true,
94
+ "use_whisper": true,
95
+ "use_mert": false,
96
+ "use_wenet": false,
97
+ "whisper_dim": 1024,
98
+ "contentvec_dim": 256,
99
+ "content_encoder_dim": 192,
100
+ "output_singer_dim": 192,
101
+ "singer_table_size": 512,
102
+ "output_content_dim": 192,
103
+ "use_spkid": true,
104
+
105
+ "pitch_max": 1100.0,
106
+ "pitch_min": 50.0,
107
+ },
108
+ "vits": {
109
+ "inter_channels": 192,
110
+ "hidden_channels": 192,
111
+ "filter_channels": 256,
112
+ "n_heads": 2,
113
+ "n_layers": 6,
114
+ "kernel_size": 3,
115
+ "p_dropout": 0.1,
116
+ "ssl_dim": 256,
117
+ "n_flow_layer": 4,
118
+ "n_layers_q": 3,
119
+ "gin_channels": 256,
120
+ "n_speakers": 512,
121
+ "use_spectral_norm": false,
122
+ },
123
+ "generator": "nsfhifigan",
124
+ },
125
+ "train": {
126
+ "batch_size": 32,
127
+ "learning_rate": 2e-4,
128
+ "gradient_accumulation_step": 1,
129
+ "max_epoch": -1, // -1 means no limit
130
+ "save_checkpoint_stride": [
131
+ 3,
132
+ 50
133
+ ],
134
+ "keep_last": [
135
+ 3,
136
+ 2
137
+ ],
138
+ "run_eval": [
139
+ true,
140
+ true
141
+ ],
142
+ "adamw": {
143
+ "lr": 2.0e-4
144
+ },
145
+ "reducelronplateau": {
146
+ "factor": 0.8,
147
+ "patience": 30,
148
+ "min_lr": 1.0e-4
149
+ },
150
+ "dataloader": {
151
+ "num_worker": 8,
152
+ "pin_memory": true
153
+ },
154
+ "sampler": {
155
+ "holistic_shuffle": false,
156
+ "drop_last": true
157
+ }
158
+ },
159
+ "inference": {
160
+ "batch_size": 1,
161
+ }
162
+ }
egs/svc/VitsSVC/run.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ ../_template/run.sh
egs/svc/_template/run.sh ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ ######## Build Experiment Environment ###########
7
+ exp_dir=$(cd `dirname $0`; pwd)
8
+ work_dir=$(dirname $(dirname $(dirname $exp_dir)))
9
+
10
+ export WORK_DIR=$work_dir
11
+ export PYTHONPATH=$work_dir
12
+ export PYTHONIOENCODING=UTF-8
13
+
14
+ ######## Parse the Given Parameters from the Commond ###########
15
+ options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,resume_from_ckpt_path:,resume_type:,infer_expt_dir:,infer_output_dir:,infer_source_file:,infer_source_audio_dir:,infer_target_speaker:,infer_key_shift:,infer_vocoder_dir: -- "$@")
16
+ eval set -- "$options"
17
+
18
+ while true; do
19
+ case $1 in
20
+ # Experimental Configuration File
21
+ -c | --config) shift; exp_config=$1 ; shift ;;
22
+ # Experimental Name
23
+ -n | --name) shift; exp_name=$1 ; shift ;;
24
+ # Running Stage
25
+ -s | --stage) shift; running_stage=$1 ; shift ;;
26
+ # Visible GPU machines. The default value is "0".
27
+ --gpu) shift; gpu=$1 ; shift ;;
28
+
29
+ # [Only for Training] Resume configuration
30
+ --resume) shift; resume=$1 ; shift ;;
31
+ # [Only for Training] The specific checkpoint path that you want to resume from.
32
+ --resume_from_ckpt_path) shift; resume_from_ckpt_path=$1 ; shift ;;
33
+ # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
34
+ --resume_type) shift; resume_type=$1 ; shift ;;
35
+
36
+ # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
37
+ --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
38
+ # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
39
+ --infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
40
+ # [Only for Inference] The inference source (can be a json file or a dir). For example, the source_file can be "[Your path to save processed data]/[YourDataset]/test.json", and the source_audio_dir can be "$work_dir/source_audio" which includes several audio files (*.wav, *.mp3 or *.flac).
41
+ --infer_source_file) shift; infer_source_file=$1 ; shift ;;
42
+ --infer_source_audio_dir) shift; infer_source_audio_dir=$1 ; shift ;;
43
+ # [Only for Inference] Specify the target speaker you want to convert into. You can refer to "[Your path to save logs and checkpoints]/[Your Expt Name]/singers.json". In this singer look-up table, you can see the usable speaker names (all the keys of the dictionary). For example, for opencpop dataset, the speaker name would be "opencpop_female1".
44
+ --infer_target_speaker) shift; infer_target_speaker=$1 ; shift ;;
45
+ # [Only for Inference] For advanced users, you can modify the trans_key parameters into an integer (which means the semitones you want to transpose). Its default value is "autoshift".
46
+ --infer_key_shift) shift; infer_key_shift=$1 ; shift ;;
47
+ # [Only for Inference] The vocoder dir. Its default value is Amphion/pretrained/bigvgan. See Amphion/pretrained/README.md to download the pretrained BigVGAN vocoders.
48
+ --infer_vocoder_dir) shift; infer_vocoder_dir=$1 ; shift ;;
49
+
50
+ --) shift ; break ;;
51
+ *) echo "Invalid option: $1" exit 1 ;;
52
+ esac
53
+ done
54
+
55
+
56
+ ### Value check ###
57
+ if [ -z "$running_stage" ]; then
58
+ echo "[Error] Please specify the running stage"
59
+ exit 1
60
+ fi
61
+
62
+ if [ -z "$exp_config" ]; then
63
+ exp_config="${exp_dir}"/exp_config.json
64
+ fi
65
+ echo "Exprimental Configuration File: $exp_config"
66
+
67
+ if [ -z "$gpu" ]; then
68
+ gpu="0"
69
+ fi
70
+
71
+ ######## Features Extraction ###########
72
+ if [ $running_stage -eq 1 ]; then
73
+ CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/svc/preprocess.py \
74
+ --config $exp_config \
75
+ --num_workers 4
76
+ fi
77
+
78
+ ######## Training ###########
79
+ if [ $running_stage -eq 2 ]; then
80
+ if [ -z "$exp_name" ]; then
81
+ echo "[Error] Please specify the experiments name"
82
+ exit 1
83
+ fi
84
+ echo "Exprimental Name: $exp_name"
85
+
86
+ if [ "$resume" = true ]; then
87
+ echo "Automatically resume from the experimental dir..."
88
+ CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/svc/train.py \
89
+ --config "$exp_config" \
90
+ --exp_name "$exp_name" \
91
+ --log_level info \
92
+ --resume
93
+ else
94
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/svc/train.py \
95
+ --config "$exp_config" \
96
+ --exp_name "$exp_name" \
97
+ --log_level info \
98
+ --resume_from_ckpt_path "$resume_from_ckpt_path" \
99
+ --resume_type "$resume_type"
100
+ fi
101
+ fi
102
+
103
+ ######## Inference/Conversion ###########
104
+ if [ $running_stage -eq 3 ]; then
105
+ if [ -z "$infer_expt_dir" ]; then
106
+ echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
107
+ exit 1
108
+ fi
109
+
110
+ if [ -z "$infer_output_dir" ]; then
111
+ infer_output_dir="$expt_dir/result"
112
+ fi
113
+
114
+ if [ -z "$infer_source_file" ] && [ -z "$infer_source_audio_dir" ]; then
115
+ echo "[Error] Please specify the source file/dir. The inference source (can be a json file or a dir). For example, the source_file can be "[Your path to save processed data]/[YourDataset]/test.json", and the source_audio_dir should include several audio files (*.wav, *.mp3 or *.flac)."
116
+ exit 1
117
+ fi
118
+
119
+ if [ -z "$infer_source_file" ]; then
120
+ infer_source=$infer_source_audio_dir
121
+ fi
122
+
123
+ if [ -z "$infer_source_audio_dir" ]; then
124
+ infer_source=$infer_source_file
125
+ fi
126
+
127
+ if [ -z "$infer_target_speaker" ]; then
128
+ echo "[Error] Please specify the target speaker. You can refer to "[Your path to save logs and checkpoints]/[Your Expt Name]/singers.json". In this singer look-up table, you can see the usable speaker names (all the keys of the dictionary). For example, for opencpop dataset, the speaker name would be "opencpop_female1""
129
+ exit 1
130
+ fi
131
+
132
+ if [ -z "$infer_key_shift" ]; then
133
+ infer_key_shift="autoshift"
134
+ fi
135
+
136
+ if [ -z "$infer_vocoder_dir" ]; then
137
+ infer_vocoder_dir="$work_dir"/pretrained/bigvgan
138
+ echo "[Warning] You don't specify the infer_vocoder_dir. It is set $infer_vocoder_dir by default. Make sure that you have followed Amphoion/pretrained/README.md to download the pretrained BigVGAN vocoder checkpoint."
139
+ fi
140
+
141
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/svc/inference.py \
142
+ --config $exp_config \
143
+ --acoustics_dir $infer_expt_dir \
144
+ --vocoder_dir $infer_vocoder_dir \
145
+ --target_singer $infer_target_speaker \
146
+ --trans_key $infer_key_shift \
147
+ --source $infer_source \
148
+ --output_dir $infer_output_dir \
149
+ --log_level debug
150
+ fi
egs/tta/README.md ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Amphion Text-to-Audio (TTA) Recipe
2
+
3
+ ## Quick Start
4
+
5
+ We provide a **[beginner recipe](RECIPE.md)** to demonstrate how to train a cutting edge TTA model. Specifically, it is designed as a latent diffusion model like [AudioLDM](https://arxiv.org/abs/2301.12503), [Make-an-Audio](https://arxiv.org/abs/2301.12661), and [AUDIT](https://arxiv.org/abs/2304.00830).
6
+
7
+ ## Supported Model Architectures
8
+
9
+ Until now, Amphion has supported a latent diffusion based text-to-audio model:
10
+
11
+ <br>
12
+ <div align="center">
13
+ <img src="../../imgs/tta/DiffusionTTA.png" width="65%">
14
+ </div>
15
+ <br>
16
+
17
+ Similar to [AUDIT](https://arxiv.org/abs/2304.00830), we implement it in two-stage training:
18
+ 1. Training the VAE which is called `AutoencoderKL` in Amphion.
19
+ 2. Training the conditional latent diffusion model which is called `AudioLDM` in Amphion.
egs/tta/RECIPE.md ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Text-to-Audio with Latent Diffusion Model
2
+
3
+ This is the quicktour for training a text-to-audio model with the popular and powerful generative model: [Latent Diffusion Model](https://arxiv.org/abs/2112.10752). Specially, this recipe is also the official implementation of the text-to-audio generation part of our NeurIPS 2023 paper "[AUDIT: Audio Editing by Following Instructions with Latent Diffusion Models](https://arxiv.org/abs/2304.00830)". You can check the last part of [AUDIT demos](https://audit-demo.github.io/) to see same text-to-audio examples.
4
+
5
+ <br>
6
+ <div align="center">
7
+ <img src="../../imgs/tta/DiffusionTTA.png" width="65%">
8
+ </div>
9
+ <br>
10
+
11
+ We train this latent diffusion model in two stages:
12
+ 1. In the first stage, we aims to obtain a high-quality VAE (called `AutoencoderKL` in Amphion), in order that we can project
13
+ the input mel-spectrograms to an efficient, low-dimensional latent space. Specially, we train the VAE with GAN loss to improve the reconstruction quality.
14
+ 1. In the second stage, we aims to obtain a text-controllable diffusion model (called `AudioLDM` in Amphion). We use U-Net architecture diffusion model, and use T5 encoder as text encoder.
15
+
16
+ There are four stages in total for training the text-to-audio model:
17
+
18
+ 1. Data preparation and processing
19
+ 2. Train the VAE model
20
+ 3. Train the latent diffusion model
21
+ 4. Inference
22
+
23
+ > **NOTE:** You need to run every command of this recipe in the `Amphion` root path:
24
+ > ```bash
25
+ > cd Amphion
26
+ > ```
27
+
28
+ ## Overview
29
+
30
+ ```sh
31
+ # Train the VAE model
32
+ sh egs/tta/autoencoderkl/run_train.sh
33
+
34
+ # Train the latent diffusion model
35
+ sh egs/tta/audioldm/run_train.sh
36
+
37
+ # Inference
38
+ sh egs/tta/audioldm/run_inference.sh
39
+ ```
40
+
41
+ ## 1. Data preparation and processing
42
+
43
+ ### Dataset Download
44
+
45
+ We take [AudioCaps](https://audiocaps.github.io/) as an example, AudioCaps is a dataset of around 44K audio-caption pairs, where each audio clip corresponds to a caption with rich semantic information. You can download the dataset [here](https://github.com/cdjkim/audiocaps).
46
+
47
+ <!-- How to download AudioCaps is detailed [here](../datasets/README.md) -->
48
+ <!-- You can downlaod the dataset [here](https://github.com/cdjkim/audiocaps). -->
49
+
50
+ ### Data Processing
51
+
52
+ - Download AudioCaps dataset to `[Your path to save tta dataset]` and modify `preprocess.processed_dir` in `egs/tta/.../exp_config.json`.
53
+
54
+ ```json
55
+ {
56
+ "dataset": [
57
+ "AudioCaps"
58
+ ],
59
+ "preprocess": {
60
+ // Specify the output root path to save the processed data
61
+ "processed_dir": "[Your path to save tta dataset]",
62
+ ...
63
+ }
64
+ }
65
+ ```
66
+
67
+ The folder structure of your downloaded data should be similar to:
68
+
69
+ ```plaintext
70
+ .../[Your path to save tta dataset]
71
+ ┣ AudioCpas
72
+ ┃   ┣ wav
73
+ ┃ ┃ ┣ ---1_cCGK4M_0_10000.wav
74
+ ┃ ┃ ┣ ---lTs1dxhU_30000_40000.wav
75
+ ┃ ┃ ┣ ...
76
+ ```
77
+
78
+ - Then you may process the data to mel-specgram and save it as `.npy` format. If you use the data we provide, we have processed all the wav data.
79
+
80
+ - Generate a json file to save the metadata, the json file is like:
81
+
82
+ ```json
83
+ [
84
+ {
85
+ "Dataset": "AudioCaps",
86
+ "Uid": "---1_cCGK4M_0_10000",
87
+ "Caption": "Idling car, train blows horn and passes"
88
+ },
89
+ {
90
+ "Dataset": "AudioCaps",
91
+ "Uid": "---lTs1dxhU_30000_40000",
92
+ "Caption": "A racing vehicle engine is heard passing by"
93
+ },
94
+ ...
95
+ ]
96
+ ```
97
+ - Finally, the folder structure is like:
98
+
99
+ ```plaintext
100
+ .../[Your path to save tta dataset]
101
+ ┣ AudioCpas
102
+ ┃   ┣ wav
103
+ ┃ ┃ ┣ ---1_cCGK4M_0_10000.wav
104
+ ┃ ┃ ┣ ---lTs1dxhU_30000_40000.wav
105
+ ┃ ┃ ┣ ...
106
+ ┃   ┣ mel
107
+ ┃ ┃ ┣ ---1_cCGK4M_0_10000.npy
108
+ ┃ ┃ ┣ ---lTs1dxhU_30000_40000.npy
109
+ ┃ ┃ ┣ ...
110
+ ┃   ┣ train.json
111
+ ┃   ┣ valid.json
112
+ ┃   ┣ ...
113
+ ```
114
+
115
+ ## 2. Training the VAE Model
116
+
117
+ The first stage model is a VAE model trained with GAN loss (called `AutoencoderKL` in Amphion), run the follow commands:
118
+
119
+ ```sh
120
+ sh egs/tta/autoencoderkl/run_train.sh
121
+ ```
122
+
123
+ ## 3. Training the Latent Diffusion Model
124
+
125
+ The second stage model is a condition diffusion model with a T5 text encoder (called `AudioLDM` in Amphion), run the following commands:
126
+
127
+ ```sh
128
+ sh egs/tta/audioldm/run_train.sh
129
+ ```
130
+
131
+ ## 4. Inference
132
+
133
+ Now you can generate audio with your pre-trained latent diffusion model, run the following commands and modify the `text` argument.
134
+
135
+ ```sh
136
+ sh egs/tta/audioldm/run_inference.sh \
137
+ --text "A man is whistling"
138
+ ```
139
+
140
+ ## Citations
141
+
142
+ ```bibtex
143
+ @article{wang2023audit,
144
+ title={AUDIT: Audio Editing by Following Instructions with Latent Diffusion Models},
145
+ author={Wang, Yuancheng and Ju, Zeqian and Tan, Xu and He, Lei and Wu, Zhizheng and Bian, Jiang and Zhao, Sheng},
146
+ journal={NeurIPS 2023},
147
+ year={2023}
148
+ }
149
+
150
+ @article{liu2023audioldm,
151
+ title={{AudioLDM}: Text-to-Audio Generation with Latent Diffusion Models},
152
+ author={Liu, Haohe and Chen, Zehua and Yuan, Yi and Mei, Xinhao and Liu, Xubo and Mandic, Danilo and Wang, Wenwu and Plumbley, Mark D},
153
+ journal={Proceedings of the International Conference on Machine Learning},
154
+ year={2023}
155
+ }
156
+ ```
egs/tta/audioldm/exp_config.json ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "egs/tta/audioldm/exp_config_base.json",
3
+ "dataset": [
4
+ "AudioCaps"
5
+ ],
6
+ "preprocess": {
7
+ // Specify the output root path to save the processed data
8
+ "processed_dir": "data",
9
+ // For example: "/home/TTADataset/processed_data"
10
+
11
+ // feature
12
+ "use_spkid": false,
13
+ "use_uv": false,
14
+ "use_frame_pitch": false,
15
+ "use_phone_pitch": false,
16
+ "use_frame_energy": false,
17
+ "use_phone_energy": false,
18
+ "use_mel": false,
19
+ "use_audio": false,
20
+ "use_label": false,
21
+ "use_one_hot": false,
22
+ // feature for text to audio
23
+ "use_caption": true,
24
+ "use_melspec": true,
25
+ "use_wav": false,
26
+ // feature dir
27
+ "melspec_dir": "mel",
28
+ "wav_dir": "wav"
29
+ },
30
+ // Specify the output root path to save model ckpts and logs
31
+ "log_dir": "ckpts/tta",
32
+ // For example: "/home/TTADataset/processed_data/logs"
33
+
34
+ // model
35
+ "model": {
36
+ "audioldm": {
37
+ "image_size": 32,
38
+ "in_channels": 4,
39
+ "out_channels": 4,
40
+ "model_channels": 256,
41
+ "attention_resolutions": [4, 2, 1],
42
+ "num_res_blocks": 2,
43
+ "channel_mult": [1, 2, 4],
44
+ "num_heads": 8,
45
+ "use_spatial_transformer": true,
46
+ "transformer_depth": 1,
47
+ "context_dim": 768,
48
+ "use_checkpoint": true,
49
+ "legacy": false
50
+ },
51
+ "autoencoderkl": {
52
+ "ch": 128,
53
+ "ch_mult": [1,1,2,2,4],
54
+ "num_res_blocks": 2,
55
+ "in_channels": 1,
56
+ "z_channels": 4,
57
+ "out_ch": 1,
58
+ "double_z": true
59
+ },
60
+ "noise_scheduler": {
61
+ "num_train_timesteps": 1000,
62
+ "beta_start": 0.00085,
63
+ "beta_end": 0.012,
64
+ "beta_schedule": "scaled_linear",
65
+ "clip_sample": false,
66
+ "steps_offset": 1,
67
+ "set_alpha_to_one": false,
68
+ "skip_prk_steps": true,
69
+ "prediction_type": "epsilon"
70
+ },
71
+ "autoencoder_path": "ckpts/tta/autoencoder_kl_debug/checkpoints/step-0445000_loss-0.3306.pt"
72
+ },
73
+
74
+ // train
75
+ "train": {
76
+ "adam": {
77
+ "lr": 5.0e-5
78
+ },
79
+ "ddp": false,
80
+ "random_seed": 12345,
81
+ "batch_size": 12,
82
+ "epochs": 50000,
83
+ "max_steps": 1000000,
84
+ "total_training_steps": 800000,
85
+ "save_summary_steps": 1000,
86
+ "save_checkpoints_steps": 5000,
87
+ "valid_interval": 5000,
88
+ "keep_checkpoint_max": 100
89
+ }
90
+ }
egs/tta/audioldm/exp_config_base.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/audioldm.json",
3
+ "model_type": "AudioLDM",
4
+ "dataset": [
5
+ "AudioCaps"
6
+ ],
7
+ "preprocess": {
8
+ "train_file": "train.json",
9
+ "valid_file": "vaild.json"
10
+ }
11
+ }
egs/tta/audioldm/exp_config_latent_4_10_78.json ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "egs/tta/audioldm/exp_config_base.json",
3
+ "dataset": [
4
+ "AudioCaps"
5
+ ],
6
+ "preprocess": {
7
+ // Specify the output root path to save the processed data
8
+ "processed_dir": "data",
9
+
10
+ // feature
11
+ "use_spkid": false,
12
+ "use_uv": false,
13
+ "use_frame_pitch": false,
14
+ "use_phone_pitch": false,
15
+ "use_frame_energy": false,
16
+ "use_phone_energy": false,
17
+ "use_mel": false,
18
+ "use_audio": false,
19
+ "use_label": false,
20
+ "use_one_hot": false,
21
+ // feature for text to audio
22
+ "use_caption": true,
23
+ "use_melspec": true,
24
+ "use_wav": false,
25
+ // feature dir
26
+ "melspec_dir": "mel",
27
+ "wav_dir": "wav"
28
+ },
29
+ // Specify the output root path to save model ckpts and logs
30
+ "log_dir": "ckpts/tta",
31
+
32
+ // model
33
+ "model": {
34
+ "audioldm": {
35
+ "image_size": 32,
36
+ "in_channels": 4,
37
+ "out_channels": 4,
38
+ "model_channels": 256,
39
+ "attention_resolutions": [4, 2, 1],
40
+ "num_res_blocks": 2,
41
+ "channel_mult": [1, 2, 4],
42
+ "num_heads": 8,
43
+ "use_spatial_transformer": true,
44
+ "transformer_depth": 1,
45
+ "context_dim": 768,
46
+ "use_checkpoint": true,
47
+ "legacy": false
48
+ },
49
+ "autoencoderkl": {
50
+ "ch": 128,
51
+ "ch_mult": [1,2,2,4],
52
+ "num_res_blocks": 2,
53
+ "in_channels": 1,
54
+ "z_channels": 4,
55
+ "out_ch": 1,
56
+ "double_z": true
57
+ },
58
+ "noise_scheduler": {
59
+ "num_train_timesteps": 1000,
60
+ "beta_start": 0.00085,
61
+ "beta_end": 0.012,
62
+ "beta_schedule": "scaled_linear",
63
+ "clip_sample": false,
64
+ "steps_offset": 1,
65
+ "set_alpha_to_one": false,
66
+ "skip_prk_steps": true,
67
+ "prediction_type": "epsilon"
68
+ },
69
+ "autoencoder_path": "ckpts/tta/autoencoder_kl_debug_latent_size_4_10_78/checkpoints/step-0390000_loss-0.2876.pt"
70
+ },
71
+
72
+ // train
73
+ "train": {
74
+ "adam": {
75
+ "lr": 2.0e-5
76
+ },
77
+ "ddp": false,
78
+ "random_seed": 12345,
79
+ "batch_size": 12,
80
+ "epochs": 50000,
81
+ "max_steps": 1000000,
82
+ "total_training_steps": 800000,
83
+ "save_summary_steps": 1000,
84
+ "save_checkpoints_steps": 5000,
85
+ "valid_interval": 5000,
86
+ "keep_checkpoint_max": 100
87
+ }
88
+ }
egs/tta/audioldm/run_inference.sh ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ ######## Build Experiment Environment ###########
7
+ exp_dir=$(cd `dirname $0`; pwd)
8
+ work_dir=$(dirname $(dirname $(dirname $exp_dir)))
9
+
10
+ export WORK_DIR=$work_dir
11
+ export PYTHONPATH=$work_dir
12
+ export PYTHONIOENCODING=UTF-8
13
+
14
+ ######## Set Experiment Configuration ###########
15
+ exp_config="$exp_dir/exp_config.json"
16
+ exp_name="audioldm_debug_latent_size_4_5_39"
17
+ checkpoint_path="$work_dir/ckpts/tta/audioldm_debug_latent_size_4_5_39/checkpoints/step-0570000_loss-0.2521.pt"
18
+ output_dir="$work_dir/temp"
19
+ vocoder_config_path="$work_dir/ckpts/tta/hifigan_checkpoints/config.json"
20
+ vocoder_path="$work_dir/ckpts/tta/hifigan_checkpoints/g_01250000"
21
+ num_steps=200
22
+ guidance_scale=4.0
23
+
24
+ export CUDA_VISIBLE_DEVICES="0"
25
+
26
+ ######## Parse Command Line Arguments ###########
27
+ while [[ $# -gt 0 ]]
28
+ do
29
+ key="$1"
30
+
31
+ case $key in
32
+ --text)
33
+ text="$2"
34
+ shift # past argument
35
+ shift # past value
36
+ ;;
37
+ *) # unknown option
38
+ shift # past argument
39
+ ;;
40
+ esac
41
+ done
42
+
43
+ ######## Run inference ###########
44
+ python "${work_dir}"/bins/tta/inference.py \
45
+ --config=$exp_config \
46
+ --checkpoint_path=$checkpoint_path \
47
+ --text="$text" \
48
+ --vocoder_path=$vocoder_path \
49
+ --vocoder_config_path=$vocoder_config_path \
50
+ --num_steps=$num_steps \
51
+ --guidance_scale=$guidance_scale \
52
+ --output_dir=$output_dir
egs/tta/audioldm/run_inference_latent_4_10_78.sh ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ ######## Build Experiment Environment ###########
7
+ exp_dir=$(cd `dirname $0`; pwd)
8
+ work_dir=$(dirname $(dirname $(dirname $exp_dir)))
9
+
10
+ export WORK_DIR=$work_dir
11
+ export PYTHONPATH=$work_dir
12
+ export PYTHONIOENCODING=UTF-8
13
+
14
+ ######## Set Experiment Configuration ###########
15
+ exp_config="$exp_dir/exp_config_v2.json"
16
+ exp_name="audioldm_debug_latent_size_4_10_78"
17
+ checkpoint_path="$work_dir/ckpts/tta/audioldm_debug_latent_size_4_10_78/checkpoints/step-0325000_loss-0.1936.pt"
18
+ output_dir="$work_dir/temp"
19
+ vocoder_config_path="$work_dir/ckpts/tta/hifigan_checkpoints/config.json"
20
+ vocoder_path="$work_dir/ckpts/tta/hifigan_checkpoints/g_01250000"
21
+ num_steps=200
22
+ guidance_scale=4.0
23
+
24
+ export CUDA_VISIBLE_DEVICES="0"
25
+
26
+ ######## Parse Command Line Arguments ###########
27
+ while [[ $# -gt 0 ]]
28
+ do
29
+ key="$1"
30
+
31
+ case $key in
32
+ --text)
33
+ text="$2"
34
+ shift # past argument
35
+ shift # past value
36
+ ;;
37
+ *) # unknown option
38
+ shift # past argument
39
+ ;;
40
+ esac
41
+ done
42
+
43
+ ######## Run inference ###########
44
+ python "${work_dir}"/bins/tta/inference.py \
45
+ --config=$exp_config \
46
+ --checkpoint_path=$checkpoint_path \
47
+ --text="A man is whistling" \
48
+ --vocoder_path=$vocoder_path \
49
+ --vocoder_config_path=$vocoder_config_path \
50
+ --num_steps=$num_steps \
51
+ --guidance_scale=$guidance_scale \
52
+ --output_dir=$output_dir \
egs/tta/audioldm/run_train.sh ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ ######## Build Experiment Environment ###########
7
+ exp_dir=$(cd `dirname $0`; pwd)
8
+ work_dir=$(dirname $(dirname $(dirname $exp_dir)))
9
+
10
+ export WORK_DIR=$work_dir
11
+ export PYTHONPATH=$work_dir
12
+ export PYTHONIOENCODING=UTF-8
13
+
14
+ ######## Set Experiment Configuration ###########
15
+ exp_config="$exp_dir/exp_config.json"
16
+ exp_name="audioldm_debug_latent_size_4_5_39"
17
+
18
+ num_workers=8
19
+ export CUDA_VISIBLE_DEVICES="0"
20
+
21
+ ######## Train Model ###########
22
+ python "${work_dir}"/bins/tta/train_tta.py \
23
+ --config=$exp_config \
24
+ --num_workers=$num_workers \
25
+ --exp_name=$exp_name \
26
+ --stdout_interval=25 \
egs/tta/audioldm/run_train_latent_4_10_78.sh ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ ######## Build Experiment Environment ###########
7
+ exp_dir=$(cd `dirname $0`; pwd)
8
+ work_dir=$(dirname $(dirname $(dirname $exp_dir)))
9
+
10
+ export WORK_DIR=$work_dir
11
+ export PYTHONPATH=$work_dir
12
+ export PYTHONIOENCODING=UTF-8
13
+
14
+ ######## Set Experiment Configuration ###########
15
+ exp_config="$exp_dir/exp_config_latent_4_10_78.json"
16
+ exp_name="audioldm_debug_latent_size_4_10_78"
17
+
18
+ num_workers=8
19
+ export CUDA_VISIBLE_DEVICES="0"
20
+
21
+ ######## Train Model ###########
22
+ python "${work_dir}"/bins/tta/train_tta.py \
23
+ --config=$exp_config \
24
+ --num_workers=$num_workers \
25
+ --exp_name=$exp_name \
26
+ --stdout_interval=25 \
egs/tta/autoencoderkl/exp_config.json ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "egs/tta/autoencoderkl/exp_config_base.json",
3
+ "dataset": [
4
+ "AudioCaps"
5
+ ],
6
+ "preprocess": {
7
+ // Specify the output root path to save the processed data
8
+ "processed_dir": "data",
9
+
10
+ // feature
11
+ "use_spk": false,
12
+ "use_spkid": false,
13
+ "use_uv": false,
14
+ "use_frame_pitch": false,
15
+ "use_phone_pitch": false,
16
+ "use_frame_energy": false,
17
+ "use_phone_energy": false,
18
+ "use_mel": false,
19
+ "use_audio": false,
20
+ "use_label": false,
21
+ "use_one_hot": false,
22
+ // feature for text to audio
23
+ "use_caption": true,
24
+ "use_melspec": true,
25
+ "use_wav": false,
26
+ // feature dir
27
+ "melspec_dir": "mel",
28
+ "wav_dir": "wav"
29
+ },
30
+ // Specify the output root path to save model ckpts and logs
31
+ "log_dir": "ckpts/tta",
32
+
33
+ // train
34
+ "train": {
35
+ "adam": {
36
+ "lr": 4.0e-5
37
+ },
38
+ "ddp": false,
39
+ "random_seed": 12345,
40
+ "batch_size": 12,
41
+ "epochs": 50000,
42
+ "max_steps": 1000000,
43
+ "total_training_steps": 800000,
44
+ "save_summary_steps": 1000,
45
+ "save_checkpoints_steps": 5000,
46
+ "valid_interval": 5000,
47
+ "keep_checkpoint_max": 100
48
+ }
49
+ }
egs/tta/autoencoderkl/exp_config_base.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/autoencoderkl.json",
3
+ "model_type": "AutoencoderKL",
4
+ "dataset": [
5
+ "AudioCaps"
6
+ ],
7
+ "preprocess": {
8
+ "train_file": "train.json",
9
+ "valid_file": "vaild.json"
10
+ }
11
+ }
egs/tta/autoencoderkl/exp_config_latent_4_10_78.json ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "egs/tta/autoencoderkl/exp_config_base.json",
3
+ "dataset": [
4
+ "AudioCaps"
5
+ ],
6
+ "preprocess": {
7
+ // Specify the output root path to save the processed data
8
+ "processed_dir": "data",
9
+
10
+ // feature
11
+ "use_spkid": false,
12
+ "use_uv": false,
13
+ "use_frame_pitch": false,
14
+ "use_phone_pitch": false,
15
+ "use_frame_energy": false,
16
+ "use_phone_energy": false,
17
+ "use_mel": false,
18
+ "use_audio": false,
19
+ "use_label": false,
20
+ "use_one_hot": false,
21
+ // feature for text to audio
22
+ "use_caption": true,
23
+ "use_melspec": true,
24
+ "use_wav": false,
25
+ // feature dir
26
+ "melspec_dir": "mel",
27
+ "wav_dir": "wav"
28
+ },
29
+ // Specify the output root path to save model ckpts and logs
30
+ "log_dir": "ckpts/tta",
31
+
32
+ "model": {
33
+ "autoencoderkl": {
34
+ "ch": 128,
35
+ "ch_mult": [1,2,2,4],
36
+ "num_res_blocks": 2,
37
+ "in_channels": 1,
38
+ "z_channels": 4,
39
+ "out_ch": 1,
40
+ "double_z": true
41
+ }
42
+ },
43
+ // train
44
+ "train": {
45
+ "adam": {
46
+ "lr": 4.0e-5
47
+ },
48
+ "ddp": false,
49
+ "random_seed": 12345,
50
+ "batch_size": 12,
51
+ "epochs": 50000,
52
+ "max_steps": 1000000,
53
+ "total_training_steps": 800000,
54
+ "save_summary_steps": 1000,
55
+ "save_checkpoints_steps": 5000,
56
+ "valid_interval": 5000,
57
+ "keep_checkpoint_max": 100
58
+ }
59
+ }
egs/tta/autoencoderkl/run_train.sh ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ ######## Build Experiment Environment ###########
7
+ exp_dir=$(cd `dirname $0`; pwd)
8
+ work_dir=$(dirname $(dirname $(dirname $exp_dir)))
9
+
10
+ export WORK_DIR=$work_dir
11
+ export PYTHONPATH=$work_dir
12
+ export PYTHONIOENCODING=UTF-8
13
+
14
+ ######## Set Experiment Configuration ###########
15
+ exp_config="$exp_dir/exp_config.json"
16
+ exp_name="autoencoder_kl_debug"
17
+
18
+ num_workers=8
19
+ export CUDA_VISIBLE_DEVICES="0"
20
+
21
+ ######## Train Model ###########
22
+ python "${work_dir}"/bins/tta/train_tta.py \
23
+ --config=$exp_config \
24
+ --num_workers=$num_workers \
25
+ --exp_name=$exp_name \
26
+ --stdout_interval=25 \
egs/tta/autoencoderkl/run_train_latent_4_10_78.sh ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ ######## Build Experiment Environment ###########
7
+ exp_dir=$(cd `dirname $0`; pwd)
8
+ work_dir=$(dirname $(dirname $(dirname $exp_dir)))
9
+
10
+ export WORK_DIR=$work_dir
11
+ export PYTHONPATH=$work_dir
12
+ export PYTHONIOENCODING=UTF-8
13
+
14
+ ######## Set Experiment Configuration ###########
15
+ exp_config="$exp_dir/exp_config_latent_4_10_78.json"
16
+ exp_name="autoencoder_kl_debug_latent_size_4_10_78"
17
+
18
+ num_workers=8
19
+ export CUDA_VISIBLE_DEVICES="0"
20
+
21
+ ######## Train Model ###########
22
+ python "${work_dir}"/bins/tta/train_tta.py \
23
+ --config=$exp_config \
24
+ --num_workers=$num_workers \
25
+ --exp_name=$exp_name \
26
+ --stdout_interval=25 \
egs/tts/FastSpeech2/README.md ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # FastSpeech2 Recipe
3
+
4
+ In this recipe, we will show how to train [FastSpeech2](https://openreview.net/forum?id=piLPYqxtWuA) using Amphion's infrastructure. FastSpeech2 is a non-autoregressive TTS architecture that utilizes feed-forward Transformer blocks.
5
+
6
+ There are four stages in total:
7
+
8
+ 1. Data preparation
9
+ 2. Features extraction
10
+ 3. Training
11
+ 4. Inference
12
+
13
+ > **NOTE:** You need to run every command of this recipe in the `Amphion` root path:
14
+ > ```bash
15
+ > cd Amphion
16
+ > ```
17
+
18
+ ## 1. Data Preparation
19
+
20
+ ### Dataset Download
21
+ You can use the commonly used TTS dataset to train TTS model, e.g., LJSpeech, VCTK, LibriTTS, etc. We strongly recommend you use LJSpeech to train TTS model for the first time. How to download dataset is detailed [here](../../datasets/README.md).
22
+
23
+ ### Configuration
24
+
25
+ After downloading the dataset, you can set the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets.
26
+
27
+ ```json
28
+ "dataset": [
29
+ "LJSpeech",
30
+ ],
31
+ "dataset_path": {
32
+ // TODO: Fill in your dataset path
33
+ "LJSpeech": "[LJSpeech dataset path]",
34
+ },
35
+ ```
36
+
37
+ ## 2. Features Extraction
38
+
39
+ ### Configuration
40
+
41
+ Specify the `processed_dir` and the `log_dir` and for saving the processed data and the checkpoints in `exp_config.json`:
42
+
43
+ ```json
44
+ // TODO: Fill in the output log path
45
+ "log_dir": "ckpts/tts",
46
+ "preprocess": {
47
+ // TODO: Fill in the output data path
48
+ "processed_dir": "data",
49
+ ...
50
+ },
51
+ ```
52
+
53
+ ### Run
54
+
55
+ Run the `run.sh` as the preproces stage (set `--stage 1`):
56
+
57
+ ```bash
58
+ sh egs/tts/FastSpeech2/run.sh --stage 1
59
+ ```
60
+
61
+ ## 3. Training
62
+
63
+ ### Configuration
64
+
65
+ We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on your GPU machines.
66
+
67
+ ```
68
+ "train": {
69
+ "batch_size": 16,
70
+ }
71
+ ```
72
+
73
+ ### Run
74
+
75
+ Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `ckpts/tts/[YourExptName]`.
76
+
77
+ ```bash
78
+ sh egs/tts/FastSpeech2/run.sh --stage 2 --name [YourExptName]
79
+ ```
80
+
81
+ > **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`.
82
+
83
+
84
+ ## 4. Inference
85
+
86
+ ### Configuration
87
+
88
+ For inference, you need to specify the following configurations when running `run.sh`:
89
+
90
+
91
+ | Parameters | Description | Example |
92
+ | --------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
93
+ | `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `ckpts/tts/[YourExptName]` |
94
+ | `--infer_output_dir` | The output directory to save inferred audios. | `ckpts/tts/[YourExptName]/result` |
95
+ | `--infer_mode` | The inference mode, e.g., "`single`", "`batch`". | "`single`" to generate a clip of speech, "`batch`" to generate a batch of speech at a time. |
96
+ | `--infer_dataset` | The dataset used for inference. | For LJSpeech dataset, the inference dataset would be `LJSpeech`. |
97
+ | `--infer_testing_set` | The subset of the inference dataset used for inference, e.g., train, test, golden_test | For LJSpeech dataset, the testing set would be  "`test`" split from LJSpeech at the feature extraction, or "`golden_test`" cherry-picked from test set as template testing set. |
98
+ | `--infer_text` | The text to be synthesized. | "`This is a clip of generated speech with the given text from a TTS model.`" |
99
+
100
+ ### Run
101
+ For example, if you want to generate speech of all testing set split from LJSpeech, just run:
102
+
103
+ ```bash
104
+ sh egs/tts/FastSpeech2/run.sh --stage 3 \
105
+ --infer_expt_dir ckpts/tts/[YourExptName] \
106
+ --infer_output_dir ckpts/tts/[YourExptName]/result \
107
+ --infer_mode "batch" \
108
+ --infer_dataset "LJSpeech" \
109
+ --infer_testing_set "test"
110
+ ```
111
+
112
+ Or, if you want to generate a single clip of speech from a given text, just run:
113
+
114
+ ```bash
115
+ sh egs/tts/FastSpeech2/run.sh --stage 3 \
116
+ --infer_expt_dir ckpts/tts/[YourExptName] \
117
+ --infer_output_dir ckpts/tts/[YourExptName]/result \
118
+ --infer_mode "single" \
119
+ --infer_text "This is a clip of generated speech with the given text from a TTS model."
120
+ ```
121
+
122
+ We will release a pre-trained FastSpeech2 model trained on LJSpeech. So you can download the pre-trained model and generate speech following the above inference instruction.
123
+
124
+
125
+ ```bibtex
126
+ @inproceedings{ren2020fastspeech,
127
+ title={FastSpeech 2: Fast and High-Quality End-to-End Text to Speech},
128
+ author={Ren, Yi and Hu, Chenxu and Tan, Xu and Qin, Tao and Zhao, Sheng and Zhao, Zhou and Liu, Tie-Yan},
129
+ booktitle={International Conference on Learning Representations},
130
+ year={2020}
131
+ }
132
+ ```
egs/tts/FastSpeech2/exp_config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/fs2.json",
3
+ "model_type": "FastSpeech2",
4
+ "dataset": [
5
+ "LJSpeech"
6
+ ],
7
+ "dataset_path": {
8
+ // TODO: Fill in your dataset path
9
+ "LJSpeech": "[LJSpeech dataset path]"
10
+ },
11
+ // TODO: Fill in the output log path. The default value is "Amphion/ckpts/tts"
12
+ "log_dir": "ckpts/tts",
13
+ "preprocess": {
14
+ // TODO: Fill in the output data path. The default value is "Amphion/data"
15
+ "processed_dir": "data",
16
+ "sample_rate": 22050,
17
+ },
18
+ "train": {
19
+ "batch_size": 16,
20
+ }
21
+ }
egs/tts/FastSpeech2/prepare_mfa.sh ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ #!/bin/bash
7
+ mkdir mfa
8
+ cd mfa
9
+ wget https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/download/v1.1.0-beta.2/montreal-forced-aligner_linux.tar.gz
10
+ tar -zxvf montreal-forced-aligner_linux.tar.gz
11
+ cd mfa
12
+ mkdir lexicon
13
+ cd lexicon
14
+ wget http://www.openslr.org/resources/11/librispeech-lexicon.txt
egs/tts/FastSpeech2/run.sh ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ ######## Build Experiment Environment ###########
7
+ exp_dir=$(cd `dirname $0`; pwd)
8
+ work_dir=$(dirname $(dirname $(dirname $exp_dir)))
9
+
10
+ export WORK_DIR=$work_dir
11
+ export PYTHONPATH=$work_dir
12
+ export PYTHONIOENCODING=UTF-8
13
+
14
+ cd $work_dir/modules/monotonic_align
15
+ mkdir -p monotonic_align
16
+ python setup.py build_ext --inplace
17
+ cd $work_dir
18
+
19
+ mfa_dir=$work_dir/mfa
20
+ echo $mfa_dir
21
+
22
+ ######## Parse the Given Parameters from the Commond ###########
23
+ # options=$(getopt -o c:n:s --long gpu:,config:,infer_expt_dir:,infer_output_dir:,infer_source_file:,infer_source_audio_dir:,infer_target_speaker:,infer_key_shift:,infer_vocoder_dir:,name:,stage: -- "$@")
24
+ options=$(getopt -o c:n:s --long gpu:,config:,infer_expt_dir:,infer_output_dir:,infer_mode:,infer_dataset:,infer_testing_set:,infer_text:,name:,stage: -- "$@")
25
+ eval set -- "$options"
26
+
27
+ while true; do
28
+ case $1 in
29
+ # Experimental Configuration File
30
+ -c | --config) shift; exp_config=$1 ; shift ;;
31
+ # Experimental Name
32
+ -n | --name) shift; exp_name=$1 ; shift ;;
33
+ # Running Stage
34
+ -s | --stage) shift; running_stage=$1 ; shift ;;
35
+ # Visible GPU machines. The default value is "0".
36
+ --gpu) shift; gpu=$1 ; shift ;;
37
+
38
+ # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
39
+ --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
40
+ # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
41
+ --infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
42
+ # [Only for Inference] The inference mode. It can be "batch" to generate speech by batch, or "single" to generage a single clip of speech.
43
+ --infer_mode) shift; infer_mode=$1 ; shift ;;
44
+ # [Only for Inference] The inference dataset. It is only used when the inference model is "batch".
45
+ --infer_dataset) shift; infer_dataset=$1 ; shift ;;
46
+ # [Only for Inference] The inference testing set. It is only used when the inference model is "batch". It can be "test" set split from the dataset, or "golden_test" carefully selected from the testing set.
47
+ --infer_testing_set) shift; infer_testing_set=$1 ; shift ;;
48
+ # [Only for Inference] The text to be synthesized from. It is only used when the inference model is "single".
49
+ --infer_text) shift; infer_text=$1 ; shift ;;
50
+
51
+ --) shift ; break ;;
52
+ *) echo "Invalid option: $1" exit 1 ;;
53
+ esac
54
+ done
55
+
56
+
57
+ ### Value check ###
58
+ if [ -z "$running_stage" ]; then
59
+ echo "[Error] Please specify the running stage"
60
+ exit 1
61
+ fi
62
+
63
+ if [ -z "$exp_config" ]; then
64
+ exp_config="${exp_dir}"/exp_config.json
65
+ fi
66
+ echo "Exprimental Configuration File: $exp_config"
67
+
68
+ if [ -z "$gpu" ]; then
69
+ gpu="0"
70
+ fi
71
+
72
+ ######## Features Extraction ###########
73
+ if [ $running_stage -eq 1 ]; then
74
+ if [ ! -d "$mfa_dir" ]; then
75
+ bash ${exp_dir}/prepare_mfa.sh
76
+ fi
77
+ CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/tts/preprocess.py \
78
+ --config=$exp_config \
79
+ --num_workers=4 \
80
+ --prepare_alignment=true
81
+ fi
82
+
83
+ ######## Training ###########
84
+ if [ $running_stage -eq 2 ]; then
85
+ if [ -z "$exp_name" ]; then
86
+ echo "[Error] Please specify the experiments name"
87
+ exit 1
88
+ fi
89
+ echo "Exprimental Name: $exp_name"
90
+
91
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/tts/train.py \
92
+ --config $exp_config \
93
+ --exp_name $exp_name \
94
+ --log_level debug
95
+ fi
96
+
97
+ ######## Inference ###########
98
+ if [ $running_stage -eq 3 ]; then
99
+ if [ -z "$infer_expt_dir" ]; then
100
+ echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
101
+ exit 1
102
+ fi
103
+
104
+ if [ -z "$infer_output_dir" ]; then
105
+ infer_output_dir="$expt_dir/result"
106
+ fi
107
+
108
+ if [ -z "$infer_mode" ]; then
109
+ echo "[Error] Please specify the inference mode, e.g., "batch", "single""
110
+ exit 1
111
+ fi
112
+
113
+ if [ "$infer_mode" = "batch" ] && [ -z "$infer_dataset" ]; then
114
+ echo "[Error] Please specify the dataset used in inference when the inference mode is batch"
115
+ exit 1
116
+ fi
117
+
118
+ if [ "$infer_mode" = "batch" ] && [ -z "$infer_testing_set" ]; then
119
+ echo "[Error] Please specify the testing set used in inference when the inference mode is batch"
120
+ exit 1
121
+ fi
122
+
123
+ if [ "$infer_mode" = "single" ] && [ -z "$infer_text" ]; then
124
+ echo "[Error] Please specify the text to be synthesized when the inference mode is single"
125
+ exit 1
126
+ fi
127
+
128
+ if [ "$infer_mode" = "single" ]; then
129
+ echo 'Text: ' ${infer_text}
130
+ infer_dataset=None
131
+ infer_testing_set=None
132
+ elif [ "$infer_mode" = "batch" ]; then
133
+ infer_text=''
134
+ fi
135
+
136
+
137
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/tts/inference.py \
138
+ --config $exp_config \
139
+ --acoustics_dir $infer_expt_dir \
140
+ --output_dir $infer_output_dir \
141
+ --mode $infer_mode \
142
+ --dataset $infer_dataset \
143
+ --testing_set $infer_testing_set \
144
+ --text "$infer_text" \
145
+ --log_level debug \
146
+ --vocoder_dir /mntnfs/lee_data1/chenxi/processed_data/ljspeech/model_ckpt/hifigan/checkpoints
147
+
148
+
149
+
150
+ fi
egs/tts/NaturalSpeech2/exp_config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "egs/tts/NaturalSpeech2/exp_config_base.json",
3
+ "dataset": [
4
+ "LibriTTS"
5
+ ],
6
+ "preprocess": {
7
+ // Specify the output root path to save the processed data
8
+ "processed_dir": "[LibriTTS dataset path]",
9
+ "train_file": "train.json",
10
+ "valid_file": "test.json",
11
+ "read_metadata": true,
12
+ "metadata_dir": "metadata"
13
+ },
14
+ // Specify the output root path to save model ckpts and logs
15
+ "log_dir": "ckpts/tts",
16
+ "train": {
17
+ // New trainer and Accelerator
18
+ "gradient_accumulation_step": 1,
19
+ "tracker": ["tensorboard"],
20
+ "max_epoch": 5000,
21
+ "save_checkpoint_stride": [1],
22
+ "keep_last": [1000],
23
+ "run_eval": [true],
24
+ "dataloader": {
25
+ "num_worker": 16,
26
+ "pin_memory": true
27
+ },
28
+ "adam": {
29
+ "lr": 1.0e-4
30
+ },
31
+ "use_dynamic_batchsize": true,
32
+ "batch_size": 8,
33
+ "max_tokens": 7500,
34
+ "max_sentences": 32,
35
+ "lr_warmup_steps": 5000,
36
+ "lr_scheduler": "cosine",
37
+ "num_train_steps": 800000
38
+ }
39
+ }
egs/tts/NaturalSpeech2/exp_config_base.json ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/ns2.json",
3
+ "model_type": "NaturalSpeech2",
4
+ "dataset": [
5
+ "LibriTTS"
6
+ ],
7
+ "preprocess": {
8
+ "use_mel": false,
9
+ "use_code": true,
10
+ "use_spkid": true,
11
+ "use_pitch": true,
12
+ "use_duration": true,
13
+ "use_phone": true,
14
+ "use_len": true,
15
+ "use_cross_reference": true,
16
+ "train_file": "train.json",
17
+ "valid_file": "test.json",
18
+ "melspec_dir": "mel",
19
+ "code_dir": "code",
20
+ "pitch_dir": "pitch",
21
+ "duration_dir": "duration",
22
+ "metadata_dir": "metadata",
23
+ "read_metadata": true,
24
+ "clip_mode": "start"
25
+ },
26
+ "model": {
27
+ "latent_dim": 128,
28
+ "prior_encoder": {
29
+ "vocab_size": 100,
30
+ "pitch_min": 50,
31
+ "pitch_max": 1100,
32
+ "pitch_bins_num": 512,
33
+ "encoder": {
34
+ "encoder_layer": 6,
35
+ "encoder_hidden": 512,
36
+ "encoder_head": 8,
37
+ "conv_filter_size": 2048,
38
+ "conv_kernel_size": 9,
39
+ "encoder_dropout": 0.2,
40
+ "use_cln": true
41
+ },
42
+ "duration_predictor": {
43
+ "input_size": 512,
44
+ "filter_size": 512,
45
+ "kernel_size": 3,
46
+ "conv_layers": 30,
47
+ "cross_attn_per_layer": 3,
48
+ "attn_head": 8,
49
+ "drop_out": 0.5
50
+ },
51
+ "pitch_predictor": {
52
+ "input_size": 512,
53
+ "filter_size": 512,
54
+ "kernel_size": 5,
55
+ "conv_layers": 30,
56
+ "cross_attn_per_layer": 3,
57
+ "attn_head": 8,
58
+ "drop_out": 0.5
59
+ }
60
+ },
61
+ "diffusion": {
62
+ "wavenet": {
63
+ "input_size": 128,
64
+ "hidden_size": 512,
65
+ "out_size": 128,
66
+ "num_layers": 40,
67
+ "cross_attn_per_layer": 3,
68
+ "dilation_cycle": 2,
69
+ "attn_head": 8,
70
+ "drop_out": 0.2
71
+ },
72
+ "beta_min": 0.05,
73
+ "beta_max": 20,
74
+ "sigma": 1.0,
75
+ "noise_factor": 1.0,
76
+ "ode_solver": "euler",
77
+ "diffusion_type": "diffusion"
78
+ },
79
+ "prompt_encoder": {
80
+ "encoder_layer": 6,
81
+ "encoder_hidden": 512,
82
+ "encoder_head": 8,
83
+ "conv_filter_size": 2048,
84
+ "conv_kernel_size": 9,
85
+ "encoder_dropout": 0.2,
86
+ "use_cln": false
87
+ },
88
+ "query_emb": {
89
+ "query_token_num": 32,
90
+ "hidden_size": 512,
91
+ "head_num": 8
92
+ },
93
+ "inference_step": 500
94
+ },
95
+ "train": {
96
+ "use_dynamic_batchsize": true,
97
+ "max_tokens": 7500,
98
+ "max_sentences": 32,
99
+ "lr_warmup_steps": 5000,
100
+ "lr_scheduler": "cosine",
101
+ "num_train_steps": 800000,
102
+ "adam": {
103
+ "lr": 7.5e-5
104
+ },
105
+ "diff_ce_loss_lambda": 0.5,
106
+ "diff_noise_loss_lambda": 1.0,
107
+ "ddp": false,
108
+ "random_seed": 114,
109
+ "batch_size": 32,
110
+ "epochs": 5000,
111
+ "max_steps": 1000000,
112
+ "total_training_steps": 800000,
113
+ "save_summary_steps": 500,
114
+ "save_checkpoints_steps": 2000,
115
+ "valid_interval": 2000,
116
+ "keep_checkpoint_max": 100
117
+ }
118
+ }
egs/tts/NaturalSpeech2/run_inference.sh ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ######## Build Experiment Environment ###########
2
+ exp_dir=$(cd `dirname $0`; pwd)
3
+ work_dir=$(dirname $(dirname $(dirname $exp_dir)))
4
+
5
+ export WORK_DIR=$work_dir
6
+ export PYTHONPATH=$work_dir
7
+ export PYTHONIOENCODING=UTF-8
8
+
9
+ ######## Set Experiment Configuration ###########
10
+ exp_config="$exp_dir/exp_config.json"
11
+ exp_name="ns2_libritts"
12
+ ref_audio="$work_dir/egs/tts/NaturalSpeech2/prompt_example/ref_audio.wav"
13
+ checkpoint_path="$work_dir/ckpts/tts/ns2_libritts/checkpoint/epoch-0065_step-0376136_loss-7.126379"
14
+ output_dir="$work_dir/output"
15
+ mode="single"
16
+
17
+ export CUDA_VISIBLE_DEVICES="0"
18
+
19
+ ######## Parse Command Line Arguments ###########
20
+ while [[ $# -gt 0 ]]
21
+ do
22
+ key="$1"
23
+
24
+ case $key in
25
+ --text)
26
+ text="$2"
27
+ shift # past argument
28
+ shift # past value
29
+ ;;
30
+ *) # unknown option
31
+ shift # past argument
32
+ ;;
33
+ esac
34
+ done
35
+
36
+ ######## Train Model ###########
37
+ python "${work_dir}"/bins/tts/inference.py \
38
+ --config=$exp_config \
39
+ --text="$text" \
40
+ --mode=$mode \
41
+ --checkpoint_path=$checkpoint_path \
42
+ --ref_audio=$ref_audio \
43
+ --output_dir=$output_dir \
egs/tts/NaturalSpeech2/run_train.sh ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ######## Build Experiment Environment ###########
2
+ exp_dir=$(cd `dirname $0`; pwd)
3
+ work_dir=$(dirname $(dirname $(dirname $exp_dir)))
4
+
5
+ export WORK_DIR=$work_dir
6
+ export PYTHONPATH=$work_dir
7
+ export PYTHONIOENCODING=UTF-8
8
+
9
+ ######## Set Experiment Configuration ###########
10
+ exp_config="$exp_dir/exp_config.json"
11
+ exp_name="ns2_libritts"
12
+
13
+ ######## Train Model ###########
14
+ CUDA_VISIBLE_DEVICES="0" accelerate \
15
+ "${work_dir}"/bins/tts/train.py \
16
+ --config=$exp_config \
17
+ --exp_name=$exp_name \
18
+ --log_level debug \
egs/tts/README.md ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # Amphion Text-to-Speech (TTS) Recipe
3
+
4
+ ## Quick Start
5
+
6
+ We provide a **[beginner recipe](VALLE/)** to demonstrate how to train a cutting edge TTS model. Specifically, it is Amphion's re-implementation for [Vall-E](https://arxiv.org/abs/2301.02111), which is a zero-shot TTS architecture that uses a neural codec language model with discrete codes.
7
+
8
+ ## Supported Model Architectures
9
+
10
+ Until now, Amphion TTS supports the following models or architectures,
11
+ - **[FastSpeech2](FastSpeech2)**: A non-autoregressive TTS architecture that utilizes feed-forward Transformer blocks.
12
+ - **[VITS](VITS)**: An end-to-end TTS architecture that utilizes conditional variational autoencoder with adversarial learning
13
+ - **[Vall-E](VALLE)**: A zero-shot TTS architecture that uses a neural codec language model with discrete codes.
14
+ - **[NaturalSpeech2](NaturalSpeech2)** (👨‍💻 developing): An architecture for TTS that utilizes a latent diffusion model to generate natural-sounding voices.
15
+
16
+ ## Amphion TTS Demo
17
+ Here are some [TTS samples](https://openhlt.github.io/Amphion_TTS_Demo/) from Amphion.
egs/tts/VALLE/README.md ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # VALL-E Recipe
2
+
3
+ In this recipe, we will show how to train [VALL-E](https://arxiv.org/abs/2301.02111) using Amphion's infrastructure. VALL-E is a zero-shot TTS architecture that uses a neural codec language model with discrete codes.
4
+
5
+ There are four stages in total:
6
+
7
+ 1. Data preparation
8
+ 2. Features extraction
9
+ 3. Training
10
+ 4. Inference
11
+
12
+ > **NOTE:** You need to run every command of this recipe in the `Amphion` root path:
13
+ > ```bash
14
+ > cd Amphion
15
+ > ```
16
+
17
+ ## 1. Data Preparation
18
+
19
+ ### Dataset Download
20
+ You can use the commonly used TTS dataset to train VALL-E model, e.g., LibriTTS, etc. We strongly recommend you use LibriTTS to train VALL-E model for the first time. How to download dataset is detailed [here](../../datasets/README.md).
21
+
22
+ ### Configuration
23
+
24
+ After downloading the dataset, you can set the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets.
25
+
26
+ ```json
27
+ "dataset": [
28
+ "libritts",
29
+ ],
30
+ "dataset_path": {
31
+ // TODO: Fill in your dataset path
32
+ "libritts": "[LibriTTS dataset path]",
33
+ },
34
+ ```
35
+
36
+ ## 2. Features Extraction
37
+
38
+ ### Configuration
39
+
40
+ Specify the `processed_dir` and the `log_dir` and for saving the processed data and the checkpoints in `exp_config.json`:
41
+
42
+ ```json
43
+ // TODO: Fill in the output log path. The default value is "Amphion/ckpts/tts"
44
+ "log_dir": "ckpts/tts",
45
+ "preprocess": {
46
+ // TODO: Fill in the output data path. The default value is "Amphion/data"
47
+ "processed_dir": "data",
48
+ ...
49
+ },
50
+ ```
51
+
52
+ ### Run
53
+
54
+ Run the `run.sh` as the preproces stage (set `--stage 1`):
55
+
56
+ ```bash
57
+ sh egs/tts/VALLE/run.sh --stage 1
58
+ ```
59
+
60
+ > **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`.
61
+
62
+
63
+ ## 3. Training
64
+
65
+ ### Configuration
66
+
67
+ We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on your GPU machines.
68
+
69
+ ```
70
+ "train": {
71
+ "batch_size": 4,
72
+ }
73
+ ```
74
+
75
+ ### Run
76
+
77
+ Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/tts/[YourExptName]`.
78
+
79
+ Specifically, VALL-E need to train a autoregressive (AR) model and then a non-autoregressive (NAR) model. So, you can set `--model_train_stage 1` to train AR model, and set `--model_train_stage 2` to train NAR model, where `--ar_model_ckpt_dir` should be set as the ckeckpoint path to the trained AR model.
80
+
81
+
82
+ Train a AR moel, just run:
83
+
84
+ ```bash
85
+ sh egs/tts/VALLE/run.sh --stage 2 --model_train_stage 1 --name [YourExptName]
86
+ ```
87
+
88
+ Train a NAR model, just run:
89
+ ```bash
90
+ sh egs/tts/VALLE/run.sh --stage 2 --model_train_stage 2 --ar_model_ckpt_dir [ARModelPath] --name [YourExptName]
91
+ ```
92
+ <!-- > **NOTE:** To train a NAR model, `--checkpoint_path` should be set as the ckeckpoint path to the trained AR model. -->
93
+
94
+ > **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`.
95
+
96
+
97
+ ## 4. Inference
98
+
99
+ ### Configuration
100
+
101
+ For inference, you need to specify the following configurations when running `run.sh`:
102
+
103
+
104
+
105
+ | Parameters | Description | Example |
106
+ | --------------------- | -------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
107
+ | `--infer_expt_dir` | The experimental directory of NAR model which contains `checkpoint` | `Amphion/ckpts/tts/[YourExptName]` |
108
+ | `--infer_output_dir` | The output directory to save inferred audios. | `Amphion/ckpts/tts/[YourExptName]/result` |
109
+ | `--infer_mode` | The inference mode, e.g., "`single`", "`batch`". | "`single`" to generate a clip of speech, "`batch`" to generate a batch of speech at a time. |
110
+ | `--infer_text` | The text to be synthesized. | "`This is a clip of generated speech with the given text from a TTS model.`" |
111
+ | `--infer_text_prompt` | The text prompt for inference. | The text prompt should be aligned with the audio prompt. |
112
+ | `--infer_audio_prompt` | The audio prompt for inference. | The audio prompt should be aligned with text prompt.|
113
+ | `--test_list_file` | The test list file used for batch inference. | The format of test list file is `text\|text_prompt\|audio_prompt`.|
114
+
115
+
116
+ ### Run
117
+ For example, if you want to generate a single clip of speech, just run:
118
+
119
+ ```bash
120
+ sh egs/tts/VALLE/run.sh --stage 3 --gpu "0" \
121
+ --infer_expt_dir Amphion/ckpts/tts/[YourExptName] \
122
+ --infer_output_dir Amphion/ckpts/tts/[YourExptName]/result \
123
+ --infer_mode "single" \
124
+ --infer_text "This is a clip of generated speech with the given text from a TTS model." \
125
+ --infer_text_prompt "But even the unsuccessful dramatist has his moments." \
126
+ --infer_audio_prompt egs/tts/VALLE/prompt_examples/7176_92135_000004_000000.wav
127
+ ```
128
+
129
+
130
+ We released a pre-trained Amphion VALL-E model. So you can download the pre-trained model [here](https://huggingface.co/amphion/valle-libritts) and generate speech following the above inference instruction.
131
+
132
+ ```bibtex
133
+ @article{wang2023neural,
134
+ title={Neural codec language models are zero-shot text to speech synthesizers},
135
+ author={Wang, Chengyi and Chen, Sanyuan and Wu, Yu and Zhang, Ziqiang and Zhou, Long and Liu, Shujie and Chen, Zhuo and Liu, Yanqing and Wang, Huaming and Li, Jinyu and others},
136
+ journal={arXiv preprint arXiv:2301.02111},
137
+ year={2023}
138
+ }
139
+ ```
egs/tts/VALLE/exp_config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/valle.json",
3
+ "model_type": "VALLE",
4
+ "dataset": [
5
+ "libritts"
6
+ ],
7
+ "dataset_path": {
8
+ "libritts": "[LibriTTS dataset path]"
9
+ },
10
+ "preprocess": {
11
+ "extract_phone": true,
12
+ "phone_extractor": "espeak", // "espeak, pypinyin, pypinyin_initials_finals, lexicon (only for language=en-us right now)"
13
+ "extract_acoustic_token": true,
14
+ "use_phone": true,
15
+ "use_acoustic_token": true,
16
+ "processed_dir": "Amphion/data/",
17
+ "sample_rate": 24000, // "Audio sampling rate."
18
+ "codec_hop_size": "320", // "Audio codec hop size."
19
+ "valid_file": "test.json",
20
+ },
21
+ "model": {
22
+ "prefix_mode": 1, // "The mode for how to prefix VALL-E NAR Decoder, 0: no prefix, 1: 0 to random, 2: random to random, 4: chunk of pre or post utterance.",
23
+ },
24
+ "log_dir": "Amphion/ckpts/tts/valle",
25
+ "train": {
26
+ "batch_size": 4,
27
+ "train_stage": 1, // 0: train all modules, For VALL_E, support 1: AR Decoder 2: NAR Decoder(s)
28
+ "max_epoch": 20, // "Number of epochs to train."
29
+ "use_dynamic_batchsize": true, // If use dynamic batch size
30
+ "max_tokens": 4000, // If use dynamic batch size
31
+ "max_sentences": 10 // If use dynamic batch size
32
+ }
33
+ }
egs/tts/VALLE/prompt_examples/260_123440_000010_000004.normalized.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ I almost think I can remember feeling a little different.
egs/tts/VALLE/prompt_examples/5142_33396_000002_000004.normalized.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Ten sons sat at meat with him, and I was the youngest.
egs/tts/VALLE/prompt_examples/6829_68771_000027_000000.normalized.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ The girl entered, and gave an involuntary cry of surprise.
egs/tts/VALLE/prompt_examples/7176_92135_000004_000000.normalized.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ But even the unsuccessful dramatist has his moments.
egs/tts/VALLE/run.sh ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ ######## Build Experiment Environment ###########
7
+ exp_dir=$(cd `dirname $0`; pwd)
8
+ work_dir=$(dirname $(dirname $(dirname $exp_dir)))
9
+
10
+ export WORK_DIR=$work_dir
11
+ export PYTHONPATH=$work_dir
12
+ export PYTHONIOENCODING=UTF-8
13
+
14
+ cd $work_dir/modules/monotonic_align
15
+ mkdir -p monotonic_align
16
+ python setup.py build_ext --inplace
17
+ cd $work_dir
18
+
19
+ ######## Parse the Given Parameters from the Commond ###########
20
+ options=$(getopt -o c:n:s --long gpu:,config:,infer_expt_dir:,ar_model_ckpt_dir:,infer_output_dir:,infer_mode:,infer_test_list_file:,infer_text:,infer_text_prompt:,infer_audio_prompt:,model_train_stage:,name:,stage: -- "$@")
21
+ eval set -- "$options"
22
+
23
+ while true; do
24
+ case $1 in
25
+ # Experimental Configuration File
26
+ -c | --config) shift; exp_config=$1 ; shift ;;
27
+ # Experimental Name
28
+ -n | --name) shift; exp_name=$1 ; shift ;;
29
+ # Running Stage
30
+ -s | --stage) shift; running_stage=$1 ; shift ;;
31
+ # Visible GPU machines. The default value is "0".
32
+ --gpu) shift; gpu=$1 ; shift ;;
33
+
34
+ # [Only for Training] Model training stage.
35
+ --model_train_stage) shift; model_train_stage=$1 ; shift ;;
36
+ # [Only for Training] The stage1 ckpt dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
37
+ --ar_model_ckpt_dir) shift; ar_model_ckpt_dir=$1 ; shift ;;
38
+
39
+ # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
40
+ --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
41
+ # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
42
+ --infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
43
+
44
+ # [Only for Inference] The inference mode. It can be "batch" to generate speech by batch, or "single" to generage a single clip of speech.
45
+ --infer_mode) shift; infer_mode=$1 ; shift ;;
46
+ # [Only for Inference] The inference test list file. It is only used when the inference model is "batch".
47
+ --infer_test_list_file) shift; infer_test_list_file=$1 ; shift ;;
48
+ # [Only for Inference] The text to be synthesized from. It is only used when the inference model is "single".
49
+ --infer_text) shift; infer_text=$1 ; shift ;;
50
+ # [Only for Inference] The inference text prompt. It is only used when the inference model is "single".
51
+ --infer_text_prompt) shift; infer_text_prompt=$1 ; shift ;;
52
+ # [Only for Inference] The inference audio prompt. It is only used when the inference model is "single".
53
+ --infer_audio_prompt) shift; infer_audio_prompt=$1 ; shift ;;
54
+
55
+ --) shift ; break ;;
56
+ *) echo "Invalid option: $1" exit 1 ;;
57
+ esac
58
+ done
59
+
60
+
61
+ ### Value check ###
62
+ if [ -z "$running_stage" ]; then
63
+ echo "[Error] Please specify the running stage"
64
+ exit 1
65
+ fi
66
+
67
+ if [ -z "$exp_config" ]; then
68
+ exp_config="${exp_dir}"/exp_config.json
69
+ fi
70
+ echo "Exprimental Configuration File: $exp_config"
71
+
72
+ if [ -z "$gpu" ]; then
73
+ gpu="0"
74
+ fi
75
+
76
+ ######## Features Extraction ###########
77
+ if [ $running_stage -eq 1 ]; then
78
+ CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/tts/preprocess.py \
79
+ --config=$exp_config \
80
+ --num_workers=4
81
+ fi
82
+
83
+ ######## Training ###########
84
+ if [ $running_stage -eq 2 ]; then
85
+ if [ -z "$exp_name" ]; then
86
+ echo "[Error] Please specify the experiments name"
87
+ exit 1
88
+ fi
89
+
90
+ if [ "$model_train_stage" = "2" ] && [ -z "$ar_model_ckpt_dir" ]; then
91
+ echo "[Error] Please specify the ckeckpoint path to the trained model in stage1."
92
+ exit 1
93
+ fi
94
+
95
+ if [ "$model_train_stage" = "1" ]; then
96
+ ar_model_ckpt_dir=None
97
+ fi
98
+
99
+ echo "Exprimental Name: $exp_name"
100
+
101
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch --main_process_port 29510 \
102
+ "${work_dir}"/bins/tts/train.py \
103
+ --config $exp_config \
104
+ --exp_name $exp_name \
105
+ --log_level debug \
106
+ --train_stage $model_train_stage \
107
+ --checkpoint_path $ar_model_ckpt_dir
108
+ fi
109
+
110
+
111
+ ######## Inference ###########
112
+ if [ $running_stage -eq 3 ]; then
113
+ if [ -z "$infer_expt_dir" ]; then
114
+ echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
115
+ exit 1
116
+ fi
117
+
118
+ if [ -z "$infer_output_dir" ]; then
119
+ infer_output_dir="$expt_dir/result"
120
+ fi
121
+
122
+ if [ -z "$infer_mode" ]; then
123
+ echo "[Error] Please specify the inference mode, e.g., "batch", "single""
124
+ exit 1
125
+ fi
126
+
127
+ if [ "$infer_mode" = "batch" ] && [ -z "$infer_test_list_file" ]; then
128
+ echo "[Error] Please specify the test list file used in inference when the inference mode is batch"
129
+ exit 1
130
+ fi
131
+
132
+ if [ "$infer_mode" = "single" ] && [ -z "$infer_text" ]; then
133
+ echo "[Error] Please specify the text to be synthesized when the inference mode is single"
134
+ exit 1
135
+ fi
136
+
137
+ if [ "$infer_mode" = "single" ]; then
138
+ echo 'Text: ' ${infer_text}
139
+ infer_test_list_file=None
140
+ elif [ "$infer_mode" = "batch" ]; then
141
+ infer_text=""
142
+ infer_text_prompt=""
143
+ infer_audio_prompt=""
144
+ fi
145
+
146
+
147
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/tts/inference.py \
148
+ --config $exp_config \
149
+ --log_level debug \
150
+ --acoustics_dir $infer_expt_dir \
151
+ --output_dir $infer_output_dir \
152
+ --mode $infer_mode \
153
+ --text "$infer_text" \
154
+ --text_prompt "$infer_text_prompt" \
155
+ --audio_prompt $infer_audio_prompt\
156
+ --test_list_file $infer_test_list_file \
157
+
158
+ fi
egs/tts/VITS/README.md ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # VITS Recipe
3
+
4
+ In this recipe, we will show how to train [VITS](https://arxiv.org/abs/2106.06103) using Amphion's infrastructure. VITS is an end-to-end TTS architecture that utilizes conditional variational autoencoder with adversarial learning.
5
+
6
+ There are four stages in total:
7
+
8
+ 1. Data preparation
9
+ 2. Features extraction
10
+ 3. Training
11
+ 4. Inference
12
+
13
+ > **NOTE:** You need to run every command of this recipe in the `Amphion` root path:
14
+ > ```bash
15
+ > cd Amphion
16
+ > ```
17
+
18
+ ## 1. Data Preparation
19
+
20
+ ### Dataset Download
21
+ You can use the commonly used TTS dataset to train TTS model, e.g., LJSpeech, VCTK, LibriTTS, etc. We strongly recommend you use LJSpeech to train TTS model for the first time. How to download dataset is detailed [here](../../datasets/README.md).
22
+
23
+ ### Configuration
24
+
25
+ After downloading the dataset, you can set the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets.
26
+
27
+ ```json
28
+ "dataset": [
29
+ "LJSpeech",
30
+ ],
31
+ "dataset_path": {
32
+ // TODO: Fill in your dataset path
33
+ "LJSpeech": "[LJSpeech dataset path]",
34
+ },
35
+ ```
36
+
37
+ ## 2. Features Extraction
38
+
39
+ ### Configuration
40
+
41
+ Specify the `processed_dir` and the `log_dir` and for saving the processed data and the checkpoints in `exp_config.json`:
42
+
43
+ ```json
44
+ // TODO: Fill in the output log path. The default value is "Amphion/ckpts/tts"
45
+ "log_dir": "ckpts/tts",
46
+ "preprocess": {
47
+ // TODO: Fill in the output data path. The default value is "Amphion/data"
48
+ "processed_dir": "data",
49
+ ...
50
+ },
51
+ ```
52
+
53
+ ### Run
54
+
55
+ Run the `run.sh` as the preproces stage (set `--stage 1`):
56
+
57
+ ```bash
58
+ sh egs/tts/VITS/run.sh --stage 1
59
+ ```
60
+
61
+ > **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`.
62
+
63
+ ## 3. Training
64
+
65
+ ### Configuration
66
+
67
+ We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on your GPU machines.
68
+
69
+ ```
70
+ "train": {
71
+ "batch_size": 16,
72
+ }
73
+ ```
74
+
75
+ ### Run
76
+
77
+ Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/tts/[YourExptName]`.
78
+
79
+ ```bash
80
+ sh egs/tts/VITS/run.sh --stage 2 --name [YourExptName]
81
+ ```
82
+
83
+ > **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`.
84
+
85
+
86
+ ## 4. Inference
87
+
88
+ ### Configuration
89
+
90
+ For inference, you need to specify the following configurations when running `run.sh`:
91
+
92
+
93
+ | Parameters | Description | Example |
94
+ | --------------------- | -------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
95
+ | `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `Amphion/ckpts/tts/[YourExptName]` |
96
+ | `--infer_output_dir` | The output directory to save inferred audios. | `Amphion/ckpts/tts/[YourExptName]/result` |
97
+ | `--infer_mode` | The inference mode, e.g., "`single`", "`batch`". | "`single`" to generate a clip of speech, "`batch`" to generate a batch of speech at a time. |
98
+ | `--infer_dataset` | The dataset used for inference. | For LJSpeech dataset, the inference dataset would be `LJSpeech`. |
99
+ | `--infer_testing_set` | The subset of the inference dataset used for inference, e.g., train, test, golden_test | For LJSpeech dataset, the testing set would be  "`test`" split from LJSpeech at the feature extraction, or "`golden_test`" cherry-picked from test set as template testing set. |
100
+ | `--infer_text` | The text to be synthesized. | "`This is a clip of generated speech with the given text from a TTS model.`" |
101
+
102
+ ### Run
103
+ For example, if you want to generate speech of all testing set split from LJSpeech, just run:
104
+
105
+ ```bash
106
+ sh egs/tts/VITS/run.sh --stage 3 --gpu "0" \
107
+ --infer_expt_dir Amphion/ckpts/tts/[YourExptName] \
108
+ --infer_output_dir Amphion/ckpts/tts/[YourExptName]/result \
109
+ --infer_mode "batch" \
110
+ --infer_dataset "LJSpeech" \
111
+ --infer_testing_set "test"
112
+ ```
113
+
114
+ Or, if you want to generate a single clip of speech from a given text, just run:
115
+
116
+ ```bash
117
+ sh egs/tts/VITS/run.sh --stage 3 --gpu "0" \
118
+ --infer_expt_dir Amphion/ckpts/tts/[YourExptName] \
119
+ --infer_output_dir Amphion/ckpts/tts/[YourExptName]/result \
120
+ --infer_mode "single" \
121
+ --infer_text "This is a clip of generated speech with the given text from a TTS model."
122
+ ```
123
+
124
+ We released a pre-trained Amphion VITS model trained on LJSpeech. So you can download the pre-trained model [here](https://huggingface.co/amphion/vits-ljspeech) and generate speech following the above inference instruction.
125
+
126
+
127
+ ```bibtex
128
+ @inproceedings{kim2021conditional,
129
+ title={Conditional variational autoencoder with adversarial learning for end-to-end text-to-speech},
130
+ author={Kim, Jaehyeon and Kong, Jungil and Son, Juhee},
131
+ booktitle={International Conference on Machine Learning},
132
+ pages={5530--5540},
133
+ year={2021},
134
+ }
135
+ ```