zyingt commited on
Commit
4e9d7f3
1 Parent(s): 863ec70

VITS checkpoint trained on Hi-Fi TTS

Browse files
README.md CHANGED
@@ -1,3 +1,46 @@
1
  ---
2
  license: mit
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: mit
3
+ language:
4
+ - en
5
  ---
6
+
7
+ # Amphion Multi-Speaker TTS Pre-trained Model
8
+ ## Quick Start
9
+ We provide the pre-trained checkpoint of [VITS](https://github.com/open-mmlab/Amphion/tree/main/egs/tts/VITS), trained on [Hi-fi TTS](https://www.openslr.org/109/), which consists of a total of 291.6 hours audio contributed by 10 speakers, on an average of 17 hours per speaker.
10
+ To utilize the pre-trained model, run the following commands:
11
+
12
+ ### Step1: Download the checkpoint
13
+ ```bash
14
+ git lfs install
15
+ git clone https://huggingface.co/amphion/vits_hifitts
16
+ ```
17
+
18
+ ### Step2: Clone the Amphion's Source Code of GitHub
19
+ ```bash
20
+ git clone https://github.com/open-mmlab/Amphion.git
21
+ ```
22
+
23
+ ### Step3: Specify the checkpoint's path
24
+ Use the soft link to specify the downloaded checkpoint in the first step:
25
+
26
+ ```bash
27
+ cd Amphion
28
+ mkdir -p ckpts/tts
29
+ ln -s ../../../vits_hifitts ckpts/tts/
30
+ ```
31
+
32
+ ### Step4: Inference
33
+
34
+ You can follow the inference part of this [recipe](https://github.com/open-mmlab/Amphion/tree/main/egs/tts/VITS#4-inference) to generate speech from text. For example, if you want to synthesize a clip of speech with the text of "This is a clip of generated speech with the given text from a TTS model.", just, run:
35
+
36
+ ```bash
37
+ sh egs/tts/VITS/run.sh --stage 3 --gpu "0" \
38
+ --config ckpts/tts/vits_hifitts/args.json \
39
+ --infer_expt_dir ckpts/tts/vits_hifitts/ \
40
+ --infer_output_dir ckpts/tts/vits_hifitts/result \
41
+ --infer_mode "single" \
42
+ --infer_text "This is a clip of generated speech with the given text from a TTS model." \
43
+ --infer_speaker_name "hifitts_92"
44
+ ```
45
+
46
+ **Note**: The supported `infer_speaker_name` values can be seen [here](https://huggingface.co/amphion/vits_hifitts/tree/main/spk2id.json).
args.json ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/vits.json",
3
+ "dataset": [
4
+ "hifitts",
5
+ ],
6
+ "dataset_path": {
7
+ "hifitts": "/mnt/workspace/xueliumeng/data/hifitts/hi_fi_tts_v0",
8
+ },
9
+ "exp_name": "HifiTTS_all",
10
+ "log_dir": "/mnt/workspace/tzeying/data/vits_on_libritts_hifitts/logs",
11
+ "model": {
12
+ "filter_channels": 768,
13
+ "gin_channels": 256,
14
+ "hidden_channels": 192,
15
+ "inter_channels": 192,
16
+ "kernel_size": 3,
17
+ "n_heads": 2,
18
+ "n_layers": 6,
19
+ "n_layers_q": 3,
20
+ "n_speakers": 10,
21
+ "p_dropout": 0.1,
22
+ "resblock": "1",
23
+ "resblock_dilation_sizes": [
24
+ [
25
+ 1,
26
+ 3,
27
+ 5,
28
+ ],
29
+ [
30
+ 1,
31
+ 3,
32
+ 5,
33
+ ],
34
+ [
35
+ 1,
36
+ 3,
37
+ 5,
38
+ ],
39
+ ],
40
+ "resblock_kernel_sizes": [
41
+ 3,
42
+ 7,
43
+ 11,
44
+ ],
45
+ "text_token_num": 512,
46
+ "upsample_initial_channel": 512,
47
+ "upsample_kernel_sizes": [
48
+ 16,
49
+ 16,
50
+ 4,
51
+ 4,
52
+ ],
53
+ "upsample_rates": [
54
+ 8,
55
+ 8,
56
+ 2,
57
+ 2,
58
+ ],
59
+ "use_sdp": true,
60
+ "use_spectral_norm": false,
61
+ },
62
+ "model_type": "VITS",
63
+ "preprocess": {
64
+ "add_blank": true,
65
+ "align_mel_duration": false,
66
+ "audio_dir": "audios",
67
+ "bits": 8,
68
+ "contentvec_dir": "contentvec",
69
+ "data_augment": false,
70
+ "dur_dir": "durs",
71
+ "duration_dir": "duration",
72
+ "emo2id": "emo2id.json",
73
+ "energy_dir": "energys",
74
+ "energy_extract_mode": "from_mel",
75
+ "energy_norm": false,
76
+ "energy_remove_outlier": false,
77
+ "extract_acoustic_token": false,
78
+ "extract_amplitude_phase": false,
79
+ "extract_audio": true,
80
+ "extract_contentvec_feature": false,
81
+ "extract_duration": false,
82
+ "extract_energy": false,
83
+ "extract_label": false,
84
+ "extract_linear_spec": true,
85
+ "extract_mcep": false,
86
+ "extract_mel": true,
87
+ "extract_mert_feature": false,
88
+ "extract_phone": true,
89
+ "extract_pitch": false,
90
+ "extract_uv": false,
91
+ "extract_wenet_feature": false,
92
+ "extract_whisper_feature": false,
93
+ "file_lst": "file.lst",
94
+ "fmax": null,
95
+ "fmin": 0,
96
+ "hop_size": 256,
97
+ "imaginary_dir": "imaginarys",
98
+ "lab_dir": "labs",
99
+ "label_dir": "labels",
100
+ "language": "en-us",
101
+ "lexicon_path": "./text/lexicon/librispeech-lexicon.txt",
102
+ "linear_dir": "linears",
103
+ "log_amplitude_dir": "log_amplitudes",
104
+ "mcep_dir": "mcep",
105
+ "mel_dir": "mels",
106
+ "mel_extract_mode": "",
107
+ "mel_min_max_norm": false,
108
+ "min_level_db": -115,
109
+ "n_fft": 1024,
110
+ "n_mel": 80,
111
+ "num_silent_frames": 8,
112
+ "phase_dir": "phases",
113
+ "phone_dir": "phones",
114
+ "phone_energy_dir": "phone_energys",
115
+ "phone_extractor": "espeak",
116
+ "phone_pitch_dir": "phone_pitches",
117
+ "phone_seq_file": "phone_seq_file",
118
+ "pitch_dir": "pitches",
119
+ "pitch_extractor": "parselmouth",
120
+ "pitch_norm": false,
121
+ "pitch_remove_outlier": false,
122
+ "processed_dir": "/mnt/workspace/xueliumeng/data/vits_on_libritts_hifitts/processed_data",
123
+ "raw_data": "raw_data",
124
+ "real_dir": "reals",
125
+ "ref_level_db": 20,
126
+ "sample_rate": 24000,
127
+ "segment_size": 8192,
128
+ "spk2id": "spk2id.json",
129
+ "symbols_dict": "symbols.dict",
130
+ "text_cleaners": [
131
+ "english_cleaners",
132
+ ],
133
+ "train_file": "train_all.json",
134
+ "trim_fft_size": 512,
135
+ "trim_hop_size": 128,
136
+ "trim_silence": false,
137
+ "trim_top_db": 30,
138
+ "trimmed_wav_dir": "trimmed_wavs",
139
+ "use_amplitude_phase": false,
140
+ "use_audio": true,
141
+ "use_dur": false,
142
+ "use_emoid": false,
143
+ "use_frame_duration": false,
144
+ "use_frame_energy": false,
145
+ "use_frame_pitch": false,
146
+ "use_lab": false,
147
+ "use_label": false,
148
+ "use_linear": true,
149
+ "use_log_scale_energy": false,
150
+ "use_log_scale_pitch": false,
151
+ "use_mel": true,
152
+ "use_min_max_norm_mel": false,
153
+ "use_one_hot": false,
154
+ "use_phn_seq": false,
155
+ "use_phone": true,
156
+ "use_phone_duration": false,
157
+ "use_phone_energy": false,
158
+ "use_phone_pitch": false,
159
+ "use_spkid": true,
160
+ "use_text": false,
161
+ "use_uv": false,
162
+ "use_wav": false,
163
+ "use_wenet": false,
164
+ "utt2emo": "utt2emo",
165
+ "utt2spk": "utt2spk",
166
+ "uv_dir": "uvs",
167
+ "valid_file": "valid_100.json",
168
+ "wav_dir": "wavs",
169
+ "wenet_dir": "wenet",
170
+ "win_size": 1024,
171
+ },
172
+ "supported_model_type": [
173
+ "Fastspeech2",
174
+ "VITS",
175
+ "VALLE",
176
+ ],
177
+ "task_type": "tts",
178
+ "train": {
179
+ "AdamW": {
180
+ "betas": [
181
+ 0.8,
182
+ 0.99,
183
+ ],
184
+ "eps": 1e-09,
185
+ },
186
+ "adamw": {
187
+ "lr": 0.0004,
188
+ },
189
+ "batch_size": 16,
190
+ "betas": [
191
+ 0.8,
192
+ 0.99,
193
+ ],
194
+ "c_kl": 1.0,
195
+ "c_mel": 45,
196
+ "dataloader": {
197
+ "num_worker": 32,
198
+ "pin_memory": true,
199
+ },
200
+ "ddp": true,
201
+ "eps": 1e-09,
202
+ "fp16_run": true,
203
+ "gradient_accumulation_step": 1,
204
+ "init_lr_ratio": 1,
205
+ "keep_checkpoint_max": 5,
206
+ "keep_last": [
207
+ 3,
208
+ -1,
209
+ ],
210
+ "learning_rate": 0.0002,
211
+ "lr_decay": 0.999875,
212
+ "max_epoch": -1,
213
+ "max_steps": 1000000,
214
+ "multi_speaker_training": true,
215
+ "optimizer": "AdamW",
216
+ "random_seed": 10086,
217
+ "reducelronplateau": {
218
+ "factor": 0.8,
219
+ "min_lr": 0.0001,
220
+ "patience": 10,
221
+ },
222
+ "run_eval": [
223
+ false,
224
+ true,
225
+ ],
226
+ "sampler": {
227
+ "drop_last": true,
228
+ "holistic_shuffle": true,
229
+ },
230
+ "save_checkpoint_stride": [
231
+ 5,
232
+ 20,
233
+ ],
234
+ "save_checkpoints_steps": 10000,
235
+ "save_summary_steps": 500,
236
+ "scheduler": "ReduceLROnPlateau",
237
+ "total_training_steps": 50000,
238
+ "tracker": [
239
+ "tensorboard",
240
+ ],
241
+ "valid_interval": 10000,
242
+ "warmup_epochs": 0,
243
+ },
244
+ "use_custom_dataset": false,
245
+ }
checkpoint/epoch-0030_step-0312356_loss-38.448391/ckpts.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ [
3
+ "/mnt/workspace/tzeying/data/vits_on_libritts_hifitts/logs/HifiTTS_all/checkpoint/epoch-0000_step-0010076_loss-42.366682",
4
+ "/mnt/workspace/tzeying/data/vits_on_libritts_hifitts/logs/HifiTTS_all/checkpoint/epoch-0015_step-0161216_loss-38.389271",
5
+ "/mnt/workspace/tzeying/data/vits_on_libritts_hifitts/logs/HifiTTS_all/checkpoint/epoch-0020_step-0211596_loss-38.454503",
6
+ "/mnt/workspace/tzeying/data/vits_on_libritts_hifitts/logs/HifiTTS_all/checkpoint/epoch-0025_step-0261976_loss-38.479332"
7
+ ],
8
+ [
9
+ "/mnt/workspace/tzeying/data/vits_on_libritts_hifitts/logs/HifiTTS_all/checkpoint/epoch-0000_step-0010076_loss-42.366682",
10
+ "/mnt/workspace/tzeying/data/vits_on_libritts_hifitts/logs/HifiTTS_all/checkpoint/epoch-0020_step-0211596_loss-38.454503"
11
+ ]
12
+ ]
checkpoint/epoch-0030_step-0312356_loss-38.448391/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd7ca5a98e57292908a7749488dfa1bee82e1f9cf560ec999906bdb72f03cce4
3
+ size 159044848
checkpoint/epoch-0030_step-0312356_loss-38.448391/model_1.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a35e0287f33fe21c2234fbab466fac8659bfa5759bf5914b873746a42308f916
3
+ size 187000096
checkpoint/epoch-0030_step-0312356_loss-38.448391/optimizer.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d23f14adfa8137ca14c9a7493556cc00848a169e11dca0b4b8bb182b711760c
3
+ size 318631531
checkpoint/epoch-0030_step-0312356_loss-38.448391/optimizer_1.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c687b8da88c7e166df5376dd5826837501ef848366b43270257e521478de5331
3
+ size 374071331
checkpoint/epoch-0030_step-0312356_loss-38.448391/random_states_0.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9365e6ef62ff302c74e6a0c36a5d057ff5879a317b20e2da80246dfd03e356f4
3
+ size 15691
checkpoint/epoch-0030_step-0312356_loss-38.448391/scheduler.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3790f159ec2bc847509acad910fae392b7bb974fa7dc7a2e52a5108e24b2484b
3
+ size 563
checkpoint/epoch-0030_step-0312356_loss-38.448391/scheduler_1.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:231b81d7e845b9b1247f789877d0bb85bd04e81c03468eba58f27e9c2664ad62
3
+ size 567
spk2id.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "hifitts_11614": 0,
3
+ "hifitts_11697": 1,
4
+ "hifitts_12787": 2,
5
+ "hifitts_6097": 3,
6
+ "hifitts_6670": 4,
7
+ "hifitts_6671": 5,
8
+ "hifitts_8051": 6,
9
+ "hifitts_9017": 7,
10
+ "hifitts_9136": 8,
11
+ "hifitts_92": 9
12
+ }
symbols.dict ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <eps> 0
2
+ ! 1
3
+ " 2
4
+ ( 3
5
+ ) 4
6
+ , 5
7
+ . 6
8
+ : 7
9
+ ; 8
10
+ ? 9
11
+ _ 10
12
+ a 11
13
+ aɪ 12
14
+ aɪə 13
15
+ aɪɚ 14
16
+ aɪʊ 15
17
+ aɪʊɹ 16
18
+ aʊ 17
19
+ b 18
20
+ d 19
21
+ dʒ 20
22
+ enus 21
23
+ es 22
24
+ eɪ 23
25
+ f 24
26
+ fr 25
27
+ h 26
28
+ i 27
29
+ iə 28
30
+ iː 29
31
+ j 30
32
+ k 31
33
+ l 32
34
+ m 33
35
+ n 34
36
+ nʲ 35
37
+ o 36
38
+ oʊ 37
39
+ oː 38
40
+ oːɹ 39
41
+ p 40
42
+ r 41
43
+ s 42
44
+ t 43
45
+ tʃ 44
46
+ uː 45
47
+ v 46
48
+ w 47
49
+ z 48
50
+ æ 49
51
+ ð 50
52
+ ø 51
53
+ ŋ 52
54
+ ɐ 53
55
+ ɑ 54
56
+ ɑː 55
57
+ ɑːɹ 56
58
+ ɔ 57
59
+ ɔɪ 58
60
+ ɔː 59
61
+ ɔːɹ 60
62
+ ə 61
63
+ əl 62
64
+ ɚ 63
65
+ ɛ 64
66
+ ɛɹ 65
67
+ ɜː 66
68
+ ɡ 67
69
+ ɪ 68
70
+ ɪɹ 69
71
+ ɫ 70
72
+ ɹ 71
73
+ ɾ 72
74
+ ʃ 73
75
+ ʊ 74
76
+ ʊɹ 75
77
+ ʌ 76
78
+ ʒ 77
79
+ ʔ 78
80
+ ̃ 79
81
+ ̩ 80
82
+ θ 81
83
+ ᵻ 82