VITS checkpoint trained on Hi-Fi TTS (#1)

Browse files

- VITS checkpoint trained on Hi-Fi TTS (4e9d7f3a40985b64ca046332803ddb18410cd6ed)
- Removed private paths in args.json and ckpts.json (3976c9d8ef5124062b2bc09fa8096c70589f8ac1)

Co-authored-by: Tang Tze Ying <zyingt@users.noreply.huggingface.co>

Files changed (11) hide show

README.md +43 -0
args.json +239 -0
checkpoint/epoch-0030_step-0312356_loss-38.448391/model.safetensors +3 -0
checkpoint/epoch-0030_step-0312356_loss-38.448391/model_1.safetensors +3 -0
checkpoint/epoch-0030_step-0312356_loss-38.448391/optimizer.bin +3 -0
checkpoint/epoch-0030_step-0312356_loss-38.448391/optimizer_1.bin +3 -0
checkpoint/epoch-0030_step-0312356_loss-38.448391/random_states_0.pkl +3 -0
checkpoint/epoch-0030_step-0312356_loss-38.448391/scheduler.bin +3 -0
checkpoint/epoch-0030_step-0312356_loss-38.448391/scheduler_1.bin +3 -0
spk2id.json +12 -0
symbols.dict +83 -0

README.md CHANGED Viewed

@@ -1,3 +1,46 @@
 ---
 license: mit
 ---

 ---
 license: mit
+language:
+- en
 ---
+# Amphion Multi-Speaker TTS Pre-trained Model
+## Quick Start
+We provide the pre-trained checkpoint of [VITS](https://github.com/open-mmlab/Amphion/tree/main/egs/tts/VITS), trained on [Hi-fi TTS](https://www.openslr.org/109/), which consists of a total of 291.6 hours audio contributed by 10 speakers, on an average of 17 hours per speaker.
+To utilize the pre-trained model, run the following commands:
+### Step1: Download the checkpoint
+```bash
+git lfs install
+git clone https://huggingface.co/amphion/vits_hifitts
+```
+### Step2: Clone the Amphion's Source Code of GitHub
+```bash
+git clone https://github.com/open-mmlab/Amphion.git
+```
+### Step3: Specify the checkpoint's path
+Use the soft link to specify the downloaded checkpoint in the first step:
+```bash
+cd Amphion
+mkdir -p ckpts/tts
+ln -s  ../../../vits_hifitts  ckpts/tts/
+```
+### Step4: Inference
+You can follow the inference part of this [recipe](https://github.com/open-mmlab/Amphion/tree/main/egs/tts/VITS#4-inference) to generate speech from text. For example, if you want to synthesize a clip of speech with the text of "This is a clip of generated speech with the given text from a TTS model.", just, run:
+```bash
+sh egs/tts/VITS/run.sh --stage 3 --gpu "0" \
+    --config ckpts/tts/vits_hifitts/args.json \
+	--infer_expt_dir ckpts/tts/vits_hifitts/ \
+	--infer_output_dir ckpts/tts/vits_hifitts/result \
+	--infer_mode "single" \
+    --infer_text "This is a clip of generated speech with the given text from a TTS model." \
+    --infer_speaker_name "hifitts_92"
+```
+**Note**: The supported `infer_speaker_name` values can be seen [here](https://huggingface.co/amphion/vits_hifitts/tree/main/spk2id.json).

args.json ADDED Viewed

	@@ -0,0 +1,239 @@

+{
+    "base_config": "config/vits.json",
+    "dataset": [
+        "hifitts",
+    ],
+    "model": {
+        "filter_channels": 768,
+        "gin_channels": 256,
+        "hidden_channels": 192,
+        "inter_channels": 192,
+        "kernel_size": 3,
+        "n_heads": 2,
+        "n_layers": 6,
+        "n_layers_q": 3,
+        "n_speakers": 10,
+        "p_dropout": 0.1,
+        "resblock": "1",
+        "resblock_dilation_sizes": [
+            [
+                1,
+                3,
+                5,
+            ],
+            [
+                1,
+                3,
+                5,
+            ],
+            [
+                1,
+                3,
+                5,
+            ],
+        ],
+        "resblock_kernel_sizes": [
+            3,
+            7,
+            11,
+        ],
+        "text_token_num": 512,
+        "upsample_initial_channel": 512,
+        "upsample_kernel_sizes": [
+            16,
+            16,
+            4,
+            4,
+        ],
+        "upsample_rates": [
+            8,
+            8,
+            2,
+            2,
+        ],
+        "use_sdp": true,
+        "use_spectral_norm": false,
+    },
+    "model_type": "VITS",
+    "preprocess": {
+        "add_blank": true,
+        "align_mel_duration": false,
+        "audio_dir": "audios",
+        "bits": 8,
+        "contentvec_dir": "contentvec",
+        "data_augment": false,
+        "dur_dir": "durs",
+        "duration_dir": "duration",
+        "emo2id": "emo2id.json",
+        "energy_dir": "energys",
+        "energy_extract_mode": "from_mel",
+        "energy_norm": false,
+        "energy_remove_outlier": false,
+        "extract_acoustic_token": false,
+        "extract_amplitude_phase": false,
+        "extract_audio": true,
+        "extract_contentvec_feature": false,
+        "extract_duration": false,
+        "extract_energy": false,
+        "extract_label": false,
+        "extract_linear_spec": true,
+        "extract_mcep": false,
+        "extract_mel": true,
+        "extract_mert_feature": false,
+        "extract_phone": true,
+        "extract_pitch": false,
+        "extract_uv": false,
+        "extract_wenet_feature": false,
+        "extract_whisper_feature": false,
+        "file_lst": "file.lst",
+        "fmax": null,
+        "fmin": 0,
+        "hop_size": 256,
+        "imaginary_dir": "imaginarys",
+        "lab_dir": "labs",
+        "label_dir": "labels",
+        "language": "en-us",
+        "lexicon_path": "./text/lexicon/librispeech-lexicon.txt",
+        "linear_dir": "linears",
+        "log_amplitude_dir": "log_amplitudes",
+        "mcep_dir": "mcep",
+        "mel_dir": "mels",
+        "mel_extract_mode": "",
+        "mel_min_max_norm": false,
+        "min_level_db": -115,
+        "n_fft": 1024,
+        "n_mel": 80,
+        "num_silent_frames": 8,
+        "phase_dir": "phases",
+        "phone_dir": "phones",
+        "phone_energy_dir": "phone_energys",
+        "phone_extractor": "espeak",
+        "phone_pitch_dir": "phone_pitches",
+        "phone_seq_file": "phone_seq_file",
+        "pitch_dir": "pitches",
+        "pitch_extractor": "parselmouth",
+        "pitch_norm": false,
+        "pitch_remove_outlier": false,
+        "raw_data": "raw_data",
+        "real_dir": "reals",
+        "ref_level_db": 20,
+        "sample_rate": 24000,
+        "segment_size": 8192,
+        "spk2id": "spk2id.json",
+        "symbols_dict": "symbols.dict",
+        "text_cleaners": [
+            "english_cleaners",
+        ],
+        "train_file": "train.json",
+        "trim_fft_size": 512,
+        "trim_hop_size": 128,
+        "trim_silence": false,
+        "trim_top_db": 30,
+        "trimmed_wav_dir": "trimmed_wavs",
+        "use_amplitude_phase": false,
+        "use_audio": true,
+        "use_dur": false,
+        "use_emoid": false,
+        "use_frame_duration": false,
+        "use_frame_energy": false,
+        "use_frame_pitch": false,
+        "use_lab": false,
+        "use_label": false,
+        "use_linear": true,
+        "use_log_scale_energy": false,
+        "use_log_scale_pitch": false,
+        "use_mel": true,
+        "use_min_max_norm_mel": false,
+        "use_one_hot": false,
+        "use_phn_seq": false,
+        "use_phone": true,
+        "use_phone_duration": false,
+        "use_phone_energy": false,
+        "use_phone_pitch": false,
+        "use_spkid": true,
+        "use_text": false,
+        "use_uv": false,
+        "use_wav": false,
+        "use_wenet": false,
+        "utt2emo": "utt2emo",
+        "utt2spk": "utt2spk",
+        "uv_dir": "uvs",
+        "valid_file": "valid.json",
+        "wav_dir": "wavs",
+        "wenet_dir": "wenet",
+        "win_size": 1024,
+    },
+    "supported_model_type": [
+        "Fastspeech2",
+        "VITS",
+        "VALLE",
+    ],
+    "task_type": "tts",
+    "train": {
+        "AdamW": {
+            "betas": [
+                0.8,
+                0.99,
+            ],
+            "eps": 1e-09,
+        },
+        "adamw": {
+            "lr": 0.0004,
+        },
+        "batch_size": 16,
+        "betas": [
+            0.8,
+            0.99,
+        ],
+        "c_kl": 1.0,
+        "c_mel": 45,
+        "dataloader": {
+            "num_worker": 32,
+            "pin_memory": true,
+        },
+        "ddp": true,
+        "eps": 1e-09,
+        "fp16_run": true,
+        "gradient_accumulation_step": 1,
+        "init_lr_ratio": 1,
+        "keep_checkpoint_max": 5,
+        "keep_last": [
+            3,
+            -1,
+        ],
+        "learning_rate": 0.0002,
+        "lr_decay": 0.999875,
+        "max_epoch": -1,
+        "max_steps": 1000000,
+        "multi_speaker_training": true,
+        "optimizer": "AdamW",
+        "random_seed": 10086,
+        "reducelronplateau": {
+            "factor": 0.8,
+            "min_lr": 0.0001,
+            "patience": 10,
+        },
+        "run_eval": [
+            false,
+            true,
+        ],
+        "sampler": {
+            "drop_last": true,
+            "holistic_shuffle": true,
+        },
+        "save_checkpoint_stride": [
+            5,
+            20,
+        ],
+        "save_checkpoints_steps": 10000,
+        "save_summary_steps": 500,
+        "scheduler": "ReduceLROnPlateau",
+        "total_training_steps": 50000,
+        "tracker": [
+            "tensorboard",
+        ],
+        "valid_interval": 10000,
+        "warmup_epochs": 0,
+    },
+    "use_custom_dataset": false,
+}

checkpoint/epoch-0030_step-0312356_loss-38.448391/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fd7ca5a98e57292908a7749488dfa1bee82e1f9cf560ec999906bdb72f03cce4
+size 159044848

checkpoint/epoch-0030_step-0312356_loss-38.448391/model_1.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a35e0287f33fe21c2234fbab466fac8659bfa5759bf5914b873746a42308f916
+size 187000096

checkpoint/epoch-0030_step-0312356_loss-38.448391/optimizer.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3d23f14adfa8137ca14c9a7493556cc00848a169e11dca0b4b8bb182b711760c
+size 318631531

checkpoint/epoch-0030_step-0312356_loss-38.448391/optimizer_1.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c687b8da88c7e166df5376dd5826837501ef848366b43270257e521478de5331
+size 374071331

checkpoint/epoch-0030_step-0312356_loss-38.448391/random_states_0.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9365e6ef62ff302c74e6a0c36a5d057ff5879a317b20e2da80246dfd03e356f4
+size 15691

checkpoint/epoch-0030_step-0312356_loss-38.448391/scheduler.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3790f159ec2bc847509acad910fae392b7bb974fa7dc7a2e52a5108e24b2484b
+size 563

checkpoint/epoch-0030_step-0312356_loss-38.448391/scheduler_1.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:231b81d7e845b9b1247f789877d0bb85bd04e81c03468eba58f27e9c2664ad62
+size 567

spk2id.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+    "hifitts_11614": 0,
+    "hifitts_11697": 1,
+    "hifitts_12787": 2,
+    "hifitts_6097": 3,
+    "hifitts_6670": 4,
+    "hifitts_6671": 5,
+    "hifitts_8051": 6,
+    "hifitts_9017": 7,
+    "hifitts_9136": 8,
+    "hifitts_92": 9
+}

symbols.dict ADDED Viewed

	@@ -0,0 +1,83 @@

+<eps> 0
+! 1
+" 2
+( 3
+) 4
+, 5
+. 6
+: 7
+; 8
+? 9
+_ 10
+a 11
+aɪ 12
+aɪə 13
+aɪɚ 14
+aɪʊ 15
+aɪʊɹ 16
+aʊ 17
+b 18
+d 19
+dʒ 20
+enus 21
+es 22
+eɪ 23
+f 24
+fr 25
+h 26
+i 27
+iə 28
+iː 29
+j 30
+k 31
+l 32
+m 33
+n 34
+nʲ 35
+o 36
+oʊ 37
+oː 38
+oːɹ 39
+p 40
+r 41
+s 42
+t 43
+tʃ 44
+uː 45
+v 46
+w 47
+z 48
+æ 49
+ð 50
+ø 51
+ŋ 52
+ɐ 53
+ɑ 54
+ɑː 55
+ɑːɹ 56
+ɔ 57
+ɔɪ 58
+ɔː 59
+ɔːɹ 60
+ə 61
+əl 62
+ɚ 63
+ɛ 64
+ɛɹ 65
+ɜː 66
+ɡ 67
+ɪ 68
+ɪɹ 69
+ɫ 70
+ɹ 71
+ɾ 72
+ʃ 73
+ʊ 74
+ʊɹ 75
+ʌ 76
+ʒ 77
+ʔ 78
+̃ 79
+̩ 80
+θ 81
+ᵻ 82