{ "base_config": "config/vits.json", "model_type": "VITS", "dataset": [ "hifitts" ], "dataset_path": { // TODO: Fill in your dataset path "hifitts": "/mnt/workspace/xueliumeng/data/hifitts/hi_fi_tts_v0" }, // TODO: Fill in the output log path. The default value is "Amphion/ckpts/tts" "log_dir": "/mnt/workspace/tzeying/data/vits_on_libritts_hifitts/logs", "preprocess": { "extract_audio": true, "use_phone": true, // linguistic features "extract_phone": true, "phone_extractor": "espeak", // "espeak, pypinyin, pypinyin_initials_finals, lexicon (only for language=en-us right now)" // TODO: Fill in the output data path. The default value is "Amphion/data" "processed_dir": "/mnt/workspace/tzeying/data/vits_on_libritts_hifitts/processed_data", "sample_rate": 24000, "train_file": "train_5k.json", "valid_file": "valid.json", // validattion set "use_spkid": true, // True: use speaker id for multi-speaker dataset }, "train": { "batch_size": 16, "multi_speaker_training": true, // True: train multi-speaker model; False: training single-speaker model; //"n_speakers": 2500, // number of speakers, while be automatically set if n_speakers is 0 and multi_speaker_training is true } }