{ "base_config": "config/valle.json", "model_type": "VALLE", "dataset": [ "libritts" ], "dataset_path": { "libritts": "[LibriTTS dataset path]" }, "preprocess": { "extract_phone": true, "phone_extractor": "espeak", // "espeak, pypinyin, pypinyin_initials_finals, lexicon (only for language=en-us right now)" "extract_acoustic_token": true, "use_phone": true, "use_acoustic_token": true, "processed_dir": "Amphion/data/", "sampling_rate": "24000", // "Audio sampling rate." "valid_file": "test.json", }, "model": { "prefix_mode": 1, // "The mode for how to prefix VALL-E NAR Decoder, 0: no prefix, 1: 0 to random, 2: random to random, 4: chunk of pre or post utterance.", }, "log_dir": "Amphion/ckpts/tts/valle", "train": { "batch_size": 4, "train_stage": 1, // 0: train all modules, For VALL_E, support 1: AR Decoder 2: NAR Decoder(s) "max_epoch": 20, // "Number of epochs to train." } }