{ "base_config": "config/tts.json", "model_type": "VALLE", "task_type": "tts", "dataset": [ "libritts" ], "preprocess": { "extract_phone": true, "phone_extractor": "espeak", // phoneme extractor: espeak, pypinyin, pypinyin_initials_finals or lexicon "extract_acoustic_token": true, "acoustic_token_extractor": "Encodec", // acoustic token extractor: encodec, dac(todo) "acoustic_token_dir": "acoutic_tokens", "use_text": false, "use_phone": true, "use_acoustic_token": true, "symbols_dict": "symbols.dict", "min_duration": 0.5, // the duration lowerbound to filter the audio with duration < min_duration "max_duration": 14, // the duration uperbound to filter the audio with duration > max_duration. "sampling_rate": 24000, }, "model": { "text_token_num": 512, "audio_token_num": 1024, "decoder_dim": 1024, // embedding dimension of the decoder model "nhead": 16, // number of attention heads in the decoder layers "num_decoder_layers": 12, // number of decoder layers "norm_first": true, // pre or post Normalization. "add_prenet": false, // whether add PreNet after Inputs "prefix_mode": 0, // mode for how to prefix VALL-E NAR Decoder, 0: no prefix, 1: 0 to random, 2: random to random, 4: chunk of pre or post utterance "share_embedding": true, // share the parameters of the output projection layer with the parameters of the acoustic embedding "nar_scale_factor": 1, // model scale factor which will be assigned different meanings in different models "prepend_bos": false, // whether prepend to the acoustic tokens -> AR Decoder inputs "num_quantizers": 8, // numbert of the audio quantization layers // "scaling_xformers": false, // Apply Reworked Conformer scaling on Transformers }, "train": { "ddp": false, "train_stage": 1, // 0: train all modules, For VALL_E, support 1: AR Decoder 2: NAR Decoder(s) "max_epoch": 20, "optimizer": "ScaledAdam", "scheduler": "Eden", "warmup_steps": 200, // number of steps that affects how rapidly the learning rate decreases "base_lr": 0.05, // base learning rate." "valid_interval": 1000, "log_epoch_step": 1000, "save_checkpoint_stride": [ 1, 1 ] } }