{ "base_config": "config/base.json", "model_type": "NaturalSpeech2", "dataset": ["LibriTTS"], "preprocess": { "use_mel": false, "use_code": true, "use_spkid": true, "use_pitch": true, "use_duration": true, "use_phone": true, "use_len": true, "use_cross_reference": true, "train_file": "train.json", "melspec_dir": "mel", "code_dir": "code", "pitch_dir": "pitch", "duration_dir": "duration", "clip_mode": "start" }, "model": { "latent_dim": 128, "prior_encoder": { "vocab_size": 100, "pitch_min": 50, "pitch_max": 1100, "pitch_bins_num": 512, "encoder": { "encoder_layer": 6, "encoder_hidden": 512, "encoder_head": 8, "conv_filter_size": 2048, "conv_kernel_size": 9, "encoder_dropout": 0.2, "use_cln": true }, "duration_predictor": { "input_size": 512, "filter_size": 512, "kernel_size": 3, "conv_layers": 30, "cross_attn_per_layer": 3, "attn_head": 8, "drop_out": 0.5 }, "pitch_predictor": { "input_size": 512, "filter_size": 512, "kernel_size": 5, "conv_layers": 30, "cross_attn_per_layer": 3, "attn_head": 8, "drop_out": 0.5 } }, "diffusion": { "wavenet": { "input_size": 128, "hidden_size": 512, "out_size": 128, "num_layers": 40, "cross_attn_per_layer": 3, "dilation_cycle": 2, "attn_head": 8, "drop_out": 0.2 }, "beta_min": 0.05, "beta_max": 20, "sigma": 1.0, "noise_factor": 1.0, "ode_solver": "euler" }, "prompt_encoder": { "encoder_layer": 6, "encoder_hidden": 512, "encoder_head": 8, "conv_filter_size": 2048, "conv_kernel_size": 9, "encoder_dropout": 0.2, "use_cln": false }, "query_emb": { "query_token_num": 32, "hidden_size": 512, "head_num": 8 } } }