{ "base_config": "config/base.json", "model_type": "AutoencoderKL", "task_type": "tta", "dataset": [ "AudioCaps" ], "preprocess": { // feature used for model training "use_spkid": false, "use_uv": false, "use_frame_pitch": false, "use_phone_pitch": false, "use_frame_energy": false, "use_phone_energy": false, "use_mel": false, "use_audio": false, "use_label": false, "use_one_hot": false }, // model "model": { "autoencoderkl": { "ch": 128, "ch_mult": [ 1, 1, 2, 2, 4 ], "num_res_blocks": 2, "in_channels": 1, "z_channels": 4, "out_ch": 1, "double_z": true }, "loss": { "kl_weight": 1e-8, "disc_weight": 0.5, "disc_factor": 1.0, "logvar_init": 0.0, "min_adapt_d_weight": 0.0, "max_adapt_d_weight": 10.0, "disc_start": 50001, "disc_in_channels": 1, "disc_num_layers": 3, "use_actnorm": false } }, // train "train": { "lronPlateau": { "factor": 0.9, "patience": 100, "min_lr": 4.0e-5, "verbose": true }, "adam": { "lr": 4.0e-4, "betas": [ 0.9, 0.999 ], "weight_decay": 1.0e-2, "eps": 1.0e-8 } } }