{ "base_config": "config/base.json", "model_type": "AudioLDM", "task_type": "tta", "dataset": [ "AudioCaps" ], "preprocess": { // feature used for model training "use_spkid": false, "use_uv": false, "use_frame_pitch": false, "use_phone_pitch": false, "use_frame_energy": false, "use_phone_energy": false, "use_mel": false, "use_audio": false, "use_label": false, "use_one_hot": false, "cond_mask_prob": 0.1 }, // model "model": { "audioldm": { "image_size": 32, "in_channels": 4, "out_channels": 4, "model_channels": 256, "attention_resolutions": [ 4, 2, 1 ], "num_res_blocks": 2, "channel_mult": [ 1, 2, 4 ], "num_heads": 8, "use_spatial_transformer": true, "transformer_depth": 1, "context_dim": 768, "use_checkpoint": true, "legacy": false }, "autoencoderkl": { "ch": 128, "ch_mult": [ 1, 1, 2, 2, 4 ], "num_res_blocks": 2, "in_channels": 1, "z_channels": 4, "out_ch": 1, "double_z": true }, "noise_scheduler": { "num_train_timesteps": 1000, "beta_start": 0.00085, "beta_end": 0.012, "beta_schedule": "scaled_linear", "clip_sample": false, "steps_offset": 1, "set_alpha_to_one": false, "skip_prk_steps": true, "prediction_type": "epsilon" } }, // train "train": { "lronPlateau": { "factor": 0.9, "patience": 100, "min_lr": 4.0e-5, "verbose": true }, "adam": { "lr": 5.0e-5, "betas": [ 0.9, 0.999 ], "weight_decay": 1.0e-2, "eps": 1.0e-8 } } }