{ "base_config": "egs/tta/audioldm/exp_config_base.json", "dataset": [ "AudioCaps" ], "preprocess": { // Specify the output root path to save the processed data "processed_dir": "data", // For example: "/home/TTADataset/processed_data" // feature "use_spkid": false, "use_uv": false, "use_frame_pitch": false, "use_phone_pitch": false, "use_frame_energy": false, "use_phone_energy": false, "use_mel": false, "use_audio": false, "use_label": false, "use_one_hot": false, // feature for text to audio "use_caption": true, "use_melspec": true, "use_wav": false, // feature dir "melspec_dir": "mel", "wav_dir": "wav" }, // Specify the output root path to save model ckpts and logs "log_dir": "ckpts/tta", // For example: "/home/TTADataset/processed_data/logs" // model "model": { "audioldm": { "image_size": 32, "in_channels": 4, "out_channels": 4, "model_channels": 256, "attention_resolutions": [4, 2, 1], "num_res_blocks": 2, "channel_mult": [1, 2, 4], "num_heads": 8, "use_spatial_transformer": true, "transformer_depth": 1, "context_dim": 768, "use_checkpoint": true, "legacy": false }, "autoencoderkl": { "ch": 128, "ch_mult": [1,1,2,2,4], "num_res_blocks": 2, "in_channels": 1, "z_channels": 4, "out_ch": 1, "double_z": true }, "noise_scheduler": { "num_train_timesteps": 1000, "beta_start": 0.00085, "beta_end": 0.012, "beta_schedule": "scaled_linear", "clip_sample": false, "steps_offset": 1, "set_alpha_to_one": false, "skip_prk_steps": true, "prediction_type": "epsilon" }, "autoencoder_path": "ckpts/tta/autoencoder_kl_debug/checkpoints/step-0445000_loss-0.3306.pt" }, // train "train": { "adam": { "lr": 5.0e-5 }, "ddp": false, "random_seed": 12345, "batch_size": 12, "epochs": 50000, "max_steps": 1000000, "total_training_steps": 800000, "save_summary_steps": 1000, "save_checkpoints_steps": 5000, "valid_interval": 5000, "keep_checkpoint_max": 100 } }