AudioLCM / audiolcm.yaml
liuhuadai's picture
Upload audiolcm.yaml
4480dee verified
model:
base_learning_rate: 3.0e-06
target: ldm.models.diffusion.lcm_audio.LCM_audio
params:
linear_start: 0.00085
linear_end: 0.012
num_timesteps_cond: 1
log_every_t: 200
timesteps: 1000
first_stage_key: image
cond_stage_key: caption
mel_dim: 20
mel_length: 312
channels: 0
cond_stage_trainable: False
conditioning_key: crossattn
monitor: val/loss_simple_ema
scale_by_std: true
use_lcm: True
num_ddim_timesteps: 50
w_min: 4
w_max: 12
ckpt_path: ./useful_ckpt/LCM_audio/maa2.ckpt
use_ema: false
scheduler_config:
target: ldm.lr_scheduler.LambdaLinearScheduler
params:
warm_up_steps:
- 10000
cycle_lengths:
- 10000000000000
f_start:
- 1.0e-06
f_max:
- 1.0
f_min:
- 1.0
unet_config:
target: ldm.modules.diffusionmodules.concatDiT.ConcatDiT2MLP
params:
in_channels: 20
context_dim: 1024
hidden_size: 576
num_heads: 8
depth: 4
max_len: 1000
first_stage_config:
target: ldm.models.autoencoder1d.AutoencoderKL
params:
embed_dim: 20
monitor: val/rec_loss
ckpt_path: ./useful_ckpt/AutoencoderKL/epoch=000032.ckpt
ddconfig:
double_z: true
in_channels: 80
out_ch: 80
z_channels: 20
kernel_size: 5
ch: 384
ch_mult:
- 1
- 2
- 4
num_res_blocks: 2
attn_layers:
- 3
down_layers:
- 0
dropout: 0.0
lossconfig:
target: torch.nn.Identity
cond_stage_config:
target: ldm.modules.encoders.modules.FrozenCLAPFLANEmbedder
params:
weights_path: ./useful_ckpt/FrozenCLAPFLANEmbedder/CLAP_weights_2022.pth
lightning:
callbacks:
image_logger:
target: main.AudioLogger
params:
sample_rate: 16000
for_specs: true
increase_log_steps: false
batch_frequency: 5000
max_images: 8
melvmin: -5
melvmax: 1.5
vocoder_cfg:
target: vocoder.bigvgan.models.VocoderBigVGAN
params:
ckpt_vocoder: ./useful_ckpt/vocoder/logs/bigvnat16k93.5w
trainer:
benchmark: True
gradient_clip_val: 1.0
replace_sampler_ddp: false
max_epochs: 100
modelcheckpoint:
params:
monitor: epoch
mode: max
# every_n_train_steps: 2000
save_top_k: 100
every_n_epochs: 3
data:
target: main.SpectrogramDataModuleFromConfig
params:
batch_size: 8
num_workers: 32
spec_dir_path: 'ldm/data/tsv_dirs/full_data/caps_struct'
mel_num: 80
train:
target: ldm.data.joinaudiodataset_struct_anylen.JoinSpecsTrain
params:
specs_dataset_cfg:
validation:
target: ldm.data.joinaudiodataset_struct_anylen.JoinSpecsValidation
params:
specs_dataset_cfg:
test_dataset:
target: ldm.data.tsvdataset.TSVDatasetStruct
params:
tsv_path: audiocaps_test_16000_struct.tsv
spec_crop_len: 624