File size: 3,159 Bytes
4480dee |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
model:
base_learning_rate: 3.0e-06
target: ldm.models.diffusion.lcm_audio.LCM_audio
params:
linear_start: 0.00085
linear_end: 0.012
num_timesteps_cond: 1
log_every_t: 200
timesteps: 1000
first_stage_key: image
cond_stage_key: caption
mel_dim: 20
mel_length: 312
channels: 0
cond_stage_trainable: False
conditioning_key: crossattn
monitor: val/loss_simple_ema
scale_by_std: true
use_lcm: True
num_ddim_timesteps: 50
w_min: 4
w_max: 12
ckpt_path: ./useful_ckpt/LCM_audio/maa2.ckpt
use_ema: false
scheduler_config:
target: ldm.lr_scheduler.LambdaLinearScheduler
params:
warm_up_steps:
- 10000
cycle_lengths:
- 10000000000000
f_start:
- 1.0e-06
f_max:
- 1.0
f_min:
- 1.0
unet_config:
target: ldm.modules.diffusionmodules.concatDiT.ConcatDiT2MLP
params:
in_channels: 20
context_dim: 1024
hidden_size: 576
num_heads: 8
depth: 4
max_len: 1000
first_stage_config:
target: ldm.models.autoencoder1d.AutoencoderKL
params:
embed_dim: 20
monitor: val/rec_loss
ckpt_path: ./useful_ckpt/AutoencoderKL/epoch=000032.ckpt
ddconfig:
double_z: true
in_channels: 80
out_ch: 80
z_channels: 20
kernel_size: 5
ch: 384
ch_mult:
- 1
- 2
- 4
num_res_blocks: 2
attn_layers:
- 3
down_layers:
- 0
dropout: 0.0
lossconfig:
target: torch.nn.Identity
cond_stage_config:
target: ldm.modules.encoders.modules.FrozenCLAPFLANEmbedder
params:
weights_path: ./useful_ckpt/FrozenCLAPFLANEmbedder/CLAP_weights_2022.pth
lightning:
callbacks:
image_logger:
target: main.AudioLogger
params:
sample_rate: 16000
for_specs: true
increase_log_steps: false
batch_frequency: 5000
max_images: 8
melvmin: -5
melvmax: 1.5
vocoder_cfg:
target: vocoder.bigvgan.models.VocoderBigVGAN
params:
ckpt_vocoder: ./useful_ckpt/vocoder/logs/bigvnat16k93.5w
trainer:
benchmark: True
gradient_clip_val: 1.0
replace_sampler_ddp: false
max_epochs: 100
modelcheckpoint:
params:
monitor: epoch
mode: max
# every_n_train_steps: 2000
save_top_k: 100
every_n_epochs: 3
data:
target: main.SpectrogramDataModuleFromConfig
params:
batch_size: 8
num_workers: 32
spec_dir_path: 'ldm/data/tsv_dirs/full_data/caps_struct'
mel_num: 80
train:
target: ldm.data.joinaudiodataset_struct_anylen.JoinSpecsTrain
params:
specs_dataset_cfg:
validation:
target: ldm.data.joinaudiodataset_struct_anylen.JoinSpecsValidation
params:
specs_dataset_cfg:
test_dataset:
target: ldm.data.tsvdataset.TSVDatasetStruct
params:
tsv_path: audiocaps_test_16000_struct.tsv
spec_crop_len: 624
|