liuhuadai commited on
Commit
4480dee
1 Parent(s): ae64e7f

Upload audiolcm.yaml

Browse files
Files changed (1) hide show
  1. audiolcm.yaml +130 -0
audiolcm.yaml ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ base_learning_rate: 3.0e-06
3
+ target: ldm.models.diffusion.lcm_audio.LCM_audio
4
+ params:
5
+ linear_start: 0.00085
6
+ linear_end: 0.012
7
+ num_timesteps_cond: 1
8
+ log_every_t: 200
9
+ timesteps: 1000
10
+ first_stage_key: image
11
+ cond_stage_key: caption
12
+ mel_dim: 20
13
+ mel_length: 312
14
+ channels: 0
15
+ cond_stage_trainable: False
16
+ conditioning_key: crossattn
17
+ monitor: val/loss_simple_ema
18
+ scale_by_std: true
19
+ use_lcm: True
20
+ num_ddim_timesteps: 50
21
+ w_min: 4
22
+ w_max: 12
23
+ ckpt_path: ./useful_ckpt/LCM_audio/maa2.ckpt
24
+
25
+ use_ema: false
26
+ scheduler_config:
27
+ target: ldm.lr_scheduler.LambdaLinearScheduler
28
+ params:
29
+ warm_up_steps:
30
+ - 10000
31
+ cycle_lengths:
32
+ - 10000000000000
33
+ f_start:
34
+ - 1.0e-06
35
+ f_max:
36
+ - 1.0
37
+ f_min:
38
+ - 1.0
39
+ unet_config:
40
+ target: ldm.modules.diffusionmodules.concatDiT.ConcatDiT2MLP
41
+ params:
42
+ in_channels: 20
43
+ context_dim: 1024
44
+ hidden_size: 576
45
+ num_heads: 8
46
+ depth: 4
47
+ max_len: 1000
48
+ first_stage_config:
49
+ target: ldm.models.autoencoder1d.AutoencoderKL
50
+ params:
51
+ embed_dim: 20
52
+ monitor: val/rec_loss
53
+ ckpt_path: ./useful_ckpt/AutoencoderKL/epoch=000032.ckpt
54
+ ddconfig:
55
+ double_z: true
56
+ in_channels: 80
57
+ out_ch: 80
58
+ z_channels: 20
59
+ kernel_size: 5
60
+ ch: 384
61
+ ch_mult:
62
+ - 1
63
+ - 2
64
+ - 4
65
+ num_res_blocks: 2
66
+ attn_layers:
67
+ - 3
68
+ down_layers:
69
+ - 0
70
+ dropout: 0.0
71
+ lossconfig:
72
+ target: torch.nn.Identity
73
+ cond_stage_config:
74
+ target: ldm.modules.encoders.modules.FrozenCLAPFLANEmbedder
75
+ params:
76
+ weights_path: ./useful_ckpt/FrozenCLAPFLANEmbedder/CLAP_weights_2022.pth
77
+
78
+ lightning:
79
+ callbacks:
80
+ image_logger:
81
+ target: main.AudioLogger
82
+ params:
83
+ sample_rate: 16000
84
+ for_specs: true
85
+ increase_log_steps: false
86
+ batch_frequency: 5000
87
+ max_images: 8
88
+ melvmin: -5
89
+ melvmax: 1.5
90
+ vocoder_cfg:
91
+ target: vocoder.bigvgan.models.VocoderBigVGAN
92
+ params:
93
+ ckpt_vocoder: ./useful_ckpt/vocoder/logs/bigvnat16k93.5w
94
+ trainer:
95
+ benchmark: True
96
+ gradient_clip_val: 1.0
97
+ replace_sampler_ddp: false
98
+ max_epochs: 100
99
+ modelcheckpoint:
100
+ params:
101
+ monitor: epoch
102
+ mode: max
103
+ # every_n_train_steps: 2000
104
+ save_top_k: 100
105
+ every_n_epochs: 3
106
+
107
+
108
+ data:
109
+ target: main.SpectrogramDataModuleFromConfig
110
+ params:
111
+ batch_size: 8
112
+ num_workers: 32
113
+ spec_dir_path: 'ldm/data/tsv_dirs/full_data/caps_struct'
114
+ mel_num: 80
115
+ train:
116
+ target: ldm.data.joinaudiodataset_struct_anylen.JoinSpecsTrain
117
+ params:
118
+ specs_dataset_cfg:
119
+ validation:
120
+ target: ldm.data.joinaudiodataset_struct_anylen.JoinSpecsValidation
121
+ params:
122
+ specs_dataset_cfg:
123
+
124
+ test_dataset:
125
+ target: ldm.data.tsvdataset.TSVDatasetStruct
126
+ params:
127
+ tsv_path: audiocaps_test_16000_struct.tsv
128
+ spec_crop_len: 624
129
+
130
+