liuhuadai commited on
Commit
db95580
1 Parent(s): fef56b2

Upload 3 files

Browse files
configs/audiolcm.yaml ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ base_learning_rate: 3.0e-06
3
+ target: ldm.models.diffusion.lcm_audio.LCM_audio
4
+ params:
5
+ linear_start: 0.00085
6
+ linear_end: 0.012
7
+ num_timesteps_cond: 1
8
+ log_every_t: 200
9
+ timesteps: 1000
10
+ first_stage_key: image
11
+ cond_stage_key: caption
12
+ mel_dim: 20
13
+ mel_length: 312
14
+ channels: 0
15
+ cond_stage_trainable: False
16
+ conditioning_key: crossattn
17
+ monitor: val/loss_simple_ema
18
+ scale_by_std: true
19
+ use_lcm: True
20
+ num_ddim_timesteps: 50
21
+ w_min: 4
22
+ w_max: 12
23
+ ckpt_path: ./ckpt/maa2.ckpt
24
+
25
+ use_ema: false
26
+ scheduler_config:
27
+ target: ldm.lr_scheduler.LambdaLinearScheduler
28
+ params:
29
+ warm_up_steps:
30
+ - 10000
31
+ cycle_lengths:
32
+ - 10000000000000
33
+ f_start:
34
+ - 1.0e-06
35
+ f_max:
36
+ - 1.0
37
+ f_min:
38
+ - 1.0
39
+ unet_config:
40
+ target: ldm.modules.diffusionmodules.concatDiT.ConcatDiT2MLP
41
+ params:
42
+ in_channels: 20
43
+ context_dim: 1024
44
+ hidden_size: 576
45
+ num_heads: 8
46
+ depth: 4
47
+ max_len: 1000
48
+ first_stage_config:
49
+ target: ldm.models.autoencoder1d.AutoencoderKL
50
+ params:
51
+ embed_dim: 20
52
+ monitor: val/rec_loss
53
+ ckpt_path: ./model/AutoencoderKL/epoch=000032.ckpt
54
+ ddconfig:
55
+ double_z: true
56
+ in_channels: 80
57
+ out_ch: 80
58
+ z_channels: 20
59
+ kernel_size: 5
60
+ ch: 384
61
+ ch_mult:
62
+ - 1
63
+ - 2
64
+ - 4
65
+ num_res_blocks: 2
66
+ attn_layers:
67
+ - 3
68
+ down_layers:
69
+ - 0
70
+ dropout: 0.0
71
+ lossconfig:
72
+ target: torch.nn.Identity
73
+ cond_stage_config:
74
+ target: ldm.modules.encoders.modules.FrozenCLAPFLANEmbedder
75
+ params:
76
+ weights_path: ./model/FrozenCLAPFLANEmbedder/CLAP_weights_2022.pth
77
+
78
+ lightning:
79
+ callbacks:
80
+ image_logger:
81
+ target: main.AudioLogger
82
+ params:
83
+ sample_rate: 16000
84
+ for_specs: true
85
+ increase_log_steps: false
86
+ batch_frequency: 5000
87
+ max_images: 8
88
+ melvmin: -5
89
+ melvmax: 1.5
90
+ vocoder_cfg:
91
+ target: vocoder.bigvgan.models.VocoderBigVGAN
92
+ params:
93
+ ckpt_vocoder: ./vocoder/logs/bigvnat16k93.5w
94
+ trainer:
95
+ benchmark: True
96
+ gradient_clip_val: 1.0
97
+ replace_sampler_ddp: false
98
+ max_epochs: 100
99
+ modelcheckpoint:
100
+ params:
101
+ monitor: epoch
102
+ mode: max
103
+ # every_n_train_steps: 2000
104
+ save_top_k: 100
105
+ every_n_epochs: 3
106
+
107
+
108
+ data:
109
+ target: main.SpectrogramDataModuleFromConfig
110
+ params:
111
+ batch_size: 8
112
+ num_workers: 32
113
+ spec_dir_path: 'ldm/data/tsv_dirs/full_data/caps_struct'
114
+ mel_num: 80
115
+ train:
116
+ target: ldm.data.joinaudiodataset_struct_anylen.JoinSpecsTrain
117
+ params:
118
+ specs_dataset_cfg:
119
+ validation:
120
+ target: ldm.data.joinaudiodataset_struct_anylen.JoinSpecsValidation
121
+ params:
122
+ specs_dataset_cfg:
123
+
124
+ test_dataset:
125
+ target: ldm.data.tsvdataset.TSVDatasetStruct
126
+ params:
127
+ tsv_path: audiocaps_test_16000_struct.tsv
128
+ spec_crop_len: 624
129
+
130
+
configs/autoencoder1d.yaml ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ base_learning_rate: 4.5e-06
3
+ target: ldm.models.autoencoder1d.AutoencoderKL
4
+ params:
5
+ embed_dim: 20
6
+ monitor: val/rec_loss
7
+ ddconfig:
8
+ double_z: true
9
+ in_channels: 80
10
+ out_ch: 80
11
+ z_channels: 20
12
+ kernel_size: 5
13
+ ch: 384
14
+ ch_mult:
15
+ - 1
16
+ - 2
17
+ - 4
18
+ num_res_blocks: 2
19
+ attn_layers:
20
+ - 3
21
+ down_layers:
22
+ - 0
23
+ dropout: 0.0
24
+ lossconfig:
25
+ target: ldm.modules.losses_audio.contperceptual.LPAPSWithDiscriminator
26
+ params:
27
+ disc_start: 80001
28
+ perceptual_weight: 0.0
29
+ kl_weight: 1.0e-06
30
+ disc_weight: 0.5
31
+ disc_in_channels: 1
32
+ disc_loss: mse
33
+ disc_factor: 2
34
+ disc_conditional: false
35
+ r1_reg_weight: 3
36
+
37
+ lightning:
38
+ callbacks:
39
+ image_logger:
40
+ target: main.AudioLogger
41
+ params:
42
+ for_specs: true
43
+ increase_log_steps: false
44
+ batch_frequency: 5000
45
+ max_images: 8
46
+ rescale: false
47
+ melvmin: -5
48
+ melvmax: 1.5
49
+ vocoder_cfg:
50
+ target: vocoder.bigvgan.models.VocoderBigVGAN
51
+ params:
52
+ ckpt_vocoder: vocoder/logs/bigvnat16k93.5w
53
+ trainer:
54
+ sync_batchnorm: false # not working with r1_regularization
55
+ strategy: ddp
56
+
57
+
58
+ data:
59
+ target: main.SpectrogramDataModuleFromConfig
60
+ params:
61
+ batch_size: 4
62
+ num_workers: 16
63
+ spec_dir_path: ldm/data/tsv_dirs/full_data/V1_new
64
+ mel_num: 80
65
+ spec_len: 624
66
+ spec_crop_len: 624
67
+ train:
68
+ target: ldm.data.joinaudiodataset_624.JoinSpecsTrain
69
+ params:
70
+ specs_dataset_cfg: null
71
+ validation:
72
+ target: ldm.data.joinaudiodataset_624.JoinSpecsValidation
73
+ params:
74
+ specs_dataset_cfg: null
configs/teacher.yaml ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ base_learning_rate: 3.0e-06
3
+ target: ldm.models.diffusion.ddpm_audio.LatentDiffusion_audio
4
+ params:
5
+ linear_start: 0.00085
6
+ linear_end: 0.012
7
+ num_timesteps_cond: 1
8
+ log_every_t: 200
9
+ timesteps: 1000
10
+ first_stage_key: image
11
+ cond_stage_key: caption
12
+ mel_dim: 20
13
+ mel_length: 312
14
+ channels: 0
15
+ cond_stage_trainable: True
16
+ conditioning_key: crossattn
17
+ monitor: val/loss_simple_ema
18
+ scale_by_std: true
19
+ use_ema: false
20
+ scheduler_config:
21
+ target: ldm.lr_scheduler.LambdaLinearScheduler
22
+ params:
23
+ warm_up_steps:
24
+ - 10000
25
+ cycle_lengths:
26
+ - 10000000000000
27
+ f_start:
28
+ - 1.0e-06
29
+ f_max:
30
+ - 1.0
31
+ f_min:
32
+ - 1.0
33
+ unet_config:
34
+ target: ldm.modules.diffusionmodules.concatDiT.ConcatDiT2MLP
35
+ params:
36
+ in_channels: 20
37
+ context_dim: 1024
38
+ hidden_size: 576
39
+ num_heads: 8
40
+ depth: 4
41
+ max_len: 1000
42
+ first_stage_config:
43
+ target: ldm.models.autoencoder1d.AutoencoderKL
44
+ params:
45
+ embed_dim: 20
46
+ monitor: val/rec_loss
47
+ ckpt_path: logs/trainae/ckpt/epoch=000032.ckpt
48
+ ddconfig:
49
+ double_z: true
50
+ in_channels: 80
51
+ out_ch: 80
52
+ z_channels: 20
53
+ kernel_size: 5
54
+ ch: 384
55
+ ch_mult:
56
+ - 1
57
+ - 2
58
+ - 4
59
+ num_res_blocks: 2
60
+ attn_layers:
61
+ - 3
62
+ down_layers:
63
+ - 0
64
+ dropout: 0.0
65
+ lossconfig:
66
+ target: torch.nn.Identity
67
+ cond_stage_config:
68
+ target: ldm.modules.encoders.modules.FrozenCLAPFLANEmbedder
69
+ params:
70
+ weights_path: useful_ckpts/CLAP/CLAP_weights_2022.pth
71
+
72
+ lightning:
73
+ callbacks:
74
+ image_logger:
75
+ target: main.AudioLogger
76
+ params:
77
+ sample_rate: 16000
78
+ for_specs: true
79
+ increase_log_steps: false
80
+ batch_frequency: 5000
81
+ max_images: 8
82
+ melvmin: -5
83
+ melvmax: 1.5
84
+ vocoder_cfg:
85
+ target: vocoder.bigvgan.models.VocoderBigVGAN
86
+ params:
87
+ ckpt_vocoder: vocoder/logs/bigvnat16k93.5w
88
+ trainer:
89
+ benchmark: True
90
+ gradient_clip_val: 1.0
91
+ replace_sampler_ddp: false
92
+ modelcheckpoint:
93
+ params:
94
+ monitor: epoch
95
+ mode: max
96
+ save_top_k: 10
97
+ every_n_epochs: 5
98
+
99
+ data:
100
+ target: main.SpectrogramDataModuleFromConfig
101
+ params:
102
+ batch_size: 4
103
+ num_workers: 32
104
+ main_spec_dir_path: 'ldm/data/tsv_dirs/full_data/caps_struct'
105
+ other_spec_dir_path: 'ldm/data/tsv_dirs/full_data/V2'
106
+ mel_num: 80
107
+ train:
108
+ target: ldm.data.joinaudiodataset_struct_sample_anylen.JoinSpecsTrain
109
+ params:
110
+ specs_dataset_cfg:
111
+ validation:
112
+ target: ldm.data.joinaudiodataset_struct_sample_anylen.JoinSpecsValidation
113
+ params:
114
+ specs_dataset_cfg:
115
+
116
+ test_dataset:
117
+ target: ldm.data.tsvdataset.TSVDatasetStruct
118
+ params:
119
+ tsv_path: musiccap.tsv
120
+ spec_crop_len: 624
121
+