haoheliu commited on
Commit
75e7a77
1 Parent(s): b700c02

Update audioldm/utils.py

Browse files
Files changed (1) hide show
  1. audioldm/utils.py +103 -2
audioldm/utils.py CHANGED
@@ -69,5 +69,106 @@ def instantiate_from_config(config):
69
  raise KeyError("Expected key `target` to instantiate.")
70
  return get_obj_from_str(config["target"])(**config.get("params", dict()))
71
 
72
- def default_audioldm_config():
73
- return {'wave_file_save_path': './output', 'id': {'version': 'v1', 'name': 'default', 'root': '/mnt/fast/nobackup/users/hl01486/projects/general_audio_generation/AudioLDM-python/config/default/latent_diffusion.yaml'}, 'model': {'device': 'cuda', 'reload_from_ckpt': '/mnt/fast/nobackup/scratch4weeks/hl01486/exps/audio_generation/stablediffusion/LDM/audioverse/2023_01_14_full_F4_B_spatial_v2_v1/checkpoints/last.ckpt', 'target': 'audioldm.pipline.LatentDiffusion', 'params': {'base_learning_rate': 5e-06, 'linear_start': 0.0015, 'linear_end': 0.0195, 'num_timesteps_cond': 1, 'log_every_t': 200, 'timesteps': 1000, 'first_stage_key': 'fbank', 'cond_stage_key': 'waveform', 'latent_t_size': 256, 'latent_f_size': 16, 'channels': 8, 'cond_stage_trainable': True, 'conditioning_key': 'film', 'monitor': 'val/loss_simple_ema', 'scale_by_std': True, 'unet_config': {'target': 'audioldm.latent_diffusion.openaimodel.UNetModel', 'params': {'image_size': 64, 'extra_film_condition_dim': 512, 'extra_film_use_concat': True, 'in_channels': 8, 'out_channels': 8, 'model_channels': 128, 'attention_resolutions': [8, 4, 2], 'num_res_blocks': 2, 'channel_mult': [1, 2, 3, 5], 'num_head_channels': 32, 'use_spatial_transformer': True}}, 'first_stage_config': {'base_learning_rate': 4.5e-05, 'target': 'audioldm.variational_autoencoder.autoencoder.AutoencoderKL', 'params': {'monitor': 'val/rec_loss', 'image_key': 'fbank', 'subband': 1, 'embed_dim': 8, 'time_shuffle': 1, 'ddconfig': {'double_z': True, 'z_channels': 8, 'resolution': 256, 'downsample_time': False, 'in_channels': 1, 'out_ch': 1, 'ch': 128, 'ch_mult': [1, 2, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0}}}, 'cond_stage_config': {'target': 'audioldm.clap.encoders.CLAPAudioEmbeddingClassifierFreev2', 'params': {'key': 'waveform', 'sampling_rate': 16000, 'embed_mode': 'audio', 'unconditional_prob': 0.1}}}}}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  raise KeyError("Expected key `target` to instantiate.")
70
  return get_obj_from_str(config["target"])(**config.get("params", dict()))
71
 
72
+ def default_audioldm_config(model_name="audioldm-s-full"):
73
+ basic_config = {
74
+ "wave_file_save_path": "./output",
75
+ "id": {
76
+ "version": "v1",
77
+ "name": "default",
78
+ "root": "/mnt/fast/nobackup/users/hl01486/projects/general_audio_generation/AudioLDM-python/config/default/latent_diffusion.yaml",
79
+ },
80
+ "preprocessing": {
81
+ "audio": {"sampling_rate": 16000, "max_wav_value": 32768},
82
+ "stft": {"filter_length": 1024, "hop_length": 160, "win_length": 1024},
83
+ "mel": {
84
+ "n_mel_channels": 64,
85
+ "mel_fmin": 0,
86
+ "mel_fmax": 8000,
87
+ "freqm": 0,
88
+ "timem": 0,
89
+ "blur": False,
90
+ "mean": -4.63,
91
+ "std": 2.74,
92
+ "target_length": 1024,
93
+ },
94
+ },
95
+ "model": {
96
+ "device": "cuda",
97
+ "target": "audioldm.pipline.LatentDiffusion",
98
+ "params": {
99
+ "base_learning_rate": 5e-06,
100
+ "linear_start": 0.0015,
101
+ "linear_end": 0.0195,
102
+ "num_timesteps_cond": 1,
103
+ "log_every_t": 200,
104
+ "timesteps": 1000,
105
+ "first_stage_key": "fbank",
106
+ "cond_stage_key": "waveform",
107
+ "latent_t_size": 256,
108
+ "latent_f_size": 16,
109
+ "channels": 8,
110
+ "cond_stage_trainable": True,
111
+ "conditioning_key": "film",
112
+ "monitor": "val/loss_simple_ema",
113
+ "scale_by_std": True,
114
+ "unet_config": {
115
+ "target": "audioldm.latent_diffusion.openaimodel.UNetModel",
116
+ "params": {
117
+ "image_size": 64,
118
+ "extra_film_condition_dim": 512,
119
+ "extra_film_use_concat": True,
120
+ "in_channels": 8,
121
+ "out_channels": 8,
122
+ "model_channels": 128,
123
+ "attention_resolutions": [8, 4, 2],
124
+ "num_res_blocks": 2,
125
+ "channel_mult": [1, 2, 3, 5],
126
+ "num_head_channels": 32,
127
+ "use_spatial_transformer": True,
128
+ },
129
+ },
130
+ "first_stage_config": {
131
+ "base_learning_rate": 4.5e-05,
132
+ "target": "audioldm.variational_autoencoder.autoencoder.AutoencoderKL",
133
+ "params": {
134
+ "monitor": "val/rec_loss",
135
+ "image_key": "fbank",
136
+ "subband": 1,
137
+ "embed_dim": 8,
138
+ "time_shuffle": 1,
139
+ "ddconfig": {
140
+ "double_z": True,
141
+ "z_channels": 8,
142
+ "resolution": 256,
143
+ "downsample_time": False,
144
+ "in_channels": 1,
145
+ "out_ch": 1,
146
+ "ch": 128,
147
+ "ch_mult": [1, 2, 4],
148
+ "num_res_blocks": 2,
149
+ "attn_resolutions": [],
150
+ "dropout": 0.0,
151
+ },
152
+ },
153
+ },
154
+ "cond_stage_config": {
155
+ "target": "audioldm.clap.encoders.CLAPAudioEmbeddingClassifierFreev2",
156
+ "params": {
157
+ "key": "waveform",
158
+ "sampling_rate": 16000,
159
+ "embed_mode": "audio",
160
+ "unconditional_prob": 0.1,
161
+ },
162
+ },
163
+ },
164
+ },
165
+ }
166
+
167
+ if("-l-" in model_name):
168
+ basic_config["model"]["params"]["unet_config"]["params"]["model_channels"] = 256
169
+ basic_config["model"]["params"]["unet_config"]["params"]["num_head_channels"] = 64
170
+ elif("-m-" in model_name):
171
+ basic_config["model"]["params"]["unet_config"]["params"]["model_channels"] = 192
172
+ basic_config["model"]["params"]["cond_stage_config"]["params"]["amodel"] = "HTSAT-base" # This model use a larger HTAST
173
+
174
+ return basic_config