Make_An_Audio / configs /text_to_audio /txt2audio_args.yaml
txt2audio's picture
update
fa25a07
raw
history blame
2.04 kB
model:
base_learning_rate: 1.0e-05
target: ldm.models.diffusion.ddpm_audio.LatentDiffusion_audio
params:
linear_start: 0.00085
linear_end: 0.0120
num_timesteps_cond: 1
log_every_t: 200
timesteps: 1000
first_stage_key: image
cond_stage_key: caption
image_size: 32 # unused
mel_dim: 10 # 80 // 2^3
mel_length: 78 # 624 // 2^3
channels: 4
cond_stage_trainable: false
conditioning_key: crossattn
monitor: val/loss_simple_ema
scale_by_std: True
use_ema: False
scheduler_config: # 10000 warmup steps
target: ldm.lr_scheduler.LambdaLinearScheduler
params:
warm_up_steps: [10000]
cycle_lengths: [10000000000000]
f_start: [1.e-6]
f_max: [1.]
f_min: [ 1.]
unet_config:
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
params:
image_size: 32 # ununsed
in_channels: 4
out_channels: 4
model_channels: 320
attention_resolutions:
- 1
- 2
num_res_blocks: 2
channel_mult: # num_down = len(ch_mult)-1
- 1
- 2
num_heads: 8
use_spatial_transformer: true
transformer_depth: 1
context_dim: 1024
use_checkpoint: true
legacy: False
first_stage_config:
target: ldm.models.autoencoder.AutoencoderKL
params:
embed_dim: 4
monitor: val/rec_loss
ckpt_path:
ddconfig:
double_z: true
z_channels: 4
resolution: 624
in_channels: 1
out_ch: 1
ch: 128
ch_mult: [ 1, 2, 2, 4 ] # num_down = len(ch_mult)-1
num_res_blocks: 2
attn_resolutions: [78, 156]
dropout: 0.0
lossconfig:
target: torch.nn.Identity
cond_stage_config:
target: ldm.modules.encoders.modules.FrozenCLAPEmbedder
params:
weights_path: useful_ckpts/CLAP/CLAP_weights_2022.pth
ckpt_path: useful_ckpts/maa1_caps.ckpt