model_name: EzAudio-L-Energy

model:
  mae: True
  mae_prob: 0.25
  mask_ratio: [0.25, 1.0]
  mask_span: 10
  img_size: 500
  patch_size: 1
  in_chans: 257
  out_chans: 128
  input_type: '1d'
  embed_dim: 1024
  depth: 24
  num_heads: 16
  mlp_ratio: 4.0
  qkv_bias: false
  qk_scale: null
  qk_norm: layernorm
  norm_layer: layernorm
  act_layer: geglu
  context_norm: true
  use_checkpoint: true
  time_fusion: 'ada_lora_bias'
  ada_lora_rank: 32
  ada_lora_alpha: 32
  cls_dim: null
  context_dim: 1024
  context_fusion: 'cross'
  context_max_length: null
  context_pe_method: 'none'
  pe_method: 'none'
  rope_mode: 'shared'
  use_conv: true
  skip: true
  skip_norm: true

controlnet:
  cond_in: 1
  cond_blocks: [64, 128]
  cond_mask: true
  cond_mask_prob: 0.25
  cond_mask_ratio: [0.25, 0.50]
  cond_mask_span: 10

conditioner:
  condition_type: energy
  hop_size: 240
  window_size: 1920
  padding: 'reflect'
  min_db: -60
  norm: True

# usually use q_first as false like other studies
autoencoder:
  name: stable_vae
  dim: 128
  sr: 24000
  latent_sr: 50
  q_first: true
  scale: 1.0
  shift: 0.0

# a fixed length should be set when using concat mode
# a fixed length should be set for distributed training
text_encoder:
  model: google/flan-t5-large
  max_length: 100
  cfg: 0.1

diff:
  num_train_timesteps: 1000
  beta_schedule: 'scaled_linear'
  beta_start: 0.00085
  beta_end: 0.012
  prediction_type: 'v_prediction'
  rescale_betas_zero_snr: true
  timestep_spacing: 'trailing'
  clip_sample: false