model_name: EzAudio-L-Energy model: mae: True mae_prob: 0.25 mask_ratio: [0.25, 1.0] mask_span: 10 img_size: 500 patch_size: 1 in_chans: 257 out_chans: 128 input_type: '1d' embed_dim: 1024 depth: 24 num_heads: 16 mlp_ratio: 4.0 qkv_bias: false qk_scale: null qk_norm: layernorm norm_layer: layernorm act_layer: geglu context_norm: true use_checkpoint: true time_fusion: 'ada_lora_bias' ada_lora_rank: 32 ada_lora_alpha: 32 cls_dim: null context_dim: 1024 context_fusion: 'cross' context_max_length: null context_pe_method: 'none' pe_method: 'none' rope_mode: 'shared' use_conv: true skip: true skip_norm: true controlnet: cond_in: 1 cond_blocks: [64, 128] cond_mask: true cond_mask_prob: 0.25 cond_mask_ratio: [0.25, 0.50] cond_mask_span: 10 conditioner: condition_type: energy hop_size: 240 window_size: 1920 padding: 'reflect' min_db: -60 norm: True # usually use q_first as false like other studies autoencoder: name: stable_vae dim: 128 sr: 24000 latent_sr: 50 q_first: true scale: 1.0 shift: 0.0 # a fixed length should be set when using concat mode # a fixed length should be set for distributed training text_encoder: model: google/flan-t5-large max_length: 100 cfg: 0.1 diff: num_train_timesteps: 1000 beta_schedule: 'scaled_linear' beta_start: 0.00085 beta_end: 0.012 prediction_type: 'v_prediction' rescale_betas_zero_snr: true timestep_spacing: 'trailing' clip_sample: false