unet_additional_kwargs: | |
use_inflated_groupnorm: true | |
unet_use_cross_frame_attention: false | |
unet_use_temporal_attention: false | |
use_motion_module: true | |
use_audio_module: true | |
motion_module_resolutions: | |
- 1 | |
- 2 | |
- 4 | |
- 8 | |
motion_module_mid_block: true | |
motion_module_decoder_only: false | |
motion_module_type: Vanilla | |
motion_module_kwargs: | |
num_attention_heads: 8 | |
num_transformer_block: 1 | |
attention_block_types: | |
- Temporal_Self | |
- Temporal_Self | |
temporal_position_encoding: true | |
temporal_position_encoding_max_len: 32 | |
temporal_attention_dim_div: 1 | |
audio_attention_dim: 768 | |
stack_enable_blocks_name: | |
- "up" | |
- "down" | |
- "mid" | |
stack_enable_blocks_depth: [0,1,2,3] | |
enable_zero_snr: true | |
noise_scheduler_kwargs: | |
beta_start: 0.00085 | |
beta_end: 0.012 | |
beta_schedule: "linear" | |
clip_sample: false | |
steps_offset: 1 | |
### Zero-SNR params | |
prediction_type: "v_prediction" | |
rescale_betas_zero_snr: True | |
timestep_spacing: "trailing" | |
sampler: DDIM | |