globals: target_fps: 32 target_nframes: 64 unet: _class_name: UNetSpatioTemporalConditionModel addition_time_embed_dim: 1 block_out_channels: - 128 - 256 - 256 - 512 cross_attention_dim: 1 down_block_types: - CrossAttnDownBlockSpatioTemporal - CrossAttnDownBlockSpatioTemporal - CrossAttnDownBlockSpatioTemporal - DownBlockSpatioTemporal in_channels: 8 layers_per_block: 2 num_attention_heads: - 8 - 16 - 16 - 32 num_frames: ${globals.target_nframes} out_channels: 4 projection_class_embeddings_input_dim: 1 sample_size: 14 transformer_layers_per_block: 1 up_block_types: - UpBlockSpatioTemporal - CrossAttnUpBlockSpatioTemporal - CrossAttnUpBlockSpatioTemporal - CrossAttnUpBlockSpatioTemporal noise_scheduler: _class_name: DDPMScheduler num_train_timesteps: 1000 beta_start: 0.0001 beta_end: 0.02 beta_schedule: linear # linear, scaled_linear, or squaredcos_cap_v2 variance_type: fixed_small # fixed_small, fixed_small_log, fixed_large, fixed_large_log, learned or learned_range clip_sample: true clip_sample_range: 4.0 # default 1 prediction_type: v_prediction # epsilon, sample, v_prediction thresholding: false # do not touch dynamic_thresholding_ratio: 0.995 # unused sample_max_value: 1.0 # unused timestep_spacing: "leading" # steps_offset: 0 # unused