model: base_learning_rate: 1.0e-04 target: sgm.models.video_diffusion.DiffusionEngine params: ckpt_path: ./ckpts/V3D_512.ckpt scale_factor: 0.18215 disable_first_stage_autocast: true input_key: latents log_keys: [] scheduler_config: target: sgm.lr_scheduler.LambdaLinearScheduler params: warm_up_steps: - 1 cycle_lengths: - 10000000000000 f_start: - 1.0e-06 f_max: - 1.0 f_min: - 1.0 denoiser_config: target: sgm.modules.diffusionmodules.denoiser.Denoiser params: scaling_config: target: sgm.modules.diffusionmodules.denoiser_scaling.VScalingWithEDMcNoise network_config: target: sgm.modules.diffusionmodules.video_model.VideoUNet params: adm_in_channels: 768 num_classes: sequential use_checkpoint: true in_channels: 8 out_channels: 4 model_channels: 320 attention_resolutions: - 4 - 2 - 1 num_res_blocks: 2 channel_mult: - 1 - 2 - 4 - 4 num_head_channels: 64 use_linear_in_transformer: true transformer_depth: 1 context_dim: 1024 spatial_transformer_attn_type: softmax-xformers extra_ff_mix_layer: true use_spatial_context: true merge_strategy: learned_with_images video_kernel_size: - 3 - 1 - 1 conditioner_config: target: sgm.modules.GeneralConditioner params: emb_models: - is_trainable: false ucg_rate: 0.2 input_key: cond_frames_without_noise target: sgm.modules.encoders.modules.IdentityEncoder - input_key: fps_id is_trainable: true target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND params: outdim: 256 - input_key: motion_bucket_id is_trainable: true target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND params: outdim: 256 - input_key: cond_frames is_trainable: false ucg_rate: 0.2 target: sgm.modules.encoders.modules.IdentityEncoder - input_key: cond_aug is_trainable: true target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND params: outdim: 256 first_stage_config: target: sgm.models.autoencoder.AutoencodingEngine params: loss_config: target: torch.nn.Identity regularizer_config: target: sgm.modules.autoencoding.regularizers.DiagonalGaussianRegularizer encoder_config: target: sgm.modules.diffusionmodules.model.Encoder params: attn_type: vanilla double_z: true z_channels: 4 resolution: 256 in_channels: 3 out_ch: 3 ch: 128 ch_mult: - 1 - 2 - 4 - 4 num_res_blocks: 2 attn_resolutions: [] dropout: 0.0 decoder_config: target: sgm.modules.autoencoding.temporal_ae.VideoDecoder params: attn_type: vanilla double_z: true z_channels: 4 resolution: 256 in_channels: 3 out_ch: 3 ch: 128 ch_mult: - 1 - 2 - 4 - 4 num_res_blocks: 2 attn_resolutions: [] dropout: 0.0 video_kernel_size: - 3 - 1 - 1 sampler_config: target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler params: num_steps: 30 discretization_config: target: sgm.modules.diffusionmodules.discretizer.EDMDiscretization params: sigma_max: 700.0 guider_config: target: sgm.modules.diffusionmodules.guiders.LinearPredictionGuider params: max_scale: 3.5 min_scale: 3.5 num_frames: 18 loss_fn_config: target: sgm.modules.diffusionmodules.loss.StandardDiffusionLoss params: batch2model_keys: - num_video_frames - image_only_indicator loss_weighting_config: target: sgm.modules.diffusionmodules.loss_weighting.EDMWeighting params: sigma_data: 1.0 sigma_sampler_config: target: sgm.modules.diffusionmodules.sigma_sampling.EDMSampling params: p_mean: 1.5 p_std: 2.0