model: target: lvdm.models.ddpm3d.LatentDiffusion params: linear_start: 0.00085 linear_end: 0.012 num_timesteps_cond: 1 timesteps: 1000 first_stage_key: video cond_stage_key: caption cond_stage_trainable: false conditioning_key: crossattn image_size: - 40 - 64 channels: 4 scale_by_std: false scale_factor: 0.18215 use_ema: false uncond_type: empty_seq use_scale: true scale_b: 0.7 unet_config: target: lvdm.modules.networks.openaimodel3d.UNetModel params: in_channels: 4 out_channels: 4 model_channels: 320 attention_resolutions: - 4 - 2 - 1 num_res_blocks: 2 channel_mult: - 1 - 2 - 4 - 4 num_head_channels: 64 transformer_depth: 1 context_dim: 1024 use_linear: true use_checkpoint: true temporal_conv: true temporal_attention: true temporal_selfatt_only: true use_relative_position: false use_causal_attention: false temporal_length: 16 addition_attention: true fps_cond: true first_stage_config: target: lvdm.models.autoencoder.AutoencoderKL params: embed_dim: 4 monitor: val/rec_loss ddconfig: double_z: true z_channels: 4 resolution: 512 in_channels: 3 out_ch: 3 ch: 128 ch_mult: - 1 - 2 - 4 - 4 num_res_blocks: 2 attn_resolutions: [] dropout: 0.0 lossconfig: target: torch.nn.Identity cond_stage_config: target: lvdm.modules.encoders.condition.FrozenOpenCLIPEmbedder params: freeze: true layer: penultimate