File size: 4,007 Bytes
7e93a0e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
model:
  target: sgm.models.diffusion.DiffusionEngine
  params:
    scale_factor: 0.18215
    disable_first_stage_autocast: True
    ckpt_path: checkpoints/svd_image_decoder.safetensors

    denoiser_config:
      target: sgm.modules.diffusionmodules.denoiser.Denoiser
      params:
        scaling_config:
          target: sgm.modules.diffusionmodules.denoiser_scaling.VScalingWithEDMcNoise

    network_config:
      target: sgm.modules.diffusionmodules.video_model.VideoUNet
      params:
        adm_in_channels: 768
        num_classes: sequential
        use_checkpoint: True
        in_channels: 8
        out_channels: 4
        model_channels: 320
        attention_resolutions: [4, 2, 1]
        num_res_blocks: 2
        channel_mult: [1, 2, 4, 4]
        num_head_channels: 64
        use_linear_in_transformer: True
        transformer_depth: 1
        context_dim: 1024
        spatial_transformer_attn_type: softmax-xformers
        extra_ff_mix_layer: True
        use_spatial_context: True
        merge_strategy: learned_with_images
        video_kernel_size: [3, 1, 1]

    conditioner_config:
      target: sgm.modules.GeneralConditioner
      params:
        emb_models:
        - is_trainable: False
          input_key: cond_frames_without_noise
          target: sgm.modules.encoders.modules.FrozenOpenCLIPImagePredictionEmbedder
          params:
            n_cond_frames: 1
            n_copies: 1
            open_clip_embedding_config:
              target: sgm.modules.encoders.modules.FrozenOpenCLIPImageEmbedder
              params:
                freeze: True

        - input_key: fps_id
          is_trainable: False
          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
          params:
            outdim: 256

        - input_key: motion_bucket_id
          is_trainable: False
          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
          params:
            outdim: 256

        - input_key: cond_frames
          is_trainable: False
          target: sgm.modules.encoders.modules.VideoPredictionEmbedderWithEncoder
          params:
            disable_encoder_autocast: True
            n_cond_frames: 1
            n_copies: 1
            is_ae: True
            encoder_config:
              target: sgm.models.autoencoder.AutoencoderKLModeOnly
              params:
                embed_dim: 4
                monitor: val/rec_loss
                ddconfig:
                  attn_type: vanilla-xformers
                  double_z: True
                  z_channels: 4
                  resolution: 256
                  in_channels: 3
                  out_ch: 3
                  ch: 128
                  ch_mult: [1, 2, 4, 4]
                  num_res_blocks: 2
                  attn_resolutions: []
                  dropout: 0.0
                lossconfig:
                  target: torch.nn.Identity

        - input_key: cond_aug
          is_trainable: False
          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
          params:
            outdim: 256

    first_stage_config:
      target: sgm.models.autoencoder.AutoencoderKL
      params:
        embed_dim: 4
        monitor: val/rec_loss
        ddconfig:
          attn_type: vanilla-xformers
          double_z: True
          z_channels: 4
          resolution: 256
          in_channels: 3
          out_ch: 3
          ch: 128
          ch_mult: [1, 2, 4, 4]
          num_res_blocks: 2
          attn_resolutions: []
          dropout: 0.0
        lossconfig:
          target: torch.nn.Identity

    sampler_config:
      target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler
      params:
        discretization_config:
          target: sgm.modules.diffusionmodules.discretizer.EDMDiscretization
          params:
            sigma_max: 700.0

        guider_config:
          target: sgm.modules.diffusionmodules.guiders.LinearPredictionGuider
          params:
            max_scale: 2.5
            min_scale: 1.0