target: model.ControlLDM
params:
  latent_scale_factor: 0.18215
  unet_cfg:
    use_checkpoint: True
    image_size: 32 # unused
    in_channels: 4
    out_channels: 4
    model_channels: 320
    attention_resolutions: [ 4, 2, 1 ]
    num_res_blocks: 2
    channel_mult: [ 1, 2, 4, 4 ]
    num_head_channels: 64 # need to fix for flash-attn
    use_spatial_transformer: True
    use_linear_in_transformer: True
    transformer_depth: 1
    context_dim: 1024
    legacy: False
  vae_cfg:
    embed_dim: 4
    ddconfig:
      double_z: true
      z_channels: 4
      resolution: 256
      in_channels: 3
      out_ch: 3
      ch: 128
      ch_mult:
      - 1
      - 2
      - 4
      - 4
      num_res_blocks: 2
      attn_resolutions: []
      dropout: 0.0
  clip_cfg:
    embed_dim: 1024
    vision_cfg:
      image_size: 224
      layers: 32
      width: 1280
      head_width: 80
      patch_size: 14
    text_cfg:
      context_length: 77
      vocab_size: 49408
      width: 1024
      heads: 16
      layers: 24
    layer: "penultimate"
  controlnet_cfg:
    use_checkpoint: True
    image_size: 32 # unused
    in_channels: 4
    hint_channels: 4
    model_channels: 320
    attention_resolutions: [ 4, 2, 1 ]
    num_res_blocks: 2
    channel_mult: [ 1, 2, 4, 4 ]
    num_head_channels: 64 # need to fix for flash-attn
    use_spatial_transformer: True
    use_linear_in_transformer: True
    transformer_depth: 1
    context_dim: 1024
    legacy: False