freeze: True
max_vision_token_length: 578 # 24*24 (resolution) + 2 (<img> and <\img>); corresponding to model_config.max_vision_token_length, dataset_config.image_size
params:
  embed_dim: 1024 # debug
  ckpt_path: vqgan.ckpt
  codebook_size: 512
  num_codebook: 2
  ddconfig:
    # only_auto_encoder: True
    encoder_name: openai-clip-vit-large-patch14-336
    select_layer: [2,10,18,22]
    double_z: False
    z_channels: 1024
    resolution: 336 # 336
    in_channels: 3
    out_ch: 3
    ch: 128
    ch_mult: [ 1,1,2,4,8]  # num_down = len(ch_mult)-1
    num_res_blocks: 2
    attn_resolutions: [24]
    dropout: 0.0
    initial_resolution: 24
    num_attn_head: 8