freeze: True max_vision_token_length: 578 # 24*24 (resolution) + 2 ( and <\img>); corresponding to model_config.max_vision_token_length, dataset_config.image_size params: embed_dim: 1024 # debug ckpt_path: vqgan.ckpt codebook_size: 512 num_codebook: 2 ddconfig: # only_auto_encoder: True encoder_name: openai-clip-vit-large-patch14-336 select_layer: [2,10,18,22] double_z: False z_channels: 1024 resolution: 336 # 336 in_channels: 3 out_ch: 3 ch: 128 ch_mult: [ 1,1,2,4,8] # num_down = len(ch_mult)-1 num_res_blocks: 2 attn_resolutions: [24] dropout: 0.0 initial_resolution: 24 num_attn_head: 8