freeze: True | |
max_vision_token_length: 578 # 24*24 (resolution) + 2 (<img> and <\img>); corresponding to model_config.max_vision_token_length, dataset_config.image_size | |
params: | |
embed_dim: 1024 # debug | |
ckpt_path: vqgan.ckpt | |
codebook_size: 512 | |
num_codebook: 2 | |
ddconfig: | |
# only_auto_encoder: True | |
encoder_name: openai-clip-vit-large-patch14-336 | |
select_layer: [2,10,18,22] | |
double_z: False | |
z_channels: 1024 | |
resolution: 336 # 336 | |
in_channels: 3 | |
out_ch: 3 | |
ch: 128 | |
ch_mult: [ 1,1,2,4,8] # num_down = len(ch_mult)-1 | |
num_res_blocks: 2 | |
attn_resolutions: [24] | |
dropout: 0.0 | |
initial_resolution: 24 | |
num_attn_head: 8 | |