model: base_learning_rate: 4.5e-06 target: taming.models.cond_transformer.Net2NetTransformer params: cond_stage_key: objects_bbox transformer_config: target: taming.modules.transformer.mingpt.GPT params: vocab_size: 8192 block_size: 348 # = 256 + 92 = dim(vqgan_latent_space,16x16) + dim(conditional_builder.embedding_dim) n_layer: 40 n_head: 16 n_embd: 1408 embd_pdrop: 0.1 resid_pdrop: 0.1 attn_pdrop: 0.1 first_stage_config: target: taming.models.vqgan.VQModel params: ckpt_path: /path/to/coco_epoch117.ckpt # https://heibox.uni-heidelberg.de/f/78dea9589974474c97c1/ embed_dim: 256 n_embed: 8192 ddconfig: double_z: false z_channels: 256 resolution: 256 in_channels: 3 out_ch: 3 ch: 128 ch_mult: - 1 - 1 - 2 - 2 - 4 num_res_blocks: 2 attn_resolutions: - 16 dropout: 0.0 lossconfig: target: taming.modules.losses.DummyLoss cond_stage_config: target: taming.models.dummy_cond_stage.DummyCondStage params: conditional_key: objects_bbox data: target: main.DataModuleFromConfig params: batch_size: 6 train: target: taming.data.annotated_objects_coco.AnnotatedObjectsCoco params: data_path: data/coco_annotations_100 # substitute with path to full dataset split: train keys: [image, objects_bbox, file_name, annotations] no_tokens: 8192 target_image_size: 256 min_object_area: 0.00001 min_objects_per_image: 2 max_objects_per_image: 30 crop_method: random-1d random_flip: true use_group_parameter: true encode_crop: true validation: target: taming.data.annotated_objects_coco.AnnotatedObjectsCoco params: data_path: data/coco_annotations_100 # substitute with path to full dataset split: validation keys: [image, objects_bbox, file_name, annotations] no_tokens: 8192 target_image_size: 256 min_object_area: 0.00001 min_objects_per_image: 2 max_objects_per_image: 30 crop_method: center random_flip: false use_group_parameter: true encode_crop: true