# ----------------------------------------------------------------------------- # Base config: VirTex pretraining for our "base" bicaptioning model: # ResNet-50 + (L = 1, H = 1024) transformer trained for 500K iterations. # ----------------------------------------------------------------------------- RANDOM_SEED: 0 AMP: true CUDNN_BENCHMARK: true CUDNN_DETERMINISTIC: false DATA: ROOT: "datasets/coco" TOKENIZER_MODEL: "datasets/vocab/coco_10k.model" VOCAB_SIZE: 10000 UNK_INDEX: 0 SOS_INDEX: 1 EOS_INDEX: 2 MASK_INDEX: 3 IMAGE_CROP_SIZE: 224 MAX_CAPTION_LENGTH: 30 IMAGE_TRANSFORM_TRAIN: - "random_resized_crop" - "horizontal_flip" - "color_jitter" - "normalize" IMAGE_TRANSFORM_VAL: - "smallest_resize" - "center_crop" - "normalize" USE_PERCENTAGE: 100.0 USE_SINGLE_CAPTION: false MODEL: NAME: "virtex" VISUAL: NAME: "torchvision::resnet50" PRETRAINED: false FROZEN: false TEXTUAL: NAME: "transdec_postnorm::L1_H1024_A16_F4096" DROPOUT: 0.1 OPTIM: OPTIMIZER_NAME: "sgd" SGD_MOMENTUM: 0.9 WEIGHT_DECAY: 0.0001 LOOKAHEAD: USE: true ALPHA: 0.5 STEPS: 5 BATCH_SIZE: 256 CNN_LR: 0.2 LR: 0.001 NUM_ITERATIONS: 500000 WARMUP_STEPS: 10000 LR_DECAY_NAME: "cosine" NO_DECAY: ".*textual.(embedding|transformer).*(norm.*|bias)" CLIP_GRAD_NORM: 10.0