MODEL: META_ARCHITECTURE: "GeneralizedVLRCNN" WEIGHT: "swin_base_patch4_window12_384_22k.pth" RPN_ONLY: True RPN_ARCHITECTURE: "VLDYHEAD" ATSS: PRE_NMS_TOP_N: 3000 DETECTIONS_PER_IMG: 100 INFERENCE_TH: 0.0 SWINT: VERSION: "fusion" EMBED_DIM: 128 DEPTHS: (2, 2, 18, 2) NUM_HEADS: (4, 8, 16, 32) WINDOW_SIZE: 12 OUT_CHANNELS: (128, 256, 512, 1024) DROP_PATH_RATE: 0.4 BACKBONE: FUSION_VERSION: "v3" CONV_BODY: "SWINT-FPN-RETINANET" OUT_CHANNELS: 256 USE_CHECKPOINT: True FREEZE_CONV_BODY_AT: -1 LANGUAGE_BACKBONE: FREEZE: False MODEL_TYPE: "roberta-fused-v2" TOKENIZER_TYPE: "roberta-base" LANG_DIM: 768 MASK_SPECIAL: False USE_CHECKPOINT: False RPN: USE_FPN: True ANCHOR_SIZES: (64, 128, 256, 512, 1024) ANCHOR_STRIDE: (8, 16, 32, 64, 128) ASPECT_RATIOS: (1.0,) SCALES_PER_OCTAVE: 1 DYHEAD: CHANNELS: 256 NUM_CONVS: 6 USE_GN: True USE_DYRELU: True USE_DFCONV: True USE_DYFUSE: True TOPK: 9 SCORE_AGG: "MEAN" LOG_SCALE: 0.0 USE_CHECKPOINT: True FUSE_CONFIG: EARLY_FUSE_ON: False TYPE: "NONE" # "MHA-B", "MHA-S", "FILM", "SCAN", "NONE" USE_CLASSIFICATION_LOSS: False USE_TOKEN_LOSS: False USE_CONTRASTIVE_ALIGN_LOSS: False CONTRASTIVE_HIDDEN_DIM: 64 USE_DOT_PRODUCT_TOKEN_LOSS: True USE_LAYER_SCALE: True CLAMP_MIN_FOR_UNDERFLOW: True CLAMP_MAX_FOR_OVERFLOW: True CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True CLAMP_DOT_PRODUCT: True # use for grounding model DATASETS: TRAIN: ("refcoco+_train", ) TEST: ("refcoco+_val",) DISABLE_SHUFFLE: True INPUT: PIXEL_MEAN: [ 103.530, 116.280, 123.675 ] PIXEL_STD: [ 57.375, 57.120, 58.395 ] MIN_SIZE_TRAIN: 800 MAX_SIZE_TRAIN: 1333 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1333 AUGMENT: MULT_MIN_SIZE_TRAIN: (480,560,640,720,800) FLIP_PROB_TRAIN: 0.0 # Important for refcoco esp DATALOADER: SIZE_DIVISIBILITY: 32 SOLVER: OPTIMIZER: ADAMW BASE_LR: 0.00001 LANG_LR: 0.00001 WEIGHT_DECAY: 0.0001 STEPS: (0.67, 0.89) MAX_EPOCH: 20 IMS_PER_BATCH: 16 WARMUP_ITERS: 2000 WARMUP_FACTOR: 0.001 TEST_WITH_INFERENCE: True FIND_UNUSED_PARAMETERS: False USE_AMP: True MODEL_EMA: 0.999 CLIP_GRADIENTS: ENABLED: False CLIP_TYPE: "full_model" CLIP_VALUE: 1.0 NORM_TYPE: 2.0 TEST: DURING_TRAINING: True EVAL_TASK: "grounding" IMS_PER_BATCH: 16