MODEL: META_ARCHITECTURE: "GeneralizedVLRCNN" WEIGHT: "MODEL/swin_tiny_patch4_window7_224.pth" RPN_ONLY: True RPN_ARCHITECTURE: "VLDYHEAD" BACKBONE: FUSION_VERSION: "v2" CONV_BODY: "SWINT-FPN-RETINANET" OUT_CHANNELS: 256 SWINT: VERSION: "fusion" LANGUAGE_BACKBONE: FREEZE: False MODEL_TYPE: "roberta-fused-tiny" MASK_SPECIAL: False TOKENIZER_TYPE: "roberta-base" USE_CHECKPOINT: False RPN: USE_FPN: True ANCHOR_SIZES: (64, 128, 256, 512, 1024) ANCHOR_STRIDE: (8, 16, 32, 64, 128) ASPECT_RATIOS: (1.0,) SCALES_PER_OCTAVE: 1 DYHEAD: CHANNELS: 256 NUM_CONVS: 6 USE_GN: True USE_DYRELU: True USE_DFCONV: True USE_DYFUSE: True TOPK: 9 # topk for selecting candidate positive samples from each level SCORE_AGG: "MEAN" LOG_SCALE: 0.0 USE_CHECKPOINT: False FUSE_CONFIG: USE_FUSED_FEATURES_DOT_PRODUCT: False EARLY_FUSE_ON: False TYPE: "NONE" # "MHA-B", "MHA-S", "FILM", "SCAN", "NONE" USE_CLASSIFICATION_LOSS: False USE_TOKEN_LOSS: False USE_CONTRASTIVE_ALIGN_LOSS: False CONTRASTIVE_HIDDEN_DIM: 64 USE_DOT_PRODUCT_TOKEN_LOSS: True USE_LAYER_SCALE: True CLAMP_MIN_FOR_UNDERFLOW: True CLAMP_MAX_FOR_OVERFLOW: True CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True CLAMP_DOT_PRODUCT: True DATASETS: TRAIN: ("flickr30k_train", ) TEST: ("coco_2017_val", ) ADD_DET_PROMPT: False ADD_DET_PROMPT_ADVANCED: False ALTERNATIVE_TRAINING: False BOX_THRESHOLD: 0.1 CAPTION_CONF: 0.9 CAPTION_FORMAT_VERSION: "v2" CAPTION_MIN_BOX: 1 CAPTION_NMS: 0.9 CLASS_AGNOSTIC: False CLASS_CONCAT: False COCO_COPY: 1 #CONTROL_PROB: (0.05, 0.05, 0.5, 0.2) CONTROL_PROB: (0.0, 0.0, 0.5, 0.0) DISABLE_CLIP_TO_IMAGE: False DISABLE_SHUFFLE: False FEW_SHOT: 0 FLICKR_COPY: 1 FLICKR_GT_TYPE: "separate" FULL_QUESTION_PROB: 0.5 FURTHER_SCREEN: False GENERAL_COPY: -1 GENERAL_COPY_TEST: -1 INFERENCE_CAPTION: False IN_COPY: 1 LOCAL_DEBUG: False LVIS_COPY: 1 LVIS_USE_NORMAL_AP: False MAX_BOX: -1 MIXED_COPY: 1 MULTISTAGE_TRAINING: False NEG_QUESTION_PROB: 0.8 NO_MINUS_ONE_FOR_ONE_HOT: False OBJECT365_COPY: 1 OI_COPY: 1 ONE_HOT: False PACK_RANDOM_CAPTION_NUMBER: 0 POS_QUESTION_PROB: 0.6 PREDOWNLOAD_BING: False PREDOWNLOAD_WITH_AZCOPY: False PROMPT_LIMIT_NEG: -1 RANDOM_SAMPLE_NEG: 85 REPLACE_CLEAN_LABEL: False SAFEGUARD_POSITIVE_CAPTION: True SEPARATION_TOKENS: ". " SHUFFLE_SEED: 0 TEST_DATASETNAME_SUFFIX: "" TRAIN_DATASETNAME_SUFFIX: "" USE_CAPTION_PROMPT: False USE_COCO_FORMAT: False USE_CROWD: False USE_OD_AUG: False USE_OVERRIDE_CATEGORY: False USE_SUPRESS_QUERY: False VG_COPY: 1 INPUT: PIXEL_MEAN: [ 103.530, 116.280, 123.675 ] PIXEL_STD: [ 57.375, 57.120, 58.395 ] MIN_SIZE_TRAIN: 800 MAX_SIZE_TRAIN: 1333 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1333 AUGMENT: MULT_MIN_SIZE_TRAIN: (480,560,640,720,800) DATALOADER: SIZE_DIVISIBILITY: 32 DISTRIBUTE_CHUNK_AMONG_NODE: False SOLVER: OPTIMIZER: ADAMW BASE_LR: 0.0001 LANG_LR: 0.00001 WEIGHT_DECAY: 0.01 WEIGHT_DECAY_SCHEDULE: True STEPS: (0.67, 0.89) MAX_ITER: 800000 IMS_PER_BATCH: 64 WARMUP_ITERS: 2000 WARMUP_FACTOR: 0.001 TEST_WITH_INFERENCE: True FIND_UNUSED_PARAMETERS: True USE_AMP: True MODEL_EMA: 0.999 CHECKPOINT_PERIOD: 2500 CLIP_GRADIENTS: ENABLED: True CLIP_TYPE: "full_model" CLIP_VALUE: 1.0 NORM_TYPE: 2.0 TEST: DURING_TRAINING: False IMS_PER_BATCH: 64