desco / configs /pretrain /mixed_nococo_flickr_objects365_refexpclean.yaml
zdou0830's picture
desco
749745d
MODEL:
META_ARCHITECTURE: "GeneralizedVLRCNN"
WEIGHT: "swin_large_patch4_window12_384_22k.pth"
RPN_ONLY: True
RPN_ARCHITECTURE: "VLDYHEAD"
BACKBONE:
FUSION_VERSION: "v3"
CONV_BODY: "SWINT-FPN-RETINANET"
OUT_CHANNELS: 256
SWINT:
VERSION: "fusion"
EMBED_DIM: 128
DEPTHS: (2, 2, 18, 2)
NUM_HEADS: (4, 8, 16, 32)
WINDOW_SIZE: 12
OUT_CHANNELS: (128, 256, 512, 1024)
DROP_PATH_RATE: 0.4
LANGUAGE_BACKBONE:
FREEZE: False
MODEL_TYPE: "roberta-fused-v2"
MASK_SPECIAL: False
TOKENIZER_TYPE: "roberta-base"
USE_CHECKPOINT: False
RPN:
USE_FPN: True
ANCHOR_SIZES: (64, 128, 256, 512, 1024)
ANCHOR_STRIDE: (8, 16, 32, 64, 128)
ASPECT_RATIOS: (1.0,)
SCALES_PER_OCTAVE: 1
DYHEAD:
CHANNELS: 256
NUM_CONVS: 6
USE_GN: True
USE_DYRELU: True
USE_DFCONV: True
USE_DYFUSE: True
TOPK: 9 # topk for selecting candidate positive samples from each level
SCORE_AGG: "MEAN"
LOG_SCALE: 0.0
USE_CHECKPOINT: True
FUSE_CONFIG:
USE_FUSED_FEATURES_DOT_PRODUCT: False
EARLY_FUSE_ON: False
TYPE: "NONE" # "MHA-B", "MHA-S", "FILM", "SCAN", "NONE"
USE_CLASSIFICATION_LOSS: False
USE_TOKEN_LOSS: False
USE_CONTRASTIVE_ALIGN_LOSS: False
CONTRASTIVE_HIDDEN_DIM: 64
USE_DOT_PRODUCT_TOKEN_LOSS: True
USE_LAYER_SCALE: True
CLAMP_MIN_FOR_UNDERFLOW: True
CLAMP_MAX_FOR_OVERFLOW: True
CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
CLAMP_DOT_PRODUCT: True
DATASETS:
TRAIN: ("mixed_train_no_coco", "flickr30k_train", "object365_dt_train" )
TEST: ("coco_2017_val", )
ADD_DET_PROMPT: False
ADD_DET_PROMPT_ADVANCED: False
ALTERNATIVE_TRAINING: False
BOX_THRESHOLD: 0.1
CAPTION_CONF: 0.9
CAPTION_FORMAT_VERSION: "v2"
CAPTION_MIN_BOX: 1
CAPTION_NMS: 0.9
CLASS_AGNOSTIC: False
CLASS_CONCAT: False
COCO_COPY: 1
CONTROL_PROB: (0.0, 0.0, 0.5, 0.0)
DISABLE_CLIP_TO_IMAGE: False
DISABLE_SHUFFLE: False
FEW_SHOT: 0
FLICKR_COPY: 1
FLICKR_GT_TYPE: "separate"
FULL_QUESTION_PROB: 0.5
FURTHER_SCREEN: False
GENERAL_COPY: -1
GENERAL_COPY_TEST: -1
INFERENCE_CAPTION: False
IN_COPY: 1
LOCAL_DEBUG: False
LVIS_COPY: 1
LVIS_USE_NORMAL_AP: False
MAX_BOX: -1
MIXED_COPY: 1
MULTISTAGE_TRAINING: False
NEG_QUESTION_PROB: 0.8
NO_MINUS_ONE_FOR_ONE_HOT: False
OBJECT365_COPY: 1
OI_COPY: 1
ONE_HOT: False
PACK_RANDOM_CAPTION_NUMBER: 0
POS_QUESTION_PROB: 0.6
PREDOWNLOAD_BING: False
PREDOWNLOAD_WITH_AZCOPY: False
PROMPT_LIMIT_NEG: -1
RANDOM_SAMPLE_NEG: 85
REPLACE_CLEAN_LABEL: False
SAFEGUARD_POSITIVE_CAPTION: True
SEPARATION_TOKENS: ". "
SHUFFLE_SEED: 0
TEST_DATASETNAME_SUFFIX: ""
TRAIN_DATASETNAME_SUFFIX: ""
USE_CAPTION_PROMPT: False
USE_COCO_FORMAT: False
USE_CROWD: False
USE_OD_AUG: False
USE_OVERRIDE_CATEGORY: False
USE_SUPRESS_QUERY: False
VG_COPY: 1
INPUT:
PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
PIXEL_STD: [ 57.375, 57.120, 58.395 ]
MIN_SIZE_TRAIN: 800
MAX_SIZE_TRAIN: 1333
MIN_SIZE_TEST: 800
MAX_SIZE_TEST: 1333
AUGMENT:
MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
DATALOADER:
SIZE_DIVISIBILITY: 32
DISTRIBUTE_CHUNK_AMONG_NODE: False
SOLVER:
OPTIMIZER: ADAMW
BASE_LR: 0.0001
LANG_LR: 0.00001
WEIGHT_DECAY: 0.01
WEIGHT_DECAY_SCHEDULE: True
STEPS: (0.67, 0.89)
MAX_ITER: 800000
IMS_PER_BATCH: 64
WARMUP_ITERS: 5000
WARMUP_FACTOR: 0.001
TEST_WITH_INFERENCE: True
FIND_UNUSED_PARAMETERS: False
USE_AMP: True
MODEL_EMA: 0.999
CHECKPOINT_PERIOD: 2500
CLIP_GRADIENTS:
ENABLED: True
CLIP_TYPE: "full_model"
CLIP_VALUE: 1.0
NORM_TYPE: 2.0
TEST:
DURING_TRAINING: False
IMS_PER_BATCH: 64