Spaces:
Sleeping
Sleeping
MODEL: | |
META_ARCHITECTURE: "GeneralizedVLRCNN" | |
WEIGHT: "swin_base_patch4_window12_384_22k.pth" | |
RPN_ONLY: True | |
RPN_ARCHITECTURE: "VLDYHEAD" | |
ATSS: | |
PRE_NMS_TOP_N: 3000 | |
DETECTIONS_PER_IMG: 100 | |
INFERENCE_TH: 0.0 | |
SWINT: | |
VERSION: "fusion" | |
EMBED_DIM: 128 | |
DEPTHS: (2, 2, 18, 2) | |
NUM_HEADS: (4, 8, 16, 32) | |
WINDOW_SIZE: 12 | |
OUT_CHANNELS: (128, 256, 512, 1024) | |
DROP_PATH_RATE: 0.4 | |
BACKBONE: | |
FUSION_VERSION: "v3" | |
CONV_BODY: "SWINT-FPN-RETINANET" | |
OUT_CHANNELS: 256 | |
USE_CHECKPOINT: True | |
FREEZE_CONV_BODY_AT: -1 | |
LANGUAGE_BACKBONE: | |
FREEZE: False | |
MODEL_TYPE: "roberta-fused-v2" | |
TOKENIZER_TYPE: "roberta-base" | |
LANG_DIM: 768 | |
MASK_SPECIAL: False | |
USE_CHECKPOINT: False | |
RPN: | |
USE_FPN: True | |
ANCHOR_SIZES: (64, 128, 256, 512, 1024) | |
ANCHOR_STRIDE: (8, 16, 32, 64, 128) | |
ASPECT_RATIOS: (1.0,) | |
SCALES_PER_OCTAVE: 1 | |
DYHEAD: | |
CHANNELS: 256 | |
NUM_CONVS: 6 | |
USE_GN: True | |
USE_DYRELU: True | |
USE_DFCONV: True | |
USE_DYFUSE: True | |
TOPK: 9 | |
SCORE_AGG: "MEAN" | |
LOG_SCALE: 0.0 | |
USE_CHECKPOINT: True | |
FUSE_CONFIG: | |
EARLY_FUSE_ON: False | |
TYPE: "NONE" # "MHA-B", "MHA-S", "FILM", "SCAN", "NONE" | |
USE_CLASSIFICATION_LOSS: False | |
USE_TOKEN_LOSS: False | |
USE_CONTRASTIVE_ALIGN_LOSS: False | |
CONTRASTIVE_HIDDEN_DIM: 64 | |
USE_DOT_PRODUCT_TOKEN_LOSS: True | |
USE_LAYER_SCALE: True | |
CLAMP_MIN_FOR_UNDERFLOW: True | |
CLAMP_MAX_FOR_OVERFLOW: True | |
CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True | |
CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True | |
CLAMP_DOT_PRODUCT: True | |
# use for grounding model | |
DATASETS: | |
TRAIN: ("refcoco+_train", ) | |
TEST: ("refcoco+_val",) | |
DISABLE_SHUFFLE: True | |
INPUT: | |
PIXEL_MEAN: [ 103.530, 116.280, 123.675 ] | |
PIXEL_STD: [ 57.375, 57.120, 58.395 ] | |
MIN_SIZE_TRAIN: 800 | |
MAX_SIZE_TRAIN: 1333 | |
MIN_SIZE_TEST: 800 | |
MAX_SIZE_TEST: 1333 | |
AUGMENT: | |
MULT_MIN_SIZE_TRAIN: (480,560,640,720,800) | |
FLIP_PROB_TRAIN: 0.0 # Important for refcoco esp | |
DATALOADER: | |
SIZE_DIVISIBILITY: 32 | |
SOLVER: | |
OPTIMIZER: ADAMW | |
BASE_LR: 0.00001 | |
LANG_LR: 0.00001 | |
WEIGHT_DECAY: 0.0001 | |
STEPS: (0.67, 0.89) | |
MAX_EPOCH: 20 | |
IMS_PER_BATCH: 16 | |
WARMUP_ITERS: 2000 | |
WARMUP_FACTOR: 0.001 | |
TEST_WITH_INFERENCE: True | |
FIND_UNUSED_PARAMETERS: False | |
USE_AMP: True | |
MODEL_EMA: 0.999 | |
CLIP_GRADIENTS: | |
ENABLED: False | |
CLIP_TYPE: "full_model" | |
CLIP_VALUE: 1.0 | |
NORM_TYPE: 2.0 | |
TEST: | |
DURING_TRAINING: True | |
EVAL_TASK: "grounding" | |
IMS_PER_BATCH: 16 | |