Spaces:
Sleeping
Sleeping
MODEL: | |
META_ARCHITECTURE: "GeneralizedVLRCNN" | |
WEIGHT: "swin_large_patch4_window12_384_22k.pth" | |
RPN_ONLY: True | |
RPN_ARCHITECTURE: "VLDYHEAD" | |
BACKBONE: | |
FUSION_VERSION: "v3" | |
CONV_BODY: "SWINT-FPN-RETINANET" | |
OUT_CHANNELS: 256 | |
SWINT: | |
VERSION: "fusion" | |
EMBED_DIM: 128 | |
DEPTHS: (2, 2, 18, 2) | |
NUM_HEADS: (4, 8, 16, 32) | |
WINDOW_SIZE: 12 | |
OUT_CHANNELS: (128, 256, 512, 1024) | |
DROP_PATH_RATE: 0.4 | |
LANGUAGE_BACKBONE: | |
FREEZE: False | |
MODEL_TYPE: "roberta-fused-v2" | |
MASK_SPECIAL: False | |
TOKENIZER_TYPE: "roberta-base" | |
USE_CHECKPOINT: False | |
RPN: | |
USE_FPN: True | |
ANCHOR_SIZES: (64, 128, 256, 512, 1024) | |
ANCHOR_STRIDE: (8, 16, 32, 64, 128) | |
ASPECT_RATIOS: (1.0,) | |
SCALES_PER_OCTAVE: 1 | |
DYHEAD: | |
CHANNELS: 256 | |
NUM_CONVS: 6 | |
USE_GN: True | |
USE_DYRELU: True | |
USE_DFCONV: True | |
USE_DYFUSE: True | |
TOPK: 9 # topk for selecting candidate positive samples from each level | |
SCORE_AGG: "MEAN" | |
LOG_SCALE: 0.0 | |
USE_CHECKPOINT: True | |
FUSE_CONFIG: | |
USE_FUSED_FEATURES_DOT_PRODUCT: False | |
EARLY_FUSE_ON: False | |
TYPE: "NONE" # "MHA-B", "MHA-S", "FILM", "SCAN", "NONE" | |
USE_CLASSIFICATION_LOSS: False | |
USE_TOKEN_LOSS: False | |
USE_CONTRASTIVE_ALIGN_LOSS: False | |
CONTRASTIVE_HIDDEN_DIM: 64 | |
USE_DOT_PRODUCT_TOKEN_LOSS: True | |
USE_LAYER_SCALE: True | |
CLAMP_MIN_FOR_UNDERFLOW: True | |
CLAMP_MAX_FOR_OVERFLOW: True | |
CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True | |
CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True | |
CLAMP_DOT_PRODUCT: True | |
DATASETS: | |
TRAIN: ("mixed_train_no_coco", "flickr30k_train", "object365_dt_train" ) | |
TEST: ("coco_2017_val", ) | |
ADD_DET_PROMPT: False | |
ADD_DET_PROMPT_ADVANCED: False | |
ALTERNATIVE_TRAINING: False | |
BOX_THRESHOLD: 0.1 | |
CAPTION_CONF: 0.9 | |
CAPTION_FORMAT_VERSION: "v2" | |
CAPTION_MIN_BOX: 1 | |
CAPTION_NMS: 0.9 | |
CLASS_AGNOSTIC: False | |
CLASS_CONCAT: False | |
COCO_COPY: 1 | |
CONTROL_PROB: (0.0, 0.0, 0.5, 0.0) | |
DISABLE_CLIP_TO_IMAGE: False | |
DISABLE_SHUFFLE: False | |
FEW_SHOT: 0 | |
FLICKR_COPY: 1 | |
FLICKR_GT_TYPE: "separate" | |
FULL_QUESTION_PROB: 0.5 | |
FURTHER_SCREEN: False | |
GENERAL_COPY: -1 | |
GENERAL_COPY_TEST: -1 | |
INFERENCE_CAPTION: False | |
IN_COPY: 1 | |
LOCAL_DEBUG: False | |
LVIS_COPY: 1 | |
LVIS_USE_NORMAL_AP: False | |
MAX_BOX: -1 | |
MIXED_COPY: 1 | |
MULTISTAGE_TRAINING: False | |
NEG_QUESTION_PROB: 0.8 | |
NO_MINUS_ONE_FOR_ONE_HOT: False | |
OBJECT365_COPY: 1 | |
OI_COPY: 1 | |
ONE_HOT: False | |
PACK_RANDOM_CAPTION_NUMBER: 0 | |
POS_QUESTION_PROB: 0.6 | |
PREDOWNLOAD_BING: False | |
PREDOWNLOAD_WITH_AZCOPY: False | |
PROMPT_LIMIT_NEG: -1 | |
RANDOM_SAMPLE_NEG: 85 | |
REPLACE_CLEAN_LABEL: False | |
SAFEGUARD_POSITIVE_CAPTION: True | |
SEPARATION_TOKENS: ". " | |
SHUFFLE_SEED: 0 | |
TEST_DATASETNAME_SUFFIX: "" | |
TRAIN_DATASETNAME_SUFFIX: "" | |
USE_CAPTION_PROMPT: False | |
USE_COCO_FORMAT: False | |
USE_CROWD: False | |
USE_OD_AUG: False | |
USE_OVERRIDE_CATEGORY: False | |
USE_SUPRESS_QUERY: False | |
VG_COPY: 1 | |
INPUT: | |
PIXEL_MEAN: [ 103.530, 116.280, 123.675 ] | |
PIXEL_STD: [ 57.375, 57.120, 58.395 ] | |
MIN_SIZE_TRAIN: 800 | |
MAX_SIZE_TRAIN: 1333 | |
MIN_SIZE_TEST: 800 | |
MAX_SIZE_TEST: 1333 | |
AUGMENT: | |
MULT_MIN_SIZE_TRAIN: (480,560,640,720,800) | |
DATALOADER: | |
SIZE_DIVISIBILITY: 32 | |
DISTRIBUTE_CHUNK_AMONG_NODE: False | |
SOLVER: | |
OPTIMIZER: ADAMW | |
BASE_LR: 0.0001 | |
LANG_LR: 0.00001 | |
WEIGHT_DECAY: 0.01 | |
WEIGHT_DECAY_SCHEDULE: True | |
STEPS: (0.67, 0.89) | |
MAX_ITER: 800000 | |
IMS_PER_BATCH: 64 | |
WARMUP_ITERS: 5000 | |
WARMUP_FACTOR: 0.001 | |
TEST_WITH_INFERENCE: True | |
FIND_UNUSED_PARAMETERS: False | |
USE_AMP: True | |
MODEL_EMA: 0.999 | |
CHECKPOINT_PERIOD: 2500 | |
CLIP_GRADIENTS: | |
ENABLED: True | |
CLIP_TYPE: "full_model" | |
CLIP_VALUE: 1.0 | |
NORM_TYPE: 2.0 | |
TEST: | |
DURING_TRAINING: False | |
IMS_PER_BATCH: 64 | |