Spaces:
Sleeping
Sleeping
MODEL: | |
META_ARCHITECTURE: "GeneralizedVLRCNN" | |
WEIGHT: "swin_tiny_patch4_window7_224.pth" | |
RPN_ONLY: True | |
RPN_ARCHITECTURE: "VLDYHEAD" | |
BACKBONE: | |
CONV_BODY: "SWINT-FPN-RETINANET" | |
OUT_CHANNELS: 256 | |
FREEZE_CONV_BODY_AT: -1 | |
LANGUAGE_BACKBONE: | |
FREEZE: False | |
MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip" | |
MASK_SPECIAL: False | |
RPN: | |
USE_FPN: True | |
ANCHOR_SIZES: (64, 128, 256, 512, 1024) | |
ANCHOR_STRIDE: (8, 16, 32, 64, 128) | |
ASPECT_RATIOS: (1.0,) | |
SCALES_PER_OCTAVE: 1 | |
DYHEAD: | |
CHANNELS: 256 | |
NUM_CONVS: 6 | |
USE_GN: True | |
USE_DYRELU: True | |
USE_DFCONV: True | |
USE_DYFUSE: True | |
TOPK: 9 # topk for selecting candidate positive samples from each level | |
SCORE_AGG: "MEAN" | |
LOG_SCALE: 0.0 | |
FUSE_CONFIG: | |
EARLY_FUSE_ON: True | |
TYPE: "MHA-B" | |
USE_CLASSIFICATION_LOSS: False | |
USE_TOKEN_LOSS: False | |
USE_CONTRASTIVE_ALIGN_LOSS: False | |
CONTRASTIVE_HIDDEN_DIM: 64 | |
USE_DOT_PRODUCT_TOKEN_LOSS: True | |
USE_FUSED_FEATURES_DOT_PRODUCT: True | |
USE_LAYER_SCALE: True | |
CLAMP_MIN_FOR_UNDERFLOW: True | |
CLAMP_MAX_FOR_OVERFLOW: True | |
CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True | |
CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True | |
CLAMP_DOT_PRODUCT: True | |
USE_CHECKPOINT: True | |
TEST: | |
DURING_TRAINING: False | |
IMS_PER_BATCH: 64 | |
# use for grounding model | |
DATASETS: | |
REGISTER: | |
mixed_train_no_coco_noun: | |
coco_img_dir: "coco/train2014" | |
vg_img_dir: "gqa/images" | |
ann_file: "mdetr_annotations/final_mixed_train_no_coco_with_nouns.json" | |
TRAIN: ("object365_dt_train", "mixed_train_no_coco", "flickr30k_train", ) | |
TEST: ("coco_2017_val", ) | |
DISABLE_SHUFFLE: False | |
ADD_DET_PROMPT: False | |
RANDOM_SAMPLE_NEG: 85 | |
CONTROL_PROB: (0.0, 0.0, 0.5, 0.0) | |
OD_TO_GROUNDING_VERSION: "description.gpt.v2.allow_zero" | |
CAPTION_AUGMENTATION_VERSION: "v3.v1" | |
CAPTION_VOCAB_FILE: "tools/files/mixed_vocab.v1.tmp0.davincci.chunk1of1.json" | |
DESCRIPTION_FILE: "tools/files/o365.description.v1.json" | |
SEPARATION_TOKENS: ". " | |
INPUT: | |
PIXEL_MEAN: [ 103.530, 116.280, 123.675 ] | |
PIXEL_STD: [ 57.375, 57.120, 58.395 ] | |
MIN_SIZE_TRAIN: 800 | |
MAX_SIZE_TRAIN: 1333 | |
MIN_SIZE_TEST: 800 | |
MAX_SIZE_TEST: 1333 | |
AUGMENT: | |
MULT_MIN_SIZE_TRAIN: (480,560,640,720,800) | |
DATALOADER: | |
SIZE_DIVISIBILITY: 32 | |
SOLVER: | |
OPTIMIZER: ADAMW | |
BASE_LR: 0.0001 | |
LANG_LR: 0.00001 | |
WEIGHT_DECAY: 0.0001 | |
STEPS: (0.67, 0.89) | |
MAX_EPOCH: 30 | |
IMS_PER_BATCH: 64 | |
WARMUP_ITERS: 2000 | |
WARMUP_FACTOR: 0.001 | |
USE_AMP: True | |
MODEL_EMA: 0.999 | |
FIND_UNUSED_PARAMETERS: False | |
MAX_NEG_PER_BATCH: 1.0 | |
CLIP_GRADIENTS: | |
ENABLED: True | |
CLIP_TYPE: "full_model" | |
CLIP_VALUE: 1.0 | |
NORM_TYPE: 2.0 |