Spaces:
Sleeping
Sleeping
MODEL: | |
META_ARCHITECTURE: "GeneralizedVLRCNN" | |
WEIGHT: "MODEL/swin_base_patch4_window7_224.pth" | |
RPN_ONLY: True | |
RPN_ARCHITECTURE: "VLDYHEAD" | |
BACKBONE: | |
FUSION_VERSION: "v2" | |
CONV_BODY: "SWINT-FPN-RETINANET" | |
OUT_CHANNELS: 256 | |
SWINT: | |
VERSION: "fusion" | |
EMBED_DIM: 128 | |
DEPTHS: (2, 2, 18, 2) | |
NUM_HEADS: (4, 8, 16, 32) | |
WINDOW_SIZE: 12 | |
OUT_CHANNELS: (128, 256, 512, 1024) | |
DROP_PATH_RATE: 0.4 | |
LANGUAGE_BACKBONE: | |
FREEZE: False | |
MODEL_TYPE: "roberta-fused-v2" | |
MASK_SPECIAL: False | |
TOKENIZER_TYPE: "roberta-base" | |
USE_CHECKPOINT: False | |
RPN: | |
USE_FPN: True | |
ANCHOR_SIZES: (64, 128, 256, 512, 1024) | |
ANCHOR_STRIDE: (8, 16, 32, 64, 128) | |
ASPECT_RATIOS: (1.0,) | |
SCALES_PER_OCTAVE: 1 | |
DYHEAD: | |
CHANNELS: 256 | |
NUM_CONVS: 6 | |
USE_GN: True | |
USE_DYRELU: True | |
USE_DFCONV: True | |
USE_DYFUSE: True | |
TOPK: 9 # topk for selecting candidate positive samples from each level | |
SCORE_AGG: "MEAN" | |
LOG_SCALE: 0.0 | |
USE_CHECKPOINT: True | |
FUSE_CONFIG: | |
USE_FUSED_FEATURES_DOT_PRODUCT: False | |
EARLY_FUSE_ON: False | |
TYPE: "NONE" # "MHA-B", "MHA-S", "FILM", "SCAN", "NONE" | |
USE_CLASSIFICATION_LOSS: False | |
USE_TOKEN_LOSS: False | |
USE_CONTRASTIVE_ALIGN_LOSS: False | |
CONTRASTIVE_HIDDEN_DIM: 64 | |
USE_DOT_PRODUCT_TOKEN_LOSS: True | |
USE_LAYER_SCALE: True | |
CLAMP_MIN_FOR_UNDERFLOW: True | |
CLAMP_MAX_FOR_OVERFLOW: True | |
CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True | |
CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True | |
CLAMP_DOT_PRODUCT: True | |
DATASETS: | |
REGISTER: | |
bing_caption_train: | |
yaml_path: "GCC/CC3M/yamls" | |
yaml_name: "tiny.noun.harsh" | |
yaml_name_no_coco: "tiny.noun.harsh" | |
# PREDOWNLOAD_BING : True | |
# PREDOWNLOAD_WITH_AZCOPY : True | |
CAPTION_CONF: 0.4 | |
CAPTION_AUGMENTATION_VERSION: "v3.v1" | |
CAPTION_VOCAB_FILE: "tools/files/mixed_vocab.v1.tmp0.davincci.chunk1of1.filtered.json" | |
DESCRIPTION_FILE: "tools/files/o365.description.v1.json" | |
TRAIN: ("mixed_train_no_coco", "flickr30k_train", "object365_dt_train", "bing_caption_train_no_coco") | |
# TRAIN: ("bing_caption_train", "mixed_train", "flickr30k_train", "coco_grounding_train", ) | |
TEST: ("coco_2017_val", ) | |
BING_INDEX_LIST: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] | |
# BING_INDEX_LIST: [ 0, 1, ] | |
ONE_HOT: False | |
FLICKR_COPY: 2 | |
MIXED_COPY: 2 | |
OBJECT365_COPY: 2 | |
DISABLE_SHUFFLE: False | |
ADD_DET_PROMPT: False | |
RANDOM_SAMPLE_NEG: 85 | |
CONTROL_PROB: (0.05, 0.05, 0.5, 0.2) | |
FURTHER_SCREEN: True | |
CAPTION_NMS: -1.0 | |
CAPTION_MIN_BOX: 1 | |
SEPARATION_TOKENS: ". " | |
PACK_RANDOM_CAPTION_NUMBER: 20 | |
NO_RANDOM_PACK_PROBABILITY: 0.4 | |
RANDOM_PACK_PROB: 0.5 | |
CAPTION_FORMAT_VERSION: "v2" | |
INPUT: | |
PIXEL_MEAN: [ 103.530, 116.280, 123.675 ] | |
PIXEL_STD: [ 57.375, 57.120, 58.395 ] | |
MIN_SIZE_TRAIN: 800 | |
MAX_SIZE_TRAIN: 1333 | |
MIN_SIZE_TEST: 800 | |
MAX_SIZE_TEST: 1333 | |
AUGMENT: | |
MULT_MIN_SIZE_TRAIN: (480,560,640,720,800) | |
DATALOADER: | |
SIZE_DIVISIBILITY: 32 | |
DISTRIBUTE_CHUNK_AMONG_NODE: False | |
SOLVER: | |
OPTIMIZER: ADAMW | |
BASE_LR: 0.0001 | |
LANG_LR: 0.00001 | |
WEIGHT_DECAY: 0.01 | |
WEIGHT_DECAY_SCHEDULE: True | |
STEPS: (0.67, 0.89) | |
MAX_ITER: 235026 | |
IMS_PER_BATCH: 64 | |
WARMUP_ITERS: 2000 | |
WARMUP_FACTOR: 0.001 | |
TEST_WITH_INFERENCE: True | |
FIND_UNUSED_PARAMETERS: False | |
USE_AMP: True | |
MODEL_EMA: 0.999 | |
CHECKPOINT_PERIOD: 2500 | |
CLIP_GRADIENTS: | |
ENABLED: True | |
CLIP_TYPE: "full_model" | |
CLIP_VALUE: 1.0 | |
NORM_TYPE: 2.0 | |
TEST: | |
DURING_TRAINING: False | |
IMS_PER_BATCH: 64 | |