Spaces:
Sleeping
Sleeping
File size: 3,544 Bytes
749745d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
MODEL:
META_ARCHITECTURE: "GeneralizedVLRCNN"
WEIGHT: "MODEL/swin_base_patch4_window7_224.pth"
RPN_ONLY: True
RPN_ARCHITECTURE: "VLDYHEAD"
BACKBONE:
FUSION_VERSION: "v2"
CONV_BODY: "SWINT-FPN-RETINANET"
OUT_CHANNELS: 256
SWINT:
VERSION: "fusion"
EMBED_DIM: 128
DEPTHS: (2, 2, 18, 2)
NUM_HEADS: (4, 8, 16, 32)
WINDOW_SIZE: 12
OUT_CHANNELS: (128, 256, 512, 1024)
DROP_PATH_RATE: 0.4
LANGUAGE_BACKBONE:
FREEZE: False
MODEL_TYPE: "roberta-fused-v2"
MASK_SPECIAL: False
TOKENIZER_TYPE: "roberta-base"
USE_CHECKPOINT: False
RPN:
USE_FPN: True
ANCHOR_SIZES: (64, 128, 256, 512, 1024)
ANCHOR_STRIDE: (8, 16, 32, 64, 128)
ASPECT_RATIOS: (1.0,)
SCALES_PER_OCTAVE: 1
DYHEAD:
CHANNELS: 256
NUM_CONVS: 6
USE_GN: True
USE_DYRELU: True
USE_DFCONV: True
USE_DYFUSE: True
TOPK: 9 # topk for selecting candidate positive samples from each level
SCORE_AGG: "MEAN"
LOG_SCALE: 0.0
USE_CHECKPOINT: True
FUSE_CONFIG:
USE_FUSED_FEATURES_DOT_PRODUCT: False
EARLY_FUSE_ON: False
TYPE: "NONE" # "MHA-B", "MHA-S", "FILM", "SCAN", "NONE"
USE_CLASSIFICATION_LOSS: False
USE_TOKEN_LOSS: False
USE_CONTRASTIVE_ALIGN_LOSS: False
CONTRASTIVE_HIDDEN_DIM: 64
USE_DOT_PRODUCT_TOKEN_LOSS: True
USE_LAYER_SCALE: True
CLAMP_MIN_FOR_UNDERFLOW: True
CLAMP_MAX_FOR_OVERFLOW: True
CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
CLAMP_DOT_PRODUCT: True
DATASETS:
REGISTER:
bing_caption_train:
yaml_path: "GCC/CC3M/yamls"
yaml_name: "tiny.noun.harsh"
yaml_name_no_coco: "tiny.noun.harsh"
# PREDOWNLOAD_BING : True
# PREDOWNLOAD_WITH_AZCOPY : True
CAPTION_CONF: 0.4
CAPTION_AUGMENTATION_VERSION: "v3.v1"
CAPTION_VOCAB_FILE: "tools/files/mixed_vocab.v1.tmp0.davincci.chunk1of1.filtered.json"
DESCRIPTION_FILE: "tools/files/o365.description.v1.json"
TRAIN: ("mixed_train_no_coco", "flickr30k_train", "object365_dt_train", "bing_caption_train_no_coco")
# TRAIN: ("bing_caption_train", "mixed_train", "flickr30k_train", "coco_grounding_train", )
TEST: ("coco_2017_val", )
BING_INDEX_LIST: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
# BING_INDEX_LIST: [ 0, 1, ]
ONE_HOT: False
FLICKR_COPY: 2
MIXED_COPY: 2
OBJECT365_COPY: 2
DISABLE_SHUFFLE: False
ADD_DET_PROMPT: False
RANDOM_SAMPLE_NEG: 85
CONTROL_PROB: (0.05, 0.05, 0.5, 0.2)
FURTHER_SCREEN: True
CAPTION_NMS: -1.0
CAPTION_MIN_BOX: 1
SEPARATION_TOKENS: ". "
PACK_RANDOM_CAPTION_NUMBER: 20
NO_RANDOM_PACK_PROBABILITY: 0.4
RANDOM_PACK_PROB: 0.5
CAPTION_FORMAT_VERSION: "v2"
INPUT:
PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
PIXEL_STD: [ 57.375, 57.120, 58.395 ]
MIN_SIZE_TRAIN: 800
MAX_SIZE_TRAIN: 1333
MIN_SIZE_TEST: 800
MAX_SIZE_TEST: 1333
AUGMENT:
MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
DATALOADER:
SIZE_DIVISIBILITY: 32
DISTRIBUTE_CHUNK_AMONG_NODE: False
SOLVER:
OPTIMIZER: ADAMW
BASE_LR: 0.0001
LANG_LR: 0.00001
WEIGHT_DECAY: 0.01
WEIGHT_DECAY_SCHEDULE: True
STEPS: (0.67, 0.89)
MAX_ITER: 235026
IMS_PER_BATCH: 64
WARMUP_ITERS: 2000
WARMUP_FACTOR: 0.001
TEST_WITH_INFERENCE: True
FIND_UNUSED_PARAMETERS: False
USE_AMP: True
MODEL_EMA: 0.999
CHECKPOINT_PERIOD: 2500
CLIP_GRADIENTS:
ENABLED: True
CLIP_TYPE: "full_model"
CLIP_VALUE: 1.0
NORM_TYPE: 2.0
TEST:
DURING_TRAINING: False
IMS_PER_BATCH: 64
|