Spaces:
Sleeping
Sleeping
File size: 3,782 Bytes
749745d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 |
MODEL:
META_ARCHITECTURE: "GeneralizedVLRCNN"
WEIGHT: "MODEL/swin_base_patch4_window7_224.pth"
RPN_ONLY: True
RPN_ARCHITECTURE: "VLDYHEAD"
BACKBONE:
FUSION_VERSION: "v2"
CONV_BODY: "SWINT-FPN-RETINANET"
OUT_CHANNELS: 256
SWINT:
VERSION: "fusion"
EMBED_DIM: 128
DEPTHS: (2, 2, 18, 2)
NUM_HEADS: (4, 8, 16, 32)
WINDOW_SIZE: 12
OUT_CHANNELS: (128, 256, 512, 1024)
DROP_PATH_RATE: 0.4
LANGUAGE_BACKBONE:
FREEZE: False
MODEL_TYPE: "roberta-fused-v2"
MASK_SPECIAL: False
TOKENIZER_TYPE: "roberta-base"
USE_CHECKPOINT: False
RPN:
USE_FPN: True
ANCHOR_SIZES: (64, 128, 256, 512, 1024)
ANCHOR_STRIDE: (8, 16, 32, 64, 128)
ASPECT_RATIOS: (1.0,)
SCALES_PER_OCTAVE: 1
DYHEAD:
CHANNELS: 256
NUM_CONVS: 6
USE_GN: True
USE_DYRELU: True
USE_DFCONV: True
USE_DYFUSE: True
TOPK: 9 # topk for selecting candidate positive samples from each level
SCORE_AGG: "MEAN"
LOG_SCALE: 0.0
USE_CHECKPOINT: True
FUSE_CONFIG:
USE_FUSED_FEATURES_DOT_PRODUCT: False
EARLY_FUSE_ON: False
TYPE: "NONE" # "MHA-B", "MHA-S", "FILM", "SCAN", "NONE"
USE_CLASSIFICATION_LOSS: False
USE_TOKEN_LOSS: False
USE_CONTRASTIVE_ALIGN_LOSS: False
CONTRASTIVE_HIDDEN_DIM: 64
USE_DOT_PRODUCT_TOKEN_LOSS: True
USE_LAYER_SCALE: True
CLAMP_MIN_FOR_UNDERFLOW: True
CLAMP_MAX_FOR_OVERFLOW: True
CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
CLAMP_DOT_PRODUCT: True
DATASETS:
TRAIN: ("mixed_train_no_coco", "flickr30k_train", "object365_dt_train" )
TEST: ("coco_2017_val", )
ADD_DET_PROMPT: False
ADD_DET_PROMPT_ADVANCED: False
ALTERNATIVE_TRAINING: False
BOX_THRESHOLD: 0.1
CAPTION_CONF: 0.9
CAPTION_FORMAT_VERSION: "v2"
CAPTION_MIN_BOX: 1
CAPTION_NMS: 0.9
CLASS_AGNOSTIC: False
CLASS_CONCAT: False
COCO_COPY: 1
CONTROL_PROB: (0.0, 0.0, 0.5, 0.0)
DISABLE_CLIP_TO_IMAGE: False
DISABLE_SHUFFLE: False
FEW_SHOT: 0
FLICKR_COPY: 1
FLICKR_GT_TYPE: "separate"
FULL_QUESTION_PROB: 0.5
FURTHER_SCREEN: False
GENERAL_COPY: -1
GENERAL_COPY_TEST: -1
INFERENCE_CAPTION: False
IN_COPY: 1
LOCAL_DEBUG: False
LVIS_COPY: 1
LVIS_USE_NORMAL_AP: False
MAX_BOX: -1
MIXED_COPY: 1
MULTISTAGE_TRAINING: False
NEG_QUESTION_PROB: 0.8
NO_MINUS_ONE_FOR_ONE_HOT: False
OBJECT365_COPY: 1
OI_COPY: 1
ONE_HOT: False
PACK_RANDOM_CAPTION_NUMBER: 0
POS_QUESTION_PROB: 0.6
PREDOWNLOAD_BING: False
PREDOWNLOAD_WITH_AZCOPY: False
PROMPT_LIMIT_NEG: -1
RANDOM_SAMPLE_NEG: 85
REPLACE_CLEAN_LABEL: False
SAFEGUARD_POSITIVE_CAPTION: True
SEPARATION_TOKENS: ". "
SHUFFLE_SEED: 0
TEST_DATASETNAME_SUFFIX: ""
TRAIN_DATASETNAME_SUFFIX: ""
USE_CAPTION_PROMPT: False
USE_COCO_FORMAT: False
USE_CROWD: False
USE_OD_AUG: False
USE_OVERRIDE_CATEGORY: False
USE_SUPRESS_QUERY: False
VG_COPY: 1
INPUT:
PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
PIXEL_STD: [ 57.375, 57.120, 58.395 ]
MIN_SIZE_TRAIN: 800
MAX_SIZE_TRAIN: 1333
MIN_SIZE_TEST: 800
MAX_SIZE_TEST: 1333
AUGMENT:
MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
DATALOADER:
SIZE_DIVISIBILITY: 32
DISTRIBUTE_CHUNK_AMONG_NODE: False
SOLVER:
OPTIMIZER: ADAMW
BASE_LR: 0.0001
LANG_LR: 0.00001
WEIGHT_DECAY: 0.01
WEIGHT_DECAY_SCHEDULE: True
STEPS: (0.67, 0.89)
MAX_ITER: 800000
IMS_PER_BATCH: 64
WARMUP_ITERS: 2000
WARMUP_FACTOR: 0.001
TEST_WITH_INFERENCE: True
FIND_UNUSED_PARAMETERS: False
USE_AMP: True
MODEL_EMA: 0.999
CHECKPOINT_PERIOD: 2500
CLIP_GRADIENTS:
ENABLED: True
CLIP_TYPE: "full_model"
CLIP_VALUE: 1.0
NORM_TYPE: 2.0
TEST:
DURING_TRAINING: False
IMS_PER_BATCH: 64
|