Spaces:
Sleeping
Sleeping
File size: 4,746 Bytes
749745d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 |
MODEL:
META_ARCHITECTURE: "GeneralizedVLRCNN"
WEIGHT: "MODEL/swin_base_patch4_window7_224.pth"
RPN_ONLY: True
RPN_ARCHITECTURE: "VLDYHEAD"
BACKBONE:
FUSION_VERSION: "v2"
CONV_BODY: "SWINT-FPN-RETINANET"
OUT_CHANNELS: 256
SWINT:
VERSION: "fusion"
EMBED_DIM: 128
DEPTHS: (2, 2, 18, 2)
NUM_HEADS: (4, 8, 16, 32)
WINDOW_SIZE: 12
OUT_CHANNELS: (128, 256, 512, 1024)
DROP_PATH_RATE: 0.4
LANGUAGE_BACKBONE:
FREEZE: False
MODEL_TYPE: "roberta-fused-v2"
MASK_SPECIAL: False
TOKENIZER_TYPE: "roberta-base"
USE_CHECKPOINT: False
RPN:
USE_FPN: True
ANCHOR_SIZES: (64, 128, 256, 512, 1024)
ANCHOR_STRIDE: (8, 16, 32, 64, 128)
ASPECT_RATIOS: (1.0,)
SCALES_PER_OCTAVE: 1
DYHEAD:
CHANNELS: 256
NUM_CONVS: 6
USE_GN: True
USE_DYRELU: True
USE_DFCONV: True
USE_DYFUSE: True
TOPK: 9 # topk for selecting candidate positive samples from each level
SCORE_AGG: "MEAN"
LOG_SCALE: 0.0
USE_CHECKPOINT: True
FUSE_CONFIG:
USE_FUSED_FEATURES_DOT_PRODUCT: False
EARLY_FUSE_ON: False
TYPE: "NONE" # "MHA-B", "MHA-S", "FILM", "SCAN", "NONE"
USE_CLASSIFICATION_LOSS: False
USE_TOKEN_LOSS: False
USE_CONTRASTIVE_ALIGN_LOSS: False
CONTRASTIVE_HIDDEN_DIM: 64
USE_DOT_PRODUCT_TOKEN_LOSS: True
USE_LAYER_SCALE: True
CLAMP_MIN_FOR_UNDERFLOW: True
CLAMP_MAX_FOR_OVERFLOW: True
CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
CLAMP_DOT_PRODUCT: True
DATASETS:
REGISTER:
bing_caption_train:
yaml_path: "GCC/CC3M/yamls"
yaml_name: "tiny.noun.harsh"
yaml_name_no_coco: "tiny.noun.harsh"
mixed_train_no_coco_noun:
coco_img_dir: "coco/train2014"
vg_img_dir: "gqa/images"
ann_file: "mdetr_annotations/final_mixed_train_no_coco_with_nouns.json"
mixed_train_no_coco_gpt:
coco_img_dir: "coco/train2014"
vg_img_dir: "gqa/images"
ann_file: "mdetr_annotations/final_mixed_train_no_coco_gpt.v1.new.json"
flickr30k_train_gpt:
img_folder: "flickr30k/flickr30k_images/train"
ann_file: "mdetr_annotations/final_flickr_separateGT_train_gpt.v1.json"
is_train: True
mixed_train_no_coco_noun_gpt:
coco_img_dir: "coco/train2014"
vg_img_dir: "gqa/images"
ann_file: "mdetr_annotations/final_mixed_train_no_coco_with_nouns_gpt.v1.json"
mixed_train_no_coco_noun_gpt_0425:
coco_img_dir: "coco/train2014"
vg_img_dir: "gqa/images"
ann_file: "mdetr_annotations/final_mixed_train_no_coco_with_nouns_gpt.0425.json"
flickr30k_train_gpt_0425:
img_folder: "flickr30k/flickr30k_images/train"
ann_file: "mdetr_annotations/final_flickr_separateGT_train_gpt.0425.json"
is_train: True
CAPTION_CONF: 0.4
OD_TO_GROUNDING_VERSION: "description.gpt.v10.mixed.allow_zero.v1"
CAPTION_AUGMENTATION_VERSION: "mixed.v4.8-2.drop_positive.control_pos.grouping.v1"
CC_CAPTION_AUGMENTATION_VERSION: "mixed.v3-v4.9-1.drop_positive.control_pos.grouping.v1"
CAPTION_VOCAB_FILE: "tools/files/joint_vocab.merged.v1.tmp0.davincci.json"
DESCRIPTION_FILE: "tools/files/o365.description.v1.json"
TRAIN: ("mixed_train_no_coco_noun_gpt_0425", "flickr30k_train_gpt_0425", "object365_dt_train", ) # bing_caption_train_no_coco
TEST: ("coco_2017_val", )
ADD_DET_PROMPT: False
ADD_DET_PROMPT_ADVANCED: False
ALTERNATIVE_TRAINING: False
BING_INDEX_LIST: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
ONE_HOT: False
FLICKR_COPY: 2
MIXED_COPY: 2
OBJECT365_COPY: 2
DISABLE_SHUFFLE: False
ADD_DET_PROMPT: False
RANDOM_SAMPLE_NEG: 85
CONTROL_PROB: (0.0, 0.0, 0.5, 0.0)
FURTHER_SCREEN: True
CAPTION_NMS: -1.0
CAPTION_MIN_BOX: 1
SEPARATION_TOKENS: ". "
PACK_RANDOM_CAPTION_NUMBER: 20
NO_RANDOM_PACK_PROBABILITY: 0.4
RANDOM_PACK_PROB: 0.5
CAPTION_FORMAT_VERSION: "v2"
INPUT:
PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
PIXEL_STD: [ 57.375, 57.120, 58.395 ]
MIN_SIZE_TRAIN: 800
MAX_SIZE_TRAIN: 1333
MIN_SIZE_TEST: 800
MAX_SIZE_TEST: 1333
AUGMENT:
MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
DATALOADER:
SIZE_DIVISIBILITY: 32
DISTRIBUTE_CHUNK_AMONG_NODE: False
SOLVER:
OPTIMIZER: ADAMW
BASE_LR: 0.0001
LANG_LR: 0.00001
WEIGHT_DECAY: 0.01
WEIGHT_DECAY_SCHEDULE: True
STEPS: (0.67, 0.89)
MAX_ITER: 800000
IMS_PER_BATCH: 64
WARMUP_ITERS: 2000
WARMUP_FACTOR: 0.001
TEST_WITH_INFERENCE: True
FIND_UNUSED_PARAMETERS: False
USE_AMP: True
MODEL_EMA: 0.999
CHECKPOINT_PERIOD: 2500
CLIP_GRADIENTS:
ENABLED: True
CLIP_TYPE: "full_model"
CLIP_VALUE: 1.0
NORM_TYPE: 2.0
TEST:
DURING_TRAINING: False
IMS_PER_BATCH: 64
|