Spaces:
Sleeping
Sleeping
File size: 3,865 Bytes
749745d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
# for final GLIP tiny, pre-trained from scratch
MODEL:
META_ARCHITECTURE: "GeneralizedVLRCNN"
WEIGHT: "MODEL/swin_tiny_patch4_window7_224.pth"
RPN_ONLY: True
RPN_ARCHITECTURE: "VLDYHEAD"
BACKBONE:
CONV_BODY: "SWINT-FPN-RETINANET"
OUT_CHANNELS: 256
LANGUAGE_BACKBONE:
FREEZE: False
MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip"
MASK_SPECIAL: False
RPN:
USE_FPN: True
ANCHOR_SIZES: (64, 128, 256, 512, 1024)
ANCHOR_STRIDE: (8, 16, 32, 64, 128)
ASPECT_RATIOS: (1.0,)
SCALES_PER_OCTAVE: 1
DYHEAD:
CHANNELS: 256
NUM_CONVS: 6
USE_GN: True
USE_DYRELU: True
USE_DFCONV: True
USE_DYFUSE: True
TOPK: 9 # topk for selecting candidate positive samples from each level
SCORE_AGG: "MEAN"
LOG_SCALE: 0.0
USE_CHECKPOINT: True
FUSE_CONFIG:
EARLY_FUSE_ON: True
TYPE: "MHA-B" # "MHA-B", "MHA-S", "FILM", "SCAN", "NONE"
USE_CLASSIFICATION_LOSS: False
USE_TOKEN_LOSS: False
USE_CONTRASTIVE_ALIGN_LOSS: False
CONTRASTIVE_HIDDEN_DIM: 64
USE_DOT_PRODUCT_TOKEN_LOSS: True
USE_FUSED_FEATURES_DOT_PRODUCT: True
USE_LAYER_SCALE: True
CLAMP_MIN_FOR_UNDERFLOW: True
CLAMP_MAX_FOR_OVERFLOW: True
CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
CLAMP_DOT_PRODUCT: True
# use for grounding model
DATASETS:
REGISTER:
bing_caption_train:
yaml_path: "GCC/CC3M/yamls"
yaml_name: "tiny.noun.harsh"
yaml_name_no_coco: "tiny.noun.harsh"
mixed_train_no_coco_noun_gpt_0425:
coco_img_dir: "coco/train2014"
vg_img_dir: "gqa/images"
ann_file: "mdetr_annotations/final_mixed_train_no_coco_with_nouns_gpt.0425.json"
flickr30k_train_gpt_0425:
img_folder: "flickr30k/flickr30k_images/train"
ann_file: "mdetr_annotations/final_flickr_separateGT_train_gpt.0425.json"
is_train: True
CAPTION_CONF: 0.4
CAPTION_AUGMENTATION_VERSION: "mixed.v4-v3.5-4-1.drop_positive.control_pos.grouping.v1" # for GoldG data; used by CaptionAugmentation to determine how to perform the augmentation
OD_TO_GROUNDING_VERSION: "description.gpt.v10.mixed.allow_zero.v1" # for
CC_CAPTION_AUGMENTATION_VERSION: "mixed.v3.8-2.drop_positive.control_pos.grouping.v1" # for CC data; used by CaptionAugmentation to determine how to perform the augmentation
CAPTION_VOCAB_FILE: "tools/files/mixed_vocab.v1.tmp0.davincci.chunk1of1.filtered.json"
DESCRIPTION_FILE: "tools/files/o365.description.v1.json"
TRAIN: ("mixed_train_no_coco_noun_gpt_0425", "flickr30k_train_gpt_0425", "object365_dt_train", ) # bing_caption_train_no_coco
TEST: ("coco_2017_val", )
BING_INDEX_LIST: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
# BING_INDEX_LIST: [ 0, 1, ]
ONE_HOT: False
FLICKR_COPY: 2
MIXED_COPY: 2
OBJECT365_COPY: 1
DISABLE_SHUFFLE: False
ADD_DET_PROMPT: False
RANDOM_SAMPLE_NEG: 85
CONTROL_PROB: (0.0, 0.0, 0.5, 0.0)
FURTHER_SCREEN: True
CAPTION_NMS: -1.0
CAPTION_MIN_BOX: 1
SEPARATION_TOKENS: ". "
PACK_RANDOM_CAPTION_NUMBER: 20
NO_RANDOM_PACK_PROBABILITY: 0.4
RANDOM_PACK_PROB: 0.5
CAPTION_FORMAT_VERSION: "v2"
INPUT:
PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
PIXEL_STD: [ 57.375, 57.120, 58.395 ]
MIN_SIZE_TRAIN: 800
MAX_SIZE_TRAIN: 1333
MIN_SIZE_TEST: 800
MAX_SIZE_TEST: 1333
AUGMENT:
MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
DATALOADER:
SIZE_DIVISIBILITY: 32
DISTRIBUTE_CHUNK_AMONG_NODE: False
SOLVER:
OPTIMIZER: ADAMW
BASE_LR: 0.0001
LANG_LR: 0.00001
WEIGHT_DECAY: 0.0001
STEPS: (0.67, 0.89)
#MAX_EPOCH: 12
MAX_ITER: 300000
IMS_PER_BATCH: 64
WARMUP_ITERS: 2000
WARMUP_FACTOR: 0.001
USE_AMP: True
MODEL_EMA: 0.999
FIND_UNUSED_PARAMETERS: False
CLIP_GRADIENTS:
ENABLED: True
CLIP_TYPE: "full_model"
CLIP_VALUE: 1.0
NORM_TYPE: 2.0
|