_BASE_: "./Base-RCNN-C4.yaml" MODEL: META_ARCHITECTURE: "CLIPFastRCNN" # "CLIPRCNN" # "GeneralizedRCNN" BACKBONE: NAME: "build_clip_swin" # "build_resnet_fpn_backbone" FREEZE_AT: 2 TEXT_BACKBONE: NAME: "build_clip_swin_text_backbone" SPEC: EMBED_DIM: 512 VISION: PATCH_SIZE: 4 IN_CHANS: 3 EMBED_DIM: 128 DEPTHS: [ 2, 2, 18, 2 ] NUM_HEADS: [ 4, 8, 16, 32 ] WINDOW_SIZE: 7 MLP_RATIO: 4. QKV_BIAS: True APE: False PATCH_NORM: True DROP_RATE: 0.0 DROP_PATH_RATE: 0.2 OUT_FEATURES: ["stage2", "stage3", "stage4", "stage5"] TEXT: NAME: 'transformer' TOKENIZER: clip CONTEXT_LENGTH: 77 WIDTH: 512 HEADS: 8 LAYERS: 12 WEIGHTS: "" # "detectron2://ImageNetPretrained/MSRA/R-50.pkl" MASK_ON: True RPN: HEAD_NAME: StandardRPNHead IN_FEATURES: ["stage4"] ROI_HEADS: NAME: "CLIPSwinROIHeads" # "Res5ROIHeads" # "StandardROIHeads" IN_FEATURES: ["stage4"] NUM_CLASSES: 1203 SCORE_THRESH_TEST: 0.0001 ROI_BOX_HEAD: NAME: "" NUM_FC: 0 POOLER_RESOLUTION: 14 ROI_MASK_HEAD: NAME: "MaskRCNNConvUpsampleHead" NUM_CONV: 0 POOLER_RESOLUTION: 14 PIXEL_MEAN: [0.485, 0.456, 0.406] PIXEL_STD: [0.229, 0.224, 0.225] INPUT: MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) DATASETS: TRAIN: ("lvis_v1_train",) TEST: ("lvis_v1_val",) TEST: DETECTIONS_PER_IMAGE: 300 # LVIS allows up to 300 EVAL_PERIOD: 25000 SOLVER: IMS_PER_BATCH: 16 BASE_LR: 0.02 STEPS: (120000, 160000) MAX_ITER: 180000 # 180000 * 16 / 100000 ~ 28.8 epochs DATALOADER: SAMPLER_TRAIN: "RepeatFactorTrainingSampler" REPEAT_THRESHOLD: 0.001 INPUT: MIN_SIZE_TRAIN_SAMPLING: choice MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) MAX_SIZE_TRAIN: 1333 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1333 FORMAT: "RGB"