_BASE_: "./Base-RCNN-C4.yaml" MODEL: META_ARCHITECTURE: "CLIPFastRCNN" # "CLIPRCNN" # "GeneralizedRCNN" MASK_ON: False WEIGHTS: "./regionclip_r50x4.pth" BACKBONE: NAME: "build_clip_resnet_backbone" # "build_resnet_fpn_backbone" FREEZE_AT: 2 TEXT_BACKBONE: NAME: "build_clip_language_encoder" CLIP: CROP_REGION_TYPE: "RPN" OFFLINE_RPN_CONFIG: "./configs/mask_rcnn_R_50_FPN_1x.yaml" USE_TEXT_EMB_CLASSIFIER: True TEXT_EMB_PATH: "./lvis_1203_cls_emb_notnorm_rn50x4.pth" NO_BOX_DELTA: True OFFLINE_RPN_NMS_THRESH: 0.7 CLSS_TEMP: 0.01 MULTIPLY_RPN_SCORE: True TEXT_EMB_DIM: 640 RESNETS: DEPTH: 200 OUT_FEATURES: ["res4"] NORM: FrozenBN STEM_OUT_CHANNELS: 64 RES2_OUT_CHANNELS: 256 RPN: HEAD_NAME: StandardRPNHead IN_FEATURES: ["res4"] POST_NMS_TOPK_TEST: 500 NMS_THRESH: ROI_HEADS: NAME: "CLIPRes5ROIHeads" # "Res5ROIHeads" # "StandardROIHeads" IN_FEATURES: ["res4"] NUM_CLASSES: 1203 NMS_THRESH_TEST: 0.3 SCORE_THRESH_TEST: 0.0 ROI_BOX_HEAD: NAME: "" NUM_FC: 0 CLS_AGNOSTIC_BBOX_REG: True POOLER_RESOLUTION: 18 ROI_MASK_HEAD: NAME: "MaskRCNNConvUpsampleHead" NUM_CONV: 0 POOLER_RESOLUTION: 14 PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073] PIXEL_STD: [0.26862954, 0.26130258, 0.27577711] INPUT: MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) DATASETS: TRAIN: ("lvis_v1_train",) TEST: ("lvis_v1_val",) TEST: DETECTIONS_PER_IMAGE: 10 # LVIS allows up to 300 EVAL_PERIOD: 25000 SOLVER: IMS_PER_BATCH: 16 BASE_LR: 0.02 STEPS: (120000, 160000) MAX_ITER: 180000 # 180000 * 16 / 100000 ~ 28.8 epochs DATALOADER: SAMPLER_TRAIN: "RepeatFactorTrainingSampler" REPEAT_THRESHOLD: 0.001 INPUT: MIN_SIZE_TRAIN_SAMPLING: choice MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) MAX_SIZE_TRAIN: 1333 MIN_SIZE_TEST: 800 MAX_SIZE_TEST: 1333 FORMAT: "RGB"