regionclip-demo / configs /CLIP_fast_rcnn_R_50_C4.yaml
jw2yang's picture
Update configs/CLIP_fast_rcnn_R_50_C4.yaml
1caa1f7
_BASE_: "./Base-RCNN-C4.yaml"
MODEL:
META_ARCHITECTURE: "CLIPFastRCNN" # "CLIPRCNN" # "GeneralizedRCNN"
MASK_ON: False
WEIGHTS: "./regionclip_r50x4.pth"
BACKBONE:
NAME: "build_clip_resnet_backbone" # "build_resnet_fpn_backbone"
FREEZE_AT: 2
TEXT_BACKBONE:
NAME: "build_clip_language_encoder"
CLIP:
CROP_REGION_TYPE: "RPN"
OFFLINE_RPN_CONFIG: "./configs/mask_rcnn_R_50_FPN_1x.yaml"
USE_TEXT_EMB_CLASSIFIER: True
TEXT_EMB_PATH: "./lvis_1203_cls_emb_notnorm_rn50x4.pth"
NO_BOX_DELTA: True
OFFLINE_RPN_NMS_THRESH: 0.7
CLSS_TEMP: 0.01
MULTIPLY_RPN_SCORE: True
TEXT_EMB_DIM: 640
RESNETS:
DEPTH: 200
OUT_FEATURES: ["res4"]
NORM: FrozenBN
STEM_OUT_CHANNELS: 64
RES2_OUT_CHANNELS: 256
RPN:
HEAD_NAME: StandardRPNHead
IN_FEATURES: ["res4"]
POST_NMS_TOPK_TEST: 500
NMS_THRESH:
ROI_HEADS:
NAME: "CLIPRes5ROIHeads" # "Res5ROIHeads" # "StandardROIHeads"
IN_FEATURES: ["res4"]
NUM_CLASSES: 1203
NMS_THRESH_TEST: 0.3
SCORE_THRESH_TEST: 0.0
ROI_BOX_HEAD:
NAME: ""
NUM_FC: 0
CLS_AGNOSTIC_BBOX_REG: True
POOLER_RESOLUTION: 18
ROI_MASK_HEAD:
NAME: "MaskRCNNConvUpsampleHead"
NUM_CONV: 0
POOLER_RESOLUTION: 14
PIXEL_MEAN: [0.48145466, 0.4578275, 0.40821073]
PIXEL_STD: [0.26862954, 0.26130258, 0.27577711]
INPUT:
MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
DATASETS:
TRAIN: ("lvis_v1_train",)
TEST: ("lvis_v1_val",)
TEST:
DETECTIONS_PER_IMAGE: 10 # LVIS allows up to 300
EVAL_PERIOD: 25000
SOLVER:
IMS_PER_BATCH: 16
BASE_LR: 0.02
STEPS: (120000, 160000)
MAX_ITER: 180000 # 180000 * 16 / 100000 ~ 28.8 epochs
DATALOADER:
SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
REPEAT_THRESHOLD: 0.001
INPUT:
MIN_SIZE_TRAIN_SAMPLING: choice
MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
MAX_SIZE_TRAIN: 1333
MIN_SIZE_TEST: 800
MAX_SIZE_TEST: 1333
FORMAT: "RGB"