_BASE_: "Base_OVCOCO_C4_1x.yaml" MODEL: WEIGHTS: "models/BoxSup_OVCOCO_CLIP_R50_1x.pth" WITH_CAPTION: True SYNC_CAPTION_BATCH: True ROI_BOX_HEAD: WS_NUM_PROPS: 1 ADD_IMAGE_BOX: True NEG_CAP_WEIGHT: 1.0 SOLVER: IMS_PER_BATCH: 16 BASE_LR: 0.02 STEPS: (60000, 80000) MAX_ITER: 90000 DATASETS: TRAIN: ("coco_zeroshot_train_oriorder", "coco_caption_train_tags") INPUT: CUSTOM_AUG: ResizeShortestEdge MIN_SIZE_TRAIN_SAMPLING: range MIN_SIZE_TRAIN: (800, 800) DATALOADER: SAMPLER_TRAIN: "MultiDatasetSampler" DATASET_RATIO: [1, 4] USE_DIFF_BS_SIZE: True DATASET_BS: [2, 8] USE_RFS: [False, False] DATASET_MIN_SIZES: [[800, 800], [400, 400]] DATASET_MAX_SIZES: [1333, 667] FILTER_EMPTY_ANNOTATIONS: False MULTI_DATASET_GROUPING: True DATASET_ANN: ['box', 'caption'] NUM_WORKERS: 8 WITH_IMAGE_LABELS: True