_BASE_: "Base-C2_L_R5021k_640b64_4x.yaml" MODEL: ROI_BOX_HEAD: USE_ZEROSHOT_CLS: True IMAGE_LABEL_LOSS: 'max_size' BACKBONE: NAME: build_swintransformer_fpn_backbone SWIN: SIZE: B-22k FPN: IN_FEATURES: ["swin1", "swin2", "swin3"] WEIGHTS: "models/BoxSup-C2_Lbase_CLIP_SwinB_896b32_4x.pth" SOLVER: MAX_ITER: 180000 IMS_PER_BATCH: 32 BASE_LR: 0.0001 WARMUP_ITERS: 1000 WARMUP_FACTOR: 0.001 DATASETS: TRAIN: ("lvis_v1_train_norare","imagenet_lvis_v1") DATALOADER: SAMPLER_TRAIN: "MultiDatasetSampler" DATASET_RATIO: [1, 4] USE_DIFF_BS_SIZE: True DATASET_BS: [4, 16] DATASET_INPUT_SIZE: [896, 448] USE_RFS: [True, False] DATASET_INPUT_SCALE: [[0.1, 2.0], [0.5, 1.5]] FILTER_EMPTY_ANNOTATIONS: False MULTI_DATASET_GROUPING: True DATASET_ANN: ['box', 'image'] NUM_WORKERS: 8 WITH_IMAGE_LABELS: True