# ----------------------------------------------------------------------------
# Train a Mask R-CNN with ResNet-50 and FPN backbone. This config follows
# Detectron2 format; and is unrelated with our VirTex configs. Params here
# replicate evaluation protocol as per MoCo (https://arxiv.org/abs/1911.05722).
# ----------------------------------------------------------------------------

INPUT:
  # Input format will always be RGB, consistent with torchvision.
  FORMAT: "RGB"
  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
  MIN_SIZE_TEST: 800

MODEL:
  META_ARCHITECTURE: "GeneralizedRCNN"

  # Train all layers end-to-end by default.
  BACKBONE:
    NAME: "build_resnet_fpn_backbone"
    FREEZE_AT: 0

  # Fine-tune with SyncBN.
  # STRIDE_IN_1X1 is False for torchvision-like models.
  RESNETS:
    DEPTH: 50
    NORM: "SyncBN"
    STRIDE_IN_1X1: False
    OUT_FEATURES: ["res2", "res3", "res4", "res5"]

  FPN:
    IN_FEATURES: ["res2", "res3", "res4", "res5"]

  ANCHOR_GENERATOR:
    # One size for each in feature map
    SIZES: [[32], [64], [128], [256], [512]]
    # Three aspect ratios (same for all in feature maps)
    ASPECT_RATIOS: [[0.5, 1.0, 2.0]]

  RPN:
    IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
    PRE_NMS_TOPK_TRAIN: 2000
    PRE_NMS_TOPK_TEST: 1000

    POST_NMS_TOPK_TRAIN: 1000
    POST_NMS_TOPK_TEST: 1000

  ROI_HEADS:
    NAME: "StandardROIHeads"
    IN_FEATURES: ["p2", "p3", "p4", "p5"]

  ROI_BOX_HEAD:
    NAME: "FastRCNNConvFCHead"
    NUM_FC: 2
    POOLER_RESOLUTION: 7

  ROI_MASK_HEAD:
    NAME: "MaskRCNNConvUpsampleHead"
    NUM_CONV: 4
    POOLER_RESOLUTION: 14

  # ImageNet color mean for torchvision-like models (RGB order).
  # These are in [0-255] range as expected by Detectron2. Rest of our codebase
  # uses [0-1] range; but both are equivalent and consistent.
  PIXEL_MEAN: [123.675, 116.280, 103.530]
  PIXEL_STD: [58.395, 57.120, 57.375]

SOLVER:
  # This is for 8 GPUs, apply linear scaling for 4 GPUs.
  IMS_PER_BATCH: 16
  BASE_LR: 0.02

TEST:
  PRECISE_BN:
    ENABLED: True

VERSION: 2