## coco_loader_lsj.py import detectron2.data.transforms as T from detectron2 import model_zoo from detectron2.config import LazyCall as L # Data using LSJ image_size = 1024 dataloader = model_zoo.get_config("common/data/coco.py").dataloader dataloader.train.mapper.augmentations = [ L(T.RandomFlip)(horizontal=True), # flip first L(T.ResizeScale)( min_scale=0.1, max_scale=2.0, target_height=image_size, target_width=image_size ), L(T.FixedSizeCrop)(crop_size=(image_size, image_size), pad=False), ] dataloader.train.mapper.image_format = "RGB" dataloader.train.total_batch_size = 64 # recompute boxes due to cropping dataloader.train.mapper.recompute_boxes = True dataloader.test.mapper.augmentations = [ L(T.ResizeShortestEdge)(short_edge_length=image_size, max_size=image_size), ] from functools import partial from fvcore.common.param_scheduler import MultiStepParamScheduler from detectron2 import model_zoo from detectron2.config import LazyCall as L from detectron2.solver import WarmupParamScheduler from detectron2.modeling.backbone.vit import get_vit_lr_decay_rate # mask_rcnn_vitdet_b_100ep.py model = model_zoo.get_config("common/models/mask_rcnn_vitdet.py").model # Initialization and trainer settings train = model_zoo.get_config("common/train.py").train train.amp.enabled = True train.ddp.fp16_compression = True train.init_checkpoint = "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_base.pth" # Schedule # 100 ep = 184375 iters * 64 images/iter / 118000 images/ep train.max_iter = 184375 lr_multiplier = L(WarmupParamScheduler)( scheduler=L(MultiStepParamScheduler)( values=[1.0, 0.1, 0.01], milestones=[163889, 177546], num_updates=train.max_iter, ), warmup_length=250 / train.max_iter, warmup_factor=0.001, ) # Optimizer optimizer = model_zoo.get_config("common/optim.py").AdamW optimizer.params.lr_factor_func = partial(get_vit_lr_decay_rate, num_layers=12, lr_decay_rate=0.7) optimizer.params.overrides = {"pos_embed": {"weight_decay": 0.0}} # cascade_mask_rcnn_vitdet_b_100ep.py from detectron2.config import LazyCall as L from detectron2.layers import ShapeSpec from detectron2.modeling.box_regression import Box2BoxTransform from detectron2.modeling.matcher import Matcher from detectron2.modeling.roi_heads import ( FastRCNNOutputLayers, FastRCNNConvFCHead, CascadeROIHeads, ) # arguments that don't exist for Cascade R-CNN [model.roi_heads.pop(k) for k in ["box_head", "box_predictor", "proposal_matcher"]] model.roi_heads.update( _target_=CascadeROIHeads, box_heads=[ L(FastRCNNConvFCHead)( input_shape=ShapeSpec(channels=256, height=7, width=7), conv_dims=[256, 256, 256, 256], fc_dims=[1024], conv_norm="LN", ) for _ in range(3) ], box_predictors=[ L(FastRCNNOutputLayers)( input_shape=ShapeSpec(channels=1024), test_score_thresh=0.05, box2box_transform=L(Box2BoxTransform)(weights=(w1, w1, w2, w2)), cls_agnostic_bbox_reg=True, num_classes="${...num_classes}", ) for (w1, w2) in [(10, 5), (20, 10), (30, 15)] ], proposal_matchers=[ L(Matcher)(thresholds=[th], labels=[0, 1], allow_low_quality_matches=False) for th in [0.5, 0.6, 0.7] ], ) # cascade_mask_rcnn_vitdet_h_75ep.py from functools import partial train.init_checkpoint = "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_huge_p14to16.pth" model.backbone.net.embed_dim = 1280 model.backbone.net.depth = 32 model.backbone.net.num_heads = 16 model.backbone.net.drop_path_rate = 0.5 # 7, 15, 23, 31 for global attention model.backbone.net.window_block_indexes = ( list(range(0, 7)) + list(range(8, 15)) + list(range(16, 23)) + list(range(24, 31)) ) optimizer.params.lr_factor_func = partial(get_vit_lr_decay_rate, lr_decay_rate=0.9, num_layers=32) optimizer.params.overrides = {} optimizer.params.weight_decay_norm = None train.max_iter = train.max_iter * 3 // 4 # 100ep -> 75ep lr_multiplier.scheduler.milestones = [ milestone * 3 // 4 for milestone in lr_multiplier.scheduler.milestones ] lr_multiplier.scheduler.num_updates = train.max_iter