crop_size = ( 512, 512, ) model = dict( backbone=dict( adapter_index=[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, ], block_chunks=0, depth=24, embed_dim=1024, ffn_bias=True, ffn_layer='mlp', has_cat=False, img_size=512, init_values=1e-05, mlp_ratio=4, num_heads=16, cloud_adapter_config=dict( cnn_type='pmaa', context_dim=64, depth=4, emd_dim=1024, global_groups=1, hidden_channels=64, int_type='convnext', local_groups=1, num_layers=24, rank_dim=16, return_last_feature=False, return_multi_feats=False, type='CloudAdapter'), patch_size=16, proj_bias=True, qkv_bias=True, type='CloudAdapterDinoVisionTransformer'), data_preprocessor=dict( bgr_to_rgb=True, mean=[ 123.675, 116.28, 103.53, ], pad_val=0, seg_pad_val=255, size=( 512, 512, ), std=[ 58.395, 57.12, 57.375, ], type='SegDataPreProcessor'), decode_head=dict( align_corners=False, enforce_decoder_input_project=False, feat_channels=256, in_channels=[ 1024, 1024, 1024, 1024, ], loss_cls=dict( class_weight=[ 1.0, 1.0, 1.0, 1.0, 0.1, ], loss_weight=2.0, reduction='mean', type='mmdet.CrossEntropyLoss', use_sigmoid=False), loss_dice=dict( activate=True, eps=1.0, loss_weight=5.0, naive_dice=True, reduction='mean', type='mmdet.DiceLoss', use_sigmoid=True), loss_mask=dict( loss_weight=5.0, reduction='mean', type='mmdet.CrossEntropyLoss', use_sigmoid=True), num_classes=4, num_queries=100, num_transformer_feat_level=3, out_channels=256, pixel_decoder=dict( act_cfg=dict(type='ReLU'), encoder=dict( init_cfg=None, layer_cfg=dict( ffn_cfg=dict( act_cfg=dict(inplace=True, type='ReLU'), embed_dims=256, feedforward_channels=1024, ffn_drop=0.0, num_fcs=2), self_attn_cfg=dict( batch_first=True, dropout=0.0, embed_dims=256, im2col_step=64, init_cfg=None, norm_cfg=None, num_heads=8, num_levels=3, num_points=4)), num_layers=6), init_cfg=None, norm_cfg=dict(num_groups=32, type='GN'), num_outs=3, positional_encoding=dict(normalize=True, num_feats=128), type='mmdet.MSDeformAttnPixelDecoder'), positional_encoding=dict(normalize=True, num_feats=128), strides=[ 4, 8, 16, 32, ], train_cfg=dict( assigner=dict( match_costs=[ dict(type='mmdet.ClassificationCost', weight=2.0), dict( type='mmdet.CrossEntropyLossCost', use_sigmoid=True, weight=5.0), dict( eps=1.0, pred_act=True, type='mmdet.DiceCost', weight=5.0), ], type='mmdet.HungarianAssigner'), importance_sample_ratio=0.75, num_points=12544, oversample_ratio=3.0, sampler=dict(type='mmdet.MaskPseudoSampler')), transformer_decoder=dict( init_cfg=None, layer_cfg=dict( cross_attn_cfg=dict( attn_drop=0.0, batch_first=True, dropout_layer=None, embed_dims=256, num_heads=8, proj_drop=0.0), ffn_cfg=dict( act_cfg=dict(inplace=True, type='ReLU'), add_identity=True, dropout_layer=None, embed_dims=256, feedforward_channels=2048, ffn_drop=0.0, num_fcs=2), self_attn_cfg=dict( attn_drop=0.0, batch_first=True, dropout_layer=None, embed_dims=256, num_heads=8, proj_drop=0.0)), num_layers=9, return_intermediate=True), type='Mask2FormerHead'), test_cfg=dict(mode='whole'), train_cfg=dict(), type='EncoderDecoder')