from torch.nn import GroupNorm, ReLU from mmdet.models import MSDeformAttnPixelDecoder, CrossEntropyLoss, DiceLoss, FocalLoss from mmdet.models.task_modules.assigners import HungarianAssigner, ClassificationCost, CrossEntropyLossCost, DiceCost from mmdet.models.task_modules.samplers import MaskPseudoSampler from seg.models.detectors import Mask2formerVideo from seg.models.fusion_head import OMGFusionHead from seg.models.heads import Mask2FormerVideoHead from seg.models.backbones import OpenCLIPBackbone num_things_classes = 80 num_stuff_classes = 53 ov_model_name = 'convnext_large_d_320' ov_datasets_name = 'CocoPanopticOVDataset' model = dict( type=Mask2formerVideo, data_preprocessor=None, # to fill backbone=dict( type=OpenCLIPBackbone, model_name='convnext_large_d_320', fix=True, init_cfg=dict( type='clip_pretrain', checkpoint='laion2b_s29b_b131k_ft_soup' ) ), panoptic_head=dict( init_cfg=dict( type='Pretrained', checkpoint='./models/omg_seg_convl.pth', prefix='panoptic_head.' ), type=Mask2FormerVideoHead, sphere_cls=True, ov_classifier_name=f'{ov_model_name}_{ov_datasets_name}', logit=None, enable_box_query=True, in_channels=[192, 384, 768, 1536], # pass to pixel_decoder inside strides=[4, 8, 16, 32], feat_channels=256, out_channels=256, num_things_classes=num_things_classes, num_stuff_classes=num_stuff_classes, num_queries=300, num_transformer_feat_level=3, pixel_decoder=dict( type=MSDeformAttnPixelDecoder, num_outs=3, norm_cfg=dict(type=GroupNorm, num_groups=32), act_cfg=dict(type=ReLU), encoder=dict( # DeformableDetrTransformerEncoder num_layers=6, layer_cfg=dict( # DeformableDetrTransformerEncoderLayer self_attn_cfg=dict( # MultiScaleDeformableAttention embed_dims=256, num_heads=8, num_levels=3, num_points=4, dropout=0.0, batch_first=True), ffn_cfg=dict( embed_dims=256, feedforward_channels=1024, num_fcs=2, ffn_drop=0.0, act_cfg=dict(type=ReLU, inplace=True)))), positional_encoding=dict(num_feats=128, normalize=True)), enforce_decoder_input_project=False, positional_encoding=dict(num_feats=128, normalize=True), transformer_decoder=dict( # Mask2FormerTransformerDecoder return_intermediate=True, num_layers=9, layer_cfg=dict( # Mask2FormerTransformerDecoderLayer self_attn_cfg=dict( # MultiheadAttention embed_dims=256, num_heads=8, dropout=0.0, batch_first=True), cross_attn_cfg=dict( # MultiheadAttention embed_dims=256, num_heads=8, dropout=0.0, batch_first=True), ffn_cfg=dict( embed_dims=256, feedforward_channels=2048, num_fcs=2, ffn_drop=0.0, act_cfg=dict(type='ReLU', inplace=True))), init_cfg=None), loss_cls=dict( type=CrossEntropyLoss, use_sigmoid=False, loss_weight=2.0, reduction='mean', class_weight=None # [1.0] * num_classes + [0.1] ), loss_mask=dict( type=CrossEntropyLoss, use_sigmoid=True, reduction='mean', loss_weight=5.0), loss_dice=dict( type=DiceLoss, use_sigmoid=True, activate=True, reduction='mean', naive_dice=True, eps=1.0, loss_weight=5.0), loss_iou=dict( type=FocalLoss, use_sigmoid=True, loss_weight=2.0, reduction='mean' ) ), panoptic_fusion_head=dict( type=OMGFusionHead, num_things_classes=num_things_classes, num_stuff_classes=num_stuff_classes, loss_panoptic=None, init_cfg=None ), train_cfg=dict( num_points=12544, oversample_ratio=3.0, importance_sample_ratio=0.75, assigner=dict( type=HungarianAssigner, match_costs=[ dict(type=ClassificationCost, weight=2.0), dict( type=CrossEntropyLossCost, weight=5.0, use_sigmoid=True), dict(type=DiceCost, weight=5.0, pred_act=True, eps=1.0) ]), sampler=dict(type=MaskPseudoSampler)), test_cfg=dict( panoptic_on=True, semantic_on=False, instance_on=True, # max_per_image is for instance segmentation. max_per_image=100, iou_thr=0.8, # In Mask2Former's panoptic postprocessing, # it will filter mask area where score is less than 0.5 . filter_low_score=True, object_mask_thr=0., ), init_cfg=None )