prompt_embed_dim=256 model = dict( type='SamMasa', backbone=dict( type='ImageEncoderViT', depth=12, embed_dim=768, img_size=1024, mlp_ratio=4, num_heads=12, patch_size=16, qkv_bias=True, use_rel_pos=True, global_attn_indexes=[2, 5, 8, 11], window_size=14, out_chans=prompt_embed_dim, out_indices=[2, 5, 8, 11]), mask_decoder=dict( type='MaskDecoder', num_multimask_outputs=3, transformer_dim=prompt_embed_dim, iou_head_depth=3, iou_head_hidden_dim=256), prompt_encoder=dict( type='PromptEncoder', embed_dim=prompt_embed_dim, image_embedding_size=(64, 64), input_image_size=(1024, 1024), mask_in_chans=16), )