# Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. import torch from functools import partial from segment_anything.modeling import ImageEncoderViT, MaskDecoder, PromptEncoder, Sam, TwoWayTransformer from EdgeSAM.rep_vit import RepViT prompt_embed_dim = 256 image_size = 1024 vit_patch_size = 16 image_embedding_size = image_size // vit_patch_size def build_edge_sam(checkpoint=None, upsample_mode="bicubic"): image_encoder = RepViT( arch="m1", img_size=image_size, upsample_mode=upsample_mode ) return _build_sam(image_encoder, checkpoint) sam_model_registry = { "default": build_edge_sam, "edge_sam": build_edge_sam, } def _build_sam_encoder( encoder_embed_dim, encoder_depth, encoder_num_heads, encoder_global_attn_indexes, ): image_encoder = ImageEncoderViT( depth=encoder_depth, embed_dim=encoder_embed_dim, img_size=image_size, mlp_ratio=4, norm_layer=partial(torch.nn.LayerNorm, eps=1e-6), num_heads=encoder_num_heads, patch_size=vit_patch_size, qkv_bias=True, use_rel_pos=True, global_attn_indexes=encoder_global_attn_indexes, window_size=14, out_chans=prompt_embed_dim, ) return image_encoder def _build_sam( image_encoder, checkpoint=None, ): sam = Sam( image_encoder=image_encoder, prompt_encoder=PromptEncoder( embed_dim=prompt_embed_dim, image_embedding_size=(image_embedding_size, image_embedding_size), input_image_size=(image_size, image_size), mask_in_chans=16, ), mask_decoder=MaskDecoder( num_multimask_outputs=3, transformer=TwoWayTransformer( depth=2, embedding_dim=prompt_embed_dim, mlp_dim=2048, num_heads=8, ), transformer_dim=prompt_embed_dim, iou_head_depth=3, iou_head_hidden_dim=256, ), pixel_mean=[123.675, 116.28, 103.53], pixel_std=[58.395, 57.12, 57.375], ) sam.eval() if checkpoint is not None: with open(checkpoint, "rb") as f: state_dict = torch.load(f, map_location="cpu") sam.load_state_dict(state_dict) return sam