Spaces:

facebook
/

ov-seg

Runtime error

JeffLiang commited on Apr 11, 2023

Commit

8c62972

1 Parent(s): ba09e2c

change sam_vit_h to sam_vit_l to save memory

Files changed (3) hide show

app.py CHANGED Viewed

@@ -45,7 +45,7 @@ def inference(class_names, proposal_gen, granularity, input_img):
     if proposal_gen == 'MaskFormer':
         demo = VisualizationDemo(cfg)
     elif proposal_gen == 'Segment_Anything':
-        demo = SAMVisualizationDemo(cfg, granularity, './sam_vit_h_4b8939.pth', './ovseg_clip_l_9a1909.pth')
     class_names = class_names.split(',')
     img = read_image(input_img, format="BGR")
     _, visualized_output = demo.run_on_image(img, class_names)

     if proposal_gen == 'MaskFormer':
         demo = VisualizationDemo(cfg)
     elif proposal_gen == 'Segment_Anything':
+        demo = SAMVisualizationDemo(cfg, granularity, './sam_vit_l_0b3195.pth', './ovseg_clip_l_9a1909.pth')
     class_names = class_names.split(',')
     img = read_image(input_img, format="BGR")
     _, visualized_output = demo.run_on_image(img, class_names)

open_vocab_seg/utils/predictor.py CHANGED Viewed

@@ -150,7 +150,7 @@ class SAMVisualizationDemo(object):
         self.parallel = parallel
         self.granularity = granularity
-        sam = sam_model_registry["vit_h"](checkpoint=sam_path).cuda()
         self.predictor = SamAutomaticMaskGenerator(sam, points_per_batch=16)
         self.clip_model, _, _ = open_clip.create_model_and_transforms('ViT-L-14', pretrained=ovsegclip_path)
         self.clip_model.cuda()
@@ -189,12 +189,17 @@ class SAMVisualizationDemo(object):
         txts = [f'a photo of {cls_name}' for cls_name in class_names]
         text = open_clip.tokenize(txts)
         with torch.no_grad(), torch.cuda.amp.autocast():
-            image_features = self.clip_model.encode_image(imgs.cuda().half())
             text_features = self.clip_model.encode_text(text.cuda())
-            image_features /= image_features.norm(dim=-1, keepdim=True)
             text_features /= text_features.norm(dim=-1, keepdim=True)
             class_preds = (100.0 * image_features @ text_features.T).softmax(dim=-1)
         select_cls = torch.zeros_like(class_preds)

         self.parallel = parallel
         self.granularity = granularity
+        sam = sam_model_registry["vit_l"](checkpoint=sam_path).cuda()
         self.predictor = SamAutomaticMaskGenerator(sam, points_per_batch=16)
         self.clip_model, _, _ = open_clip.create_model_and_transforms('ViT-L-14', pretrained=ovsegclip_path)
         self.clip_model.cuda()
         txts = [f'a photo of {cls_name}' for cls_name in class_names]
         text = open_clip.tokenize(txts)
+        img_batches = torch.split(imgs, 32, dim=0)
         with torch.no_grad(), torch.cuda.amp.autocast():
             text_features = self.clip_model.encode_text(text.cuda())
             text_features /= text_features.norm(dim=-1, keepdim=True)
+            image_features = []
+            for img_batch in img_batches:
+                image_feat = self.clip_model.encode_image(img_batch.cuda().half())
+                image_feat /= image_feat.norm(dim=-1, keepdim=True)
+                image_features.append(image_feat.detach())
+            image_features = torch.cat(image_features, dim=0)
             class_preds = (100.0 * image_features @ text_features.T).softmax(dim=-1)
         select_cls = torch.zeros_like(class_preds)

sam_vit_l_0b3195.pth ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:3adcc4315b642a4d2101128f611684e8734c41232a17c648ed1693702a49a622
+size 1249524607