Spaces:

hamacojr
/

CAT-Seg

Runtime error

App Files Files Community

hsshin98 commited on Mar 21, 2023

Commit

aff8d56

1 Parent(s): f98e690

prompt

Browse files

Files changed (3) hide show

app.py +13 -6
cat_seg/modeling/transformer/cat_seg_predictor.py +10 -10
demo/predictor.py +3 -2

app.py CHANGED Viewed

@@ -41,7 +41,6 @@ def setup_cfg(args):
     add_cat_seg_config(cfg)
     cfg.merge_from_file(args.config_file)
     cfg.merge_from_list(args.opts)
-    cfg.MODEL.DEVICE = "cpu"
     cfg.freeze()
     return cfg
@@ -67,7 +66,10 @@ def get_parser():
         "MODEL.SEM_SEG_HEAD.TRAIN_CLASS_JSON", "datasets/voc20.json",
         "MODEL.SEM_SEG_HEAD.TEST_CLASS_JSON", "datasets/voc20.json",
         "TEST.SLIDING_WINDOW", "True",
-        "MODEL.SEM_SEG_HEAD.POOLING_SIZES", "[1,1]"],
         nargs=argparse.REMAINDER,
     )
     return parser
@@ -75,7 +77,7 @@ def get_parser():
 def save_masks(preds, text):
     preds = preds['sem_seg'].argmax(dim=0).cpu().numpy() # C H W
     for i, t in enumerate(text):
-        dir = f"masks/mask_{t}.png"
         mask = preds == i
         cv2.imwrite(dir, mask * 255)
@@ -84,7 +86,7 @@ def predict(image, text):
     cfg = setup_cfg(args)
     demo = VisualizationDemo(cfg, text=text)
     predictions, visualized_output = demo.run_on_image(image)
-    # save_masks(predictions, text.split(','))
     canvas = fc(visualized_output.fig)
     canvas.draw()
     out = np.frombuffer(canvas.tostring_rgb(), dtype='uint8').reshape(canvas.get_width_height()[::-1] + (3,))
@@ -97,7 +99,12 @@ if __name__ == "__main__":
     iface = gr.Interface(
         fn=predict,
-        inputs=[gr.Image(), gr.Textbox(placeholder="Classes to segment")],
         outputs="image",
-    )
     iface.launch()

     add_cat_seg_config(cfg)
     cfg.merge_from_file(args.config_file)
     cfg.merge_from_list(args.opts)
     cfg.freeze()
     return cfg
         "MODEL.SEM_SEG_HEAD.TRAIN_CLASS_JSON", "datasets/voc20.json",
         "MODEL.SEM_SEG_HEAD.TEST_CLASS_JSON", "datasets/voc20.json",
         "TEST.SLIDING_WINDOW", "True",
+        "MODEL.SEM_SEG_HEAD.POOLING_SIZES", "[1,1]",
+        "MODEL.DEVICE", "cpu",
+        "MODEL.PROMPT_ENSEMBLE_TYPE", "single"
+        ],
         nargs=argparse.REMAINDER,
     )
     return parser
 def save_masks(preds, text):
     preds = preds['sem_seg'].argmax(dim=0).cpu().numpy() # C H W
     for i, t in enumerate(text):
+        dir = f"mask_{t}.png"
         mask = preds == i
         cv2.imwrite(dir, mask * 255)
     cfg = setup_cfg(args)
     demo = VisualizationDemo(cfg, text=text)
     predictions, visualized_output = demo.run_on_image(image)
+    #save_masks(predictions, text.split(','))
     canvas = fc(visualized_output.fig)
     canvas.draw()
     out = np.frombuffer(canvas.tostring_rgb(), dtype='uint8').reshape(canvas.get_width_height()[::-1] + (3,))
     iface = gr.Interface(
         fn=predict,
+        inputs=[gr.Image(), gr.Textbox(placeholder='cat, person, background')],
         outputs="image",
+        description="""## CAT-Seg Demo
+Welcome to the CAT-Seg Demo! Here, we present the CAT-Seg with ViT-L model for open-vocabulary semantic segmentation.
+Please note that this is an optimized version of the full model, and as such, its performance may be limited compared to the full model.
+To get started, simply upload an image and a comma-separated list of categories, and let the model work its magic!""")
     iface.launch()

cat_seg/modeling/transformer/cat_seg_predictor.py CHANGED Viewed

@@ -50,13 +50,13 @@ class CATSegPredictor(nn.Module):
         import json
         # use class_texts in train_forward, and test_class_texts in test_forward
-        with open(train_class_json, 'r') as f_in:
-            self.class_texts = json.load(f_in)
-        with open(test_class_json, 'r') as f_in:
-            self.test_class_texts = json.load(f_in)
-        assert self.class_texts != None
-        if self.test_class_texts == None:
-            self.test_class_texts = self.class_texts
         device = "cuda" if torch.cuda.is_available() else "cpu"
         self.device = device
         self.tokenizer = None
@@ -84,12 +84,12 @@ class CATSegPredictor(nn.Module):
             prompt_templates = ['A photo of a {} in the scene',]
         else:
             raise NotImplementedError
         self.clip_model = clip_model.float()
         self.clip_preprocess = clip_preprocess
-        self.text_features = self.class_embeddings(self.class_texts, prompt_templates, clip_model).permute(1, 0, 2).float()
-        self.text_features_test = self.class_embeddings(self.test_class_texts, prompt_templates, clip_model).permute(1, 0, 2).float()
         transformer = Aggregator(
             text_guidance_dim=text_guidance_dim,

         import json
         # use class_texts in train_forward, and test_class_texts in test_forward
+        #with open(train_class_json, 'r') as f_in:
+        #    self.class_texts = json.load(f_in)
+        #with open(test_class_json, 'r') as f_in:
+        #    self.test_class_texts = json.load(f_in)
+        #assert self.class_texts != None
+        #if self.test_class_texts == None:
+        #    self.test_class_texts = self.class_texts
         device = "cuda" if torch.cuda.is_available() else "cpu"
         self.device = device
         self.tokenizer = None
             prompt_templates = ['A photo of a {} in the scene',]
         else:
             raise NotImplementedError
+        #self.text_features = self.class_embeddings(self.class_texts, prompt_templates, clip_model).permute(1, 0, 2).float()
+        #self.text_features_test = self.class_embeddings(self.test_class_texts, prompt_templates, clip_model).permute(1, 0, 2).float()
         self.clip_model = clip_model.float()
         self.clip_preprocess = clip_preprocess
         transformer = Aggregator(
             text_guidance_dim=text_guidance_dim,

demo/predictor.py CHANGED Viewed

@@ -43,8 +43,9 @@ class VisualizationDemo(object):
             pred = self.predictor.model.sem_seg_head.predictor
             pred.test_class_texts = text.split(',')
             pred.text_features_test = pred.class_embeddings(pred.test_class_texts,
-                imagenet_templates.IMAGENET_TEMPLATES,
-                pred.clip_model).permute(1, 0, 2).float()
             self.metadata = ns()
             self.metadata.stuff_classes = pred.test_class_texts

             pred = self.predictor.model.sem_seg_head.predictor
             pred.test_class_texts = text.split(',')
             pred.text_features_test = pred.class_embeddings(pred.test_class_texts,
+                #imagenet_templates.IMAGENET_TEMPLATES,
+                 ['A photo of a {} in the scene',],
+                pred.clip_model).permute(1, 0, 2).float().repeat(1, 80, 1)
             self.metadata = ns()
             self.metadata.stuff_classes = pred.test_class_texts