Spaces:

MykolaL
/

evp

Running on A10G

App Files Files Community

nick_93 commited on Dec 19, 2023

Commit

c6fb6c8

•

1 Parent(s): 7ca6aff

init

Browse files

Files changed (3) hide show

app.py +49 -9
depth/imgs/test_img5.jpg +0 -0
refer/models_refer/model.py +6 -2

app.py CHANGED Viewed

@@ -2,6 +2,7 @@ import os
 import sys
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), 'depth')))
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), 'stable-diffusion')))
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), 'taming-transformers')))
@@ -10,8 +11,8 @@ os.chdir(os.path.abspath(os.path.join(os.path.dirname(__file__), 'depth')))
 import cv2
 import numpy as np
 import torch
-import torch.backends.cudnn as cudnn
 from depth.models_depth.model import EVPDepth
 from depth.configs.train_options import TrainOptions
 from depth.configs.test_options import TestOptions
 import glob
@@ -22,6 +23,7 @@ from PIL import Image
 import torch.nn.functional as F
 import gradio as gr
 import tempfile
 css = """
@@ -37,7 +39,7 @@ css = """
 """
-def create_demo(model, device):
     gr.Markdown("### Depth Prediction demo")
     with gr.Row():
         input_image = gr.Image(label="Input Image", type='pil', elem_id='img-display-input')
@@ -65,24 +67,60 @@ def create_demo(model, device):
         return [colored_depth, tmp.name]
     submit.click(on_submit, inputs=[input_image], outputs=[depth_image, raw_file])
-    examples = gr.Examples(examples=["imgs/test_img1.jpg", "imgs/test_img2.jpg", "imgs/test_img3.jpg", "imgs/test_img4.jpg"],
                            inputs=[input_image])
 def main():
     opt = TestOptions().initialize()
     args = opt.parse_args()
-    args.ckpt_dir = 'best_model_nyu.ckpt'
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     model = EVPDepth(args=args, caption_aggregation=True)
-    cudnn.benchmark = True
     model.to(device)
-    model_weight = torch.load(args.ckpt_dir, map_location=device)['model']
-    if 'module' in next(iter(model_weight.items()))[0]:
-        model_weight = OrderedDict((k[7:], v) for k, v in model_weight.items())
     model.load_state_dict(model_weight, strict=False)
     model.eval()
     title = "# EVP"
     description = """Official demo for **EVP: Enhanced Visual Perception using Inverse Multi-Attentive Feature
@@ -94,7 +132,9 @@ def main():
         gr.Markdown(title)
         gr.Markdown(description)
         with gr.Tab("Depth Prediction"):
-            create_demo(model, device)
         gr.HTML('''<br><br><br><center>You can duplicate this Space to skip the queue:<a href="https://huggingface.co/spaces/MykolaL/evp?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a><br>
                 <p><img src="https://visitor-badge.glitch.me/badge?page_id=MykolaL/evp" alt="visitors"></p></center>''')

 import sys
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), 'depth')))
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), 'refer')))
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), 'stable-diffusion')))
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), 'taming-transformers')))
 import cv2
 import numpy as np
 import torch
 from depth.models_depth.model import EVPDepth
+from models_refer.model import EVPRefer
 from depth.configs.train_options import TrainOptions
 from depth.configs.test_options import TestOptions
 import glob
 import torch.nn.functional as F
 import gradio as gr
 import tempfile
+from transformers import CLIPTokenizer
 css = """
 """
+def create_depth_demo(model, device):
     gr.Markdown("### Depth Prediction demo")
     with gr.Row():
         input_image = gr.Image(label="Input Image", type='pil', elem_id='img-display-input')
         return [colored_depth, tmp.name]
     submit.click(on_submit, inputs=[input_image], outputs=[depth_image, raw_file])
+    examples = gr.Examples(examples=["imgs/test_img1.jpg", "imgs/test_img2.jpg", "imgs/test_img3.jpg", "imgs/test_img4.jpg", "imgs/test_img5.jpg"],
                            inputs=[input_image])
+def create_refseg_demo(model, tokenizer, device):
+    gr.Markdown("### Referring Segmentation demo")
+    with gr.Row():
+        input_image = gr.Image(label="Input Image", type='pil', elem_id='img-display-input')
+        refseg_image = gr.Image(label="Output Mask", elem_id='img-display-output')
+    input_text = gr.Textbox(label='Prompt', placeholder='Please upload your image first', lines=2)
+    submit = gr.Button("Submit")
+    def on_submit(image, text):
+        image = np.array(image)
+        image_t = transforms.ToTensor()(image).unsqueeze(0).to(device)
+        image_t = transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])(image_t)
+        shape = image_t.shape
+        image_t = torch.nn.functional.interpolate(image_t, (512,512), mode='bilinear', align_corners=True)
+        input_ids = tokenizer(text=text, truncation=True, max_length=40, return_length=True,
+            return_overflowing_tokens=False, padding="max_length", return_tensors="pt")['input_ids'].to(device)
+        with torch.no_grad():
+            pred = model(image_t, input_ids)
+        pred = torch.nn.functional.interpolate(pred, shape[2:], mode='bilinear', align_corners=True)
+        output_mask = pred.cpu().argmax(1).data.numpy().squeeze()
+        alpha = 0.65
+        image[output_mask == 0] = (image[output_mask == 0]*alpha).astype(np.uint8)
+        contours, _ = cv2.findContours(output_mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+        cv2.drawContours(image, contours, -1, (0, 255, 0), 2)
+        return Image.fromarray(image)
+    submit.click(on_submit, inputs=[input_image, input_text], outputs=refseg_image)
+    examples = gr.Examples(examples=[["imgs/test_img2.jpg", "green plant"], ["imgs/test_img3.jpg", "chair"], ["imgs/test_img4.jpg", "left green plant"], ["imgs/test_img5.jpg", "man walking on foot"], ["imgs/test_img5.jpg", "the rightest camel"]],
+                           inputs=[input_image, input_text])
 def main():
     opt = TestOptions().initialize()
     args = opt.parse_args()
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     model = EVPDepth(args=args, caption_aggregation=True)
     model.to(device)
+    model_weight = torch.load('best_model_nyu.ckpt', map_location=device)['model']
     model.load_state_dict(model_weight, strict=False)
     model.eval()
+    tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+    model_refseg = EVPRefer()
+    model_refseg.to(device)
+    model_weight = torch.load('best_model_refcoco.pth', map_location=device)['model']
+    model_refseg.load_state_dict(model_weight, strict=False)
+    model_refseg.eval()
     title = "# EVP"
     description = """Official demo for **EVP: Enhanced Visual Perception using Inverse Multi-Attentive Feature
         gr.Markdown(title)
         gr.Markdown(description)
         with gr.Tab("Depth Prediction"):
+            create_depth_demo(model, device)
+        with gr.Tab("Referring Segmentation"):
+            create_refseg_demo(model_refseg, tokenizer, device)
         gr.HTML('''<br><br><br><center>You can duplicate this Space to skip the queue:<a href="https://huggingface.co/spaces/MykolaL/evp?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a><br>
                 <p><img src="https://visitor-badge.glitch.me/badge?page_id=MykolaL/evp" alt="visitors"></p></center>''')

depth/imgs/test_img5.jpg ADDED Viewed

refer/models_refer/model.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import sys
 from ldm.util import instantiate_from_config
 from transformers.models.clip.modeling_clip import CLIPTextModel
@@ -258,7 +258,11 @@ class EVPRefer(nn.Module):
                  **args):
         super().__init__()
         config = OmegaConf.load('./v1-inference.yaml')
-        config.model.params.ckpt_path = f'{sd_path}'
         sd_model = instantiate_from_config(config.model)
         self.encoder_vq = sd_model.first_stage_model
         self.unet = UNetWrapper(sd_model.model, base_size=base_size)

 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+import os
 import sys
 from ldm.util import instantiate_from_config
 from transformers.models.clip.modeling_clip import CLIPTextModel
                  **args):
         super().__init__()
         config = OmegaConf.load('./v1-inference.yaml')
+        if os.path.exists(f'{sd_path}'):
+            config.model.params.ckpt_path = f'{sd_path}'
+        else:
+            config.model.params.ckpt_path = None
         sd_model = instantiate_from_config(config.model)
         self.encoder_vq = sd_model.first_stage_model
         self.unet = UNetWrapper(sd_model.model, base_size=base_size)