Spaces:

yizhangliu
/

Grounded-Segment-Anything

Running on T4

App Files Files Community

liuyizhang commited on Apr 14, 2023

Commit

72fe59d

1 Parent(s): 1f8f331

update app.py

Browse files

Files changed (11) hide show

GroundingDINO/demo/inference_on_a_image.py +1 -1
app.py +285 -79
automatic_label_demo.py +5 -3
gradio_app.py +0 -345
gradio_auto_label.py +0 -392
grounded_sam.ipynb +4 -2
grounded_sam_demo.py +0 -217
grounded_sam_inpainting_demo.py +0 -215
grounded_sam_whisper_demo.py +0 -258
grounded_sam_whisper_inpainting_demo.py +0 -281
requirements.txt +9 -0

GroundingDINO/demo/inference_on_a_image.py CHANGED Viewed

@@ -143,7 +143,7 @@ if __name__ == "__main__":
     text_prompt = args.text_prompt
     output_dir = args.output_dir
     box_threshold = args.box_threshold
-    text_threshold = args.box_threshold
     # make dir
     os.makedirs(output_dir, exist_ok=True)

     text_prompt = args.text_prompt
     output_dir = args.output_dir
     box_threshold = args.box_threshold
+    text_threshold = args.text_threshold
     # make dir
     os.makedirs(output_dir, exist_ok=True)

app.py CHANGED Viewed

@@ -1,10 +1,11 @@
-import subprocess, os, sys, time
 os.environ["CUDA_VISIBLE_DEVICES"] = "0"
-result = subprocess.run(['pip', 'install', '-e', 'GroundingDINO'], check=True)
-print(f'pip install GroundingDINO = {result}')
 result = subprocess.run(['pip', 'list'], check=True)
 print(f'pip list = {result}')
@@ -12,6 +13,7 @@ print(f'pip list = {result}')
 sys.path.insert(0, './GroundingDINO')
 if not os.path.exists('./sam_vit_h_4b8939.pth'):
     result = subprocess.run(['wget', 'https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth'], check=True)
     print(f'wget sam_vit_h_4b8939.pth result = {result}')
@@ -19,10 +21,11 @@ import gradio as gr
 import argparse
 import copy
 import numpy as np
 import torch
-from PIL import Image, ImageDraw, ImageFont
 # Grounding DINO
 import GroundingDINO.groundingdino.datasets.transforms as T
@@ -31,12 +34,14 @@ from GroundingDINO.groundingdino.util import box_ops
 from GroundingDINO.groundingdino.util.slconfig import SLConfig
 from GroundingDINO.groundingdino.util.utils import clean_state_dict, get_phrases_from_posmap
-# segment anything
-from segment_anything import build_sam, SamPredictor
 import cv2
 import numpy as np
 import matplotlib.pyplot as plt
 # diffusers
 import PIL
@@ -108,8 +113,10 @@ def plot_boxes_to_image(image_pil, tgt):
 def load_image(image_path):
     # # load image
-    # image_pil = Image.open(image_path).convert("RGB")  # load image
-    image_pil = image_path
     transform = T.Compose(
         [
@@ -181,6 +188,38 @@ def show_box(box, ax, label):
     ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor='green', facecolor=(0,0,0,0), lw=2))
     ax.text(x0, y0, label)
 config_file = 'GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py'
 ckpt_repo_id = "ShilongLiu/GroundingDINO"
 ckpt_filenmae = "groundingdino_swint_ogc.pth"
@@ -189,53 +228,157 @@ output_dir = "outputs"
 device = "cuda"
 device = get_device()
 print(f'device={device}')
 # initialize groundingdino model
 groundingdino_model = load_model_hf(config_file, ckpt_repo_id, ckpt_filenmae)
 # initialize SAM
 sam_predictor = SamPredictor(build_sam(checkpoint=sam_checkpoint))
 # initialize stable-diffusion-inpainting
-sd_pipe = StableDiffusionInpaintPipeline.from_pretrained(
-        "runwayml/stable-diffusion-inpainting",
-        torch_dtype=torch.float16
 )
-sd_pipe = sd_pipe.to(device)
-def run_grounded_sam(image_path, text_prompt, task_type, inpaint_prompt, box_threshold, text_threshold):
-    assert text_prompt, 'text_prompt is not found!'
     # make dir
     os.makedirs(output_dir, exist_ok=True)
     # load image
-    image_pil, image = load_image(image_path.convert("RGB"))
     file_temp = int(time.time())
     # visualize raw image
     # image_pil.save(os.path.join(output_dir, f"raw_image_{file_temp}.jpg"))
     # run grounding dino model
-    groundingdino_device = 'cpu'
-    if device != 'cpu':
-        try:
-            from groundingdino import _C
-            groundingdino_device = 'cuda:0'
-        except:
-            warnings.warn("Failed to load custom C++ ops. Running on CPU mode Only in groundingdino!")
-    groundingdino_device = 'cpu'
-    boxes_filt, pred_phrases = get_grounding_output(
-        groundingdino_model, image, text_prompt, box_threshold, text_threshold, device=groundingdino_device
-    )
-    size = image_pil.size
-    if task_type == 'segment' or task_type == 'inpainting':
-        image = np.array(image_path)
         sam_predictor.set_image(image)
         H, W = size[1], size[0]
@@ -253,25 +396,8 @@ def run_grounded_sam(image_path, text_prompt, task_type, inpaint_prompt, box_thr
             boxes = transformed_boxes,
             multimask_output = False,
         )
-        # masks: [1, 1, 512, 512]
-    if task_type == 'detection':
-        pred_dict = {
-            "boxes": boxes_filt,
-            "size": [size[1], size[0]],  # H,W
-            "labels": pred_phrases,
-        }
-        # import ipdb; ipdb.set_trace()
-        image_with_box = plot_boxes_to_image(image_pil, pred_dict)[0]
-        image_path = os.path.join(output_dir, f"grounding_dino_output_{file_temp}.jpg")
-        image_with_box.save(image_path)
-        image_result = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB)
-        os.remove(image_path)
-        return image_result
-    elif task_type == 'segment':
         assert sam_checkpoint, 'sam_checkpoint is not found!'
         # draw output image
         plt.figure(figsize=(10, 10))
         plt.imshow(image)
@@ -282,39 +408,106 @@ def run_grounded_sam(image_path, text_prompt, task_type, inpaint_prompt, box_thr
         plt.axis('off')
         image_path = os.path.join(output_dir, f"grounding_seg_output_{file_temp}.jpg")
         plt.savefig(image_path, bbox_inches="tight")
         image_result = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB)
         os.remove(image_path)
-        return image_result
-    elif task_type == 'inpainting':
-        assert inpaint_prompt, 'inpaint_prompt is not found!'
-        # inpainting pipeline
-        mask = masks[0][0].cpu().numpy() # simply choose the first mask, which will be refine in the future release
-        mask_pil = Image.fromarray(mask)
-        image_pil = Image.fromarray(image)
-        # image_inpainting = sd_pipe(prompt=inpaint_prompt, image=image_pil, mask_image=mask_pil).images[0]
-        # resize for inpaint
-        image_source_for_inpaint = image_pil.resize((512, 512))
-        image_mask_for_inpaint = mask_pil.resize((512, 512))
-        image_inpainting = sd_pipe(prompt=inpaint_prompt, image=image_source_for_inpaint, mask_image=image_mask_for_inpaint).images[0]
         image_inpainting = image_inpainting.resize((image_pil.size[0], image_pil.size[1]))
         image_path = os.path.join(output_dir, f"grounded_sam_inpainting_output_{file_temp}.jpg")
         image_inpainting.save(image_path)
         image_result = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB)
         os.remove(image_path)
-        return image_result
     else:
-        print("task_type:{} error!".format(task_type))
-def change_task_type(task_type):
     if task_type == "inpainting":
-        return gr.Textbox.update(visible=True)
-    else:
-        return gr.Textbox.update(visible=False)
 if __name__ == "__main__":
     parser = argparse.ArgumentParser("Grounded SAM demo", add_help=True)
     parser.add_argument("--debug", action="store_true", help="using debug mode")
     parser.add_argument("--share", action="store_true", help="share the app")
@@ -326,11 +519,14 @@ if __name__ == "__main__":
     with block:
         with gr.Row():
             with gr.Column():
-                input_image = gr.Image(source='upload', type="pil")
-                task_type = gr.Radio(["detection", "segment", "inpainting"],  value="detection",
                                                 label='Task type',interactive=True, visible=True)
                 text_prompt = gr.Textbox(label="Detection Prompt", placeholder="Cannot be empty")
-                inpaint_prompt = gr.Textbox(label="Inpaint Prompt", visible=True)
                 run_button = gr.Button(label="Run")
                 with gr.Accordion("Advanced options", open=False):
                     box_threshold = gr.Slider(
@@ -339,18 +535,28 @@ if __name__ == "__main__":
                     text_threshold = gr.Slider(
                         label="Text Threshold", minimum=0.0, maximum=1.0, value=0.25, step=0.001
                     )
             with gr.Column():
-                gallery = gr.outputs.Image(
-                    type="pil",
-                ).style(full_width=True, full_height=True)
         run_button.click(fn=run_grounded_sam, inputs=[
-                        input_image, text_prompt, task_type, inpaint_prompt, box_threshold, text_threshold], outputs=[gallery])
-        # task_type.change(fn=change_task_type, inputs=[task_type], outputs=[inpaint_prompt])
         DESCRIPTION = '### This demo from [Grounded-Segment-Anything](https://github.com/IDEA-Research/Grounded-Segment-Anything). Thanks for their excellent work.'
         DESCRIPTION += f'<p>For faster inference without waiting in queue, you may duplicate the space and upgrade to GPU in settings. <a href="https://huggingface.co/spaces/yizhangliu/Grounded-Segment-Anything?duplicate=true"><img style="display: inline; margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space" /></a></p>'
         gr.Markdown(DESCRIPTION)
-    block.launch(server_name='0.0.0.0', debug=args.debug, share=args.share)

+import subprocess, io, os, sys, time
 os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+if os.environ.get('IS_MY_DEBUG') is None:
+    result = subprocess.run(['pip', 'install', '-e', 'GroundingDINO'], check=True)
+    print(f'pip install GroundingDINO = {result}')
 result = subprocess.run(['pip', 'list'], check=True)
 print(f'pip list = {result}')
 sys.path.insert(0, './GroundingDINO')
 if not os.path.exists('./sam_vit_h_4b8939.pth'):
+    logger.info(f"get sam_vit_h_4b8939.pth...")
     result = subprocess.run(['wget', 'https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth'], check=True)
     print(f'wget sam_vit_h_4b8939.pth result = {result}')
 import argparse
 import copy
+from loguru import logger
 import numpy as np
 import torch
+from PIL import Image, ImageDraw, ImageFont, ImageOps
 # Grounding DINO
 import GroundingDINO.groundingdino.datasets.transforms as T
 from GroundingDINO.groundingdino.util.slconfig import SLConfig
 from GroundingDINO.groundingdino.util.utils import clean_state_dict, get_phrases_from_posmap
 import cv2
 import numpy as np
 import matplotlib.pyplot as plt
+from lama_cleaner.model_manager import ModelManager
+from lama_cleaner.schema import Config
+# segment anything
+from segment_anything import build_sam, SamPredictor
 # diffusers
 import PIL
 def load_image(image_path):
     # # load image
+    if isinstance(image_path, PIL.Image.Image):
+        image_pil = image_path
+    else:
+        image_pil = Image.open(image_path).convert("RGB")  # load image
     transform = T.Compose(
         [
     ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor='green', facecolor=(0,0,0,0), lw=2))
     ax.text(x0, y0, label)
+def xywh_to_xyxy(box, sizeW, sizeH):
+    if isinstance(box, list):
+        box = torch.Tensor(box)
+    box = box * torch.Tensor([sizeW, sizeH, sizeW, sizeH])
+    box[:2] -= box[2:] / 2
+    box[2:] += box[:2]
+    box = box.numpy()
+    return box
+def mask_extend(img, box, extend_pixels=10, useRectangle=True):
+    box[0] = int(box[0])
+    box[1] = int(box[1])
+    box[2] = int(box[2])
+    box[3] = int(box[3])
+    region = img.crop(tuple(box))
+    new_width = box[2] - box[0] + 2*extend_pixels
+    new_height = box[3] - box[1] + 2*extend_pixels
+    region_BILINEAR = region.resize((int(new_width), int(new_height)))
+    if useRectangle:
+        region_draw = ImageDraw.Draw(region_BILINEAR)
+        region_draw.rectangle((0, 0, new_width, new_height), fill=(255, 255, 255))
+    img.paste(region_BILINEAR, (int(box[0]-extend_pixels), int(box[1]-extend_pixels)))
+    return img
+def mix_masks(imgs):
+    re_img =  1 - np.asarray(imgs[0].convert("1"))
+    for i in range(len(imgs)-1):
+        re_img = np.multiply(re_img, 1 - np.asarray(imgs[i+1].convert("1")))
+    re_img =  1 - re_img
+    return  Image.fromarray(np.uint8(255*re_img))
 config_file = 'GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py'
 ckpt_repo_id = "ShilongLiu/GroundingDINO"
 ckpt_filenmae = "groundingdino_swint_ogc.pth"
 device = "cuda"
 device = get_device()
 print(f'device={device}')
 # initialize groundingdino model
+logger.info(f"initialize groundingdino model...")
 groundingdino_model = load_model_hf(config_file, ckpt_repo_id, ckpt_filenmae)
 # initialize SAM
+logger.info(f"initialize SAM model...")
 sam_predictor = SamPredictor(build_sam(checkpoint=sam_checkpoint))
 # initialize stable-diffusion-inpainting
+logger.info(f"initialize stable-diffusion-inpainting...")
+sd_pipe = None
+if os.environ.get('IS_MY_DEBUG') is None:
+    sd_pipe = StableDiffusionInpaintPipeline.from_pretrained(
+            "runwayml/stable-diffusion-inpainting",
+            torch_dtype=torch.float16
+    )
+    sd_pipe = sd_pipe.to(device)
+# initialize lama_cleaner
+logger.info(f"initialize lama_cleaner...")
+from lama_cleaner.helper import (
+    load_img,
+    numpy_to_bytes,
+    resize_max_size,
 )
+lama_cleaner_model = ModelManager(
+        name='lama',
+        device=device,
+    )
+def lama_cleaner_process(image, mask):
+    ori_image = image
+    if mask.shape[0] == image.shape[1] and mask.shape[1] == image.shape[0] and mask.shape[0] != mask.shape[1]:
+        # rotate image
+        ori_image = np.transpose(image[::-1, ...][:, ::-1], axes=(1, 0, 2))[::-1, ...]
+        image = ori_image
+    original_shape = ori_image.shape
+    interpolation = cv2.INTER_CUBIC
+    size_limit = 1080
+    if size_limit == "Original":
+        size_limit = max(image.shape)
+    else:
+        size_limit = int(size_limit)
+    config = Config(
+        ldm_steps=25,
+        ldm_sampler='plms',
+        zits_wireframe=True,
+        hd_strategy='Original',
+        hd_strategy_crop_margin=196,
+        hd_strategy_crop_trigger_size=1280,
+        hd_strategy_resize_limit=2048,
+        prompt='',
+        use_croper=False,
+        croper_x=0,
+        croper_y=0,
+        croper_height=512,
+        croper_width=512,
+        sd_mask_blur=5,
+        sd_strength=0.75,
+        sd_steps=50,
+        sd_guidance_scale=7.5,
+        sd_sampler='ddim',
+        sd_seed=42,
+        cv2_flag='INPAINT_NS',
+        cv2_radius=5,
+    )
+    if config.sd_seed == -1:
+        config.sd_seed = random.randint(1, 999999999)
+    # logger.info(f"Origin image shape_0_: {original_shape} / {size_limit}")
+    image = resize_max_size(image, size_limit=size_limit, interpolation=interpolation)
+    # logger.info(f"Resized image shape_1_: {image.shape}")
+    # logger.info(f"mask image shape_0_: {mask.shape} / {type(mask)}")
+    mask = resize_max_size(mask, size_limit=size_limit, interpolation=interpolation)
+    # logger.info(f"mask image shape_1_: {mask.shape} / {type(mask)}")
+    res_np_img = lama_cleaner_model(image, mask, config)
+    torch.cuda.empty_cache()
+    image = Image.open(io.BytesIO(numpy_to_bytes(res_np_img, 'png')))
+    return  image
+mask_source_draw = "draw a mask on input image"
+mask_source_segment = "type what to detect below"
+def run_grounded_sam(input_image, text_prompt, task_type, inpaint_prompt, box_threshold, text_threshold,
+            iou_threshold, inpaint_mode, mask_source_radio, remove_mode, remove_mask_extend):
+    if (task_type == 'inpainting' or task_type == 'remove') and mask_source_radio == mask_source_draw:
+        pass
+    else:
+        assert text_prompt, 'text_prompt is not found!'
+    logger.info(f'run_grounded_sam_1_')
     # make dir
     os.makedirs(output_dir, exist_ok=True)
     # load image
+    input_mask_pil = input_image['mask']
+    input_mask = np.array(input_mask_pil.convert("L"))
+    image_pil, image = load_image(input_image['image'].convert("RGB"))
     file_temp = int(time.time())
     # visualize raw image
     # image_pil.save(os.path.join(output_dir, f"raw_image_{file_temp}.jpg"))
+    size = image_pil.size
+    output_images = []
     # run grounding dino model
+    if (task_type == 'inpainting' or task_type == 'remove') and mask_source_radio == mask_source_draw:
+        pass
+    else:
+        groundingdino_device = 'cpu'
+        if device != 'cpu':
+            try:
+                from groundingdino import _C
+                groundingdino_device = 'cuda:0'
+            except:
+                warnings.warn("Failed to load custom C++ ops. Running on CPU mode Only in groundingdino!")
+        groundingdino_device = 'cpu'
+        boxes_filt, pred_phrases = get_grounding_output(
+            groundingdino_model, image, text_prompt, box_threshold, text_threshold, device=groundingdino_device
+        )
+        boxes_filt_ori = copy.deepcopy(boxes_filt)
+        pred_dict = {
+            "boxes": boxes_filt,
+            "size": [size[1], size[0]],  # H,W
+            "labels": pred_phrases,
+        }
+        image_with_box = plot_boxes_to_image(copy.deepcopy(image_pil), pred_dict)[0]
+        image_path = os.path.join(output_dir, f"grounding_dino_output_{file_temp}.jpg")
+        image_with_box.save(image_path)
+        detection_image_result = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB)
+        os.remove(image_path)
+        output_images.append(detection_image_result)
+    logger.info(f'run_grounded_sam_2_')
+    if task_type == 'segment' or ((task_type == 'inpainting' or task_type == 'remove') and mask_source_radio == mask_source_segment):
+        image = np.array(input_image['image'])
         sam_predictor.set_image(image)
         H, W = size[1], size[0]
             boxes = transformed_boxes,
             multimask_output = False,
         )
+        # masks: [9, 1, 512, 512]
         assert sam_checkpoint, 'sam_checkpoint is not found!'
         # draw output image
         plt.figure(figsize=(10, 10))
         plt.imshow(image)
         plt.axis('off')
         image_path = os.path.join(output_dir, f"grounding_seg_output_{file_temp}.jpg")
         plt.savefig(image_path, bbox_inches="tight")
+        segment_image_result = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB)
+        os.remove(image_path)
+        output_images.append(segment_image_result)
+    logger.info(f'run_grounded_sam_3_')
+    if task_type == 'detection' or task_type == 'segment':
+        logger.info(f'run_grounded_sam_9_{task_type}_')
+        return output_images
+    elif task_type == 'inpainting' or task_type == 'remove':
+        if inpaint_prompt.strip() == '' and mask_source_radio == mask_source_segment:
+            task_type = 'remove'
+        logger.info(f'run_grounded_sam_4_{task_type}_')
+        if mask_source_radio == mask_source_draw:
+            mask_pil = input_mask_pil
+            mask = input_mask
+        else:
+            if inpaint_mode == 'merge':
+                masks = torch.sum(masks, dim=0).unsqueeze(0)
+                masks = torch.where(masks > 0, True, False)
+            mask = masks[0][0].cpu().numpy()
+            mask_pil = Image.fromarray(mask)
+        image_path = os.path.join(output_dir, f"image_mask_{file_temp}.jpg")
+        mask_pil.convert("RGB").save(image_path)
         image_result = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB)
         os.remove(image_path)
+        output_images.append(image_result)
+        if task_type == 'inpainting':
+            # inpainting pipeline
+            image_source_for_inpaint = image_pil.resize((512, 512))
+            image_mask_for_inpaint = mask_pil.resize((512, 512))
+            image_inpainting = sd_pipe(prompt=inpaint_prompt, image=image_source_for_inpaint, mask_image=image_mask_for_inpaint).images[0]
+        else:
+            # remove from mask
+            if mask_source_radio == mask_source_segment:
+                mask_imgs = []
+                masks_shape = masks.shape
+                boxes_filt_ori_array = boxes_filt_ori.numpy()
+                if inpaint_mode == 'merge':
+                    extend_shape_0 = masks_shape[0]
+                    extend_shape_1 = masks_shape[1]
+                else:
+                    extend_shape_0 = 1
+                    extend_shape_1 = 1
+                for i in range(extend_shape_0):
+                    for j in range(extend_shape_1):
+                        mask = masks[i][j].cpu().numpy()
+                        mask_pil = Image.fromarray(mask)
+                        if remove_mode == 'segment':
+                            useRectangle = False
+                        else:
+                            useRectangle = True
+                        try:
+                            remove_mask_extend = int(remove_mask_extend)
+                        except:
+                            remove_mask_extend = 10
+                        mask_pil_exp = mask_extend(copy.deepcopy(mask_pil).convert("RGB"),
+                                        xywh_to_xyxy(torch.tensor(boxes_filt_ori_array[i]), size[0], size[1]),
+                                        extend_pixels=remove_mask_extend, useRectangle=useRectangle)
+                        mask_imgs.append(mask_pil_exp)
+                mask_pil = mix_masks(mask_imgs)
+                image_path = os.path.join(output_dir, f"image_mask_{file_temp}.jpg")
+                mask_pil.convert("RGB").save(image_path)
+                image_result = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB)
+                os.remove(image_path)
+                output_images.append(image_result)
+            image_inpainting = lama_cleaner_process(np.array(image_pil), np.array(mask_pil.convert("L")))
         image_inpainting = image_inpainting.resize((image_pil.size[0], image_pil.size[1]))
         image_path = os.path.join(output_dir, f"grounded_sam_inpainting_output_{file_temp}.jpg")
         image_inpainting.save(image_path)
         image_result = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB)
         os.remove(image_path)
+        logger.info(f'run_grounded_sam_9_{task_type}_')
+        output_images.append(image_result)
+        return output_images
     else:
+        logger.info(f"task_type:{task_type} error!")
+    logger.info(f'run_grounded_sam_9_9_')
+    return output_images
+def change_radio_display(task_type, mask_source_radio):
+    text_prompt_visible = True
+    inpaint_prompt_visible = False
+    mask_source_radio_visible = False
     if task_type == "inpainting":
+        inpaint_prompt_visible = True
+    if task_type == "inpainting" or task_type == "remove":
+        mask_source_radio_visible = True
+        if mask_source_radio == mask_source_draw:
+            text_prompt_visible = False
+    return  gr.Textbox.update(visible=text_prompt_visible), gr.Textbox.update(visible=inpaint_prompt_visible), gr.Radio.update(visible=mask_source_radio_visible)
 if __name__ == "__main__":
     parser = argparse.ArgumentParser("Grounded SAM demo", add_help=True)
     parser.add_argument("--debug", action="store_true", help="using debug mode")
     parser.add_argument("--share", action="store_true", help="share the app")
     with block:
         with gr.Row():
             with gr.Column():
+                input_image = gr.Image(source='upload', elem_id="image_upload", tool='sketch', type='pil', label="Upload")
+                task_type = gr.Radio(["detection", "segment", "inpainting", "remove"],  value="detection",
                                                 label='Task type',interactive=True, visible=True)
+                mask_source_radio = gr.Radio([mask_source_draw, mask_source_segment],
+                                    value=mask_source_segment, label="Mask from",
+                                    interactive=True, visible=False)
                 text_prompt = gr.Textbox(label="Detection Prompt", placeholder="Cannot be empty")
+                inpaint_prompt = gr.Textbox(label="Inpaint Prompt (if this is empty, then remove)", visible=False)
                 run_button = gr.Button(label="Run")
                 with gr.Accordion("Advanced options", open=False):
                     box_threshold = gr.Slider(
                     text_threshold = gr.Slider(
                         label="Text Threshold", minimum=0.0, maximum=1.0, value=0.25, step=0.001
                     )
+                    iou_threshold = gr.Slider(
+                        label="IOU Threshold", minimum=0.0, maximum=1.0, value=0.5, step=0.001
+                    )
+                    inpaint_mode = gr.Radio(["merge", "first"], value="merge", label="inpaint_mode")
+                    with gr.Row():
+                        with gr.Column(scale=1):
+                            remove_mode = gr.Radio(["segment", "rectangle"],  value="segment", label='remove mode')
+                        with gr.Column(scale=1):
+                            remove_mask_extend = gr.Textbox(label="remove_mask_extend", value='10')
             with gr.Column():
+                gallery = gr.Gallery(
+                    label="Generated images", show_label=False, elem_id="gallery"
+                ).style(grid=[2], full_width=True, full_height=True)
         run_button.click(fn=run_grounded_sam, inputs=[
+                        input_image, text_prompt, task_type, inpaint_prompt, box_threshold, text_threshold, iou_threshold, inpaint_mode, mask_source_radio, remove_mode, remove_mask_extend], outputs=[gallery])
+        task_type.change(fn=change_radio_display, inputs=[task_type, mask_source_radio], outputs=[text_prompt, inpaint_prompt, mask_source_radio])
+        mask_source_radio.change(fn=change_radio_display, inputs=[task_type, mask_source_radio], outputs=[text_prompt, inpaint_prompt, mask_source_radio])
         DESCRIPTION = '### This demo from [Grounded-Segment-Anything](https://github.com/IDEA-Research/Grounded-Segment-Anything). Thanks for their excellent work.'
         DESCRIPTION += f'<p>For faster inference without waiting in queue, you may duplicate the space and upgrade to GPU in settings. <a href="https://huggingface.co/spaces/yizhangliu/Grounded-Segment-Anything?duplicate=true"><img style="display: inline; margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space" /></a></p>'
         gr.Markdown(DESCRIPTION)
+    block.launch(server_name='0.0.0.0', debug=args.debug, share=args.share)

automatic_label_demo.py CHANGED Viewed

@@ -224,7 +224,7 @@ if __name__ == "__main__":
     openai_proxy = args.openai_proxy
     output_dir = args.output_dir
     box_threshold = args.box_threshold
-    text_threshold = args.box_threshold
     iou_threshold = args.iou_threshold
     device = args.device
@@ -264,7 +264,9 @@ if __name__ == "__main__":
     )
     # initialize SAM
-    predictor = SamPredictor(build_sam(checkpoint=sam_checkpoint))
     image = cv2.imread(image_path)
     image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
     predictor.set_image(image)
@@ -286,7 +288,7 @@ if __name__ == "__main__":
     caption = check_caption(caption, pred_phrases)
     print(f"Revise caption with number: {caption}")
-    transformed_boxes = predictor.transform.apply_boxes_torch(boxes_filt, image.shape[:2])
     masks, _, _ = predictor.predict_torch(
         point_coords = None,

     openai_proxy = args.openai_proxy
     output_dir = args.output_dir
     box_threshold = args.box_threshold
+    text_threshold = args.text_threshold
     iou_threshold = args.iou_threshold
     device = args.device
     )
     # initialize SAM
+    sam = build_sam(checkpoint=sam_checkpoint)
+    sam.to(device=device)
+    predictor = SamPredictor(sam)
     image = cv2.imread(image_path)
     image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
     predictor.set_image(image)
     caption = check_caption(caption, pred_phrases)
     print(f"Revise caption with number: {caption}")
+    transformed_boxes = predictor.transform.apply_boxes_torch(boxes_filt, image.shape[:2]).to(device)
     masks, _, _ = predictor.predict_torch(
         point_coords = None,

gradio_app.py DELETED Viewed

@@ -1,345 +0,0 @@
-import os
-# os.system('pip install v0.1.0-alpha2.tar.gz')
-import gradio as gr
-import argparse
-import copy
-import numpy as np
-import torch
-import torchvision
-from PIL import Image, ImageDraw, ImageFont
-# Grounding DINO
-import GroundingDINO.groundingdino.datasets.transforms as T
-from GroundingDINO.groundingdino.models import build_model
-from GroundingDINO.groundingdino.util import box_ops
-from GroundingDINO.groundingdino.util.slconfig import SLConfig
-from GroundingDINO.groundingdino.util.utils import clean_state_dict, get_phrases_from_posmap
-# segment anything
-from segment_anything import build_sam, SamPredictor
-import cv2
-import numpy as np
-import matplotlib.pyplot as plt
-# diffusers
-import PIL
-import requests
-import torch
-from io import BytesIO
-from diffusers import StableDiffusionInpaintPipeline
-from huggingface_hub import hf_hub_download
-# BLIP
-from transformers import BlipProcessor, BlipForConditionalGeneration
-def load_model_hf(model_config_path, repo_id, filename, device='cpu'):
-    args = SLConfig.fromfile(model_config_path)
-    model = build_model(args)
-    args.device = device
-    cache_file = hf_hub_download(repo_id=repo_id, filename=filename)
-    checkpoint = torch.load(cache_file, map_location='cpu')
-    log = model.load_state_dict(clean_state_dict(checkpoint['model']), strict=False)
-    print("Model loaded from {} \n => {}".format(cache_file, log))
-    _ = model.eval()
-    return model
-def generate_caption(processor, blip_model, raw_image):
-    # unconditional image captioning
-    inputs = processor(raw_image, return_tensors="pt").to("cuda", torch.float16)
-    out = blip_model.generate(**inputs)
-    caption = processor.decode(out[0], skip_special_tokens=True)
-    return caption
-def plot_boxes_to_image(image_pil, tgt):
-    H, W = tgt["size"]
-    boxes = tgt["boxes"]
-    labels = tgt["labels"]
-    assert len(boxes) == len(labels), "boxes and labels must have same length"
-    draw = ImageDraw.Draw(image_pil)
-    mask = Image.new("L", image_pil.size, 0)
-    mask_draw = ImageDraw.Draw(mask)
-    # draw boxes and masks
-    for box, label in zip(boxes, labels):
-        # from 0..1 to 0..W, 0..H
-        box = box * torch.Tensor([W, H, W, H])
-        # from xywh to xyxy
-        box[:2] -= box[2:] / 2
-        box[2:] += box[:2]
-        # random color
-        color = tuple(np.random.randint(0, 255, size=3).tolist())
-        # draw
-        x0, y0, x1, y1 = box
-        x0, y0, x1, y1 = int(x0), int(y0), int(x1), int(y1)
-        draw.rectangle([x0, y0, x1, y1], outline=color, width=6)
-        # draw.text((x0, y0), str(label), fill=color)
-        font = ImageFont.load_default()
-        if hasattr(font, "getbbox"):
-            bbox = draw.textbbox((x0, y0), str(label), font)
-        else:
-            w, h = draw.textsize(str(label), font)
-            bbox = (x0, y0, w + x0, y0 + h)
-        # bbox = draw.textbbox((x0, y0), str(label))
-        draw.rectangle(bbox, fill=color)
-        draw.text((x0, y0), str(label), fill="white")
-        mask_draw.rectangle([x0, y0, x1, y1], fill=255, width=6)
-    return image_pil, mask
-def load_image(image_path):
-    # # load image
-    # image_pil = Image.open(image_path).convert("RGB")  # load image
-    image_pil = image_path
-    transform = T.Compose(
-        [
-            T.RandomResize([800], max_size=1333),
-            T.ToTensor(),
-            T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
-        ]
-    )
-    image, _ = transform(image_pil, None)  # 3, h, w
-    return image_pil, image
-def load_model(model_config_path, model_checkpoint_path, device):
-    args = SLConfig.fromfile(model_config_path)
-    args.device = device
-    model = build_model(args)
-    checkpoint = torch.load(model_checkpoint_path, map_location="cpu")
-    load_res = model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
-    print(load_res)
-    _ = model.eval()
-    return model
-def get_grounding_output(model, image, caption, box_threshold, text_threshold, with_logits=True, device="cpu"):
-    caption = caption.lower()
-    caption = caption.strip()
-    if not caption.endswith("."):
-        caption = caption + "."
-    model = model.to(device)
-    image = image.to(device)
-    with torch.no_grad():
-        outputs = model(image[None], captions=[caption])
-    logits = outputs["pred_logits"].cpu().sigmoid()[0]  # (nq, 256)
-    boxes = outputs["pred_boxes"].cpu()[0]  # (nq, 4)
-    logits.shape[0]
-    # filter output
-    logits_filt = logits.clone()
-    boxes_filt = boxes.clone()
-    filt_mask = logits_filt.max(dim=1)[0] > box_threshold
-    logits_filt = logits_filt[filt_mask]  # num_filt, 256
-    boxes_filt = boxes_filt[filt_mask]  # num_filt, 4
-    logits_filt.shape[0]
-    # get phrase
-    tokenlizer = model.tokenizer
-    tokenized = tokenlizer(caption)
-    # build pred
-    pred_phrases = []
-    scores = []
-    for logit, box in zip(logits_filt, boxes_filt):
-        pred_phrase = get_phrases_from_posmap(logit > text_threshold, tokenized, tokenlizer)
-        if with_logits:
-            pred_phrases.append(pred_phrase + f"({str(logit.max().item())[:4]})")
-        else:
-            pred_phrases.append(pred_phrase)
-        scores.append(logit.max().item())
-    return boxes_filt, torch.Tensor(scores), pred_phrases
-def show_mask(mask, ax, random_color=False):
-    if random_color:
-        color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
-    else:
-        color = np.array([30/255, 144/255, 255/255, 0.6])
-    h, w = mask.shape[-2:]
-    mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
-    ax.imshow(mask_image)
-def show_box(box, ax, label):
-    x0, y0 = box[0], box[1]
-    w, h = box[2] - box[0], box[3] - box[1]
-    ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor='green', facecolor=(0,0,0,0), lw=2))
-    ax.text(x0, y0, label)
-config_file = 'GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py'
-ckpt_repo_id = "ShilongLiu/GroundingDINO"
-ckpt_filenmae = "groundingdino_swint_ogc.pth"
-sam_checkpoint='sam_vit_h_4b8939.pth'
-output_dir="outputs"
-device="cuda"
-def run_grounded_sam(image_path, text_prompt, task_type, inpaint_prompt, box_threshold, text_threshold, iou_threshold, inpaint_mode):
-    # make dir
-    os.makedirs(output_dir, exist_ok=True)
-    # load image
-    image_pil, image = load_image(image_path.convert("RGB"))
-    # load model
-    model = load_model_hf(config_file, ckpt_repo_id, ckpt_filenmae)
-    # model = load_model(config_file, ckpt_filenmae, device=device)
-    # visualize raw image
-    image_pil.save(os.path.join(output_dir, "raw_image.jpg"))
-    if task_type == 'automatic':
-        # generate caption and tags
-        # use Tag2Text can generate better captions
-        # https://huggingface.co/spaces/xinyu1205/Tag2Text
-        # but there are some bugs...
-        processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
-        blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large", torch_dtype=torch.float16).to("cuda")
-        text_prompt = generate_caption(processor, blip_model, image_pil)
-        print(f"Caption: {text_prompt}")
-    # run grounding dino model
-    boxes_filt, scores, pred_phrases = get_grounding_output(
-        model, image, text_prompt, box_threshold, text_threshold, device=device
-    )
-    size = image_pil.size
-    if task_type == 'seg' or task_type == 'inpainting' or task_type == 'automatic':
-        # initialize SAM
-        predictor = SamPredictor(build_sam(checkpoint=sam_checkpoint))
-        image = np.array(image_path)
-        predictor.set_image(image)
-        H, W = size[1], size[0]
-        for i in range(boxes_filt.size(0)):
-            boxes_filt[i] = boxes_filt[i] * torch.Tensor([W, H, W, H])
-            boxes_filt[i][:2] -= boxes_filt[i][2:] / 2
-            boxes_filt[i][2:] += boxes_filt[i][:2]
-        boxes_filt = boxes_filt.cpu()
-        if task_type == 'automatic':
-            # use NMS to handle overlapped boxes
-            print(f"Before NMS: {boxes_filt.shape[0]} boxes")
-            nms_idx = torchvision.ops.nms(boxes_filt, scores, iou_threshold).numpy().tolist()
-            boxes_filt = boxes_filt[nms_idx]
-            pred_phrases = [pred_phrases[idx] for idx in nms_idx]
-            print(f"After NMS: {boxes_filt.shape[0]} boxes")
-            print(f"Revise caption with number: {text_prompt}")
-        transformed_boxes = predictor.transform.apply_boxes_torch(boxes_filt, image.shape[:2])
-        masks, _, _ = predictor.predict_torch(
-            point_coords = None,
-            point_labels = None,
-            boxes = transformed_boxes,
-            multimask_output = False,
-        )
-        # masks: [1, 1, 512, 512]
-    if task_type == 'det':
-        pred_dict = {
-            "boxes": boxes_filt,
-            "size": [size[1], size[0]],  # H,W
-            "labels": pred_phrases,
-        }
-        # import ipdb; ipdb.set_trace()
-        image_with_box = plot_boxes_to_image(image_pil, pred_dict)[0]
-        image_path = os.path.join(output_dir, "grounding_dino_output.jpg")
-        image_with_box.save(image_path)
-        image_result = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB)
-        return image_result
-    elif task_type == 'seg' or task_type == 'automatic':
-        assert sam_checkpoint, 'sam_checkpoint is not found!'
-        # draw output image
-        plt.figure(figsize=(10, 10))
-        plt.imshow(image)
-        for mask in masks:
-            show_mask(mask.cpu().numpy(), plt.gca(), random_color=True)
-        for box, label in zip(boxes_filt, pred_phrases):
-            show_box(box.numpy(), plt.gca(), label)
-        if task_type == 'automatic':
-            plt.title(text_prompt)
-        plt.axis('off')
-        image_path = os.path.join(output_dir, "grounding_dino_output.jpg")
-        plt.savefig(image_path, bbox_inches="tight")
-        image_result = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB)
-        return image_result
-    elif task_type == 'inpainting':
-        assert inpaint_prompt, 'inpaint_prompt is not found!'
-        # inpainting pipeline
-        if inpaint_mode == 'merge':
-            masks = torch.sum(masks, dim=0).unsqueeze(0)
-            masks = torch.where(masks > 0, True, False)
-        else:
-            mask = masks[0][0].cpu().numpy() # simply choose the first mask, which will be refine in the future release
-        mask_pil = Image.fromarray(mask)
-        pipe = StableDiffusionInpaintPipeline.from_pretrained(
-        "runwayml/stable-diffusion-inpainting", torch_dtype=torch.float16
-        )
-        pipe = pipe.to("cuda")
-        image_pil = image_pil.resize((512, 512))
-        mask_pil = mask_pil.resize((512, 512))
-        image = pipe(prompt=inpaint_prompt, image=image_pil, mask_image=mask_pil).images[0]
-        image = image.resize(size)
-        image_path = os.path.join(output_dir, "grounded_sam_inpainting_output.jpg")
-        image.save(image_path)
-        image_result = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB)
-        return image_result
-    else:
-        print("task_type:{} error!".format(task_type))
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser("Grounded SAM demo", add_help=True)
-    parser.add_argument("--debug", action="store_true", help="using debug mode")
-    parser.add_argument("--share", action="store_true", help="share the app")
-    parser.add_argument('--port', type=int, default=7589, help='port to run the server')
-    args = parser.parse_args()
-    block = gr.Blocks().queue()
-    with block:
-        with gr.Row():
-            with gr.Column():
-                input_image = gr.Image(source='upload', type="pil", value="assets/demo1.jpg")
-                task_type = gr.Dropdown(["det", "seg", "inpainting", "automatic"], value="automatic", label="task_type")
-                text_prompt = gr.Textbox(label="Text Prompt")
-                inpaint_prompt = gr.Textbox(label="Inpaint Prompt")
-                run_button = gr.Button(label="Run")
-                with gr.Accordion("Advanced options", open=False):
-                    box_threshold = gr.Slider(
-                        label="Box Threshold", minimum=0.0, maximum=1.0, value=0.3, step=0.001
-                    )
-                    text_threshold = gr.Slider(
-                        label="Text Threshold", minimum=0.0, maximum=1.0, value=0.25, step=0.001
-                    )
-                    iou_threshold = gr.Slider(
-                        label="IOU Threshold", minimum=0.0, maximum=1.0, value=0.5, step=0.001
-                    )
-                    inpaint_mode = gr.Dropdown(["merge", "first"], value="merge", label="inpaint_mode")
-            with gr.Column():
-                gallery = gr.outputs.Image(
-                    type="pil",
-                ).style(full_width=True, full_height=True)
-        run_button.click(fn=run_grounded_sam, inputs=[
-                        input_image, text_prompt, task_type, inpaint_prompt, box_threshold, text_threshold, iou_threshold, inpaint_mode], outputs=[gallery])
-    block.launch(server_name='0.0.0.0', server_port=args.port, debug=args.debug, share=args.share)

gradio_auto_label.py DELETED Viewed

@@ -1,392 +0,0 @@
-import gradio as gr
-import json
-import argparse
-import os
-import copy
-import numpy as np
-import torch
-import torchvision
-from PIL import Image, ImageDraw, ImageFont
-import openai
-# Grounding DINO
-import GroundingDINO.groundingdino.datasets.transforms as T
-from GroundingDINO.groundingdino.models import build_model
-from GroundingDINO.groundingdino.util import box_ops
-from GroundingDINO.groundingdino.util.slconfig import SLConfig
-from GroundingDINO.groundingdino.util.utils import clean_state_dict, get_phrases_from_posmap
-from transformers import BlipProcessor, BlipForConditionalGeneration
-# segment anything
-from segment_anything import build_sam, SamPredictor
-from segment_anything.utils.amg import remove_small_regions
-import cv2
-import numpy as np
-import matplotlib.pyplot as plt
-# diffusers
-import PIL
-import requests
-import torch
-from io import BytesIO
-from huggingface_hub import hf_hub_download
-from sys import platform
-#macos
-if platform == 'darwin':
-    import matplotlib
-    matplotlib.use('agg')
-def load_model_hf(model_config_path, repo_id, filename, device='cpu'):
-    args = SLConfig.fromfile(model_config_path)
-    model = build_model(args)
-    args.device = device
-    cache_file = hf_hub_download(repo_id=repo_id, filename=filename)
-    checkpoint = torch.load(cache_file, map_location='cpu')
-    log = model.load_state_dict(clean_state_dict(checkpoint['model']), strict=False)
-    print("Model loaded from {} \n => {}".format(cache_file, log))
-    _ = model.eval()
-    return model
-def plot_boxes_to_image(image_pil, tgt):
-    H, W = tgt["size"]
-    boxes = tgt["boxes"]
-    labels = tgt["labels"]
-    assert len(boxes) == len(labels), "boxes and labels must have same length"
-    draw = ImageDraw.Draw(image_pil)
-    mask = Image.new("L", image_pil.size, 0)
-    mask_draw = ImageDraw.Draw(mask)
-    # draw boxes and masks
-    for box, label in zip(boxes, labels):
-        # from 0..1 to 0..W, 0..H
-        box = box * torch.Tensor([W, H, W, H])
-        # from xywh to xyxy
-        box[:2] -= box[2:] / 2
-        box[2:] += box[:2]
-        # random color
-        color = tuple(np.random.randint(0, 255, size=3).tolist())
-        # draw
-        x0, y0, x1, y1 = box
-        x0, y0, x1, y1 = int(x0), int(y0), int(x1), int(y1)
-        draw.rectangle([x0, y0, x1, y1], outline=color, width=6)
-        # draw.text((x0, y0), str(label), fill=color)
-        font = ImageFont.load_default()
-        if hasattr(font, "getbbox"):
-            bbox = draw.textbbox((x0, y0), str(label), font)
-        else:
-            w, h = draw.textsize(str(label), font)
-            bbox = (x0, y0, w + x0, y0 + h)
-        # bbox = draw.textbbox((x0, y0), str(label))
-        draw.rectangle(bbox, fill=color)
-        draw.text((x0, y0), str(label), fill="white")
-        mask_draw.rectangle([x0, y0, x1, y1], fill=255, width=6)
-    return image_pil, mask
-def load_image(image_path):
-    # # load image
-    # image_pil = Image.open(image_path).convert("RGB")  # load image
-    image_pil = image_path
-    transform = T.Compose(
-        [
-            T.RandomResize([800], max_size=1333),
-            T.ToTensor(),
-            T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
-        ]
-    )
-    image, _ = transform(image_pil, None)  # 3, h, w
-    return image_pil, image
-def load_model(model_config_path, model_checkpoint_path, device):
-    args = SLConfig.fromfile(model_config_path)
-    args.device = device
-    model = build_model(args)
-    checkpoint = torch.load(model_checkpoint_path, map_location="cpu")
-    load_res = model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
-    _ = model.eval()
-    return model
-def get_grounding_output(model, image, caption, box_threshold, text_threshold, with_logits=True, device="cpu"):
-    caption = caption.lower()
-    caption = caption.strip()
-    if not caption.endswith("."):
-        caption = caption + "."
-    model = model.to(device)
-    image = image.to(device)
-    with torch.no_grad():
-        outputs = model(image[None], captions=[caption])
-    logits = outputs["pred_logits"].cpu().sigmoid()[0]  # (nq, 256)
-    boxes = outputs["pred_boxes"].cpu()[0]  # (nq, 4)
-    logits.shape[0]
-    # filter output
-    logits_filt = logits.clone()
-    boxes_filt = boxes.clone()
-    filt_mask = logits_filt.max(dim=1)[0] > box_threshold
-    logits_filt = logits_filt[filt_mask]  # num_filt, 256
-    boxes_filt = boxes_filt[filt_mask]  # num_filt, 4
-    logits_filt.shape[0]
-    # get phrase
-    tokenlizer = model.tokenizer
-    tokenized = tokenlizer(caption)
-    # build pred
-    pred_phrases = []
-    scores = []
-    for logit, box in zip(logits_filt, boxes_filt):
-        pred_phrase = get_phrases_from_posmap(logit > text_threshold, tokenized, tokenlizer)
-        if with_logits:
-            pred_phrases.append(pred_phrase + f"({str(logit.max().item())[:4]})")
-        else:
-            pred_phrases.append(pred_phrase)
-        scores.append(logit.max().item())
-    return boxes_filt, torch.Tensor(scores), pred_phrases
-def show_mask(mask, ax, random_color=False):
-    if random_color:
-        color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
-    else:
-        color = np.array([30/255, 144/255, 255/255, 0.6])
-    h, w = mask.shape[-2:]
-    mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
-    ax.imshow(mask_image)
-def save_mask_data(output_dir, mask_list, box_list, label_list):
-    value = 0  # 0 for background
-    mask_img = torch.zeros(mask_list.shape[-2:])
-    for idx, mask in enumerate(mask_list):
-        mask_img[mask.cpu().numpy()[0] == True] = value + idx + 1
-    plt.figure(figsize=(10, 10))
-    plt.imshow(mask_img.numpy())
-    plt.axis('off')
-    mask_img_path = os.path.join(output_dir, 'mask.jpg')
-    plt.savefig(mask_img_path, bbox_inches="tight", dpi=300, pad_inches=0.0)
-    json_data = [{
-        'value': value,
-        'label': 'background'
-    }]
-    for label, box in zip(label_list, box_list):
-        value += 1
-        name, logit = label.split('(')
-        logit = logit[:-1] # the last is ')'
-        json_data.append({
-            'value': value,
-            'label': name,
-            'logit': float(logit),
-            'box': box.numpy().tolist(),
-        })
-    mask_json_path = os.path.join(output_dir, 'mask.json')
-    with open(mask_json_path, 'w') as f:
-        json.dump(json_data, f)
-    return mask_img_path, mask_json_path
-def show_box(box, ax, label):
-    x0, y0 = box[0], box[1]
-    w, h = box[2] - box[0], box[3] - box[1]
-    ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor='green', facecolor=(0,0,0,0), lw=2))
-    ax.text(x0, y0, label)
-config_file = 'GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py'
-ckpt_repo_id = "ShilongLiu/GroundingDINO"
-ckpt_filenmae = "groundingdino_swint_ogc.pth"
-sam_checkpoint='sam_vit_h_4b8939.pth'
-output_dir="outputs"
-device="cpu"
-processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
-blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
-def generate_caption(raw_image):
-    # unconditional image captioning
-    inputs = processor(raw_image, return_tensors="pt")
-    out = blip_model.generate(**inputs)
-    caption = processor.decode(out[0], skip_special_tokens=True)
-    return caption
-def generate_tags(caption, split=',', max_tokens=100, model="gpt-3.5-turbo", openai_key=''):
-    openai.api_key = openai_key
-    prompt = [
-        {
-            'role': 'system',
-            'content': 'Extract the unique nouns in the caption. Remove all the adjectives. ' + \
-                       f'List the nouns in singular form. Split them by "{split} ". ' + \
-                       f'Caption: {caption}.'
-        }
-    ]
-    response = openai.ChatCompletion.create(model=model, messages=prompt, temperature=0.6, max_tokens=max_tokens)
-    reply = response['choices'][0]['message']['content']
-    # sometimes return with "noun: xxx, xxx, xxx"
-    tags = reply.split(':')[-1].strip()
-    return tags
-def check_caption(caption, pred_phrases, max_tokens=100, model="gpt-3.5-turbo"):
-    object_list = [obj.split('(')[0] for obj in pred_phrases]
-    object_num = []
-    for obj in set(object_list):
-        object_num.append(f'{object_list.count(obj)} {obj}')
-    object_num = ', '.join(object_num)
-    print(f"Correct object number: {object_num}")
-    prompt = [
-        {
-            'role': 'system',
-            'content': 'Revise the number in the caption if it is wrong. ' + \
-                       f'Caption: {caption}. ' + \
-                       f'True object number: {object_num}. ' + \
-                       'Only give the revised caption: '
-        }
-    ]
-    response = openai.ChatCompletion.create(model=model, messages=prompt, temperature=0.6, max_tokens=max_tokens)
-    reply = response['choices'][0]['message']['content']
-    # sometimes return with "Caption: xxx, xxx, xxx"
-    caption = reply.split(':')[-1].strip()
-    return caption
-def run_grounded_sam(image_path, openai_key, box_threshold, text_threshold, iou_threshold, area_threshold):
-    assert openai_key, 'Openai key is not found!'
-    # make dir
-    os.makedirs(output_dir, exist_ok=True)
-    # load image
-    image_pil, image = load_image(image_path.convert("RGB"))
-    # load model
-    model = load_model_hf(config_file, ckpt_repo_id, ckpt_filenmae)
-    # visualize raw image
-    image_pil.save(os.path.join(output_dir, "raw_image.jpg"))
-    caption = generate_caption(image_pil)
-    # Currently ", " is better for detecting single tags
-    # while ". " is a little worse in some case
-    split = ','
-    tags = generate_tags(caption, split=split, openai_key=openai_key)
-    # run grounding dino model
-    boxes_filt, scores, pred_phrases = get_grounding_output(
-        model, image, tags, box_threshold, text_threshold, device=device
-    )
-    size = image_pil.size
-    # initialize SAM
-    predictor = SamPredictor(build_sam(checkpoint=sam_checkpoint))
-    image = np.array(image_path)
-    predictor.set_image(image)
-    H, W = size[1], size[0]
-    for i in range(boxes_filt.size(0)):
-        boxes_filt[i] = boxes_filt[i] * torch.Tensor([W, H, W, H])
-        boxes_filt[i][:2] -= boxes_filt[i][2:] / 2
-        boxes_filt[i][2:] += boxes_filt[i][:2]
-    boxes_filt = boxes_filt.cpu()
-    # use NMS to handle overlapped boxes
-    print(f"Before NMS: {boxes_filt.shape[0]} boxes")
-    nms_idx = torchvision.ops.nms(boxes_filt, scores, iou_threshold).numpy().tolist()
-    boxes_filt = boxes_filt[nms_idx]
-    pred_phrases = [pred_phrases[idx] for idx in nms_idx]
-    print(f"After NMS: {boxes_filt.shape[0]} boxes")
-    caption = check_caption(caption, pred_phrases)
-    print(f"Revise caption with number: {caption}")
-    transformed_boxes = predictor.transform.apply_boxes_torch(boxes_filt, image.shape[:2])
-    masks, _, _ = predictor.predict_torch(
-        point_coords = None,
-        point_labels = None,
-        boxes = transformed_boxes,
-        multimask_output = False,
-    )
-    # area threshold: remove the mask when area < area_thresh (in pixels)
-    new_masks = []
-    for mask in masks:
-        # reshape to be used in remove_small_regions()
-        mask = mask.cpu().numpy().squeeze()
-        mask, _ = remove_small_regions(mask, area_threshold, mode="holes")
-        mask, _ = remove_small_regions(mask, area_threshold, mode="islands")
-        new_masks.append(torch.as_tensor(mask).unsqueeze(0))
-    masks = torch.stack(new_masks, dim=0)
-    # masks: [1, 1, 512, 512]
-    assert sam_checkpoint, 'sam_checkpoint is not found!'
-    # draw output image
-    plt.figure(figsize=(10, 10))
-    plt.imshow(image)
-    for mask in masks:
-        show_mask(mask.cpu().numpy(), plt.gca(), random_color=True)
-    for box, label in zip(boxes_filt, pred_phrases):
-        show_box(box.numpy(), plt.gca(), label)
-    plt.axis('off')
-    image_path = os.path.join(output_dir, "grounding_dino_output.jpg")
-    plt.savefig(image_path, bbox_inches="tight")
-    image_result = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB)
-    mask_img_path, _ = save_mask_data('./outputs', masks, boxes_filt, pred_phrases)
-    mask_img = cv2.cvtColor(cv2.imread(mask_img_path), cv2.COLOR_BGR2RGB)
-    return image_result, mask_img, caption, tags
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser("Grounded SAM demo", add_help=True)
-    parser.add_argument("--debug", action="store_true", help="using debug mode")
-    parser.add_argument("--share", action="store_true", help="share the app")
-    args = parser.parse_args()
-    block = gr.Blocks().queue()
-    with block:
-        with gr.Row():
-            with gr.Column():
-                input_image = gr.Image(source='upload', type="pil")
-                openai_key = gr.Textbox(label="OpenAI key")
-                run_button = gr.Button(label="Run")
-                with gr.Accordion("Advanced options", open=False):
-                    box_threshold = gr.Slider(
-                        label="Box Threshold", minimum=0.0, maximum=1.0, value=0.3, step=0.001
-                    )
-                    text_threshold = gr.Slider(
-                        label="Text Threshold", minimum=0.0, maximum=1.0, value=0.25, step=0.001
-                    )
-                    iou_threshold = gr.Slider(
-                        label="IoU Threshold", minimum=0.0, maximum=1.0, value=0.5, step=0.001
-                    )
-                    area_threshold = gr.Slider(
-                        label="Area Threshold", minimum=0.0, maximum=2500, value=100, step=10
-                    )
-            with gr.Column():
-                image_caption = gr.Textbox(label="Image Caption")
-                identified_labels = gr.Textbox(label="Key objects extracted by ChatGPT")
-                gallery = gr.outputs.Image(
-                    type="pil",
-                ).style(full_width=True, full_height=True)
-                mask_gallary = gr.outputs.Image(
-                    type="pil",
-                ).style(full_width=True, full_height=True)
-        run_button.click(fn=run_grounded_sam, inputs=[
-                        input_image, openai_key, box_threshold, text_threshold, iou_threshold, area_threshold],
-                        outputs=[gallery, mask_gallary, image_caption, identified_labels])
-    block.launch(server_name='0.0.0.0', server_port=7589, debug=args.debug, share=args.share)

grounded_sam.ipynb CHANGED Viewed

@@ -224,7 +224,9 @@
    "outputs": [],
    "source": [
     "sam_checkpoint = 'sam_vit_h_4b8939.pth'\n",
-    "sam_predictor = SamPredictor(build_sam(checkpoint=sam_checkpoint))"
    ]
   },
   {
@@ -404,7 +406,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "transformed_boxes = sam_predictor.transform.apply_boxes_torch(boxes_xyxy, image_source.shape[:2])\n",
     "masks, _, _ = sam_predictor.predict_torch(\n",
     "            point_coords = None,\n",
     "            point_labels = None,\n",

    "outputs": [],
    "source": [
     "sam_checkpoint = 'sam_vit_h_4b8939.pth'\n",
+    "sam = build_sam(checkpoint=sam_checkpoint)\n",
+    "sam.to(device=device)\n",
+    "sam_predictor = SamPredictor(sam)"
    ]
   },
   {
    "metadata": {},
    "outputs": [],
    "source": [
+    "transformed_boxes = sam_predictor.transform.apply_boxes_torch(boxes_xyxy, image_source.shape[:2]).to(device)\n",
     "masks, _, _ = sam_predictor.predict_torch(\n",
     "            point_coords = None,\n",
     "            point_labels = None,\n",

grounded_sam_demo.py DELETED Viewed

@@ -1,217 +0,0 @@
-import argparse
-import os
-import copy
-import numpy as np
-import json
-import torch
-from PIL import Image, ImageDraw, ImageFont
-# Grounding DINO
-import GroundingDINO.groundingdino.datasets.transforms as T
-from GroundingDINO.groundingdino.models import build_model
-from GroundingDINO.groundingdino.util import box_ops
-from GroundingDINO.groundingdino.util.slconfig import SLConfig
-from GroundingDINO.groundingdino.util.utils import clean_state_dict, get_phrases_from_posmap
-# segment anything
-from segment_anything import build_sam, SamPredictor
-import cv2
-import numpy as np
-import matplotlib.pyplot as plt
-def load_image(image_path):
-    # load image
-    image_pil = Image.open(image_path).convert("RGB")  # load image
-    transform = T.Compose(
-        [
-            T.RandomResize([800], max_size=1333),
-            T.ToTensor(),
-            T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
-        ]
-    )
-    image, _ = transform(image_pil, None)  # 3, h, w
-    return image_pil, image
-def load_model(model_config_path, model_checkpoint_path, device):
-    args = SLConfig.fromfile(model_config_path)
-    args.device = device
-    model = build_model(args)
-    checkpoint = torch.load(model_checkpoint_path, map_location="cpu")
-    load_res = model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
-    print(load_res)
-    _ = model.eval()
-    return model
-def get_grounding_output(model, image, caption, box_threshold, text_threshold, with_logits=True, device="cpu"):
-    caption = caption.lower()
-    caption = caption.strip()
-    if not caption.endswith("."):
-        caption = caption + "."
-    model = model.to(device)
-    image = image.to(device)
-    with torch.no_grad():
-        outputs = model(image[None], captions=[caption])
-    logits = outputs["pred_logits"].cpu().sigmoid()[0]  # (nq, 256)
-    boxes = outputs["pred_boxes"].cpu()[0]  # (nq, 4)
-    logits.shape[0]
-    # filter output
-    logits_filt = logits.clone()
-    boxes_filt = boxes.clone()
-    filt_mask = logits_filt.max(dim=1)[0] > box_threshold
-    logits_filt = logits_filt[filt_mask]  # num_filt, 256
-    boxes_filt = boxes_filt[filt_mask]  # num_filt, 4
-    logits_filt.shape[0]
-    # get phrase
-    tokenlizer = model.tokenizer
-    tokenized = tokenlizer(caption)
-    # build pred
-    pred_phrases = []
-    for logit, box in zip(logits_filt, boxes_filt):
-        pred_phrase = get_phrases_from_posmap(logit > text_threshold, tokenized, tokenlizer)
-        if with_logits:
-            pred_phrases.append(pred_phrase + f"({str(logit.max().item())[:4]})")
-        else:
-            pred_phrases.append(pred_phrase)
-    return boxes_filt, pred_phrases
-def show_mask(mask, ax, random_color=False):
-    if random_color:
-        color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
-    else:
-        color = np.array([30/255, 144/255, 255/255, 0.6])
-    h, w = mask.shape[-2:]
-    mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
-    ax.imshow(mask_image)
-def show_box(box, ax, label):
-    x0, y0 = box[0], box[1]
-    w, h = box[2] - box[0], box[3] - box[1]
-    ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor='green', facecolor=(0,0,0,0), lw=2))
-    ax.text(x0, y0, label)
-def save_mask_data(output_dir, mask_list, box_list, label_list):
-    value = 0  # 0 for background
-    mask_img = torch.zeros(mask_list.shape[-2:])
-    for idx, mask in enumerate(mask_list):
-        mask_img[mask.cpu().numpy()[0] == True] = value + idx + 1
-    plt.figure(figsize=(10, 10))
-    plt.imshow(mask_img.numpy())
-    plt.axis('off')
-    plt.savefig(os.path.join(output_dir, 'mask.jpg'), bbox_inches="tight", dpi=300, pad_inches=0.0)
-    json_data = [{
-        'value': value,
-        'label': 'background'
-    }]
-    for label, box in zip(label_list, box_list):
-        value += 1
-        name, logit = label.split('(')
-        logit = logit[:-1] # the last is ')'
-        json_data.append({
-            'value': value,
-            'label': name,
-            'logit': float(logit),
-            'box': box.numpy().tolist(),
-        })
-    with open(os.path.join(output_dir, 'mask.json'), 'w') as f:
-        json.dump(json_data, f)
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser("Grounded-Segment-Anything Demo", add_help=True)
-    parser.add_argument("--config", type=str, required=True, help="path to config file")
-    parser.add_argument(
-        "--grounded_checkpoint", type=str, required=True, help="path to checkpoint file"
-    )
-    parser.add_argument(
-        "--sam_checkpoint", type=str, required=True, help="path to checkpoint file"
-    )
-    parser.add_argument("--input_image", type=str, required=True, help="path to image file")
-    parser.add_argument("--text_prompt", type=str, required=True, help="text prompt")
-    parser.add_argument(
-        "--output_dir", "-o", type=str, default="outputs", required=True, help="output directory"
-    )
-    parser.add_argument("--box_threshold", type=float, default=0.3, help="box threshold")
-    parser.add_argument("--text_threshold", type=float, default=0.25, help="text threshold")
-    parser.add_argument("--device", type=str, default="cpu", help="running on cpu only!, default=False")
-    args = parser.parse_args()
-    # cfg
-    config_file = args.config  # change the path of the model config file
-    grounded_checkpoint = args.grounded_checkpoint  # change the path of the model
-    sam_checkpoint = args.sam_checkpoint
-    image_path = args.input_image
-    text_prompt = args.text_prompt
-    output_dir = args.output_dir
-    box_threshold = args.box_threshold
-    text_threshold = args.box_threshold
-    device = args.device
-    # make dir
-    os.makedirs(output_dir, exist_ok=True)
-    # load image
-    image_pil, image = load_image(image_path)
-    # load model
-    model = load_model(config_file, grounded_checkpoint, device=device)
-    # visualize raw image
-    image_pil.save(os.path.join(output_dir, "raw_image.jpg"))
-    # run grounding dino model
-    boxes_filt, pred_phrases = get_grounding_output(
-        model, image, text_prompt, box_threshold, text_threshold, device=device
-    )
-    # initialize SAM
-    predictor = SamPredictor(build_sam(checkpoint=sam_checkpoint))
-    image = cv2.imread(image_path)
-    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
-    predictor.set_image(image)
-    size = image_pil.size
-    H, W = size[1], size[0]
-    for i in range(boxes_filt.size(0)):
-        boxes_filt[i] = boxes_filt[i] * torch.Tensor([W, H, W, H])
-        boxes_filt[i][:2] -= boxes_filt[i][2:] / 2
-        boxes_filt[i][2:] += boxes_filt[i][:2]
-    boxes_filt = boxes_filt.cpu()
-    transformed_boxes = predictor.transform.apply_boxes_torch(boxes_filt, image.shape[:2])
-    masks, _, _ = predictor.predict_torch(
-        point_coords = None,
-        point_labels = None,
-        boxes = transformed_boxes,
-        multimask_output = False,
-    )
-    # draw output image
-    plt.figure(figsize=(10, 10))
-    plt.imshow(image)
-    for mask in masks:
-        show_mask(mask.cpu().numpy(), plt.gca(), random_color=True)
-    for box, label in zip(boxes_filt, pred_phrases):
-        show_box(box.numpy(), plt.gca(), label)
-    plt.axis('off')
-    plt.savefig(
-        os.path.join(output_dir, "grounded_sam_output.jpg"),
-        bbox_inches="tight", dpi=300, pad_inches=0.0
-    )
-    save_mask_data(output_dir, masks, boxes_filt, pred_phrases)

grounded_sam_inpainting_demo.py DELETED Viewed

@@ -1,215 +0,0 @@
-import argparse
-import os
-import copy
-import numpy as np
-import torch
-from PIL import Image, ImageDraw, ImageFont
-# Grounding DINO
-import GroundingDINO.groundingdino.datasets.transforms as T
-from GroundingDINO.groundingdino.models import build_model
-from GroundingDINO.groundingdino.util import box_ops
-from GroundingDINO.groundingdino.util.slconfig import SLConfig
-from GroundingDINO.groundingdino.util.utils import clean_state_dict, get_phrases_from_posmap
-# segment anything
-from segment_anything import build_sam, SamPredictor
-import cv2
-import numpy as np
-import matplotlib.pyplot as plt
-# diffusers
-import PIL
-import requests
-import torch
-from io import BytesIO
-from diffusers import StableDiffusionInpaintPipeline
-def load_image(image_path):
-    # load image
-    image_pil = Image.open(image_path).convert("RGB")  # load image
-    transform = T.Compose(
-        [
-            T.RandomResize([800], max_size=1333),
-            T.ToTensor(),
-            T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
-        ]
-    )
-    image, _ = transform(image_pil, None)  # 3, h, w
-    return image_pil, image
-def load_model(model_config_path, model_checkpoint_path, device):
-    args = SLConfig.fromfile(model_config_path)
-    args.device = device
-    model = build_model(args)
-    checkpoint = torch.load(model_checkpoint_path, map_location="cpu")
-    load_res = model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
-    print(load_res)
-    _ = model.eval()
-    return model
-def get_grounding_output(model, image, caption, box_threshold, text_threshold, with_logits=True, device="cpu"):
-    caption = caption.lower()
-    caption = caption.strip()
-    if not caption.endswith("."):
-        caption = caption + "."
-    model = model.to(device)
-    image = image.to(device)
-    with torch.no_grad():
-        outputs = model(image[None], captions=[caption])
-    logits = outputs["pred_logits"].cpu().sigmoid()[0]  # (nq, 256)
-    boxes = outputs["pred_boxes"].cpu()[0]  # (nq, 4)
-    logits.shape[0]
-    # filter output
-    logits_filt = logits.clone()
-    boxes_filt = boxes.clone()
-    filt_mask = logits_filt.max(dim=1)[0] > box_threshold
-    logits_filt = logits_filt[filt_mask]  # num_filt, 256
-    boxes_filt = boxes_filt[filt_mask]  # num_filt, 4
-    logits_filt.shape[0]
-    # get phrase
-    tokenlizer = model.tokenizer
-    tokenized = tokenlizer(caption)
-    # build pred
-    pred_phrases = []
-    for logit, box in zip(logits_filt, boxes_filt):
-        pred_phrase = get_phrases_from_posmap(logit > text_threshold, tokenized, tokenlizer)
-        if with_logits:
-            pred_phrases.append(pred_phrase + f"({str(logit.max().item())[:4]})")
-        else:
-            pred_phrases.append(pred_phrase)
-    return boxes_filt, pred_phrases
-def show_mask(mask, ax, random_color=False):
-    if random_color:
-        color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
-    else:
-        color = np.array([30/255, 144/255, 255/255, 0.6])
-    h, w = mask.shape[-2:]
-    mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
-    ax.imshow(mask_image)
-def show_box(box, ax, label):
-    x0, y0 = box[0], box[1]
-    w, h = box[2] - box[0], box[3] - box[1]
-    ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor='green', facecolor=(0,0,0,0), lw=2))
-    ax.text(x0, y0, label)
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser("Grounded-Segment-Anything Demo", add_help=True)
-    parser.add_argument("--config", type=str, required=True, help="path to config file")
-    parser.add_argument(
-        "--grounded_checkpoint", type=str, required=True, help="path to checkpoint file"
-    )
-    parser.add_argument(
-        "--sam_checkpoint", type=str, required=True, help="path to checkpoint file"
-    )
-    parser.add_argument("--input_image", type=str, required=True, help="path to image file")
-    parser.add_argument("--det_prompt", type=str, required=True, help="text prompt")
-    parser.add_argument("--inpaint_prompt", type=str, required=True, help="inpaint prompt")
-    parser.add_argument(
-        "--output_dir", "-o", type=str, default="outputs", required=True, help="output directory"
-    )
-    parser.add_argument("--box_threshold", type=float, default=0.3, help="box threshold")
-    parser.add_argument("--text_threshold", type=float, default=0.25, help="text threshold")
-    parser.add_argument("--inpaint_mode", type=str, default="first", help="inpaint mode")
-    parser.add_argument("--device", type=str, default="cpu", help="running on cpu only!, default=False")
-    args = parser.parse_args()
-    # cfg
-    config_file = args.config  # change the path of the model config file
-    grounded_checkpoint = args.grounded_checkpoint  # change the path of the model
-    sam_checkpoint = args.sam_checkpoint
-    image_path = args.input_image
-    det_prompt = args.det_prompt
-    inpaint_prompt = args.inpaint_prompt
-    output_dir = args.output_dir
-    box_threshold = args.box_threshold
-    text_threshold = args.box_threshold
-    inpaint_mode = args.inpaint_mode
-    device = args.device
-    # make dir
-    os.makedirs(output_dir, exist_ok=True)
-    # load image
-    image_pil, image = load_image(image_path)
-    # load model
-    model = load_model(config_file, grounded_checkpoint, device=device)
-    # visualize raw image
-    image_pil.save(os.path.join(output_dir, "raw_image.jpg"))
-    # run grounding dino model
-    boxes_filt, pred_phrases = get_grounding_output(
-        model, image, det_prompt, box_threshold, text_threshold, device=device
-    )
-    # initialize SAM
-    predictor = SamPredictor(build_sam(checkpoint=sam_checkpoint))
-    image = cv2.imread(image_path)
-    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
-    predictor.set_image(image)
-    size = image_pil.size
-    H, W = size[1], size[0]
-    for i in range(boxes_filt.size(0)):
-        boxes_filt[i] = boxes_filt[i] * torch.Tensor([W, H, W, H])
-        boxes_filt[i][:2] -= boxes_filt[i][2:] / 2
-        boxes_filt[i][2:] += boxes_filt[i][:2]
-    boxes_filt = boxes_filt.cpu()
-    transformed_boxes = predictor.transform.apply_boxes_torch(boxes_filt, image.shape[:2])
-    masks, _, _ = predictor.predict_torch(
-        point_coords = None,
-        point_labels = None,
-        boxes = transformed_boxes,
-        multimask_output = False,
-    )
-    # masks: [1, 1, 512, 512]
-    # inpainting pipeline
-    if inpaint_mode == 'merge':
-        masks = torch.sum(masks, dim=0).unsqueeze(0)
-        masks = torch.where(masks > 0, True, False)
-    else:
-        mask = masks[0][0].cpu().numpy() # simply choose the first mask, which will be refine in the future release
-    mask_pil = Image.fromarray(mask)
-    image_pil = Image.fromarray(image)
-    pipe = StableDiffusionInpaintPipeline.from_pretrained(
-    "runwayml/stable-diffusion-inpainting", torch_dtype=torch.float16
-    )
-    pipe = pipe.to("cuda")
-    image_pil = image_pil.resize((512, 512))
-    mask_pil = mask_pil.resize((512, 512))
-    # prompt = "A sofa, high quality, detailed"
-    image = pipe(prompt=inpaint_prompt, image=image_pil, mask_image=mask_pil).images[0]
-    image = image.resize(size)
-    image.save(os.path.join(output_dir, "grounded_sam_inpainting_output.jpg"))
-    # draw output image
-    # plt.figure(figsize=(10, 10))
-    # plt.imshow(image)
-    # for mask in masks:
-    #     show_mask(mask.cpu().numpy(), plt.gca(), random_color=True)
-    # for box, label in zip(boxes_filt, pred_phrases):
-    #     show_box(box.numpy(), plt.gca(), label)
-    # plt.axis('off')
-    # plt.savefig(os.path.join(output_dir, "grounded_sam_output.jpg"), bbox_inches="tight")

grounded_sam_whisper_demo.py DELETED Viewed

@@ -1,258 +0,0 @@
-import argparse
-import os
-import copy
-import numpy as np
-import json
-import torch
-import torchvision
-from PIL import Image, ImageDraw, ImageFont
-# Grounding DINO
-import GroundingDINO.groundingdino.datasets.transforms as T
-from GroundingDINO.groundingdino.models import build_model
-from GroundingDINO.groundingdino.util import box_ops
-from GroundingDINO.groundingdino.util.slconfig import SLConfig
-from GroundingDINO.groundingdino.util.utils import clean_state_dict, get_phrases_from_posmap
-# segment anything
-from segment_anything import build_sam, SamPredictor
-import cv2
-import numpy as np
-import matplotlib.pyplot as plt
-# whisper
-import whisper
-def load_image(image_path):
-    # load image
-    image_pil = Image.open(image_path).convert("RGB")  # load image
-    transform = T.Compose(
-        [
-            T.RandomResize([800], max_size=1333),
-            T.ToTensor(),
-            T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
-        ]
-    )
-    image, _ = transform(image_pil, None)  # 3, h, w
-    return image_pil, image
-def load_model(model_config_path, model_checkpoint_path, device):
-    args = SLConfig.fromfile(model_config_path)
-    args.device = device
-    model = build_model(args)
-    checkpoint = torch.load(model_checkpoint_path, map_location="cpu")
-    load_res = model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
-    print(load_res)
-    _ = model.eval()
-    return model
-def get_grounding_output(model, image, caption, box_threshold, text_threshold,device="cpu"):
-    caption = caption.lower()
-    caption = caption.strip()
-    if not caption.endswith("."):
-        caption = caption + "."
-    model = model.to(device)
-    image = image.to(device)
-    with torch.no_grad():
-        outputs = model(image[None], captions=[caption])
-    logits = outputs["pred_logits"].cpu().sigmoid()[0]  # (nq, 256)
-    boxes = outputs["pred_boxes"].cpu()[0]  # (nq, 4)
-    logits.shape[0]
-    # filter output
-    logits_filt = logits.clone()
-    boxes_filt = boxes.clone()
-    filt_mask = logits_filt.max(dim=1)[0] > box_threshold
-    logits_filt = logits_filt[filt_mask]  # num_filt, 256
-    boxes_filt = boxes_filt[filt_mask]  # num_filt, 4
-    logits_filt.shape[0]
-    # get phrase
-    tokenlizer = model.tokenizer
-    tokenized = tokenlizer(caption)
-    # build pred
-    pred_phrases = []
-    scores = []
-    for logit, box in zip(logits_filt, boxes_filt):
-        pred_phrase = get_phrases_from_posmap(logit > text_threshold, tokenized, tokenlizer)
-        pred_phrases.append(pred_phrase + f"({str(logit.max().item())[:4]})")
-        scores.append(logit.max().item())
-    return boxes_filt, torch.Tensor(scores), pred_phrases
-def show_mask(mask, ax, random_color=False):
-    if random_color:
-        color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
-    else:
-        color = np.array([30/255, 144/255, 255/255, 0.6])
-    h, w = mask.shape[-2:]
-    mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
-    ax.imshow(mask_image)
-def show_box(box, ax, label):
-    x0, y0 = box[0], box[1]
-    w, h = box[2] - box[0], box[3] - box[1]
-    ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor='green', facecolor=(0,0,0,0), lw=2))
-    ax.text(x0, y0, label)
-def save_mask_data(output_dir, mask_list, box_list, label_list):
-    value = 0  # 0 for background
-    mask_img = torch.zeros(mask_list.shape[-2:])
-    for idx, mask in enumerate(mask_list):
-        mask_img[mask.cpu().numpy()[0] == True] = value + idx + 1
-    plt.figure(figsize=(10, 10))
-    plt.imshow(mask_img.numpy())
-    plt.axis('off')
-    plt.savefig(os.path.join(output_dir, 'mask.jpg'), bbox_inches="tight", dpi=300, pad_inches=0.0)
-    json_data = [{
-        'value': value,
-        'label': 'background'
-    }]
-    for label, box in zip(label_list, box_list):
-        value += 1
-        name, logit = label.split('(')
-        logit = logit[:-1] # the last is ')'
-        json_data.append({
-            'value': value,
-            'label': name,
-            'logit': float(logit),
-            'box': box.numpy().tolist(),
-        })
-    with open(os.path.join(output_dir, 'mask.json'), 'w') as f:
-        json.dump(json_data, f)
-def speech_recognition(speech_file, model):
-    # whisper
-    # load audio and pad/trim it to fit 30 seconds
-    audio = whisper.load_audio(speech_file)
-    audio = whisper.pad_or_trim(audio)
-    # make log-Mel spectrogram and move to the same device as the model
-    mel = whisper.log_mel_spectrogram(audio).to(model.device)
-    # detect the spoken language
-    _, probs = model.detect_language(mel)
-    speech_language = max(probs, key=probs.get)
-    # decode the audio
-    options = whisper.DecodingOptions()
-    result = whisper.decode(model, mel, options)
-    # print the recognized text
-    speech_text = result.text
-    return speech_text, speech_language
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser("Grounded-Segment-Anything Demo", add_help=True)
-    parser.add_argument("--config", type=str, required=True, help="path to config file")
-    parser.add_argument(
-        "--grounded_checkpoint", type=str, required=True, help="path to checkpoint file"
-    )
-    parser.add_argument(
-        "--sam_checkpoint", type=str, required=True, help="path to checkpoint file"
-    )
-    parser.add_argument("--input_image", type=str, required=True, help="path to image file")
-    parser.add_argument("--speech_file", type=str, required=True, help="speech file")
-    parser.add_argument(
-        "--output_dir", "-o", type=str, default="outputs", required=True, help="output directory"
-    )
-    parser.add_argument("--box_threshold", type=float, default=0.3, help="box threshold")
-    parser.add_argument("--text_threshold", type=float, default=0.25, help="text threshold")
-    parser.add_argument("--iou_threshold", type=float, default=0.5, help="iou threshold")
-    parser.add_argument("--device", type=str, default="cpu", help="running on cpu only!, default=False")
-    args = parser.parse_args()
-    # cfg
-    config_file = args.config  # change the path of the model config file
-    grounded_checkpoint = args.grounded_checkpoint  # change the path of the model
-    sam_checkpoint = args.sam_checkpoint
-    image_path = args.input_image
-    output_dir = args.output_dir
-    box_threshold = args.box_threshold
-    text_threshold = args.text_threshold
-    iou_threshold = args.iou_threshold
-    device = args.device
-    # load speech
-    whisper_model = whisper.load_model("base")
-    speech_text, speech_language = speech_recognition(args.speech_file, whisper_model)
-    print(f"speech_text: {speech_text}")
-    print(f"speech_language: {speech_language}")
-    # make dir
-    os.makedirs(output_dir, exist_ok=True)
-    # load image
-    image_pil, image = load_image(image_path)
-    # load model
-    model = load_model(config_file, grounded_checkpoint, device=device)
-    # visualize raw image
-    image_pil.save(os.path.join(output_dir, "raw_image.jpg"))
-    # run grounding dino model
-    text_prompt = speech_text
-    boxes_filt, scores, pred_phrases = get_grounding_output(
-        model, image, text_prompt, box_threshold, text_threshold, device=device
-    )
-    # initialize SAM
-    predictor = SamPredictor(build_sam(checkpoint=sam_checkpoint).to(args.device))
-    image = cv2.imread(image_path)
-    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
-    predictor.set_image(image)
-    size = image_pil.size
-    H, W = size[1], size[0]
-    for i in range(boxes_filt.size(0)):
-        boxes_filt[i] = boxes_filt[i] * torch.Tensor([W, H, W, H])
-        boxes_filt[i][:2] -= boxes_filt[i][2:] / 2
-        boxes_filt[i][2:] += boxes_filt[i][:2]
-    boxes_filt = boxes_filt.cpu()
-    # use NMS to handle overlapped boxes
-    print(f"Before NMS: {boxes_filt.shape[0]} boxes")
-    nms_idx = torchvision.ops.nms(boxes_filt, scores, iou_threshold).numpy().tolist()
-    boxes_filt = boxes_filt[nms_idx]
-    pred_phrases = [pred_phrases[idx] for idx in nms_idx]
-    print(f"After NMS: {boxes_filt.shape[0]} boxes")
-    transformed_boxes = predictor.transform.apply_boxes_torch(boxes_filt, image.shape[:2])
-    masks, _, _ = predictor.predict_torch(
-        point_coords = None,
-        point_labels = None,
-        boxes = transformed_boxes.to(args.device),
-        multimask_output = False,
-    )
-    # draw output image
-    plt.figure(figsize=(10, 10))
-    plt.imshow(image)
-    for mask in masks:
-        show_mask(mask.cpu().numpy(), plt.gca(), random_color=True)
-    for box, label in zip(boxes_filt, pred_phrases):
-        show_box(box.numpy(), plt.gca(), label)
-    plt.title(speech_text)
-    plt.axis('off')
-    plt.savefig(
-        os.path.join(output_dir, "grounded_sam_whisper_output.jpg"),
-        bbox_inches="tight", dpi=300, pad_inches=0.0
-    )
-    save_mask_data(output_dir, masks, boxes_filt, pred_phrases)

grounded_sam_whisper_inpainting_demo.py DELETED Viewed

@@ -1,281 +0,0 @@
-import argparse
-import os
-from warnings import warn
-import numpy as np
-import torch
-from PIL import Image, ImageDraw, ImageFont
-# Grounding DINO
-import GroundingDINO.groundingdino.datasets.transforms as T
-from GroundingDINO.groundingdino.models import build_model
-from GroundingDINO.groundingdino.util import box_ops
-from GroundingDINO.groundingdino.util.slconfig import SLConfig
-from GroundingDINO.groundingdino.util.utils import clean_state_dict, get_phrases_from_posmap
-# segment anything
-from segment_anything import build_sam, SamPredictor
-import cv2
-import numpy as np
-import matplotlib.pyplot as plt
-# diffusers
-import PIL
-import requests
-import torch
-from io import BytesIO
-from diffusers import StableDiffusionInpaintPipeline
-# whisper
-import whisper
-# ChatGPT
-import openai
-def load_image(image_path):
-    # load image
-    image_pil = Image.open(image_path).convert("RGB")  # load image
-    transform = T.Compose(
-        [
-            T.RandomResize([800], max_size=1333),
-            T.ToTensor(),
-            T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
-        ]
-    )
-    image, _ = transform(image_pil, None)  # 3, h, w
-    return image_pil, image
-def load_model(model_config_path, model_checkpoint_path, device):
-    args = SLConfig.fromfile(model_config_path)
-    args.device = device
-    model = build_model(args)
-    checkpoint = torch.load(model_checkpoint_path, map_location="cpu")
-    load_res = model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
-    print(load_res)
-    _ = model.eval()
-    return model
-def get_grounding_output(model, image, caption, box_threshold, text_threshold, with_logits=True, device="cpu"):
-    caption = caption.lower()
-    caption = caption.strip()
-    if not caption.endswith("."):
-        caption = caption + "."
-    model = model.to(device)
-    image = image.to(device)
-    with torch.no_grad():
-        outputs = model(image[None], captions=[caption])
-    logits = outputs["pred_logits"].cpu().sigmoid()[0]  # (nq, 256)
-    boxes = outputs["pred_boxes"].cpu()[0]  # (nq, 4)
-    logits.shape[0]
-    # filter output
-    logits_filt = logits.clone()
-    boxes_filt = boxes.clone()
-    filt_mask = logits_filt.max(dim=1)[0] > box_threshold
-    logits_filt = logits_filt[filt_mask]  # num_filt, 256
-    boxes_filt = boxes_filt[filt_mask]  # num_filt, 4
-    logits_filt.shape[0]
-    # get phrase
-    tokenlizer = model.tokenizer
-    tokenized = tokenlizer(caption)
-    # build pred
-    pred_phrases = []
-    for logit, box in zip(logits_filt, boxes_filt):
-        pred_phrase = get_phrases_from_posmap(logit > text_threshold, tokenized, tokenlizer)
-        if with_logits:
-            pred_phrases.append(pred_phrase + f"({str(logit.max().item())[:4]})")
-        else:
-            pred_phrases.append(pred_phrase)
-    return boxes_filt, pred_phrases
-def show_mask(mask, ax, random_color=False):
-    if random_color:
-        color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
-    else:
-        color = np.array([30/255, 144/255, 255/255, 0.6])
-    h, w = mask.shape[-2:]
-    mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
-    ax.imshow(mask_image)
-def show_box(box, ax, label):
-    x0, y0 = box[0], box[1]
-    w, h = box[2] - box[0], box[3] - box[1]
-    ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor='green', facecolor=(0,0,0,0), lw=2))
-    ax.text(x0, y0, label)
-def speech_recognition(speech_file, model):
-    # whisper
-    # load audio and pad/trim it to fit 30 seconds
-    audio = whisper.load_audio(speech_file)
-    audio = whisper.pad_or_trim(audio)
-    # make log-Mel spectrogram and move to the same device as the model
-    mel = whisper.log_mel_spectrogram(audio).to(model.device)
-    # detect the spoken language
-    _, probs = model.detect_language(mel)
-    speech_language = max(probs, key=probs.get)
-    # decode the audio
-    options = whisper.DecodingOptions()
-    result = whisper.decode(model, mel, options)
-    # print the recognized text
-    speech_text = result.text
-    return speech_text, speech_language
-def filter_prompts_with_chatgpt(caption, max_tokens=100, model="gpt-3.5-turbo"):
-    prompt = [
-        {
-            'role': 'system',
-            'content': f"Extract the main object to be replaced and marked it as 'main_object', " + \
-                       f"Extract the remaining part as 'other prompt' " + \
-                       f"Return (main_object, other prompt)" + \
-                       f'Given caption: {caption}.'
-        }
-    ]
-    response = openai.ChatCompletion.create(model=model, messages=prompt, temperature=0.6, max_tokens=max_tokens)
-    reply = response['choices'][0]['message']['content']
-    try:
-        det_prompt, inpaint_prompt = reply.split('\n')[0].split(':')[-1].strip(), reply.split('\n')[1].split(':')[-1].strip()
-    except:
-        warn(f"Failed to extract tags from caption") # use caption as det_prompt, inpaint_prompt
-        det_prompt, inpaint_prompt = caption, caption
-    return det_prompt, inpaint_prompt
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser("Grounded-Segment-Anything Demo", add_help=True)
-    parser.add_argument("--config", type=str, required=True, help="path to config file")
-    parser.add_argument(
-        "--grounded_checkpoint", type=str, required=True, help="path to checkpoint file"
-    )
-    parser.add_argument(
-        "--sam_checkpoint", type=str, required=True, help="path to checkpoint file"
-    )
-    parser.add_argument("--input_image", type=str, required=True, help="path to image file")
-    parser.add_argument(
-        "--output_dir", "-o", type=str, default="outputs", required=True, help="output directory"
-    )
-    parser.add_argument("--det_speech_file", type=str, help="grounding speech file")
-    parser.add_argument("--inpaint_speech_file", type=str, help="inpaint speech file")
-    parser.add_argument("--prompt_speech_file", type=str, help="prompt speech file, no need to provide det_speech_file")
-    parser.add_argument("--enable_chatgpt", action="store_true", help="enable chatgpt")
-    parser.add_argument("--openai_key", type=str, help="key for chatgpt")
-    parser.add_argument("--openai_proxy", default=None, type=str, help="proxy for chatgpt")
-    parser.add_argument("--whisper_model", type=str, default="small", help="whisper model version: tiny, base, small, medium, large")
-    parser.add_argument("--box_threshold", type=float, default=0.3, help="box threshold")
-    parser.add_argument("--text_threshold", type=float, default=0.25, help="text threshold")
-    parser.add_argument("--inpaint_mode", type=str, default="first", help="inpaint mode")
-    parser.add_argument("--device", type=str, default="cpu", help="running on cpu only!, default=False")
-    parser.add_argument("--prompt_extra", type=str, default=" high resolution, real scene", help="extra prompt for inpaint")
-    args = parser.parse_args()
-    # cfg
-    config_file = args.config  # change the path of the model config file
-    grounded_checkpoint = args.grounded_checkpoint  # change the path of the model
-    sam_checkpoint = args.sam_checkpoint
-    image_path = args.input_image
-    output_dir = args.output_dir
-    box_threshold = args.box_threshold
-    text_threshold = args.box_threshold
-    inpaint_mode = args.inpaint_mode
-    device = args.device
-    # make dir
-    os.makedirs(output_dir, exist_ok=True)
-    # load image
-    image_pil, image = load_image(image_path)
-    # load model
-    model = load_model(config_file, grounded_checkpoint, device=device)
-    # visualize raw image
-    image_pil.save(os.path.join(output_dir, "raw_image.jpg"))
-    # recognize speech
-    whisper_model = whisper.load_model(args.whisper_model)
-    if args.enable_chatgpt:
-        openai.api_key = args.openai_key
-        if args.openai_proxy:
-            openai.proxy = {"http": args.openai_proxy, "https": args.openai_proxy}
-        speech_text, _ = speech_recognition(args.prompt_speech_file, whisper_model)
-        det_prompt, inpaint_prompt = filter_prompts_with_chatgpt(speech_text)
-        inpaint_prompt += args.prompt_extra
-        print(f"det_prompt: {det_prompt}, inpaint_prompt: {inpaint_prompt}")
-    else:
-        det_prompt, det_speech_language = speech_recognition(args.det_speech_file, whisper_model)
-        inpaint_prompt, inpaint_speech_language = speech_recognition(args.inpaint_speech_file, whisper_model)
-        print(f"det_prompt: {det_prompt}, using language: {det_speech_language}")
-        print(f"inpaint_prompt: {inpaint_prompt}, using language: {inpaint_speech_language}")
-    # run grounding dino model
-    boxes_filt, pred_phrases = get_grounding_output(
-        model, image, det_prompt, box_threshold, text_threshold, device=device
-    )
-    # initialize SAM
-    predictor = SamPredictor(build_sam(checkpoint=sam_checkpoint))
-    image = cv2.imread(image_path)
-    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
-    predictor.set_image(image)
-    size = image_pil.size
-    H, W = size[1], size[0]
-    for i in range(boxes_filt.size(0)):
-        boxes_filt[i] = boxes_filt[i] * torch.Tensor([W, H, W, H])
-        boxes_filt[i][:2] -= boxes_filt[i][2:] / 2
-        boxes_filt[i][2:] += boxes_filt[i][:2]
-    boxes_filt = boxes_filt.cpu()
-    transformed_boxes = predictor.transform.apply_boxes_torch(boxes_filt, image.shape[:2])
-    masks, _, _ = predictor.predict_torch(
-        point_coords = None,
-        point_labels = None,
-        boxes = transformed_boxes,
-        multimask_output = False,
-    )
-    # masks: [1, 1, 512, 512]
-    # inpainting pipeline
-    if inpaint_mode == 'merge':
-        masks = torch.sum(masks, dim=0).unsqueeze(0)
-        masks = torch.where(masks > 0, True, False)
-    else:
-        mask = masks[0][0].cpu().numpy() # simply choose the first mask, which will be refine in the future release
-    mask_pil = Image.fromarray(mask)
-    image_pil = Image.fromarray(image)
-    pipe = StableDiffusionInpaintPipeline.from_pretrained(
-    "runwayml/stable-diffusion-inpainting", torch_dtype=torch.float16
-    )
-    pipe = pipe.to("cuda")
-    # prompt = "A sofa, high quality, detailed"
-    image = pipe(prompt=inpaint_prompt, image=image_pil, mask_image=mask_pil).images[0]
-    image.save(os.path.join(output_dir, "grounded_sam_inpainting_output.jpg"))
-    # draw output image
-    # plt.figure(figsize=(10, 10))
-    # plt.imshow(image)
-    # for mask in masks:
-    #     show_mask(mask.cpu().numpy(), plt.gca(), random_color=True)
-    # for box, label in zip(boxes_filt, pred_phrases):
-    #     show_box(box.numpy(), plt.gca(), label)
-    # plt.axis('off')
-    # plt.savefig(os.path.join(output_dir, "grounded_sam_output.jpg"), bbox_inches="tight")

requirements.txt CHANGED Viewed

@@ -21,3 +21,12 @@ transformers
 yapf
 numba
 segment_anything

 yapf
 numba
 segment_anything
+# ftfy
+# uuid
+# psutil
+# facexlib
+lama-cleaner==0.25.0
+# tensorflow
+# easydict