Spaces:

Arulkumar03
/

GroundingDINO_SOTA_Zero_Shot_Model

Running

App Files Files Community

Arulkumar03 commited on Oct 18, 2023

Commit

3acc94f

•

1 Parent(s): e5e70ea

Update app.py

Browse files

Files changed (1) hide show

app.py +106 -71

app.py CHANGED Viewed

@@ -1,98 +1,133 @@
 import argparse
-import copy
-from IPython.display import display
-from PIL import Image, ImageDraw, ImageFont
-from torchvision.ops import box_convert
-# Grounding DINO
-import groundingdino.datasets.transforms as T
 from groundingdino.models import build_model
-from groundingdino.util import box_ops
 from groundingdino.util.slconfig import SLConfig
-from groundingdino.util.utils import clean_state_dict, get_phrases_from_posmap
 from groundingdino.util.inference import annotate, load_image, predict
-import supervision as sv
-# segment anything
-from segment_anything import build_sam, SamPredictor
-import cv2
-import numpy as np
-import matplotlib.pyplot as plt
-# diffusers
-import PIL
-import requests
-import torch
-from io import BytesIO
-from diffusers import StableDiffusionInpaintPipeline
-from huggingface_hub import hf_hub_download
-device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-def load_model_hf(repo_id, filename, ckpt_config_filename, device='cpu'):
-    cache_config_file = hf_hub_download(repo_id=repo_id, filename=ckpt_config_filename)
-    args = SLConfig.fromfile(cache_config_file)
-    args.device = device
     model = build_model(args)
     cache_file = hf_hub_download(repo_id=repo_id, filename=filename)
-    checkpoint = torch.load(cache_file, map_location=device)
     log = model.load_state_dict(clean_state_dict(checkpoint['model']), strict=False)
     print("Model loaded from {} \n => {}".format(cache_file, log))
     _ = model.eval()
-    return model
-ckpt_repo_id = "ShilongLiu/GroundingDINO"
-ckpt_filenmae = "groundingdino_swinb_cogcoor.pth"
-ckpt_config_filename = "GroundingDINO_SwinB.cfg.py"
-groundingdino_model = load_model_hf(ckpt_repo_id, ckpt_filenmae, ckpt_config_filename, device)
-checkpoint = 'sam_vit_h_4b8939.pth'
-predictor = SamPredictor(build_sam(checkpoint=checkpoint).to(device))
-# detect object using grounding DINO
-def detect(image, text_prompt, model, box_threshold = 0.3, text_threshold = 0.25):
-  boxes, logits, phrases = predict(
-      model=model,
-      image=image,
-      caption=text_prompt,
-      box_threshold=box_threshold,
-      text_threshold=text_threshold
-  )
-  annotated_frame = annotate(image_source=image_source, boxes=boxes, logits=logits, phrases=phrases)
-  annotated_frame = annotated_frame[...,::-1] # BGR to RGB
-  return annotated_frame, boxes
-import gradio as gr
-# Define the Gradio interface
-def detect_objects(image, text_prompt):
-    # Convert Gradio input format to the format expected by the code
-    image_array = np.array(image)
-    image_source, _ = load_image(image_array)
-    # Detect objects using grounding DINO
-    annotated_frame, detected_boxes = detect(image_array, text_prompt, groundingdino_model)
-    # Convert the annotated frame to Gradio output format
-    annotated_image = Image.fromarray(annotated_frame)
-    return annotated_image
-# Create the Gradio interface
-iface = gr.Interface(
-    fn=detect_objects,
-    inputs=[gr.Image(), "text"],
-    outputs=gr.Image(),
-    live=True,
-    interpretation="default"
-)
-# Launch the Gradio interface
-iface.launch()

 import argparse
+from functools import partial
+import cv2
+import requests
+import os
+from io import BytesIO
+from PIL import Image
+import numpy as np
+from pathlib import Path
+import gradio as gr
+import warnings
+import torch
+os.system("python setup.py build develop --user")
+os.system("pip install packaging==21.3")
+warnings.filterwarnings("ignore")
 from groundingdino.models import build_model
 from groundingdino.util.slconfig import SLConfig
+from groundingdino.util.utils import clean_state_dict
 from groundingdino.util.inference import annotate, load_image, predict
+import groundingdino.datasets.transforms as T
+from huggingface_hub import hf_hub_download
+# Use this command for evaluate the GLIP-T model
+config_file = "groundingdino/config/GroundingDINO_SwinT_OGC.py"
+ckpt_repo_id = "ShilongLiu/GroundingDINO"
+ckpt_filenmae = "groundingdino_swint_ogc.pth"
+def load_model_hf(model_config_path, repo_id, filename, device='cpu'):
+    args = SLConfig.fromfile(model_config_path)
     model = build_model(args)
+    args.device = device
     cache_file = hf_hub_download(repo_id=repo_id, filename=filename)
+    checkpoint = torch.load(cache_file, map_location='cpu')
     log = model.load_state_dict(clean_state_dict(checkpoint['model']), strict=False)
     print("Model loaded from {} \n => {}".format(cache_file, log))
     _ = model.eval()
+    return model
+def image_transform_grounding(init_image):
+    transform = T.Compose([
+        T.RandomResize([800], max_size=1333),
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    image, _ = transform(init_image, None) # 3, h, w
+    return init_image, image
+def image_transform_grounding_for_vis(init_image):
+    transform = T.Compose([
+        T.RandomResize([800], max_size=1333),
+    ])
+    image, _ = transform(init_image, None) # 3, h, w
+    return image
+model = load_model_hf(config_file, ckpt_repo_id, ckpt_filenmae)
+def run_grounding(input_image, grounding_caption, box_threshold, text_threshold):
+    init_image = input_image.convert("RGB")
+    original_size = init_image.size
+    _, image_tensor = image_transform_grounding(init_image)
+    image_pil: Image = image_transform_grounding_for_vis(init_image)
+    # run grounidng
+    boxes, logits, phrases = predict(model, image_tensor, grounding_caption, box_threshold, text_threshold, device='cpu')
+    annotated_frame = annotate(image_source=np.asarray(image_pil), boxes=boxes, logits=logits, phrases=phrases)
+    image_with_box = Image.fromarray(cv2.cvtColor(annotated_frame, cv2.COLOR_BGR2RGB))
+    return image_with_box
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser("Grounding DINO demo", add_help=True)
+    parser.add_argument("--debug", action="store_true", help="using debug mode")
+    parser.add_argument("--share", action="store_true", help="share the app")
+    args = parser.parse_args()
+    css = """
+  #mkd {
+    height: 500px;
+    overflow: auto;
+    border: 1px solid #ccc;
+  }
+"""
+    block = gr.Blocks(css=css).queue()
+    with block:
+        gr.Markdown("<h1><center>Grounding DINO<h1><center>")
+        gr.Markdown("<h3><center>Open-World Detection with <a href='https://github.com/IDEA-Research/GroundingDINO'>Grounding DINO</a><h3><center>")
+        gr.Markdown("<h3><center>Note the model runs on CPU, so it may take a while to run the model.<h3><center>")
+        with gr.Row():
+            with gr.Column():
+                input_image = gr.Image(source='upload', type="pil")
+                grounding_caption = gr.Textbox(label="Detection Prompt")
+                run_button = gr.Button(label="Run")
+                with gr.Accordion("Advanced options", open=False):
+                    box_threshold = gr.Slider(
+                        label="Box Threshold", minimum=0.0, maximum=1.0, value=0.25, step=0.001
+                    )
+                    text_threshold = gr.Slider(
+                        label="Text Threshold", minimum=0.0, maximum=1.0, value=0.25, step=0.001
+                    )
+            with gr.Column():
+                gallery = gr.outputs.Image(
+                    type="pil",
+                    # label="grounding results"
+                ).style(full_width=True, full_height=True)
+                # gallery = gr.Gallery(label="Generated images", show_label=False).style(
+                #         grid=[1], height="auto", container=True, full_width=True, full_height=True)
+        run_button.click(fn=run_grounding, inputs=[
+                        input_image, grounding_caption, box_threshold, text_threshold], outputs=[gallery])
+        gr.Examples(
+          [["this_is_fine.png", "coffee cup", 0.25, 0.25]],
+          inputs = [input_image, grounding_caption, box_threshold, text_threshold],
+          outputs = [gallery],
+          fn=run_grounding,
+          cache_examples=True,
+          label='Try this example input!'
+      )
+    block.launch(share=False, show_api=False, show_error=True)