Spaces:

Mountchicken
/

Grounding-DINO-1.5

Runtime error

App Files Files Community

Mountchicken commited on May 17, 2024

Commit

bf9dee2

verified ·

1 Parent(s): cfa95f2

Upload 28 files

Browse files

Files changed (29) hide show

.gitattributes +11 -0
app.py +77 -0
asset/demo.jpg +0 -0
asset/demo2.jpeg +0 -0
asset/demo3.jpeg +0 -0
asset/demo4.jpeg +0 -0
asset/demo5.jpeg +0 -0
asset/demo_output.jpg +0 -0
asset/gd1.5_overall_framework.png +0 -0
asset/qualitative_visualization/common_object_vis.png +3 -0
asset/qualitative_visualization/common_object_vis2.png +3 -0
asset/qualitative_visualization/dense_object_vis.png +3 -0
asset/qualitative_visualization/dense_object_vis2.png +3 -0
asset/qualitative_visualization/edge_vis.png +3 -0
asset/qualitative_visualization/long_caption_vis.png +3 -0
asset/qualitative_visualization/long_caption_vis2.png +3 -0
asset/qualitative_visualization/long_caption_vis3.png +3 -0
asset/qualitative_visualization/longtail_object_vis.png +3 -0
asset/qualitative_visualization/short_caption_vis.png +3 -0
asset/qualitative_visualization/video_object_vis.png +3 -0
asset/video_cover.jpg +0 -0
asset/zeroshot.png +0 -0
gdino/__init__.py +4 -0
gdino/__pycache__/visualize.cpython-38.pyc +0 -0
gdino/model_wrapper.py +132 -0
gdino/version.py +1 -0
gdino/visualize.py +108 -0
requirements.txt +2 -0
setup.py +126 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,14 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+asset/qualitative_visualization/common_object_vis.png filter=lfs diff=lfs merge=lfs -text
+asset/qualitative_visualization/common_object_vis2.png filter=lfs diff=lfs merge=lfs -text
+asset/qualitative_visualization/dense_object_vis.png filter=lfs diff=lfs merge=lfs -text
+asset/qualitative_visualization/dense_object_vis2.png filter=lfs diff=lfs merge=lfs -text
+asset/qualitative_visualization/edge_vis.png filter=lfs diff=lfs merge=lfs -text
+asset/qualitative_visualization/long_caption_vis.png filter=lfs diff=lfs merge=lfs -text
+asset/qualitative_visualization/long_caption_vis2.png filter=lfs diff=lfs merge=lfs -text
+asset/qualitative_visualization/long_caption_vis3.png filter=lfs diff=lfs merge=lfs -text
+asset/qualitative_visualization/longtail_object_vis.png filter=lfs diff=lfs merge=lfs -text
+asset/qualitative_visualization/short_caption_vis.png filter=lfs diff=lfs merge=lfs -text
+asset/qualitative_visualization/video_object_vis.png filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import os
+import argparse
+from typing import Dict, List
+from gdino import GroundingDINOAPIWrapper, visualize
+import gradio as gr
+import numpy as np
+import cv2
+def arg_parse():
+    parser = argparse.ArgumentParser(description="Gradio Demo for T-Rex2")
+    parser.add_argument(
+        "--token",
+        type=str,
+        help="This token is only for gradio space. Please do not take it away for your own purpose!",
+    )
+    args = parser.parse_args()
+    return args
+def resize_image_with_aspect_ratio(image: np.ndarray, min_size: int = 800, max_size: int = 1333) -> np.ndarray:
+    h, w = image.shape[:2]
+    aspect_ratio = w / h
+    # Determine the scaling factor based on the constraints
+    if h < w:
+        new_height = min_size
+        new_width = int(new_height * aspect_ratio)
+        if new_width > max_size:
+            new_width = max_size
+            new_height = int(new_width / aspect_ratio)
+    else:
+        new_width = min_size
+        new_height = int(new_width / aspect_ratio)
+        if new_height > max_size:
+            new_height = max_size
+            new_width = int(new_height * aspect_ratio)
+    # Resize the image
+    resized_image = cv2.resize(image, (new_width, new_height), interpolation=cv2.INTER_AREA)
+    return resized_image
+def inference(image, prompt: str, return_mask: bool = False, return_score: bool = False) -> gr.Image:
+    # shrink image first to save computation
+    if return_mask:
+        image = resize_image_with_aspect_ratio(image, min_size=600, max_size=1000)
+    prompts = dict(image=image, prompt=prompt)
+    results = gdino.inference(prompts, return_mask=return_mask)
+    image_pil = visualize(image, results, return_mask=return_mask, draw_score=return_score)
+    return image_pil
+args = arg_parse()
+gdino = GroundingDINOAPIWrapper(args.token)
+if  __name__ == "__main__":
+    with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo:
+        with gr.Row():
+            with gr.Column():
+                input_image = gr.Image(label="Input Image")
+            with gr.Column():
+                output_image = gr.Image(label="Output Image")
+        with  gr.Row():
+            return_mask = gr.Checkbox(label="Return Mask")
+            return_score = gr.Checkbox(label="Return Score")
+            prompt = gr.Textbox(label="Prompt", placeholder="e.g., person.pigeon.tree")
+            run = gr.Button(value="Run")
+        with gr.Row():
+            gr.Examples(
+                examples=[
+                    ['asset/demo.jpg', 'person . pigeon . tree'],
+                    ['asset/demo2.jpeg', 'wireless walkie-talkie . life jacket . atlantic cod . man . vehicle . accessory . cell phone .'],
+                    ['asset/demo3.jpeg', 'wine rack . bottle . basket'],
+                    ['asset/demo4.jpeg', 'Mosque. golden dome. smaller domes. minarets. arched windows. white facade. cars. electrical lines. streetlights. trees. pedestrians. blue sky. shadows'],
+                    ['asset/demo5.jpeg', 'stately building. columns. sculptures. Spanish flag. clouds. blue sky. street. taxis. van. city bus. traffic lights. street lamps. road markings. pedestrians. sidewalk. traffic sign. palm trees']
+                ],
+                inputs=[input_image, prompt],
+            )
+        run.click(inference, inputs=[input_image, prompt, return_mask, return_score], outputs=output_image)
+    demo.launch(debug=True)

asset/demo.jpg ADDED Viewed

asset/demo2.jpeg ADDED Viewed

asset/demo3.jpeg ADDED Viewed

asset/demo4.jpeg ADDED Viewed

asset/demo5.jpeg ADDED Viewed

asset/demo_output.jpg ADDED Viewed

asset/gd1.5_overall_framework.png ADDED Viewed

asset/qualitative_visualization/common_object_vis.png ADDED Viewed

Git LFS Details

SHA256: 5df2d7784fbeb1f6fa09605e8fb833976f5346dda5061ea7c37760f2a9758afb
Pointer size: 132 Bytes
Size of remote file: 3.6 MB

asset/qualitative_visualization/common_object_vis2.png ADDED Viewed

Git LFS Details

SHA256: 9484f1a14b862aaa9b8e47e30d95ce90dc580058b44b6fc5489aaf9fa77f3893
Pointer size: 132 Bytes
Size of remote file: 3.36 MB

asset/qualitative_visualization/dense_object_vis.png ADDED Viewed

Git LFS Details

SHA256: 0c98062f92d532ce78bfbd1d2d8704aea236c9eba58732789e18f0a335c688ca
Pointer size: 132 Bytes
Size of remote file: 3.86 MB

asset/qualitative_visualization/dense_object_vis2.png ADDED Viewed

Git LFS Details

SHA256: 517c4db44f7c77ad9abc67ecde55ae7cb64c0868f68e9e378d40e5ae889def9d
Pointer size: 132 Bytes
Size of remote file: 3.5 MB

asset/qualitative_visualization/edge_vis.png ADDED Viewed

Git LFS Details

SHA256: d03f4bfe1891dc00cf4a0045b05a5e59884b4552502a25952cc9899e739463c3
Pointer size: 132 Bytes
Size of remote file: 3.51 MB

asset/qualitative_visualization/long_caption_vis.png ADDED Viewed

Git LFS Details

SHA256: 8a33698073271516337099fdafcfdeb187c52e94365d9728e72f11051e4c299a
Pointer size: 132 Bytes
Size of remote file: 2.32 MB

asset/qualitative_visualization/long_caption_vis2.png ADDED Viewed

Git LFS Details

SHA256: 32fb2d51e034e58197cb96c77ceb8dd37e88a95c12edd9441c90e02fbc7392fa
Pointer size: 132 Bytes
Size of remote file: 1.78 MB

asset/qualitative_visualization/long_caption_vis3.png ADDED Viewed

Git LFS Details

SHA256: 442896c95cad3c7b2825e63d9c5e5f655fbb2b93a62caf097faa598bfc135220
Pointer size: 132 Bytes
Size of remote file: 2.52 MB

asset/qualitative_visualization/longtail_object_vis.png ADDED Viewed

Git LFS Details

SHA256: ec6440fd9500a8c4aa65d3be0f3412ebfc22127ced696f455388202bf168a8bb
Pointer size: 132 Bytes
Size of remote file: 3.7 MB

asset/qualitative_visualization/short_caption_vis.png ADDED Viewed

Git LFS Details

SHA256: 55622025ae7f29a087e935c08798880feaa2061d5cf5f06db18466973b5452ee
Pointer size: 132 Bytes
Size of remote file: 2.36 MB

asset/qualitative_visualization/video_object_vis.png ADDED Viewed

Git LFS Details

SHA256: 198e045fd0516803263b6634164cda43ca2b752cd8419b2698763565745894d5
Pointer size: 132 Bytes
Size of remote file: 2.76 MB

asset/video_cover.jpg ADDED Viewed

asset/zeroshot.png ADDED Viewed

gdino/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .model_wrapper import GroundingDINOAPIWrapper
+from .visualize import visualize
+__all__ = ["GroundingDINOAPIWrapper", "visualize"]

gdino/__pycache__/visualize.cpython-38.pyc ADDED Viewed

Binary file (3.17 kB). View file

gdino/model_wrapper.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import tempfile
+from typing import Dict, List, Union
+import numpy as np
+from dds_cloudapi_sdk import (
+    DetectionTask,
+    Client,
+    Config,
+    TextPrompt,
+    DetectionModel,
+    DetectionTarget,
+)
+from PIL import Image
+import concurrent.futures
+class GroundingDINOAPIWrapper:
+    """API wrapper for Grounding DINO 1.5
+    Args:
+        token (str): The token for Grounding DINO 1.5 API. We are now opening free API access to Grounding DINO 1.5. For
+            educators, students, and researchers, we offer an API with extensive usage times to
+            support your educational and research endeavors. You can get free API token at here:
+            https://deepdataspace.com/request_api
+    """
+    def __init__(self, token: str):
+        self.client = Client(Config(token=token))
+    def inference(self, prompt: Dict, return_mask:bool=False):
+        """Main inference function of Grounding DINO 1.5. We take batch as input and
+        each image is a dict. N. We do not support batch inference for now.
+        Args:
+            prompts (dict): Annotations with the following keys:
+                - "image" (str): Path to image. E.g. "test1.jpg",
+                - "prompt" (str): Text prompt sepearted by '.' E.g. 'cate1 . cate2 . cate3'
+            return_mask (bool): Whether to return mask. Defaults to False.
+        Returns:
+            (Dict): Detection results in dict format with keys::
+                - "scores": (List[float]): A list of scores for each object in the batch
+                - "labels": (List[int]): A list of labels for each object in the batch
+                - "boxes": (List[List[int]]): A list of boxes for each object in the batch,
+                     in format [xmin, ymin, xmax, ymax]
+                - "masks": (List[np.ndarray]): A list of segmentations for each object in the batch
+        """
+        # construct input prompts
+        image=self.get_image_url(prompt["image"]),
+        task=DetectionTask(
+            image_url=image[0],
+            prompts=[TextPrompt(text=prompt['prompt'])],
+            targets=[DetectionTarget.Mask, DetectionTarget.BBox] if return_mask else [DetectionTarget.BBox],
+            model=DetectionModel.GDino1_5_Pro,
+        )
+        self.client.run_task(task)
+        result = task.result
+        return self.postprocess(result, task, return_mask)
+    def postprocess(self, result, task, return_mask):
+        """Postprocess the result from the API call
+        Args:
+            result (TaskResult): Task result with the following keys:
+                - objects (List[DetectionObject]): Each DetectionObject has the following keys:
+                    - bbox (List[float]): Box in xyxy format
+                    - category (str): Detection category
+                    - score (float): Detection score
+                    - mask (DetectionObjectMask): Use mask.counts to parse RLE mask
+            task (DetectionTask): The task object
+            return_mask (bool): Whether to return mask
+        Returns:
+            (Dict): Return dict in format:
+                {
+                    "scores": (List[float]): A list of scores for each object
+                    "categorys": (List[str]): A list of categorys for each object
+                    "boxes": (List[List[int]]): A list of boxes for each object
+                    "masks": (List[PIL.Image]): A list of masks in the format of PIL.Image
+                }
+        """
+        def process_object_with_mask(object):
+            box = object.bbox
+            score = object.score
+            category = object.category
+            mask = task.rle2rgba(object.mask)
+            return box, score, category, mask
+        def process_object_without_mask(object):
+            box = object.bbox
+            score = object.score
+            category = object.category
+            mask = None
+            return box, score, category, mask
+        boxes, scores, categorys, masks = [], [], [], []
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            if return_mask:
+                process_object = process_object_with_mask
+            else:
+                process_object = process_object_without_mask
+            futures = [executor.submit(process_object, obj) for obj in result.objects]
+            for future in concurrent.futures.as_completed(futures):
+                box, score, category, mask = future.result()
+                boxes.append(box)
+                scores.append(score)
+                categorys.append(category)
+                if mask is not None:
+                    masks.append(mask)
+        return dict(boxes=boxes, categorys=categorys, scores=scores, masks=masks)
+    def get_image_url(self, image: Union[str, np.ndarray]):
+        """Upload Image to server and return the url
+        Args:
+            image (Union[str, np.ndarray]): The image to upload. Can be a file path or np.ndarray.
+                If it is a np.ndarray, it will be saved to a temporary file.
+        Returns:
+            str: The url of the image
+        """
+        if isinstance(image, str):
+            url = self.client.upload_file(image)
+        else:
+            with tempfile.NamedTemporaryFile(delete=True, suffix=".png") as tmp_file:
+                # image is in numpy format, convert to PIL Image
+                image = Image.fromarray(image)
+                image.save(tmp_file, format="PNG")
+                tmp_file_path = tmp_file.name
+                url = self.client.upload_file(tmp_file_path)
+        return url

gdino/version.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ __version__ = 'v1.5'

gdino/visualize.py ADDED Viewed

	@@ -0,0 +1,108 @@

+from typing import Dict
+import numpy as np
+from PIL import Image, ImageDraw, ImageFont, ImageOps
+import random
+def draw_mask(mask, draw, random_color=True):
+    """Draws a mask with a specified color on an image.
+    Args:
+        mask (np.array): Binary mask as a NumPy array.
+        draw (ImageDraw.Draw): ImageDraw object to draw on the image.
+        random_color (bool): Whether to use a random color for the mask.
+    """
+    if random_color:
+        color = (
+            random.randint(0, 255),
+            random.randint(0, 255),
+            random.randint(0, 255),
+            153,
+        )
+    else:
+        color = (30, 144, 255, 153)
+    nonzero_coords = np.transpose(np.nonzero(mask))
+    for coord in nonzero_coords:
+        draw.point(coord[::-1], fill=color)
+def visualize(image_pil: Image,
+              result: Dict,
+              draw_width: float = 6.0,
+              return_mask=True,
+              draw_score=True) -> Image:
+    """Plot bounding boxes and labels on an image.
+    Args:
+        image_pil (PIL.Image): The input image as a PIL Image object.
+        result (Dict[str, Union[torch.Tensor, List[torch.Tensor]]]): The target dictionary containing
+            the bounding boxes and labels. The keys are:
+                - boxes (List[int]): A list of bounding boxes in shape (N, 4), [x1, y1, x2, y2] format.
+                - scores (List[float]): A list of scores for each bounding box. shape (N)
+                - categorys (List[str]): A list of categorys for each object
+                - masks (List[PIL.Image]): A list of masks in the format of PIL.Image
+        draw_score (bool): Draw score on the image. Defaults to False.
+    Returns:
+        PIL.Image: The input image with plotted bounding boxes, labels, and masks.
+    """
+    # Get the bounding boxes and labels from the target dictionary
+    boxes = result["boxes"]
+    scores = result["scores"]
+    categorys = result["categorys"]
+    masks = result.get("masks", [])
+    # Find all unique categories and build a cate2color dictionary
+    cate2color = {}
+    unique_categorys = set(categorys)
+    for cate in unique_categorys:
+        cate2color[cate] = tuple(np.random.randint(0, 255, size=3).tolist())
+    # Create a PIL ImageDraw object to draw on the input image
+    if isinstance(image_pil, np.ndarray):
+        image_pil = Image.fromarray(image_pil)
+    draw = ImageDraw.Draw(image_pil)
+    # Create a new binary mask image with the same size as the input image
+    mask = Image.new("L", image_pil.size, 0)
+    # Create a PIL ImageDraw object to draw on the mask image
+    mask_draw = ImageDraw.Draw(mask)
+    # Draw boxes, labels, and masks for each box and label in the target dictionary
+    for box, score, category in zip(boxes, scores, categorys):
+        # Extract the box coordinates
+        x0, y0, x1, y1 = box
+        x0, y0, x1, y1 = int(x0), int(y0), int(x1), int(y1)
+        color = cate2color[category]
+        # Draw the box outline on the input image
+        draw.rectangle([x0, y0, x1, y1], outline=color, width=int(draw_width))
+        # Draw the label and score on the input image
+        if draw_score:
+            text = f"{category} {score:.2f}"
+        else:
+            text = f"{category}"
+        font = ImageFont.load_default()
+        if hasattr(font, "getbbox"):
+            bbox = draw.textbbox((x0, y0), text, font)
+        else:
+            w, h = draw.textsize(text, font)
+            bbox = (x0, y0, w + x0, y0 + h)
+        draw.rectangle(bbox, fill=color)
+        draw.text((x0, y0), text, fill="white")
+    # Draw the mask on the input image if masks are provided
+    if len(masks) > 0 and return_mask:
+        size = image_pil.size
+        mask_image = Image.new("RGBA", size, color=(0, 0, 0, 0))
+        mask_draw = ImageDraw.Draw(mask_image)
+        for mask in masks:
+            mask = np.array(mask)[:, :, -1]
+            draw_mask(mask, mask_draw)
+        image_pil = Image.alpha_composite(image_pil.convert("RGBA"), mask_image).convert("RGB")
+    return image_pil

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ dds-cloudapi-sdk==0.2.1
2	+ gradio==4.22.0

setup.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import glob
+import os
+import subprocess
+import torch
+from setuptools import find_packages, setup
+from torch.utils.cpp_extension import CUDA_HOME, CppExtension, CUDAExtension
+version = "v1.5"
+package_name = "gdino"
+cwd = os.path.dirname(os.path.abspath(__file__))
+sha = "Unknown"
+try:
+    sha = subprocess.check_output(["git", "rev-parse", "HEAD"],
+                                  cwd=cwd).decode("ascii").strip()
+except Exception:
+    pass
+def write_version_file():
+    version_path = os.path.join(cwd, "gdino/", "version.py")
+    with open(version_path, "w") as f:
+        f.write(f"__version__ = '{version}'\n")
+        # f.write(f"git_version = {repr(sha)}\n")
+def parse_requirements(fname="requirements.txt", with_version=True):
+    """Parse the package dependencies listed in a requirements file but strips
+    specific versioning information.
+    Args:
+        fname (str): path to requirements file
+        with_version (bool, default=False): if True include version specs
+    Returns:
+        List[str]: list of requirements items
+    CommandLine:
+        python -c "import setup; print(setup.parse_requirements())"
+    """
+    import re
+    import sys
+    from os.path import exists
+    require_fpath = fname
+    def parse_line(line):
+        """Parse information from a line in a requirements text file."""
+        if line.startswith("-r "):
+            # Allow specifying requirements in other files
+            target = line.split(" ")[1]
+            for info in parse_require_file(target):
+                yield info
+        else:
+            info = {"line": line}
+            if line.startswith("-e "):
+                info["package"] = line.split("#egg=")[1]
+            elif "@git+" in line:
+                info["package"] = line
+            else:
+                # Remove versioning from the package
+                pat = "(" + "|".join([">=", "==", ">"]) + ")"
+                parts = re.split(pat, line, maxsplit=1)
+                parts = [p.strip() for p in parts]
+                info["package"] = parts[0]
+                if len(parts) > 1:
+                    op, rest = parts[1:]
+                    if ";" in rest:
+                        # Handle platform specific dependencies
+                        # http://setuptools.readthedocs.io/en/latest/setuptools.html#declaring-platform-specific-dependencies
+                        version, platform_deps = map(str.strip,
+                                                     rest.split(";"))
+                        info["platform_deps"] = platform_deps
+                    else:
+                        version = rest  # NOQA
+                    info["version"] = (op, version)
+            yield info
+    def parse_require_file(fpath):
+        with open(fpath, "r") as f:
+            for line in f.readlines():
+                line = line.strip()
+                if line and not line.startswith("#"):
+                    for info in parse_line(line):
+                        yield info
+    def gen_packages_items():
+        if exists(require_fpath):
+            for info in parse_require_file(require_fpath):
+                parts = [info["package"]]
+                if with_version and "version" in info:
+                    parts.extend(info["version"])
+                if not sys.version.startswith("3.4"):
+                    # apparently package_deps are broken in 3.4
+                    platform_deps = info.get("platform_deps")
+                    if platform_deps is not None:
+                        parts.append(";" + platform_deps)
+                item = "".join(parts)
+                yield item
+    packages = list(gen_packages_items())
+    return packages
+if __name__ == "__main__":
+    print(f"Building wheel {package_name}-{version}")
+    with open("LICENSE", "r", encoding="utf-8") as f:
+        license = f.read()
+    write_version_file()
+    setup(
+        name="gdino",
+        version="v1.5",
+        author="International Digital Economy Academy, CVR",
+        url="https://github.com/IDEA-Research/Grounding-DINO-1.5-API",
+        description="Grounding DINO 1.5 API wrapper.",
+        license=license,
+        install_requires=parse_requirements("requirements.txt"),
+        packages=find_packages(exclude=("tests", )),
+        ext_modules=None,
+        cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
+    )