Spaces:

segments
/

panoptic-segment-anything

Running on T4

App Files Files Community

Tobias Cornille commited on about 1 month ago

Commit

d24572a

•

1 Parent(s): d688b67

Update GroundingDINO

Browse files

Files changed (37) hide show

GroundingDINO/LICENSE +1 -1
GroundingDINO/README.md +245 -41
GroundingDINO/groundingdino/config/{GroundingDINO_SwinB.cfg.py → GroundingDINO_SwinB_cfg.py} +0 -0
GroundingDINO/groundingdino/config/__init__.py +0 -0
GroundingDINO/groundingdino/datasets/__pycache__/__init__.cpython-310.pyc +0 -0
GroundingDINO/groundingdino/datasets/__pycache__/transforms.cpython-310.pyc +0 -0
GroundingDINO/groundingdino/datasets/cocogrounding_eval.py +269 -0
GroundingDINO/groundingdino/models/GroundingDINO/__pycache__/__init__.cpython-310.pyc +0 -0
GroundingDINO/groundingdino/models/GroundingDINO/__pycache__/bertwarper.cpython-310.pyc +0 -0
GroundingDINO/groundingdino/models/GroundingDINO/__pycache__/fuse_modules.cpython-310.pyc +0 -0
GroundingDINO/groundingdino/models/GroundingDINO/__pycache__/groundingdino.cpython-310.pyc +0 -0
GroundingDINO/groundingdino/models/GroundingDINO/__pycache__/ms_deform_attn.cpython-310.pyc +0 -0
GroundingDINO/groundingdino/models/GroundingDINO/__pycache__/transformer.cpython-310.pyc +0 -0
GroundingDINO/groundingdino/models/GroundingDINO/__pycache__/transformer_vanilla.cpython-310.pyc +0 -0
GroundingDINO/groundingdino/models/GroundingDINO/__pycache__/utils.cpython-310.pyc +0 -0
GroundingDINO/groundingdino/models/GroundingDINO/backbone/__pycache__/__init__.cpython-310.pyc +0 -0
GroundingDINO/groundingdino/models/GroundingDINO/backbone/__pycache__/backbone.cpython-310.pyc +0 -0
GroundingDINO/groundingdino/models/GroundingDINO/backbone/__pycache__/position_encoding.cpython-310.pyc +0 -0
GroundingDINO/groundingdino/models/GroundingDINO/backbone/__pycache__/swin_transformer.cpython-310.pyc +0 -0
GroundingDINO/groundingdino/models/GroundingDINO/groundingdino.py +25 -8
GroundingDINO/groundingdino/models/__pycache__/__init__.cpython-310.pyc +0 -0
GroundingDINO/groundingdino/models/__pycache__/registry.cpython-310.pyc +0 -0
GroundingDINO/groundingdino/util/__pycache__/__init__.cpython-310.pyc +0 -0
GroundingDINO/groundingdino/util/__pycache__/box_ops.cpython-310.pyc +0 -0
GroundingDINO/groundingdino/util/__pycache__/get_tokenlizer.cpython-310.pyc +0 -0
GroundingDINO/groundingdino/util/__pycache__/inference.cpython-310.pyc +0 -0
GroundingDINO/groundingdino/util/__pycache__/misc.cpython-310.pyc +0 -0
GroundingDINO/groundingdino/util/__pycache__/slconfig.cpython-310.pyc +0 -0
GroundingDINO/groundingdino/util/__pycache__/utils.cpython-310.pyc +0 -0
GroundingDINO/groundingdino/util/__pycache__/visualizer.cpython-310.pyc +0 -0
GroundingDINO/groundingdino/util/__pycache__/vl_utils.cpython-310.pyc +0 -0
GroundingDINO/groundingdino/util/get_tokenlizer.py +5 -2
GroundingDINO/groundingdino/util/inference.py +180 -7
GroundingDINO/groundingdino/util/slconfig.py +2 -2
GroundingDINO/groundingdino/util/utils.py +3 -1
GroundingDINO/requirements.txt +2 -2
GroundingDINO/setup.py +13 -1

GroundingDINO/LICENSE CHANGED Viewed

@@ -186,7 +186,7 @@
       same "printed page" as the copyright notice for easier
       identification within third-party archives.
-   Copyright 2020 - present, Facebook, Inc
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.

       same "printed page" as the copyright notice for easier
       identification within third-party archives.
+   Copyright 2023 - present, IDEA Research.
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.

GroundingDINO/README.md CHANGED Viewed

@@ -1,78 +1,269 @@
-# Grounding DINO
----
 [![arXiv](https://img.shields.io/badge/arXiv-2303.05499-b31b1b.svg)](https://arxiv.org/abs/2303.05499)
-[![YouTube](https://badges.aleen42.com/src/youtube.svg)](https://youtu.be/wxWDt5UiwY8)
-[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/roboflow-ai/notebooks/blob/main/notebooks/zero-shot-object-detection-with-grounding-dino.ipynb)
-[![YouTube](https://badges.aleen42.com/src/youtube.svg)](https://youtu.be/cMa77r3YrDk)
 [![HuggingFace space](https://img.shields.io/badge/🤗-HuggingFace%20Space-cyan.svg)](https://huggingface.co/spaces/ShilongLiu/Grounding_DINO_demo)
-[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/grounding-dino-marrying-dino-with-grounded/zero-shot-object-detection-on-mscoco)](https://paperswithcode.com/sota/zero-shot-object-detection-on-mscoco?p=grounding-dino-marrying-dino-with-grounded) \
-[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/grounding-dino-marrying-dino-with-grounded/zero-shot-object-detection-on-odinw)](https://paperswithcode.com/sota/zero-shot-object-detection-on-odinw?p=grounding-dino-marrying-dino-with-grounded) \
-[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/grounding-dino-marrying-dino-with-grounded/object-detection-on-coco-minival)](https://paperswithcode.com/sota/object-detection-on-coco-minival?p=grounding-dino-marrying-dino-with-grounded) \
-[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/grounding-dino-marrying-dino-with-grounded/object-detection-on-coco)](https://paperswithcode.com/sota/object-detection-on-coco?p=grounding-dino-marrying-dino-with-grounded)
-Official PyTorch implementation of [Grounding DINO](https://arxiv.org/abs/2303.05499), a stronger open-set object detector. Code is available now!
-## Highlight
 - **Open-Set Detection.** Detect **everything** with language!
-- **High Performancce.** COCO zero-shot **52.5 AP** (training without COCO data!). COCO fine-tune **63.0 AP**.
 - **Flexible.** Collaboration with Stable Diffusion for Image Editting.
-## News
-[2023/03/28] A YouTube [video](https://youtu.be/cMa77r3YrDk) about Grounding DINO and basic object detection prompt engineering. [[SkalskiP](https://github.com/SkalskiP)] \
-[2023/03/28] Add a [demo](https://huggingface.co/spaces/ShilongLiu/Grounding_DINO_demo) on Hugging Face Space! \
-[2023/03/27] Support CPU-only mode. Now the model can run on machines without GPUs.\
-[2023/03/25] A [demo](https://colab.research.google.com/github/roboflow-ai/notebooks/blob/main/notebooks/zero-shot-object-detection-with-grounding-dino.ipynb) for Grounding DINO is available at Colab. [[SkalskiP](https://github.com/SkalskiP)] \
-[2023/03/22] Code is available Now!
 <details open>
 <summary><font size="4">
 Description
 </font></summary>
 <img src=".asset/hero_figure.png" alt="ODinW" width="100%">
 </details>
-## TODO
 - [x] Release inference code and demo.
 - [x] Release checkpoints.
-- [ ] Grounding DINO with Stable Diffusion and GLIGEN demos.
 - [ ] Release training codes.
-## Install
-If you have a CUDA environment, please make sure the environment variable `CUDA_HOME` is set. It will be compiled under CPU-only mode if no CUDA available.
 ```bash
 pip install -e .
 ```
-## Demo
 ```bash
-CUDA_VISIBLE_DEVICES=6 python demo/inference_on_a_image.py \
-  -c /path/to/config \
-  -p /path/to/checkpoint \
-  -i .asset/cats.png \
-  -o "outputs/0" \
-  -t "cat ear." \
-  [--cpu-only] # open it for cpu mode
 ```
 See the `demo/inference_on_a_image.py` for more details.
 **Web UI**
 We also provide a demo code to integrate Grounding DINO with Gradio Web UI. See the file `demo/gradio_app.py` for more details.
-## Checkpoints
 <!-- insert a table -->
 <table>
@@ -94,13 +285,22 @@ We also provide a demo code to integrate Grounding DINO with Gradio Web UI. See
       <td>Swin-T</td>
       <td>O365,GoldG,Cap4M</td>
       <td>48.4 (zero-shot) / 57.2 (fine-tune)</td>
-      <td><a href="https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth">Github link</a> | <a href="https://huggingface.co/ShilongLiu/GroundingDINO/resolve/main/groundingdino_swint_ogc.pth">HF link</a></td>
       <td><a href="https://github.com/IDEA-Research/GroundingDINO/blob/main/groundingdino/config/GroundingDINO_SwinT_OGC.py">link</a></td>
     </tr>
   </tbody>
 </table>
-## Results
 <details open>
 <summary><font size="4">
@@ -120,24 +320,27 @@ ODinW Object Detection Results
 <summary><font size="4">
 Marrying Grounding DINO with <a href="https://github.com/Stability-AI/StableDiffusion">Stable Diffusion</a> for Image Editing
 </font></summary>
 <img src=".asset/GD_SD.png" alt="GD_SD" width="100%">
 </details>
 <details open>
 <summary><font size="4">
-Marrying Grounding DINO with <a href="https://github.com/gligen/GLIGEN">GLIGEN</a> for more Detailed Image Editing
 </font></summary>
 <img src=".asset/GD_GLIGEN.png" alt="GD_GLIGEN" width="100%">
 </details>
-## Model
 Includes: a text backbone, an image backbone, a feature enhancer, a language-guided query selection, and a cross-modality decoder.
 ![arch](.asset/arch.png)
-## Acknowledgement
 Our model is related to [DINO](https://github.com/IDEA-Research/DINO) and [GLIP](https://github.com/microsoft/GLIP). Thanks for their great work!
@@ -146,14 +349,15 @@ We also thank great previous work including DETR, Deformable DETR, SMCA, Conditi
 Thanks [Stable Diffusion](https://github.com/Stability-AI/StableDiffusion) and [GLIGEN](https://github.com/gligen/GLIGEN) for their awesome models.
-## Citation
 If you find our work helpful for your research, please consider citing the following BibTeX entry.
 ```bibtex
-@inproceedings{ShilongLiu2023GroundingDM,
-  title={Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection},
-  author={Shilong Liu and Zhaoyang Zeng and Tianhe Ren and Feng Li and Hao Zhang and Jie Yang and Chunyuan Li and Jianwei Yang and Hang Su and Jun Zhu and Lei Zhang},
   year={2023}
 }
 ```

+<div align="center">
+  <img src="./.asset/grounding_dino_logo.png" width="30%">
+</div>
+# :sauropod: Grounding DINO
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/grounding-dino-marrying-dino-with-grounded/zero-shot-object-detection-on-mscoco)](https://paperswithcode.com/sota/zero-shot-object-detection-on-mscoco?p=grounding-dino-marrying-dino-with-grounded) [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/grounding-dino-marrying-dino-with-grounded/zero-shot-object-detection-on-odinw)](https://paperswithcode.com/sota/zero-shot-object-detection-on-odinw?p=grounding-dino-marrying-dino-with-grounded) \
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/grounding-dino-marrying-dino-with-grounded/object-detection-on-coco-minival)](https://paperswithcode.com/sota/object-detection-on-coco-minival?p=grounding-dino-marrying-dino-with-grounded) [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/grounding-dino-marrying-dino-with-grounded/object-detection-on-coco)](https://paperswithcode.com/sota/object-detection-on-coco?p=grounding-dino-marrying-dino-with-grounded)
+**[IDEA-CVR, IDEA-Research](https://github.com/IDEA-Research)**
+[Shilong Liu](http://www.lsl.zone/), [Zhaoyang Zeng](https://scholar.google.com/citations?user=U_cvvUwAAAAJ&hl=zh-CN&oi=ao), [Tianhe Ren](https://rentainhe.github.io/), [Feng Li](https://scholar.google.com/citations?user=ybRe9GcAAAAJ&hl=zh-CN), [Hao Zhang](https://scholar.google.com/citations?user=B8hPxMQAAAAJ&hl=zh-CN), [Jie Yang](https://github.com/yangjie-cv), [Chunyuan Li](https://scholar.google.com/citations?user=Zd7WmXUAAAAJ&hl=zh-CN&oi=ao), [Jianwei Yang](https://jwyang.github.io/), [Hang Su](https://scholar.google.com/citations?hl=en&user=dxN1_X0AAAAJ&view_op=list_works&sortby=pubdate), [Jun Zhu](https://scholar.google.com/citations?hl=en&user=axsP38wAAAAJ), [Lei Zhang](https://www.leizhang.org/)<sup>:email:</sup>.
+[[`Paper`](https://arxiv.org/abs/2303.05499)] [[`Demo`](https://huggingface.co/spaces/ShilongLiu/Grounding_DINO_demo)] [[`BibTex`](#black_nib-citation)]
+PyTorch implementation and pretrained models for Grounding DINO. For details, see the paper **[Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499)**.
+## :sun_with_face: Helpful Tutorial
+- :grapes: [[Read our arXiv Paper](https://arxiv.org/abs/2303.05499)]
+- :apple:  [[Watch our simple introduction video on YouTube](https://youtu.be/wxWDt5UiwY8)]
+- :blossom:   &nbsp;[[Try the Colab Demo](https://colab.research.google.com/github/roboflow-ai/notebooks/blob/main/notebooks/zero-shot-object-detection-with-grounding-dino.ipynb)]
+- :sunflower: [[Try our Official Huggingface Demo](https://huggingface.co/spaces/ShilongLiu/Grounding_DINO_demo)]
+- :maple_leaf: [[Watch the Step by Step Tutorial about GroundingDINO by Roboflow AI](https://youtu.be/cMa77r3YrDk)]
+- :mushroom: [[GroundingDINO: Automated Dataset Annotation and Evaluation by Roboflow AI](https://youtu.be/C4NqaRBz_Kw)]
+- :hibiscus: [[Accelerate Image Annotation with SAM and GroundingDINO by Roboflow AI](https://youtu.be/oEQYStnF2l8)]
+- :white_flower: [[Autodistill: Train YOLOv8 with ZERO Annotations based on Grounding-DINO and Grounded-SAM by Roboflow AI](https://github.com/autodistill/autodistill)]
+<!-- Grounding DINO Methods |
 [![arXiv](https://img.shields.io/badge/arXiv-2303.05499-b31b1b.svg)](https://arxiv.org/abs/2303.05499)
+[![YouTube](https://badges.aleen42.com/src/youtube.svg)](https://youtu.be/wxWDt5UiwY8) -->
+<!-- Grounding DINO Demos |
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/roboflow-ai/notebooks/blob/main/notebooks/zero-shot-object-detection-with-grounding-dino.ipynb) -->
+<!-- [![YouTube](https://badges.aleen42.com/src/youtube.svg)](https://youtu.be/cMa77r3YrDk)
 [![HuggingFace space](https://img.shields.io/badge/🤗-HuggingFace%20Space-cyan.svg)](https://huggingface.co/spaces/ShilongLiu/Grounding_DINO_demo)
+[![YouTube](https://badges.aleen42.com/src/youtube.svg)](https://youtu.be/oEQYStnF2l8)
+[![YouTube](https://badges.aleen42.com/src/youtube.svg)](https://youtu.be/C4NqaRBz_Kw) -->
+## :sparkles: Highlight Projects
+- [Semantic-SAM: a universal image segmentation model to enable segment and recognize anything at any desired granularity.](https://github.com/UX-Decoder/Semantic-SAM),
+- [DetGPT: Detect What You Need via Reasoning](https://github.com/OptimalScale/DetGPT)
+- [Grounded-SAM: Marrying Grounding DINO with Segment Anything](https://github.com/IDEA-Research/Grounded-Segment-Anything)
+- [Grounding DINO with Stable Diffusion](demo/image_editing_with_groundingdino_stablediffusion.ipynb)
+- [Grounding DINO with GLIGEN for Controllable Image Editing](demo/image_editing_with_groundingdino_gligen.ipynb)
+- [OpenSeeD: A Simple and Strong Openset Segmentation Model](https://github.com/IDEA-Research/OpenSeeD)
+- [SEEM: Segment Everything Everywhere All at Once](https://github.com/UX-Decoder/Segment-Everything-Everywhere-All-At-Once)
+- [X-GPT: Conversational Visual Agent supported by X-Decoder](https://github.com/microsoft/X-Decoder/tree/xgpt)
+- [GLIGEN: Open-Set Grounded Text-to-Image Generation](https://github.com/gligen/GLIGEN)
+- [LLaVA: Large Language and Vision Assistant](https://github.com/haotian-liu/LLaVA)
+<!-- Extensions | [Grounding DINO with Segment Anything](https://github.com/IDEA-Research/Grounded-Segment-Anything); [Grounding DINO with Stable Diffusion](demo/image_editing_with_groundingdino_stablediffusion.ipynb); [Grounding DINO with GLIGEN](demo/image_editing_with_groundingdino_gligen.ipynb)  -->
+<!-- Official PyTorch implementation of [Grounding DINO](https://arxiv.org/abs/2303.05499), a stronger open-set object detector. Code is available now! -->
+## :bulb: Highlight
 - **Open-Set Detection.** Detect **everything** with language!
+- **High Performance.** COCO zero-shot **52.5 AP** (training without COCO data!). COCO fine-tune **63.0 AP**.
 - **Flexible.** Collaboration with Stable Diffusion for Image Editting.
+## :fire: News
+- **`2023/07/18`**: We release [Semantic-SAM](https://github.com/UX-Decoder/Semantic-SAM), a universal image segmentation model to enable segment and recognize anything at any desired granularity. **Code** and **checkpoint** are available!
+- **`2023/06/17`**: We provide an example to evaluate Grounding DINO on COCO zero-shot performance.
+- **`2023/04/15`**: Refer to [CV in the Wild Readings](https://github.com/Computer-Vision-in-the-Wild/CVinW_Readings) for those who are interested in open-set recognition!
+- **`2023/04/08`**: We release [demos](demo/image_editing_with_groundingdino_gligen.ipynb) to combine [Grounding DINO](https://arxiv.org/abs/2303.05499) with [GLIGEN](https://github.com/gligen/GLIGEN)  for more controllable image editings.
+- **`2023/04/08`**: We release [demos](demo/image_editing_with_groundingdino_stablediffusion.ipynb) to combine [Grounding DINO](https://arxiv.org/abs/2303.05499) with [Stable Diffusion](https://github.com/Stability-AI/StableDiffusion) for image editings.
+- **`2023/04/06`**: We build a new demo by marrying GroundingDINO with [Segment-Anything](https://github.com/facebookresearch/segment-anything) named **[Grounded-Segment-Anything](https://github.com/IDEA-Research/Grounded-Segment-Anything)** aims to support segmentation in GroundingDINO.
+- **`2023/03/28`**: A YouTube [video](https://youtu.be/cMa77r3YrDk) about Grounding DINO and basic object detection prompt engineering. [[SkalskiP](https://github.com/SkalskiP)]
+- **`2023/03/28`**: Add a [demo](https://huggingface.co/spaces/ShilongLiu/Grounding_DINO_demo) on Hugging Face Space!
+- **`2023/03/27`**: Support CPU-only mode. Now the model can run on machines without GPUs.
+- **`2023/03/25`**: A [demo](https://colab.research.google.com/github/roboflow-ai/notebooks/blob/main/notebooks/zero-shot-object-detection-with-grounding-dino.ipynb) for Grounding DINO is available at Colab. [[SkalskiP](https://github.com/SkalskiP)]
+- **`2023/03/22`**: Code is available Now!
 <details open>
 <summary><font size="4">
 Description
 </font></summary>
+ <a href="https://arxiv.org/abs/2303.05499">Paper</a> introduction.
 <img src=".asset/hero_figure.png" alt="ODinW" width="100%">
+Marrying <a href="https://github.com/IDEA-Research/GroundingDINO">Grounding DINO</a> and <a href="https://github.com/gligen/GLIGEN">GLIGEN</a>
+<img src="https://huggingface.co/ShilongLiu/GroundingDINO/resolve/main/GD_GLIGEN.png" alt="gd_gligen" width="100%">
 </details>
+## :star: Explanations/Tips for Grounding DINO Inputs and Outputs
+- Grounding DINO accepts an `(image, text)` pair as inputs.
+- It outputs `900` (by default) object boxes. Each box has similarity scores across all input words. (as shown in Figures below.)
+- We defaultly choose the boxes whose highest similarities are higher than a `box_threshold`.
+- We extract the words whose similarities are higher than the `text_threshold` as predicted labels.
+- If you want to obtain objects of specific phrases, like the `dogs` in the sentence `two dogs with a stick.`, you can select the boxes with highest text similarities with `dogs` as final outputs.
+- Note that each word can be split to **more than one** tokens with different tokenlizers. The number of words in a sentence may not equal to the number of text tokens.
+- We suggest separating different category names with `.` for Grounding DINO.
+![model_explain1](.asset/model_explan1.PNG)
+![model_explain2](.asset/model_explan2.PNG)
+## :label: TODO
 - [x] Release inference code and demo.
 - [x] Release checkpoints.
+- [x] Grounding DINO with Stable Diffusion and GLIGEN demos.
 - [ ] Release training codes.
+## :hammer_and_wrench: Install
+**Note:**
+0. If you have a CUDA environment, please make sure the environment variable `CUDA_HOME` is set. It will be compiled under CPU-only mode if no CUDA available.
+Please make sure following the installation steps strictly, otherwise the program may produce:
+```bash
+NameError: name '_C' is not defined
+```
+If this happened, please reinstalled the groundingDINO by reclone the git and do all the installation steps again.
+#### how to check cuda:
+```bash
+echo $CUDA_HOME
+```
+If it print nothing, then it means you haven't set up the path/
+Run this so the environment variable will be set under current shell.
+```bash
+export CUDA_HOME=/path/to/cuda-11.3
+```
+Notice the version of cuda should be aligned with your CUDA runtime, for there might exists multiple cuda at the same time.
+If you want to set the CUDA_HOME permanently, store it using:
+```bash
+echo 'export CUDA_HOME=/path/to/cuda' >> ~/.bashrc
+```
+after that, source the bashrc file and check CUDA_HOME:
+```bash
+source ~/.bashrc
+echo $CUDA_HOME
+```
+In this example, /path/to/cuda-11.3 should be replaced with the path where your CUDA toolkit is installed. You can find this by typing **which nvcc** in your terminal:
+For instance,
+if the output is /usr/local/cuda/bin/nvcc, then:
+```bash
+export CUDA_HOME=/usr/local/cuda
+```
+**Installation:**
+1.Clone the GroundingDINO repository from GitHub.
+```bash
+git clone https://github.com/IDEA-Research/GroundingDINO.git
+```
+2. Change the current directory to the GroundingDINO folder.
+```bash
+cd GroundingDINO/
+```
+3. Install the required dependencies in the current directory.
 ```bash
 pip install -e .
 ```
+4. Download pre-trained model weights.
+```bash
+mkdir weights
+cd weights
+wget -q https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth
+cd ..
+```
+## :arrow_forward: Demo
+Check your GPU ID (only if you're using a GPU)
+```bash
+nvidia-smi
+```
+Replace `{GPU ID}`, `image_you_want_to_detect.jpg`, and `"dir you want to save the output"` with appropriate values in the following command
+```bash
+CUDA_VISIBLE_DEVICES={GPU ID} python demo/inference_on_a_image.py \
+-c groundingdino/config/GroundingDINO_SwinT_OGC.py \
+-p weights/groundingdino_swint_ogc.pth \
+-i image_you_want_to_detect.jpg \
+-o "dir you want to save the output" \
+-t "chair"
+ [--cpu-only] # open it for cpu mode
+```
+If you would like to specify the phrases to detect, here is a demo:
 ```bash
+CUDA_VISIBLE_DEVICES={GPU ID} python demo/inference_on_a_image.py \
+-c groundingdino/config/GroundingDINO_SwinT_OGC.py \
+-p ./groundingdino_swint_ogc.pth \
+-i .asset/cat_dog.jpeg \
+-o logs/1111 \
+-t "There is a cat and a dog in the image ." \
+--token_spans "[[[9, 10], [11, 14]], [[19, 20], [21, 24]]]"
+ [--cpu-only] # open it for cpu mode
 ```
+The token_spans specify the start and end positions of a phrases. For example, the first phrase is `[[9, 10], [11, 14]]`. `"There is a cat and a dog in the image ."[9:10] = 'a'`, `"There is a cat and a dog in the image ."[11:14] = 'cat'`. Hence it refers to the phrase `a cat` . Similarly, the `[[19, 20], [21, 24]]` refers to the phrase `a dog`.
 See the `demo/inference_on_a_image.py` for more details.
+**Running with Python:**
+```python
+from groundingdino.util.inference import load_model, load_image, predict, annotate
+import cv2
+model = load_model("groundingdino/config/GroundingDINO_SwinT_OGC.py", "weights/groundingdino_swint_ogc.pth")
+IMAGE_PATH = "weights/dog-3.jpeg"
+TEXT_PROMPT = "chair . person . dog ."
+BOX_TRESHOLD = 0.35
+TEXT_TRESHOLD = 0.25
+image_source, image = load_image(IMAGE_PATH)
+boxes, logits, phrases = predict(
+    model=model,
+    image=image,
+    caption=TEXT_PROMPT,
+    box_threshold=BOX_TRESHOLD,
+    text_threshold=TEXT_TRESHOLD
+)
+annotated_frame = annotate(image_source=image_source, boxes=boxes, logits=logits, phrases=phrases)
+cv2.imwrite("annotated_image.jpg", annotated_frame)
+```
 **Web UI**
 We also provide a demo code to integrate Grounding DINO with Gradio Web UI. See the file `demo/gradio_app.py` for more details.
+**Notebooks**
+- We release [demos](demo/image_editing_with_groundingdino_gligen.ipynb) to combine [Grounding DINO](https://arxiv.org/abs/2303.05499) with [GLIGEN](https://github.com/gligen/GLIGEN)  for more controllable image editings.
+- We release [demos](demo/image_editing_with_groundingdino_stablediffusion.ipynb) to combine [Grounding DINO](https://arxiv.org/abs/2303.05499) with [Stable Diffusion](https://github.com/Stability-AI/StableDiffusion) for image editings.
+## COCO Zero-shot Evaluations
+We provide an example to evaluate Grounding DINO zero-shot performance on COCO. The results should be **48.5**.
+```bash
+CUDA_VISIBLE_DEVICES=0 \
+python demo/test_ap_on_coco.py \
+ -c groundingdino/config/GroundingDINO_SwinT_OGC.py \
+ -p weights/groundingdino_swint_ogc.pth \
+ --anno_path /path/to/annoataions/ie/instances_val2017.json \
+ --image_dir /path/to/imagedir/ie/val2017
+```
+## :luggage: Checkpoints
 <!-- insert a table -->
 <table>
       <td>Swin-T</td>
       <td>O365,GoldG,Cap4M</td>
       <td>48.4 (zero-shot) / 57.2 (fine-tune)</td>
+      <td><a href="https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth">GitHub link</a> | <a href="https://huggingface.co/ShilongLiu/GroundingDINO/resolve/main/groundingdino_swint_ogc.pth">HF link</a></td>
       <td><a href="https://github.com/IDEA-Research/GroundingDINO/blob/main/groundingdino/config/GroundingDINO_SwinT_OGC.py">link</a></td>
     </tr>
+    <tr>
+      <th>2</th>
+      <td>GroundingDINO-B</td>
+      <td>Swin-B</td>
+      <td>COCO,O365,GoldG,Cap4M,OpenImage,ODinW-35,RefCOCO</td>
+      <td>56.7 </td>
+      <td><a href="https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha2/groundingdino_swinb_cogcoor.pth">GitHub link</a>  | <a href="https://huggingface.co/ShilongLiu/GroundingDINO/resolve/main/groundingdino_swinb_cogcoor.pth">HF link</a>
+      <td><a href="https://github.com/IDEA-Research/GroundingDINO/blob/main/groundingdino/config/GroundingDINO_SwinB_cfg.py">link</a></td>
+    </tr>
   </tbody>
 </table>
+## :medal_military: Results
 <details open>
 <summary><font size="4">
 <summary><font size="4">
 Marrying Grounding DINO with <a href="https://github.com/Stability-AI/StableDiffusion">Stable Diffusion</a> for Image Editing
 </font></summary>
+See our example <a href="https://github.com/IDEA-Research/GroundingDINO/blob/main/demo/image_editing_with_groundingdino_stablediffusion.ipynb">notebook</a> for more details.
 <img src=".asset/GD_SD.png" alt="GD_SD" width="100%">
 </details>
 <details open>
 <summary><font size="4">
+Marrying Grounding DINO with <a href="https://github.com/gligen/GLIGEN">GLIGEN</a> for more Detailed Image Editing.
 </font></summary>
+See our example <a href="https://github.com/IDEA-Research/GroundingDINO/blob/main/demo/image_editing_with_groundingdino_gligen.ipynb">notebook</a> for more details.
 <img src=".asset/GD_GLIGEN.png" alt="GD_GLIGEN" width="100%">
 </details>
+## :sauropod: Model: Grounding DINO
 Includes: a text backbone, an image backbone, a feature enhancer, a language-guided query selection, and a cross-modality decoder.
 ![arch](.asset/arch.png)
+## :hearts: Acknowledgement
 Our model is related to [DINO](https://github.com/IDEA-Research/DINO) and [GLIP](https://github.com/microsoft/GLIP). Thanks for their great work!
 Thanks [Stable Diffusion](https://github.com/Stability-AI/StableDiffusion) and [GLIGEN](https://github.com/gligen/GLIGEN) for their awesome models.
+## :black_nib: Citation
 If you find our work helpful for your research, please consider citing the following BibTeX entry.
 ```bibtex
+@article{liu2023grounding,
+  title={Grounding dino: Marrying dino with grounded pre-training for open-set object detection},
+  author={Liu, Shilong and Zeng, Zhaoyang and Ren, Tianhe and Li, Feng and Zhang, Hao and Yang, Jie and Li, Chunyuan and Yang, Jianwei and Su, Hang and Zhu, Jun and others},
+  journal={arXiv preprint arXiv:2303.05499},
   year={2023}
 }
 ```

GroundingDINO/groundingdino/config/{GroundingDINO_SwinB.cfg.py → GroundingDINO_SwinB_cfg.py} RENAMED Viewed

File without changes

GroundingDINO/groundingdino/config/__init__.py ADDED Viewed

File without changes

GroundingDINO/groundingdino/datasets/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (189 Bytes). View file

GroundingDINO/groundingdino/datasets/__pycache__/transforms.cpython-310.pyc ADDED Viewed

Binary file (10.1 kB). View file

GroundingDINO/groundingdino/datasets/cocogrounding_eval.py ADDED Viewed

	@@ -0,0 +1,269 @@

+# ------------------------------------------------------------------------
+# Grounding DINO. Midified by Shilong Liu.
+# url: https://github.com/IDEA-Research/GroundingDINO
+# Copyright (c) 2023 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Copyright (c) Aishwarya Kamath & Nicolas Carion. Licensed under the Apache License 2.0. All Rights Reserved
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+COCO evaluator that works in distributed mode.
+Mostly copy-paste from https://github.com/pytorch/vision/blob/edfd5a7/references/detection/coco_eval.py
+The difference is that there is less copy-pasting from pycocotools
+in the end of the file, as python3 can suppress prints with contextlib
+"""
+import contextlib
+import copy
+import os
+import numpy as np
+import pycocotools.mask as mask_util
+import torch
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+from groundingdino.util.misc import all_gather
+class CocoGroundingEvaluator(object):
+    def __init__(self, coco_gt, iou_types, useCats=True):
+        assert isinstance(iou_types, (list, tuple))
+        coco_gt = copy.deepcopy(coco_gt)
+        self.coco_gt = coco_gt
+        self.iou_types = iou_types
+        self.coco_eval = {}
+        for iou_type in iou_types:
+            self.coco_eval[iou_type] = COCOeval(coco_gt, iouType=iou_type)
+            self.coco_eval[iou_type].useCats = useCats
+        self.img_ids = []
+        self.eval_imgs = {k: [] for k in iou_types}
+        self.useCats = useCats
+    def update(self, predictions):
+        img_ids = list(np.unique(list(predictions.keys())))
+        self.img_ids.extend(img_ids)
+        for iou_type in self.iou_types:
+            results = self.prepare(predictions, iou_type)
+            # suppress pycocotools prints
+            with open(os.devnull, "w") as devnull:
+                with contextlib.redirect_stdout(devnull):
+                    coco_dt = COCO.loadRes(self.coco_gt, results) if results else COCO()
+            coco_eval = self.coco_eval[iou_type]
+            coco_eval.cocoDt = coco_dt
+            coco_eval.params.imgIds = list(img_ids)
+            coco_eval.params.useCats = self.useCats
+            img_ids, eval_imgs = evaluate(coco_eval)
+            self.eval_imgs[iou_type].append(eval_imgs)
+    def synchronize_between_processes(self):
+        for iou_type in self.iou_types:
+            self.eval_imgs[iou_type] = np.concatenate(self.eval_imgs[iou_type], 2)
+            create_common_coco_eval(self.coco_eval[iou_type], self.img_ids, self.eval_imgs[iou_type])
+    def accumulate(self):
+        for coco_eval in self.coco_eval.values():
+            coco_eval.accumulate()
+    def summarize(self):
+        for iou_type, coco_eval in self.coco_eval.items():
+            print("IoU metric: {}".format(iou_type))
+            coco_eval.summarize()
+    def prepare(self, predictions, iou_type):
+        if iou_type == "bbox":
+            return self.prepare_for_coco_detection(predictions)
+        elif iou_type == "segm":
+            return self.prepare_for_coco_segmentation(predictions)
+        elif iou_type == "keypoints":
+            return self.prepare_for_coco_keypoint(predictions)
+        else:
+            raise ValueError("Unknown iou type {}".format(iou_type))
+    def prepare_for_coco_detection(self, predictions):
+        coco_results = []
+        for original_id, prediction in predictions.items():
+            if len(prediction) == 0:
+                continue
+            boxes = prediction["boxes"]
+            boxes = convert_to_xywh(boxes).tolist()
+            scores = prediction["scores"].tolist()
+            labels = prediction["labels"].tolist()
+            coco_results.extend(
+                [
+                    {
+                        "image_id": original_id,
+                        "category_id": labels[k],
+                        "bbox": box,
+                        "score": scores[k],
+                    }
+                    for k, box in enumerate(boxes)
+                ]
+            )
+        return coco_results
+    def prepare_for_coco_segmentation(self, predictions):
+        coco_results = []
+        for original_id, prediction in predictions.items():
+            if len(prediction) == 0:
+                continue
+            scores = prediction["scores"]
+            labels = prediction["labels"]
+            masks = prediction["masks"]
+            masks = masks > 0.5
+            scores = prediction["scores"].tolist()
+            labels = prediction["labels"].tolist()
+            rles = [
+                mask_util.encode(np.array(mask[0, :, :, np.newaxis], dtype=np.uint8, order="F"))[0]
+                for mask in masks
+            ]
+            for rle in rles:
+                rle["counts"] = rle["counts"].decode("utf-8")
+            coco_results.extend(
+                [
+                    {
+                        "image_id": original_id,
+                        "category_id": labels[k],
+                        "segmentation": rle,
+                        "score": scores[k],
+                    }
+                    for k, rle in enumerate(rles)
+                ]
+            )
+        return coco_results
+    def prepare_for_coco_keypoint(self, predictions):
+        coco_results = []
+        for original_id, prediction in predictions.items():
+            if len(prediction) == 0:
+                continue
+            boxes = prediction["boxes"]
+            boxes = convert_to_xywh(boxes).tolist()
+            scores = prediction["scores"].tolist()
+            labels = prediction["labels"].tolist()
+            keypoints = prediction["keypoints"]
+            keypoints = keypoints.flatten(start_dim=1).tolist()
+            coco_results.extend(
+                [
+                    {
+                        "image_id": original_id,
+                        "category_id": labels[k],
+                        "keypoints": keypoint,
+                        "score": scores[k],
+                    }
+                    for k, keypoint in enumerate(keypoints)
+                ]
+            )
+        return coco_results
+def convert_to_xywh(boxes):
+    xmin, ymin, xmax, ymax = boxes.unbind(1)
+    return torch.stack((xmin, ymin, xmax - xmin, ymax - ymin), dim=1)
+def merge(img_ids, eval_imgs):
+    all_img_ids = all_gather(img_ids)
+    all_eval_imgs = all_gather(eval_imgs)
+    merged_img_ids = []
+    for p in all_img_ids:
+        merged_img_ids.extend(p)
+    merged_eval_imgs = []
+    for p in all_eval_imgs:
+        merged_eval_imgs.append(p)
+    merged_img_ids = np.array(merged_img_ids)
+    merged_eval_imgs = np.concatenate(merged_eval_imgs, 2)
+    # keep only unique (and in sorted order) images
+    merged_img_ids, idx = np.unique(merged_img_ids, return_index=True)
+    merged_eval_imgs = merged_eval_imgs[..., idx]
+    return merged_img_ids, merged_eval_imgs
+def create_common_coco_eval(coco_eval, img_ids, eval_imgs):
+    img_ids, eval_imgs = merge(img_ids, eval_imgs)
+    img_ids = list(img_ids)
+    eval_imgs = list(eval_imgs.flatten())
+    coco_eval.evalImgs = eval_imgs
+    coco_eval.params.imgIds = img_ids
+    coco_eval._paramsEval = copy.deepcopy(coco_eval.params)
+#################################################################
+# From pycocotools, just removed the prints and fixed
+# a Python3 bug about unicode not defined
+#################################################################
+def evaluate(self):
+    """
+    Run per image evaluation on given images and store results (a list of dict) in self.evalImgs
+    :return: None
+    """
+    # tic = time.time()
+    # print('Running per image evaluation...')
+    p = self.params
+    # add backward compatibility if useSegm is specified in params
+    if p.useSegm is not None:
+        p.iouType = "segm" if p.useSegm == 1 else "bbox"
+        print("useSegm (deprecated) is not None. Running {} evaluation".format(p.iouType))
+    # print('Evaluate annotation type *{}*'.format(p.iouType))
+    p.imgIds = list(np.unique(p.imgIds))
+    if p.useCats:
+        p.catIds = list(np.unique(p.catIds))
+    p.maxDets = sorted(p.maxDets)
+    self.params = p
+    self._prepare()
+    # loop through images, area range, max detection number
+    catIds = p.catIds if p.useCats else [-1]
+    if p.iouType == "segm" or p.iouType == "bbox":
+        computeIoU = self.computeIoU
+    elif p.iouType == "keypoints":
+        computeIoU = self.computeOks
+    self.ious = {
+        (imgId, catId): computeIoU(imgId, catId)
+        for imgId in p.imgIds
+        for catId in catIds}
+    evaluateImg = self.evaluateImg
+    maxDet = p.maxDets[-1]
+    evalImgs = [
+        evaluateImg(imgId, catId, areaRng, maxDet)
+        for catId in catIds
+        for areaRng in p.areaRng
+        for imgId in p.imgIds
+    ]
+    # this is NOT in the pycocotools code, but could be done outside
+    evalImgs = np.asarray(evalImgs).reshape(len(catIds), len(p.areaRng), len(p.imgIds))
+    self._paramsEval = copy.deepcopy(self.params)
+    # toc = time.time()
+    # print('DONE (t={:0.2f}s).'.format(toc-tic))
+    return p.imgIds, evalImgs
+#################################################################
+# end of straight copy from pycocotools, just removing the prints
+#################################################################

GroundingDINO/groundingdino/models/GroundingDINO/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (258 Bytes). View file

GroundingDINO/groundingdino/models/GroundingDINO/__pycache__/bertwarper.cpython-310.pyc ADDED Viewed

Binary file (7.23 kB). View file

GroundingDINO/groundingdino/models/GroundingDINO/__pycache__/fuse_modules.cpython-310.pyc ADDED Viewed

Binary file (7.78 kB). View file

GroundingDINO/groundingdino/models/GroundingDINO/__pycache__/groundingdino.cpython-310.pyc ADDED Viewed

Binary file (11.3 kB). View file

GroundingDINO/groundingdino/models/GroundingDINO/__pycache__/ms_deform_attn.cpython-310.pyc ADDED Viewed

Binary file (11.8 kB). View file

GroundingDINO/groundingdino/models/GroundingDINO/__pycache__/transformer.cpython-310.pyc ADDED Viewed

Binary file (19.3 kB). View file

GroundingDINO/groundingdino/models/GroundingDINO/__pycache__/transformer_vanilla.cpython-310.pyc ADDED Viewed

Binary file (3.45 kB). View file

GroundingDINO/groundingdino/models/GroundingDINO/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (9.58 kB). View file

GroundingDINO/groundingdino/models/GroundingDINO/backbone/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (257 Bytes). View file

GroundingDINO/groundingdino/models/GroundingDINO/backbone/__pycache__/backbone.cpython-310.pyc ADDED Viewed

Binary file (6.25 kB). View file

GroundingDINO/groundingdino/models/GroundingDINO/backbone/__pycache__/position_encoding.cpython-310.pyc ADDED Viewed

Binary file (5.16 kB). View file

GroundingDINO/groundingdino/models/GroundingDINO/backbone/__pycache__/swin_transformer.cpython-310.pyc ADDED Viewed

Binary file (20.6 kB). View file

GroundingDINO/groundingdino/models/GroundingDINO/groundingdino.py CHANGED Viewed

@@ -206,6 +206,21 @@ class GroundingDINO(nn.Module):
             nn.init.xavier_uniform_(proj[0].weight, gain=1)
             nn.init.constant_(proj[0].bias, 0)
     def init_ref_points(self, use_num_queries):
         self.refpoint_embed = nn.Embedding(use_num_queries, self.query_dim)
@@ -228,7 +243,6 @@ class GroundingDINO(nn.Module):
             captions = kw["captions"]
         else:
             captions = [t["caption"] for t in targets]
-        len(captions)
         # encoder texts
         tokenized = self.tokenizer(captions, padding="longest", return_tensors="pt").to(
@@ -283,14 +297,14 @@ class GroundingDINO(nn.Module):
         }
         # import ipdb; ipdb.set_trace()
         if isinstance(samples, (list, torch.Tensor)):
             samples = nested_tensor_from_tensor_list(samples)
-        features, poss = self.backbone(samples)
         srcs = []
         masks = []
-        for l, feat in enumerate(features):
             src, mask = feat.decompose()
             srcs.append(self.input_proj[l](src))
             masks.append(mask)
@@ -299,7 +313,7 @@ class GroundingDINO(nn.Module):
             _len_srcs = len(srcs)
             for l in range(_len_srcs, self.num_feature_levels):
                 if l == _len_srcs:
-                    src = self.input_proj[l](features[-1].tensors)
                 else:
                     src = self.input_proj[l](srcs[-1])
                 m = samples.mask
@@ -307,11 +321,11 @@ class GroundingDINO(nn.Module):
                 pos_l = self.backbone[1](NestedTensor(src, mask)).to(src.dtype)
                 srcs.append(src)
                 masks.append(mask)
-                poss.append(pos_l)
         input_query_bbox = input_query_label = attn_mask = dn_meta = None
         hs, reference, hs_enc, ref_enc, init_box_proposal = self.transformer(
-            srcs, masks, input_query_bbox, poss, input_query_label, attn_mask, text_dict
         )
         # deformable-detr-like anchor update
@@ -345,7 +359,9 @@ class GroundingDINO(nn.Module):
         #     interm_class = self.transformer.enc_out_class_embed(hs_enc[-1], text_dict)
         #     out['interm_outputs'] = {'pred_logits': interm_class, 'pred_boxes': interm_coord}
         #     out['interm_outputs_for_matching_pre'] = {'pred_logits': interm_class, 'pred_boxes': init_box_proposal}
         return out
     @torch.jit.unused
@@ -393,3 +409,4 @@ def build_groundingdino(args):
     )
     return model

             nn.init.xavier_uniform_(proj[0].weight, gain=1)
             nn.init.constant_(proj[0].bias, 0)
+    def set_image_tensor(self, samples: NestedTensor):
+        if isinstance(samples, (list, torch.Tensor)):
+            samples = nested_tensor_from_tensor_list(samples)
+        self.features, self.poss = self.backbone(samples)
+    def unset_image_tensor(self):
+        if hasattr(self, 'features'):
+            del self.features
+        if hasattr(self,'poss'):
+            del self.poss
+    def set_image_features(self, features , poss):
+        self.features = features
+        self.poss = poss
     def init_ref_points(self, use_num_queries):
         self.refpoint_embed = nn.Embedding(use_num_queries, self.query_dim)
             captions = kw["captions"]
         else:
             captions = [t["caption"] for t in targets]
         # encoder texts
         tokenized = self.tokenizer(captions, padding="longest", return_tensors="pt").to(
         }
         # import ipdb; ipdb.set_trace()
         if isinstance(samples, (list, torch.Tensor)):
             samples = nested_tensor_from_tensor_list(samples)
+        if not hasattr(self, 'features') or not hasattr(self, 'poss'):
+            self.set_image_tensor(samples)
         srcs = []
         masks = []
+        for l, feat in enumerate(self.features):
             src, mask = feat.decompose()
             srcs.append(self.input_proj[l](src))
             masks.append(mask)
             _len_srcs = len(srcs)
             for l in range(_len_srcs, self.num_feature_levels):
                 if l == _len_srcs:
+                    src = self.input_proj[l](self.features[-1].tensors)
                 else:
                     src = self.input_proj[l](srcs[-1])
                 m = samples.mask
                 pos_l = self.backbone[1](NestedTensor(src, mask)).to(src.dtype)
                 srcs.append(src)
                 masks.append(mask)
+                self.poss.append(pos_l)
         input_query_bbox = input_query_label = attn_mask = dn_meta = None
         hs, reference, hs_enc, ref_enc, init_box_proposal = self.transformer(
+            srcs, masks, input_query_bbox, self.poss, input_query_label, attn_mask, text_dict
         )
         # deformable-detr-like anchor update
         #     interm_class = self.transformer.enc_out_class_embed(hs_enc[-1], text_dict)
         #     out['interm_outputs'] = {'pred_logits': interm_class, 'pred_boxes': interm_coord}
         #     out['interm_outputs_for_matching_pre'] = {'pred_logits': interm_class, 'pred_boxes': init_box_proposal}
+        unset_image_tensor = kw.get('unset_image_tensor', True)
+        if unset_image_tensor:
+            self.unset_image_tensor() ## If necessary
         return out
     @torch.jit.unused
     )
     return model

GroundingDINO/groundingdino/models/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (502 Bytes). View file

GroundingDINO/groundingdino/models/__pycache__/registry.cpython-310.pyc ADDED Viewed

Binary file (2.11 kB). View file

GroundingDINO/groundingdino/util/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (187 Bytes). View file

GroundingDINO/groundingdino/util/__pycache__/box_ops.cpython-310.pyc ADDED Viewed

Binary file (3.85 kB). View file

GroundingDINO/groundingdino/util/__pycache__/get_tokenlizer.cpython-310.pyc ADDED Viewed

Binary file (1.13 kB). View file

GroundingDINO/groundingdino/util/__pycache__/inference.cpython-310.pyc ADDED Viewed

Binary file (8 kB). View file

GroundingDINO/groundingdino/util/__pycache__/misc.cpython-310.pyc ADDED Viewed

Binary file (20.3 kB). View file

GroundingDINO/groundingdino/util/__pycache__/slconfig.cpython-310.pyc ADDED Viewed

Binary file (13.2 kB). View file

GroundingDINO/groundingdino/util/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (19.9 kB). View file

GroundingDINO/groundingdino/util/__pycache__/visualizer.cpython-310.pyc ADDED Viewed

Binary file (7.84 kB). View file

GroundingDINO/groundingdino/util/__pycache__/vl_utils.cpython-310.pyc ADDED Viewed

Binary file (3.12 kB). View file

GroundingDINO/groundingdino/util/get_tokenlizer.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from transformers import AutoTokenizer, BertModel, BertTokenizer, RobertaModel, RobertaTokenizerFast
 def get_tokenlizer(text_encoder_type):
     if not isinstance(text_encoder_type, str):
@@ -8,6 +8,8 @@ def get_tokenlizer(text_encoder_type):
             text_encoder_type = text_encoder_type.text_encoder_type
         elif text_encoder_type.get("text_encoder_type", False):
             text_encoder_type = text_encoder_type.get("text_encoder_type")
         else:
             raise ValueError(
                 "Unknown type of text_encoder_type: {}".format(type(text_encoder_type))
@@ -19,8 +21,9 @@ def get_tokenlizer(text_encoder_type):
 def get_pretrained_language_model(text_encoder_type):
-    if text_encoder_type == "bert-base-uncased":
         return BertModel.from_pretrained(text_encoder_type)
     if text_encoder_type == "roberta-base":
         return RobertaModel.from_pretrained(text_encoder_type)
     raise ValueError("Unknown text_encoder_type {}".format(text_encoder_type))

 from transformers import AutoTokenizer, BertModel, BertTokenizer, RobertaModel, RobertaTokenizerFast
+import os
 def get_tokenlizer(text_encoder_type):
     if not isinstance(text_encoder_type, str):
             text_encoder_type = text_encoder_type.text_encoder_type
         elif text_encoder_type.get("text_encoder_type", False):
             text_encoder_type = text_encoder_type.get("text_encoder_type")
+        elif os.path.isdir(text_encoder_type) and os.path.exists(text_encoder_type):
+            pass
         else:
             raise ValueError(
                 "Unknown type of text_encoder_type: {}".format(type(text_encoder_type))
 def get_pretrained_language_model(text_encoder_type):
+    if text_encoder_type == "bert-base-uncased" or (os.path.isdir(text_encoder_type) and os.path.exists(text_encoder_type)):
         return BertModel.from_pretrained(text_encoder_type)
     if text_encoder_type == "roberta-base":
         return RobertaModel.from_pretrained(text_encoder_type)
     raise ValueError("Unknown text_encoder_type {}".format(text_encoder_type))

GroundingDINO/groundingdino/util/inference.py CHANGED Viewed

@@ -6,6 +6,7 @@ import supervision as sv
 import torch
 from PIL import Image
 from torchvision.ops import box_convert
 import groundingdino.datasets.transforms as T
 from groundingdino.models import build_model
@@ -13,6 +14,10 @@ from groundingdino.util.misc import clean_state_dict
 from groundingdino.util.slconfig import SLConfig
 from groundingdino.util.utils import get_phrases_from_posmap
 def preprocess_caption(caption: str) -> str:
     result = caption.lower().strip()
@@ -51,7 +56,8 @@ def predict(
         caption: str,
         box_threshold: float,
         text_threshold: float,
-        device: str = "cuda"
 ) -> Tuple[torch.Tensor, torch.Tensor, List[str]]:
     caption = preprocess_caption(caption=caption)
@@ -70,17 +76,40 @@ def predict(
     tokenizer = model.tokenizer
     tokenized = tokenizer(caption)
-    phrases = [
-        get_phrases_from_posmap(logit > text_threshold, tokenized, tokenizer).replace('.', '')
-        for logit
-        in logits
-    ]
     return boxes, logits.max(dim=1)[0], phrases
 def annotate(image_source: np.ndarray, boxes: torch.Tensor, logits: torch.Tensor, phrases: List[str]) -> np.ndarray:
     h, w, _ = image_source.shape
     boxes = boxes * torch.Tensor([w, h, w, h])
     xyxy = box_convert(boxes=boxes, in_fmt="cxcywh", out_fmt="xyxy").numpy()
@@ -96,3 +125,147 @@ def annotate(image_source: np.ndarray, boxes: torch.Tensor, logits: torch.Tensor
     annotated_frame = cv2.cvtColor(image_source, cv2.COLOR_RGB2BGR)
     annotated_frame = box_annotator.annotate(scene=annotated_frame, detections=detections, labels=labels)
     return annotated_frame

 import torch
 from PIL import Image
 from torchvision.ops import box_convert
+import bisect
 import groundingdino.datasets.transforms as T
 from groundingdino.models import build_model
 from groundingdino.util.slconfig import SLConfig
 from groundingdino.util.utils import get_phrases_from_posmap
+# ----------------------------------------------------------------------------------------------------------------------
+# OLD API
+# ----------------------------------------------------------------------------------------------------------------------
 def preprocess_caption(caption: str) -> str:
     result = caption.lower().strip()
         caption: str,
         box_threshold: float,
         text_threshold: float,
+        device: str = "cuda",
+        remove_combined: bool = False
 ) -> Tuple[torch.Tensor, torch.Tensor, List[str]]:
     caption = preprocess_caption(caption=caption)
     tokenizer = model.tokenizer
     tokenized = tokenizer(caption)
+    if remove_combined:
+        sep_idx = [i for i in range(len(tokenized['input_ids'])) if tokenized['input_ids'][i] in [101, 102, 1012]]
+        phrases = []
+        for logit in logits:
+            max_idx = logit.argmax()
+            insert_idx = bisect.bisect_left(sep_idx, max_idx)
+            right_idx = sep_idx[insert_idx]
+            left_idx = sep_idx[insert_idx - 1]
+            phrases.append(get_phrases_from_posmap(logit > text_threshold, tokenized, tokenizer, left_idx, right_idx).replace('.', ''))
+    else:
+        phrases = [
+            get_phrases_from_posmap(logit > text_threshold, tokenized, tokenizer).replace('.', '')
+            for logit
+            in logits
+        ]
     return boxes, logits.max(dim=1)[0], phrases
 def annotate(image_source: np.ndarray, boxes: torch.Tensor, logits: torch.Tensor, phrases: List[str]) -> np.ndarray:
+    """
+    This function annotates an image with bounding boxes and labels.
+    Parameters:
+    image_source (np.ndarray): The source image to be annotated.
+    boxes (torch.Tensor): A tensor containing bounding box coordinates.
+    logits (torch.Tensor): A tensor containing confidence scores for each bounding box.
+    phrases (List[str]): A list of labels for each bounding box.
+    Returns:
+    np.ndarray: The annotated image.
+    """
     h, w, _ = image_source.shape
     boxes = boxes * torch.Tensor([w, h, w, h])
     xyxy = box_convert(boxes=boxes, in_fmt="cxcywh", out_fmt="xyxy").numpy()
     annotated_frame = cv2.cvtColor(image_source, cv2.COLOR_RGB2BGR)
     annotated_frame = box_annotator.annotate(scene=annotated_frame, detections=detections, labels=labels)
     return annotated_frame
+# ----------------------------------------------------------------------------------------------------------------------
+# NEW API
+# ----------------------------------------------------------------------------------------------------------------------
+class Model:
+    def __init__(
+        self,
+        model_config_path: str,
+        model_checkpoint_path: str,
+        device: str = "cuda"
+    ):
+        self.model = load_model(
+            model_config_path=model_config_path,
+            model_checkpoint_path=model_checkpoint_path,
+            device=device
+        ).to(device)
+        self.device = device
+    def predict_with_caption(
+        self,
+        image: np.ndarray,
+        caption: str,
+        box_threshold: float = 0.35,
+        text_threshold: float = 0.25
+    ) -> Tuple[sv.Detections, List[str]]:
+        """
+        import cv2
+        image = cv2.imread(IMAGE_PATH)
+        model = Model(model_config_path=CONFIG_PATH, model_checkpoint_path=WEIGHTS_PATH)
+        detections, labels = model.predict_with_caption(
+            image=image,
+            caption=caption,
+            box_threshold=BOX_THRESHOLD,
+            text_threshold=TEXT_THRESHOLD
+        )
+        import supervision as sv
+        box_annotator = sv.BoxAnnotator()
+        annotated_image = box_annotator.annotate(scene=image, detections=detections, labels=labels)
+        """
+        processed_image = Model.preprocess_image(image_bgr=image).to(self.device)
+        boxes, logits, phrases = predict(
+            model=self.model,
+            image=processed_image,
+            caption=caption,
+            box_threshold=box_threshold,
+            text_threshold=text_threshold,
+            device=self.device)
+        source_h, source_w, _ = image.shape
+        detections = Model.post_process_result(
+            source_h=source_h,
+            source_w=source_w,
+            boxes=boxes,
+            logits=logits)
+        return detections, phrases
+    def predict_with_classes(
+        self,
+        image: np.ndarray,
+        classes: List[str],
+        box_threshold: float,
+        text_threshold: float
+    ) -> sv.Detections:
+        """
+        import cv2
+        image = cv2.imread(IMAGE_PATH)
+        model = Model(model_config_path=CONFIG_PATH, model_checkpoint_path=WEIGHTS_PATH)
+        detections = model.predict_with_classes(
+            image=image,
+            classes=CLASSES,
+            box_threshold=BOX_THRESHOLD,
+            text_threshold=TEXT_THRESHOLD
+        )
+        import supervision as sv
+        box_annotator = sv.BoxAnnotator()
+        annotated_image = box_annotator.annotate(scene=image, detections=detections)
+        """
+        caption = ". ".join(classes)
+        processed_image = Model.preprocess_image(image_bgr=image).to(self.device)
+        boxes, logits, phrases = predict(
+            model=self.model,
+            image=processed_image,
+            caption=caption,
+            box_threshold=box_threshold,
+            text_threshold=text_threshold,
+            device=self.device)
+        source_h, source_w, _ = image.shape
+        detections = Model.post_process_result(
+            source_h=source_h,
+            source_w=source_w,
+            boxes=boxes,
+            logits=logits)
+        class_id = Model.phrases2classes(phrases=phrases, classes=classes)
+        detections.class_id = class_id
+        return detections
+    @staticmethod
+    def preprocess_image(image_bgr: np.ndarray) -> torch.Tensor:
+        transform = T.Compose(
+            [
+                T.RandomResize([800], max_size=1333),
+                T.ToTensor(),
+                T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
+            ]
+        )
+        image_pillow = Image.fromarray(cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB))
+        image_transformed, _ = transform(image_pillow, None)
+        return image_transformed
+    @staticmethod
+    def post_process_result(
+            source_h: int,
+            source_w: int,
+            boxes: torch.Tensor,
+            logits: torch.Tensor
+    ) -> sv.Detections:
+        boxes = boxes * torch.Tensor([source_w, source_h, source_w, source_h])
+        xyxy = box_convert(boxes=boxes, in_fmt="cxcywh", out_fmt="xyxy").numpy()
+        confidence = logits.numpy()
+        return sv.Detections(xyxy=xyxy, confidence=confidence)
+    @staticmethod
+    def phrases2classes(phrases: List[str], classes: List[str]) -> np.ndarray:
+        class_ids = []
+        for phrase in phrases:
+            for class_ in classes:
+                if class_ in phrase:
+                    class_ids.append(classes.index(class_))
+                    break
+            else:
+                class_ids.append(None)
+        return np.array(class_ids)

GroundingDINO/groundingdino/util/slconfig.py CHANGED Viewed

@@ -2,13 +2,13 @@
 # Modified from mmcv
 # ==========================================================
 import ast
 import os.path as osp
 import shutil
 import sys
 import tempfile
 from argparse import Action
 from importlib import import_module
-import platform
 from addict import Dict
 from yapf.yapflib.yapf_api import FormatCode
@@ -81,7 +81,7 @@ class SLConfig(object):
             with tempfile.TemporaryDirectory() as temp_config_dir:
                 temp_config_file = tempfile.NamedTemporaryFile(dir=temp_config_dir, suffix=".py")
                 temp_config_name = osp.basename(temp_config_file.name)
-                if platform.system() == 'Windows':
                     temp_config_file.close()
                 shutil.copyfile(filename, osp.join(temp_config_dir, temp_config_name))
                 temp_module_name = osp.splitext(temp_config_name)[0]

 # Modified from mmcv
 # ==========================================================
 import ast
+import os
 import os.path as osp
 import shutil
 import sys
 import tempfile
 from argparse import Action
 from importlib import import_module
 from addict import Dict
 from yapf.yapflib.yapf_api import FormatCode
             with tempfile.TemporaryDirectory() as temp_config_dir:
                 temp_config_file = tempfile.NamedTemporaryFile(dir=temp_config_dir, suffix=".py")
                 temp_config_name = osp.basename(temp_config_file.name)
+                if os.name == 'nt':
                     temp_config_file.close()
                 shutil.copyfile(filename, osp.join(temp_config_dir, temp_config_name))
                 temp_module_name = osp.splitext(temp_config_name)[0]

GroundingDINO/groundingdino/util/utils.py CHANGED Viewed

@@ -597,10 +597,12 @@ def targets_to(targets: List[Dict[str, Any]], device):
 def get_phrases_from_posmap(
-    posmap: torch.BoolTensor, tokenized: Dict, tokenizer: AutoTokenizer
 ):
     assert isinstance(posmap, torch.Tensor), "posmap must be torch.Tensor"
     if posmap.dim() == 1:
         non_zero_idx = posmap.nonzero(as_tuple=True)[0].tolist()
         token_ids = [tokenized["input_ids"][i] for i in non_zero_idx]
         return tokenizer.decode(token_ids)

 def get_phrases_from_posmap(
+    posmap: torch.BoolTensor, tokenized: Dict, tokenizer: AutoTokenizer, left_idx: int = 0, right_idx: int = 255
 ):
     assert isinstance(posmap, torch.Tensor), "posmap must be torch.Tensor"
     if posmap.dim() == 1:
+        posmap[0: left_idx + 1] = False
+        posmap[right_idx:] = False
         non_zero_idx = posmap.nonzero(as_tuple=True)[0].tolist()
         token_ids = [tokenized["input_ids"][i] for i in non_zero_idx]
         return tokenizer.decode(token_ids)

GroundingDINO/requirements.txt CHANGED Viewed

@@ -6,5 +6,5 @@ yapf
 timm
 numpy
 opencv-python
-supervision==0.3.2
-pycocotools

 timm
 numpy
 opencv-python
+supervision
+pycocotools

GroundingDINO/setup.py CHANGED Viewed

@@ -24,6 +24,18 @@ import glob
 import os
 import subprocess
 import torch
 from setuptools import find_packages, setup
 from torch.utils.cpp_extension import CUDA_HOME, CppExtension, CUDAExtension
@@ -70,7 +82,7 @@ def get_extensions():
     extra_compile_args = {"cxx": []}
     define_macros = []
-    if torch.cuda.is_available() and CUDA_HOME is not None:
         print("Compiling with CUDA")
         extension = CUDAExtension
         sources += source_cuda

 import os
 import subprocess
+import subprocess
+import sys
+def install_torch():
+    try:
+        import torch
+    except ImportError:
+        subprocess.check_call([sys.executable, "-m", "pip", "install", "torch"])
+# Call the function to ensure torch is installed
+install_torch()
 import torch
 from setuptools import find_packages, setup
 from torch.utils.cpp_extension import CUDA_HOME, CppExtension, CUDAExtension
     extra_compile_args = {"cxx": []}
     define_macros = []
+    if CUDA_HOME is not None and (torch.cuda.is_available() or "TORCH_CUDA_ARCH_LIST" in os.environ):
         print("Compiling with CUDA")
         extension = CUDAExtension
         sources += source_cuda