FantasticGNU commited on
Commit
5908b15
Β·
1 Parent(s): 778f99c
UniVAD/models/GroundingDINO/.gitignore ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # IDE
2
+ .idea/
3
+ .vscode/
4
+
5
+ # Byte-compiled / optimized / DLL files
6
+ __pycache__/
7
+ *.py[cod]
8
+ *$py.class
9
+
10
+ # C extensions
11
+ *.so
12
+
13
+ # Distribution / packaging
14
+ .Python
15
+ build/
16
+ develop-eggs/
17
+ dist/
18
+ downloads/
19
+ eggs/
20
+ .eggs/
21
+ lib/
22
+ lib64/
23
+ parts/
24
+ sdist/
25
+ var/
26
+ wheels/
27
+ pip-wheel-metadata/
28
+ share/python-wheels/
29
+ *.egg-info/
30
+ .installed.cfg
31
+ *.egg
32
+ MANIFEST
33
+
34
+ # PyInstaller
35
+ # Usually these files are written by a python script from a template
36
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
37
+ *.manifest
38
+ *.spec
39
+
40
+ # Installer logs
41
+ pip-log.txt
42
+ pip-delete-this-directory.txt
43
+
44
+ # Unit test / coverage reports
45
+ htmlcov/
46
+ .tox/
47
+ .nox/
48
+ .coverage
49
+ .coverage.*
50
+ .cache
51
+ nosetests.xml
52
+ coverage.xml
53
+ *.cover
54
+ *.py,cover
55
+ .hypothesis/
56
+ .pytest_cache/
57
+
58
+ # Translations
59
+ *.mo
60
+ *.pot
61
+
62
+ # Django stuff:
63
+ *.log
64
+ local_settings.py
65
+ db.sqlite3
66
+ db.sqlite3-journal
67
+
68
+ # Flask stuff:
69
+ instance/
70
+ .webassets-cache
71
+
72
+ # Scrapy stuff:
73
+ .scrapy
74
+
75
+ # Sphinx documentation
76
+ docs/_build/
77
+
78
+ # PyBuilder
79
+ target/
80
+
81
+ # Jupyter Notebook
82
+ .ipynb_checkpoints
83
+
84
+ # IPython
85
+ profile_default/
86
+ ipython_config.py
87
+
88
+ # pyenv
89
+ .python-version
90
+
91
+ # pipenv
92
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
93
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
94
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
95
+ # install all needed dependencies.
96
+ #Pipfile.lock
97
+
98
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
99
+ __pypackages__/
100
+
101
+ # Celery stuff
102
+ celerybeat-schedule
103
+ celerybeat.pid
104
+
105
+ # SageMath parsed files
106
+ *.sage.py
107
+
108
+ # Environments
109
+ .env
110
+ .venv
111
+ env/
112
+ venv/
113
+ ENV/
114
+ env.bak/
115
+ venv.bak/
116
+
117
+ # Spyder project settings
118
+ .spyderproject
119
+ .spyproject
120
+
121
+ # Rope project settings
122
+ .ropeproject
123
+
124
+ # mkdocs documentation
125
+ /site
126
+
127
+ # mypy
128
+ .mypy_cache/
129
+ .dmypy.json
130
+ dmypy.json
131
+
132
+ # Pyre type checker
133
+ .pyre/
134
+
135
+ # vscode
136
+ .vscode/
137
+ output/
138
+ outputs/
139
+ subs/
140
+ logs/
141
+
142
+ grounding/config/configs
143
+ grounding/version.py
144
+
145
+ vis/
146
+ tmp/
UniVAD/models/GroundingDINO/Dockerfile ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM pytorch/pytorch:2.1.2-cuda12.1-cudnn8-runtime
2
+ ARG DEBIAN_FRONTEND=noninteractive
3
+
4
+ ENV CUDA_HOME=/usr/local/cuda \
5
+ TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0 7.5 8.0 8.6+PTX" \
6
+ SETUPTOOLS_USE_DISTUTILS=stdlib
7
+
8
+ RUN conda update conda -y
9
+
10
+ # Install libraries in the brand new image.
11
+ RUN apt-get -y update && apt-get install -y --no-install-recommends \
12
+ wget \
13
+ build-essential \
14
+ git \
15
+ python3-opencv \
16
+ ca-certificates && \
17
+ rm -rf /var/lib/apt/lists/*
18
+
19
+ # Set the working directory for all the subsequent Dockerfile instructions.
20
+ WORKDIR /opt/program
21
+
22
+ RUN git clone https://github.com/IDEA-Research/GroundingDINO.git
23
+
24
+ RUN mkdir weights ; cd weights ; wget -q https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth ; cd ..
25
+
26
+ RUN conda install -c "nvidia/label/cuda-12.1.1" cuda -y
27
+ ENV CUDA_HOME=$CONDA_PREFIX
28
+
29
+ ENV PATH=/usr/local/cuda/bin:$PATH
30
+
31
+ RUN cd GroundingDINO/ && python -m pip install .
32
+
33
+ COPY docker_test.py docker_test.py
34
+
35
+ CMD [ "python", "docker_test.py" ]
UniVAD/models/GroundingDINO/LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright 2023 - present, IDEA Research.
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
UniVAD/models/GroundingDINO/README.md ADDED
@@ -0,0 +1,370 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div align="center">
2
+ <img src="./.asset/grounding_dino_logo.png" width="30%">
3
+ </div>
4
+
5
+ # :sauropod: Grounding DINO
6
+
7
+ [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/grounding-dino-marrying-dino-with-grounded/zero-shot-object-detection-on-mscoco)](https://paperswithcode.com/sota/zero-shot-object-detection-on-mscoco?p=grounding-dino-marrying-dino-with-grounded) [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/grounding-dino-marrying-dino-with-grounded/zero-shot-object-detection-on-odinw)](https://paperswithcode.com/sota/zero-shot-object-detection-on-odinw?p=grounding-dino-marrying-dino-with-grounded) \
8
+ [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/grounding-dino-marrying-dino-with-grounded/object-detection-on-coco-minival)](https://paperswithcode.com/sota/object-detection-on-coco-minival?p=grounding-dino-marrying-dino-with-grounded) [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/grounding-dino-marrying-dino-with-grounded/object-detection-on-coco)](https://paperswithcode.com/sota/object-detection-on-coco?p=grounding-dino-marrying-dino-with-grounded)
9
+
10
+
11
+ **[IDEA-CVR, IDEA-Research](https://github.com/IDEA-Research)**
12
+
13
+ [Shilong Liu](http://www.lsl.zone/), [Zhaoyang Zeng](https://scholar.google.com/citations?user=U_cvvUwAAAAJ&hl=zh-CN&oi=ao), [Tianhe Ren](https://rentainhe.github.io/), [Feng Li](https://scholar.google.com/citations?user=ybRe9GcAAAAJ&hl=zh-CN), [Hao Zhang](https://scholar.google.com/citations?user=B8hPxMQAAAAJ&hl=zh-CN), [Jie Yang](https://github.com/yangjie-cv), [Chunyuan Li](https://scholar.google.com/citations?user=Zd7WmXUAAAAJ&hl=zh-CN&oi=ao), [Jianwei Yang](https://jwyang.github.io/), [Hang Su](https://scholar.google.com/citations?hl=en&user=dxN1_X0AAAAJ&view_op=list_works&sortby=pubdate), [Jun Zhu](https://scholar.google.com/citations?hl=en&user=axsP38wAAAAJ), [Lei Zhang](https://www.leizhang.org/)<sup>:email:</sup>.
14
+
15
+
16
+ [[`Paper`](https://arxiv.org/abs/2303.05499)] [[`Demo`](https://huggingface.co/spaces/ShilongLiu/Grounding_DINO_demo)] [[`BibTex`](#black_nib-citation)]
17
+
18
+
19
+ PyTorch implementation and pretrained models for Grounding DINO. For details, see the paper **[Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499)**.
20
+
21
+ - πŸ”₯ **[Grounding DINO 1.5](https://github.com/IDEA-Research/Grounding-DINO-1.5-API)** is released now, which is IDEA Research's **Most Capable** Open-World Object Detection Model!
22
+ - πŸ”₯ **[Grounding DINO](https://arxiv.org/abs/2303.05499)** and **[Grounded SAM](https://arxiv.org/abs/2401.14159)** are now supported in Huggingface. For more convenient use, you can refer to [this documentation](https://huggingface.co/docs/transformers/model_doc/grounding-dino)
23
+
24
+ ## :sun_with_face: Helpful Tutorial
25
+
26
+ - :grapes: [[Read our arXiv Paper](https://arxiv.org/abs/2303.05499)]
27
+ - :apple: [[Watch our simple introduction video on YouTube](https://youtu.be/wxWDt5UiwY8)]
28
+ - :blossom: &nbsp;[[Try the Colab Demo](https://colab.research.google.com/github/roboflow-ai/notebooks/blob/main/notebooks/zero-shot-object-detection-with-grounding-dino.ipynb)]
29
+ - :sunflower: [[Try our Official Huggingface Demo](https://huggingface.co/spaces/ShilongLiu/Grounding_DINO_demo)]
30
+ - :maple_leaf: [[Watch the Step by Step Tutorial about GroundingDINO by Roboflow AI](https://youtu.be/cMa77r3YrDk)]
31
+ - :mushroom: [[GroundingDINO: Automated Dataset Annotation and Evaluation by Roboflow AI](https://youtu.be/C4NqaRBz_Kw)]
32
+ - :hibiscus: [[Accelerate Image Annotation with SAM and GroundingDINO by Roboflow AI](https://youtu.be/oEQYStnF2l8)]
33
+ - :white_flower: [[Autodistill: Train YOLOv8 with ZERO Annotations based on Grounding-DINO and Grounded-SAM by Roboflow AI](https://github.com/autodistill/autodistill)]
34
+
35
+ <!-- Grounding DINO Methods |
36
+ [![arXiv](https://img.shields.io/badge/arXiv-2303.05499-b31b1b.svg)](https://arxiv.org/abs/2303.05499)
37
+ [![YouTube](https://badges.aleen42.com/src/youtube.svg)](https://youtu.be/wxWDt5UiwY8) -->
38
+
39
+ <!-- Grounding DINO Demos |
40
+ [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/roboflow-ai/notebooks/blob/main/notebooks/zero-shot-object-detection-with-grounding-dino.ipynb) -->
41
+ <!-- [![YouTube](https://badges.aleen42.com/src/youtube.svg)](https://youtu.be/cMa77r3YrDk)
42
+ [![HuggingFace space](https://img.shields.io/badge/πŸ€—-HuggingFace%20Space-cyan.svg)](https://huggingface.co/spaces/ShilongLiu/Grounding_DINO_demo)
43
+ [![YouTube](https://badges.aleen42.com/src/youtube.svg)](https://youtu.be/oEQYStnF2l8)
44
+ [![YouTube](https://badges.aleen42.com/src/youtube.svg)](https://youtu.be/C4NqaRBz_Kw) -->
45
+
46
+ ## :sparkles: Highlight Projects
47
+
48
+ - [Semantic-SAM: a universal image segmentation model to enable segment and recognize anything at any desired granularity.](https://github.com/UX-Decoder/Semantic-SAM),
49
+ - [DetGPT: Detect What You Need via Reasoning](https://github.com/OptimalScale/DetGPT)
50
+ - [Grounded-SAM: Marrying Grounding DINO with Segment Anything](https://github.com/IDEA-Research/Grounded-Segment-Anything)
51
+ - [Grounding DINO with Stable Diffusion](demo/image_editing_with_groundingdino_stablediffusion.ipynb)
52
+ - [Grounding DINO with GLIGEN for Controllable Image Editing](demo/image_editing_with_groundingdino_gligen.ipynb)
53
+ - [OpenSeeD: A Simple and Strong Openset Segmentation Model](https://github.com/IDEA-Research/OpenSeeD)
54
+ - [SEEM: Segment Everything Everywhere All at Once](https://github.com/UX-Decoder/Segment-Everything-Everywhere-All-At-Once)
55
+ - [X-GPT: Conversational Visual Agent supported by X-Decoder](https://github.com/microsoft/X-Decoder/tree/xgpt)
56
+ - [GLIGEN: Open-Set Grounded Text-to-Image Generation](https://github.com/gligen/GLIGEN)
57
+ - [LLaVA: Large Language and Vision Assistant](https://github.com/haotian-liu/LLaVA)
58
+
59
+ <!-- Extensions | [Grounding DINO with Segment Anything](https://github.com/IDEA-Research/Grounded-Segment-Anything); [Grounding DINO with Stable Diffusion](demo/image_editing_with_groundingdino_stablediffusion.ipynb); [Grounding DINO with GLIGEN](demo/image_editing_with_groundingdino_gligen.ipynb) -->
60
+
61
+
62
+
63
+ <!-- Official PyTorch implementation of [Grounding DINO](https://arxiv.org/abs/2303.05499), a stronger open-set object detector. Code is available now! -->
64
+
65
+
66
+ ## :bulb: Highlight
67
+
68
+ - **Open-Set Detection.** Detect **everything** with language!
69
+ - **High Performance.** COCO zero-shot **52.5 AP** (training without COCO data!). COCO fine-tune **63.0 AP**.
70
+ - **Flexible.** Collaboration with Stable Diffusion for Image Editting.
71
+
72
+
73
+
74
+
75
+ ## :fire: News
76
+ - **`2023/07/18`**: We release [Semantic-SAM](https://github.com/UX-Decoder/Semantic-SAM), a universal image segmentation model to enable segment and recognize anything at any desired granularity. **Code** and **checkpoint** are available!
77
+ - **`2023/06/17`**: We provide an example to evaluate Grounding DINO on COCO zero-shot performance.
78
+ - **`2023/04/15`**: Refer to [CV in the Wild Readings](https://github.com/Computer-Vision-in-the-Wild/CVinW_Readings) for those who are interested in open-set recognition!
79
+ - **`2023/04/08`**: We release [demos](demo/image_editing_with_groundingdino_gligen.ipynb) to combine [Grounding DINO](https://arxiv.org/abs/2303.05499) with [GLIGEN](https://github.com/gligen/GLIGEN) for more controllable image editings.
80
+ - **`2023/04/08`**: We release [demos](demo/image_editing_with_groundingdino_stablediffusion.ipynb) to combine [Grounding DINO](https://arxiv.org/abs/2303.05499) with [Stable Diffusion](https://github.com/Stability-AI/StableDiffusion) for image editings.
81
+ - **`2023/04/06`**: We build a new demo by marrying GroundingDINO with [Segment-Anything](https://github.com/facebookresearch/segment-anything) named **[Grounded-Segment-Anything](https://github.com/IDEA-Research/Grounded-Segment-Anything)** aims to support segmentation in GroundingDINO.
82
+ - **`2023/03/28`**: A YouTube [video](https://youtu.be/cMa77r3YrDk) about Grounding DINO and basic object detection prompt engineering. [[SkalskiP](https://github.com/SkalskiP)]
83
+ - **`2023/03/28`**: Add a [demo](https://huggingface.co/spaces/ShilongLiu/Grounding_DINO_demo) on Hugging Face Space!
84
+ - **`2023/03/27`**: Support CPU-only mode. Now the model can run on machines without GPUs.
85
+ - **`2023/03/25`**: A [demo](https://colab.research.google.com/github/roboflow-ai/notebooks/blob/main/notebooks/zero-shot-object-detection-with-grounding-dino.ipynb) for Grounding DINO is available at Colab. [[SkalskiP](https://github.com/SkalskiP)]
86
+ - **`2023/03/22`**: Code is available Now!
87
+
88
+ <details open>
89
+ <summary><font size="4">
90
+ Description
91
+ </font></summary>
92
+ <a href="https://arxiv.org/abs/2303.05499">Paper</a> introduction.
93
+ <img src=".asset/hero_figure.png" alt="ODinW" width="100%">
94
+ Marrying <a href="https://github.com/IDEA-Research/GroundingDINO">Grounding DINO</a> and <a href="https://github.com/gligen/GLIGEN">GLIGEN</a>
95
+ <img src="https://huggingface.co/ShilongLiu/GroundingDINO/resolve/main/GD_GLIGEN.png" alt="gd_gligen" width="100%">
96
+ </details>
97
+
98
+ ## :star: Explanations/Tips for Grounding DINO Inputs and Outputs
99
+ - Grounding DINO accepts an `(image, text)` pair as inputs.
100
+ - It outputs `900` (by default) object boxes. Each box has similarity scores across all input words. (as shown in Figures below.)
101
+ - We defaultly choose the boxes whose highest similarities are higher than a `box_threshold`.
102
+ - We extract the words whose similarities are higher than the `text_threshold` as predicted labels.
103
+ - If you want to obtain objects of specific phrases, like the `dogs` in the sentence `two dogs with a stick.`, you can select the boxes with highest text similarities with `dogs` as final outputs.
104
+ - Note that each word can be split to **more than one** tokens with different tokenlizers. The number of words in a sentence may not equal to the number of text tokens.
105
+ - We suggest separating different category names with `.` for Grounding DINO.
106
+ ![model_explain1](.asset/model_explan1.PNG)
107
+ ![model_explain2](.asset/model_explan2.PNG)
108
+
109
+ ## :label: TODO
110
+
111
+ - [x] Release inference code and demo.
112
+ - [x] Release checkpoints.
113
+ - [x] Grounding DINO with Stable Diffusion and GLIGEN demos.
114
+ - [ ] Release training codes.
115
+
116
+ ## :hammer_and_wrench: Install
117
+
118
+ **Note:**
119
+
120
+ 0. If you have a CUDA environment, please make sure the environment variable `CUDA_HOME` is set. It will be compiled under CPU-only mode if no CUDA available.
121
+
122
+ Please make sure following the installation steps strictly, otherwise the program may produce:
123
+ ```bash
124
+ NameError: name '_C' is not defined
125
+ ```
126
+
127
+ If this happened, please reinstalled the groundingDINO by reclone the git and do all the installation steps again.
128
+
129
+ #### how to check cuda:
130
+ ```bash
131
+ echo $CUDA_HOME
132
+ ```
133
+ If it print nothing, then it means you haven't set up the path/
134
+
135
+ Run this so the environment variable will be set under current shell.
136
+ ```bash
137
+ export CUDA_HOME=/path/to/cuda-11.3
138
+ ```
139
+
140
+ Notice the version of cuda should be aligned with your CUDA runtime, for there might exists multiple cuda at the same time.
141
+
142
+ If you want to set the CUDA_HOME permanently, store it using:
143
+
144
+ ```bash
145
+ echo 'export CUDA_HOME=/path/to/cuda' >> ~/.bashrc
146
+ ```
147
+ after that, source the bashrc file and check CUDA_HOME:
148
+ ```bash
149
+ source ~/.bashrc
150
+ echo $CUDA_HOME
151
+ ```
152
+
153
+ In this example, /path/to/cuda-11.3 should be replaced with the path where your CUDA toolkit is installed. You can find this by typing **which nvcc** in your terminal:
154
+
155
+ For instance,
156
+ if the output is /usr/local/cuda/bin/nvcc, then:
157
+ ```bash
158
+ export CUDA_HOME=/usr/local/cuda
159
+ ```
160
+ **Installation:**
161
+
162
+ 1.Clone the GroundingDINO repository from GitHub.
163
+
164
+ ```bash
165
+ git clone https://github.com/IDEA-Research/GroundingDINO.git
166
+ ```
167
+
168
+ 2. Change the current directory to the GroundingDINO folder.
169
+
170
+ ```bash
171
+ cd GroundingDINO/
172
+ ```
173
+
174
+ 3. Install the required dependencies in the current directory.
175
+
176
+ ```bash
177
+ pip install -e .
178
+ ```
179
+
180
+ 4. Download pre-trained model weights.
181
+
182
+ ```bash
183
+ mkdir weights
184
+ cd weights
185
+ wget -q https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth
186
+ cd ..
187
+ ```
188
+
189
+ ## :arrow_forward: Demo
190
+ Check your GPU ID (only if you're using a GPU)
191
+
192
+ ```bash
193
+ nvidia-smi
194
+ ```
195
+ Replace `{GPU ID}`, `image_you_want_to_detect.jpg`, and `"dir you want to save the output"` with appropriate values in the following command
196
+ ```bash
197
+ CUDA_VISIBLE_DEVICES={GPU ID} python demo/inference_on_a_image.py \
198
+ -c groundingdino/config/GroundingDINO_SwinT_OGC.py \
199
+ -p weights/groundingdino_swint_ogc.pth \
200
+ -i image_you_want_to_detect.jpg \
201
+ -o "dir you want to save the output" \
202
+ -t "chair"
203
+ [--cpu-only] # open it for cpu mode
204
+ ```
205
+
206
+ If you would like to specify the phrases to detect, here is a demo:
207
+ ```bash
208
+ CUDA_VISIBLE_DEVICES={GPU ID} python demo/inference_on_a_image.py \
209
+ -c groundingdino/config/GroundingDINO_SwinT_OGC.py \
210
+ -p ./groundingdino_swint_ogc.pth \
211
+ -i .asset/cat_dog.jpeg \
212
+ -o logs/1111 \
213
+ -t "There is a cat and a dog in the image ." \
214
+ --token_spans "[[[9, 10], [11, 14]], [[19, 20], [21, 24]]]"
215
+ [--cpu-only] # open it for cpu mode
216
+ ```
217
+ The token_spans specify the start and end positions of a phrases. For example, the first phrase is `[[9, 10], [11, 14]]`. `"There is a cat and a dog in the image ."[9:10] = 'a'`, `"There is a cat and a dog in the image ."[11:14] = 'cat'`. Hence it refers to the phrase `a cat` . Similarly, the `[[19, 20], [21, 24]]` refers to the phrase `a dog`.
218
+
219
+ See the `demo/inference_on_a_image.py` for more details.
220
+
221
+ **Running with Python:**
222
+
223
+ ```python
224
+ from groundingdino.util.inference import load_model, load_image, predict, annotate
225
+ import cv2
226
+
227
+ model = load_model("groundingdino/config/GroundingDINO_SwinT_OGC.py", "weights/groundingdino_swint_ogc.pth")
228
+ IMAGE_PATH = "weights/dog-3.jpeg"
229
+ TEXT_PROMPT = "chair . person . dog ."
230
+ BOX_TRESHOLD = 0.35
231
+ TEXT_TRESHOLD = 0.25
232
+
233
+ image_source, image = load_image(IMAGE_PATH)
234
+
235
+ boxes, logits, phrases = predict(
236
+ model=model,
237
+ image=image,
238
+ caption=TEXT_PROMPT,
239
+ box_threshold=BOX_TRESHOLD,
240
+ text_threshold=TEXT_TRESHOLD
241
+ )
242
+
243
+ annotated_frame = annotate(image_source=image_source, boxes=boxes, logits=logits, phrases=phrases)
244
+ cv2.imwrite("annotated_image.jpg", annotated_frame)
245
+ ```
246
+ **Web UI**
247
+
248
+ We also provide a demo code to integrate Grounding DINO with Gradio Web UI. See the file `demo/gradio_app.py` for more details.
249
+
250
+ **Notebooks**
251
+
252
+ - We release [demos](demo/image_editing_with_groundingdino_gligen.ipynb) to combine [Grounding DINO](https://arxiv.org/abs/2303.05499) with [GLIGEN](https://github.com/gligen/GLIGEN) for more controllable image editings.
253
+ - We release [demos](demo/image_editing_with_groundingdino_stablediffusion.ipynb) to combine [Grounding DINO](https://arxiv.org/abs/2303.05499) with [Stable Diffusion](https://github.com/Stability-AI/StableDiffusion) for image editings.
254
+
255
+ ## COCO Zero-shot Evaluations
256
+
257
+ We provide an example to evaluate Grounding DINO zero-shot performance on COCO. The results should be **48.5**.
258
+
259
+ ```bash
260
+ CUDA_VISIBLE_DEVICES=0 \
261
+ python demo/test_ap_on_coco.py \
262
+ -c groundingdino/config/GroundingDINO_SwinT_OGC.py \
263
+ -p weights/groundingdino_swint_ogc.pth \
264
+ --anno_path /path/to/annoataions/ie/instances_val2017.json \
265
+ --image_dir /path/to/imagedir/ie/val2017
266
+ ```
267
+
268
+
269
+ ## :luggage: Checkpoints
270
+
271
+ <!-- insert a table -->
272
+ <table>
273
+ <thead>
274
+ <tr style="text-align: right;">
275
+ <th></th>
276
+ <th>name</th>
277
+ <th>backbone</th>
278
+ <th>Data</th>
279
+ <th>box AP on COCO</th>
280
+ <th>Checkpoint</th>
281
+ <th>Config</th>
282
+ </tr>
283
+ </thead>
284
+ <tbody>
285
+ <tr>
286
+ <th>1</th>
287
+ <td>GroundingDINO-T</td>
288
+ <td>Swin-T</td>
289
+ <td>O365,GoldG,Cap4M</td>
290
+ <td>48.4 (zero-shot) / 57.2 (fine-tune)</td>
291
+ <td><a href="https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth">GitHub link</a> | <a href="https://huggingface.co/ShilongLiu/GroundingDINO/resolve/main/groundingdino_swint_ogc.pth">HF link</a></td>
292
+ <td><a href="https://github.com/IDEA-Research/GroundingDINO/blob/main/groundingdino/config/GroundingDINO_SwinT_OGC.py">link</a></td>
293
+ </tr>
294
+ <tr>
295
+ <th>2</th>
296
+ <td>GroundingDINO-B</td>
297
+ <td>Swin-B</td>
298
+ <td>COCO,O365,GoldG,Cap4M,OpenImage,ODinW-35,RefCOCO</td>
299
+ <td>56.7 </td>
300
+ <td><a href="https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha2/groundingdino_swinb_cogcoor.pth">GitHub link</a> | <a href="https://huggingface.co/ShilongLiu/GroundingDINO/resolve/main/groundingdino_swinb_cogcoor.pth">HF link</a>
301
+ <td><a href="https://github.com/IDEA-Research/GroundingDINO/blob/main/groundingdino/config/GroundingDINO_SwinB_cfg.py">link</a></td>
302
+ </tr>
303
+ </tbody>
304
+ </table>
305
+
306
+ ## :medal_military: Results
307
+
308
+ <details open>
309
+ <summary><font size="4">
310
+ COCO Object Detection Results
311
+ </font></summary>
312
+ <img src=".asset/COCO.png" alt="COCO" width="100%">
313
+ </details>
314
+
315
+ <details open>
316
+ <summary><font size="4">
317
+ ODinW Object Detection Results
318
+ </font></summary>
319
+ <img src=".asset/ODinW.png" alt="ODinW" width="100%">
320
+ </details>
321
+
322
+ <details open>
323
+ <summary><font size="4">
324
+ Marrying Grounding DINO with <a href="https://github.com/Stability-AI/StableDiffusion">Stable Diffusion</a> for Image Editing
325
+ </font></summary>
326
+ See our example <a href="https://github.com/IDEA-Research/GroundingDINO/blob/main/demo/image_editing_with_groundingdino_stablediffusion.ipynb">notebook</a> for more details.
327
+ <img src=".asset/GD_SD.png" alt="GD_SD" width="100%">
328
+ </details>
329
+
330
+
331
+ <details open>
332
+ <summary><font size="4">
333
+ Marrying Grounding DINO with <a href="https://github.com/gligen/GLIGEN">GLIGEN</a> for more Detailed Image Editing.
334
+ </font></summary>
335
+ See our example <a href="https://github.com/IDEA-Research/GroundingDINO/blob/main/demo/image_editing_with_groundingdino_gligen.ipynb">notebook</a> for more details.
336
+ <img src=".asset/GD_GLIGEN.png" alt="GD_GLIGEN" width="100%">
337
+ </details>
338
+
339
+ ## :sauropod: Model: Grounding DINO
340
+
341
+ Includes: a text backbone, an image backbone, a feature enhancer, a language-guided query selection, and a cross-modality decoder.
342
+
343
+ ![arch](.asset/arch.png)
344
+
345
+
346
+ ## :hearts: Acknowledgement
347
+
348
+ Our model is related to [DINO](https://github.com/IDEA-Research/DINO) and [GLIP](https://github.com/microsoft/GLIP). Thanks for their great work!
349
+
350
+ We also thank great previous work including DETR, Deformable DETR, SMCA, Conditional DETR, Anchor DETR, Dynamic DETR, DAB-DETR, DN-DETR, etc. More related work are available at [Awesome Detection Transformer](https://github.com/IDEACVR/awesome-detection-transformer). A new toolbox [detrex](https://github.com/IDEA-Research/detrex) is available as well.
351
+
352
+ Thanks [Stable Diffusion](https://github.com/Stability-AI/StableDiffusion) and [GLIGEN](https://github.com/gligen/GLIGEN) for their awesome models.
353
+
354
+
355
+ ## :black_nib: Citation
356
+
357
+ If you find our work helpful for your research, please consider citing the following BibTeX entry.
358
+
359
+ ```bibtex
360
+ @article{liu2023grounding,
361
+ title={Grounding dino: Marrying dino with grounded pre-training for open-set object detection},
362
+ author={Liu, Shilong and Zeng, Zhaoyang and Ren, Tianhe and Li, Feng and Zhang, Hao and Yang, Jie and Li, Chunyuan and Yang, Jianwei and Su, Hang and Zhu, Jun and others},
363
+ journal={arXiv preprint arXiv:2303.05499},
364
+ year={2023}
365
+ }
366
+ ```
367
+
368
+
369
+
370
+
UniVAD/models/GroundingDINO/environment.yaml ADDED
@@ -0,0 +1,248 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: dino
2
+ channels:
3
+ - pytorch
4
+ - nvidia
5
+ - conda-forge
6
+ - defaults
7
+ dependencies:
8
+ - addict=2.4.0=pyhd8ed1ab_2
9
+ - aiohttp=3.8.5=py39ha55989b_0
10
+ - aiosignal=1.3.1=pyhd8ed1ab_0
11
+ - asttokens=2.0.5=pyhd3eb1b0_0
12
+ - async-timeout=4.0.3=pyhd8ed1ab_0
13
+ - attrs=23.1.0=pyh71513ae_1
14
+ - aws-c-auth=0.7.0=h6f3c987_2
15
+ - aws-c-cal=0.6.0=h6ba3258_0
16
+ - aws-c-common=0.8.23=hcfcfb64_0
17
+ - aws-c-compression=0.2.17=h420beca_1
18
+ - aws-c-event-stream=0.3.1=had47b81_1
19
+ - aws-c-http=0.7.11=h72ba615_0
20
+ - aws-c-io=0.13.28=ha35c040_0
21
+ - aws-c-mqtt=0.8.14=h4941efa_2
22
+ - aws-c-s3=0.3.13=he04eaa7_2
23
+ - aws-c-sdkutils=0.1.11=h420beca_1
24
+ - aws-checksums=0.1.16=h420beca_1
25
+ - aws-crt-cpp=0.20.3=h247a981_4
26
+ - aws-sdk-cpp=1.10.57=h1a0519f_17
27
+ - backcall=0.2.0=pyhd3eb1b0_0
28
+ - blas=2.118=mkl
29
+ - blas-devel=3.9.0=18_win64_mkl
30
+ - brotli=1.0.9=hcfcfb64_9
31
+ - brotli-bin=1.0.9=hcfcfb64_9
32
+ - brotli-python=1.0.9=py39h99910a6_9
33
+ - bzip2=1.0.8=h8ffe710_4
34
+ - c-ares=1.19.1=hcfcfb64_0
35
+ - ca-certificates=2023.08.22=haa95532_0
36
+ - certifi=2023.7.22=py39haa95532_0
37
+ - charset-normalizer=3.2.0=pyhd8ed1ab_0
38
+ - click=8.1.7=win_pyh7428d3b_0
39
+ - colorama=0.4.6=pyhd8ed1ab_0
40
+ - comm=0.1.2=py39haa95532_0
41
+ - contourpy=1.1.1=py39h1f6ef14_1
42
+ - cuda-cccl=12.2.140=0
43
+ - cuda-cudart=11.8.89=0
44
+ - cuda-cudart-dev=11.8.89=0
45
+ - cuda-cupti=11.8.87=0
46
+ - cuda-libraries=11.8.0=0
47
+ - cuda-libraries-dev=11.8.0=0
48
+ - cuda-nvrtc=11.8.89=0
49
+ - cuda-nvrtc-dev=11.8.89=0
50
+ - cuda-nvtx=11.8.86=0
51
+ - cuda-profiler-api=12.2.140=0
52
+ - cuda-runtime=11.8.0=0
53
+ - cycler=0.11.0=pyhd8ed1ab_0
54
+ - cython=3.0.0=py39h2bbff1b_0
55
+ - dataclasses=0.8=pyhc8e2a94_3
56
+ - datasets=2.14.5=pyhd8ed1ab_0
57
+ - debugpy=1.6.7=py39hd77b12b_0
58
+ - decorator=5.1.1=pyhd3eb1b0_0
59
+ - dill=0.3.7=pyhd8ed1ab_0
60
+ - exceptiongroup=1.0.4=py39haa95532_0
61
+ - executing=0.8.3=pyhd3eb1b0_0
62
+ - filelock=3.12.4=pyhd8ed1ab_0
63
+ - fonttools=4.42.1=py39ha55989b_0
64
+ - freeglut=3.2.2=h63175ca_2
65
+ - freetype=2.12.1=hdaf720e_2
66
+ - frozenlist=1.4.0=py39ha55989b_1
67
+ - fsspec=2023.6.0=pyh1a96a4e_0
68
+ - gettext=0.21.1=h5728263_0
69
+ - glib=2.78.0=h12be248_0
70
+ - glib-tools=2.78.0=h12be248_0
71
+ - gst-plugins-base=1.22.6=h001b923_1
72
+ - gstreamer=1.22.6=hb4038d2_1
73
+ - huggingface_hub=0.17.3=pyhd8ed1ab_0
74
+ - icu=70.1=h0e60522_0
75
+ - idna=3.4=pyhd8ed1ab_0
76
+ - importlib-metadata=6.8.0=pyha770c72_0
77
+ - importlib-resources=6.1.0=pyhd8ed1ab_0
78
+ - importlib_metadata=6.8.0=hd8ed1ab_0
79
+ - importlib_resources=6.1.0=pyhd8ed1ab_0
80
+ - intel-openmp=2023.2.0=h57928b3_49503
81
+ - ipykernel=6.25.0=py39h9909e9c_0
82
+ - ipython=8.15.0=py39haa95532_0
83
+ - jasper=2.0.33=hc2e4405_1
84
+ - jedi=0.18.1=py39haa95532_1
85
+ - jinja2=3.1.2=pyhd8ed1ab_1
86
+ - joblib=1.3.2=pyhd8ed1ab_0
87
+ - jpeg=9e=hcfcfb64_3
88
+ - jupyter_client=8.1.0=py39haa95532_0
89
+ - jupyter_core=5.3.0=py39haa95532_0
90
+ - kiwisolver=1.4.5=py39h1f6ef14_1
91
+ - krb5=1.20.1=heb0366b_0
92
+ - lcms2=2.14=h90d422f_0
93
+ - lerc=4.0.0=h63175ca_0
94
+ - libabseil=20230125.3=cxx17_h63175ca_0
95
+ - libarrow=12.0.1=h12e5d06_5_cpu
96
+ - libblas=3.9.0=18_win64_mkl
97
+ - libbrotlicommon=1.0.9=hcfcfb64_9
98
+ - libbrotlidec=1.0.9=hcfcfb64_9
99
+ - libbrotlienc=1.0.9=hcfcfb64_9
100
+ - libcblas=3.9.0=18_win64_mkl
101
+ - libclang=15.0.7=default_h77d9078_3
102
+ - libclang13=15.0.7=default_h77d9078_3
103
+ - libcrc32c=1.1.2=h0e60522_0
104
+ - libcublas=11.11.3.6=0
105
+ - libcublas-dev=11.11.3.6=0
106
+ - libcufft=10.9.0.58=0
107
+ - libcufft-dev=10.9.0.58=0
108
+ - libcurand=10.3.3.141=0
109
+ - libcurand-dev=10.3.3.141=0
110
+ - libcurl=8.1.2=h68f0423_0
111
+ - libcusolver=11.4.1.48=0
112
+ - libcusolver-dev=11.4.1.48=0
113
+ - libcusparse=11.7.5.86=0
114
+ - libcusparse-dev=11.7.5.86=0
115
+ - libdeflate=1.14=hcfcfb64_0
116
+ - libevent=2.1.12=h3671451_1
117
+ - libffi=3.4.2=h8ffe710_5
118
+ - libglib=2.78.0=he8f3873_0
119
+ - libgoogle-cloud=2.12.0=h00b2bdc_1
120
+ - libgrpc=1.54.3=ha177ca7_0
121
+ - libhwloc=2.9.3=default_haede6df_1009
122
+ - libiconv=1.17=h8ffe710_0
123
+ - liblapack=3.9.0=18_win64_mkl
124
+ - liblapacke=3.9.0=18_win64_mkl
125
+ - libnpp=11.8.0.86=0
126
+ - libnpp-dev=11.8.0.86=0
127
+ - libnvjpeg=11.9.0.86=0
128
+ - libnvjpeg-dev=11.9.0.86=0
129
+ - libogg=1.3.4=h8ffe710_1
130
+ - libopencv=4.5.3=py39h488c12c_8
131
+ - libpng=1.6.39=h19919ed_0
132
+ - libprotobuf=3.21.12=h12be248_2
133
+ - libsodium=1.0.18=h62dcd97_0
134
+ - libsqlite=3.43.0=hcfcfb64_0
135
+ - libssh2=1.11.0=h7dfc565_0
136
+ - libthrift=0.18.1=h06f6336_2
137
+ - libtiff=4.4.0=hc4f729c_5
138
+ - libutf8proc=2.8.0=h82a8f57_0
139
+ - libuv=1.44.2=hcfcfb64_1
140
+ - libvorbis=1.3.7=h0e60522_0
141
+ - libwebp-base=1.3.2=hcfcfb64_0
142
+ - libxcb=1.13=hcd874cb_1004
143
+ - libxml2=2.11.5=hc3477c8_1
144
+ - libzlib=1.2.13=hcfcfb64_5
145
+ - lz4-c=1.9.4=hcfcfb64_0
146
+ - m2w64-gcc-libgfortran=5.3.0=6
147
+ - m2w64-gcc-libs=5.3.0=7
148
+ - m2w64-gcc-libs-core=5.3.0=7
149
+ - m2w64-gmp=6.1.0=2
150
+ - m2w64-libwinpthread-git=5.0.0.4634.697f757=2
151
+ - markupsafe=2.1.3=py39ha55989b_1
152
+ - matplotlib-base=3.8.0=py39hf19769e_1
153
+ - matplotlib-inline=0.1.6=py39haa95532_0
154
+ - mkl=2022.1.0=h6a75c08_874
155
+ - mkl-devel=2022.1.0=h57928b3_875
156
+ - mkl-include=2022.1.0=h6a75c08_874
157
+ - mpmath=1.3.0=pyhd8ed1ab_0
158
+ - msys2-conda-epoch=20160418=1
159
+ - multidict=6.0.4=py39ha55989b_0
160
+ - multiprocess=0.70.15=py39ha55989b_1
161
+ - munkres=1.1.4=pyh9f0ad1d_0
162
+ - nest-asyncio=1.5.6=py39haa95532_0
163
+ - networkx=3.1=pyhd8ed1ab_0
164
+ - numpy=1.26.0=py39hddb5d58_0
165
+ - opencv=4.5.3=py39hcbf5309_8
166
+ - openjpeg=2.5.0=hc9384bd_1
167
+ - openssl=3.1.3=hcfcfb64_0
168
+ - orc=1.9.0=hada7b9e_1
169
+ - packaging=23.1=pyhd8ed1ab_0
170
+ - pandas=2.1.1=py39h32e6231_0
171
+ - parso=0.8.3=pyhd3eb1b0_0
172
+ - pcre2=10.40=h17e33f8_0
173
+ - pickleshare=0.7.5=pyhd3eb1b0_1003
174
+ - pillow=9.2.0=py39h595c93f_3
175
+ - pip=23.2.1=pyhd8ed1ab_0
176
+ - platformdirs=3.10.0=pyhd8ed1ab_0
177
+ - prompt-toolkit=3.0.36=py39haa95532_0
178
+ - psutil=5.9.0=py39h2bbff1b_0
179
+ - pthread-stubs=0.4=hcd874cb_1001
180
+ - pthreads-win32=2.9.1=hfa6e2cd_3
181
+ - pure_eval=0.2.2=pyhd3eb1b0_0
182
+ - py-opencv=4.5.3=py39h00e5391_8
183
+ - pyarrow=12.0.1=py39hca4e8af_5_cpu
184
+ - pycocotools=2.0.6=py39hc266a54_1
185
+ - pygments=2.15.1=py39haa95532_1
186
+ - pyparsing=3.1.1=pyhd8ed1ab_0
187
+ - pysocks=1.7.1=pyh0701188_6
188
+ - python=3.9.18=h4de0772_0_cpython
189
+ - python-dateutil=2.8.2=pyhd8ed1ab_0
190
+ - python-tzdata=2023.3=pyhd8ed1ab_0
191
+ - python-xxhash=3.3.0=py39ha55989b_1
192
+ - python_abi=3.9=4_cp39
193
+ - pytorch=2.0.1=py3.9_cuda11.8_cudnn8_0
194
+ - pytorch-cuda=11.8=h24eeafa_5
195
+ - pytorch-mutex=1.0=cuda
196
+ - pytz=2023.3.post1=pyhd8ed1ab_0
197
+ - pywin32=305=py39h2bbff1b_0
198
+ - pyyaml=6.0.1=py39ha55989b_1
199
+ - pyzmq=25.1.0=py39hd77b12b_0
200
+ - qt-main=5.15.8=h720456b_6
201
+ - re2=2023.03.02=hd4eee63_0
202
+ - regex=2023.8.8=py39ha55989b_1
203
+ - requests=2.31.0=pyhd8ed1ab_0
204
+ - sacremoses=0.0.53=pyhd8ed1ab_0
205
+ - safetensors=0.3.3=py39hf21820d_1
206
+ - setuptools=68.2.2=pyhd8ed1ab_0
207
+ - six=1.16.0=pyh6c4a22f_0
208
+ - snappy=1.1.10=hfb803bf_0
209
+ - stack_data=0.2.0=pyhd3eb1b0_0
210
+ - sympy=1.12=pyh04b8f61_3
211
+ - tbb=2021.10.0=h91493d7_1
212
+ - timm=0.9.7=pyhd8ed1ab_0
213
+ - tk=8.6.13=hcfcfb64_0
214
+ - tokenizers=0.13.3=py39hca44cb7_0
215
+ - tomli=2.0.1=pyhd8ed1ab_0
216
+ - tornado=6.3.2=py39h2bbff1b_0
217
+ - tqdm=4.66.1=pyhd8ed1ab_0
218
+ - traitlets=5.7.1=py39haa95532_0
219
+ - transformers=4.33.2=pyhd8ed1ab_0
220
+ - typing-extensions=4.8.0=hd8ed1ab_0
221
+ - typing_extensions=4.8.0=pyha770c72_0
222
+ - tzdata=2023c=h71feb2d_0
223
+ - ucrt=10.0.22621.0=h57928b3_0
224
+ - unicodedata2=15.0.0=py39ha55989b_1
225
+ - urllib3=2.0.5=pyhd8ed1ab_0
226
+ - vc=14.3=h64f974e_17
227
+ - vc14_runtime=14.36.32532=hdcecf7f_17
228
+ - vs2015_runtime=14.36.32532=h05e6639_17
229
+ - wcwidth=0.2.5=pyhd3eb1b0_0
230
+ - wheel=0.41.2=pyhd8ed1ab_0
231
+ - win_inet_pton=1.1.0=pyhd8ed1ab_6
232
+ - xorg-libxau=1.0.11=hcd874cb_0
233
+ - xorg-libxdmcp=1.1.3=hcd874cb_0
234
+ - xxhash=0.8.2=hcfcfb64_0
235
+ - xz=5.2.6=h8d14728_0
236
+ - yaml=0.2.5=h8ffe710_2
237
+ - yapf=0.40.1=pyhd8ed1ab_0
238
+ - yarl=1.9.2=py39ha55989b_0
239
+ - zeromq=4.3.4=hd77b12b_0
240
+ - zipp=3.17.0=pyhd8ed1ab_0
241
+ - zlib=1.2.13=hcfcfb64_5
242
+ - zstd=1.5.5=h12be248_0
243
+ - pip:
244
+ - opencv-python==4.8.0.76
245
+ - supervision==0.6.0
246
+ - torchaudio==2.0.2
247
+ - torchvision==0.15.2
248
+ prefix: C:\Users\Makoto\miniconda3\envs\dino
UniVAD/models/GroundingDINO/requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ torch
2
+ torchvision
3
+ transformers
4
+ addict
5
+ yapf
6
+ timm
7
+ numpy
8
+ opencv-python
9
+ supervision
10
+ pycocotools
UniVAD/models/GroundingDINO/test.ipynb ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 2,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stdout",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "final text_encoder_type: bert-base-uncased\n"
13
+ ]
14
+ },
15
+ {
16
+ "data": {
17
+ "application/json": {
18
+ "ascii": false,
19
+ "bar_format": null,
20
+ "colour": null,
21
+ "elapsed": 0.014210224151611328,
22
+ "initial": 0,
23
+ "n": 0,
24
+ "ncols": null,
25
+ "nrows": null,
26
+ "postfix": null,
27
+ "prefix": "Downloading model.safetensors",
28
+ "rate": null,
29
+ "total": 440449768,
30
+ "unit": "B",
31
+ "unit_divisor": 1000,
32
+ "unit_scale": true
33
+ },
34
+ "application/vnd.jupyter.widget-view+json": {
35
+ "model_id": "5922f34578364d36afa13de9f01254bd",
36
+ "version_major": 2,
37
+ "version_minor": 0
38
+ },
39
+ "text/plain": [
40
+ "Downloading model.safetensors: 0%| | 0.00/440M [00:00<?, ?B/s]"
41
+ ]
42
+ },
43
+ "metadata": {},
44
+ "output_type": "display_data"
45
+ },
46
+ {
47
+ "name": "stderr",
48
+ "output_type": "stream",
49
+ "text": [
50
+ "/root/miniconda3/lib/python3.8/site-packages/transformers/modeling_utils.py:881: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
51
+ " warnings.warn(\n",
52
+ "/root/miniconda3/lib/python3.8/site-packages/torch/utils/checkpoint.py:31: UserWarning: None of the inputs have requires_grad=True. Gradients will be None\n",
53
+ " warnings.warn(\"None of the inputs have requires_grad=True. Gradients will be None\")\n"
54
+ ]
55
+ },
56
+ {
57
+ "data": {
58
+ "text/plain": [
59
+ "True"
60
+ ]
61
+ },
62
+ "execution_count": 2,
63
+ "metadata": {},
64
+ "output_type": "execute_result"
65
+ }
66
+ ],
67
+ "source": [
68
+ "from groundingdino.util.inference import load_model, load_image, predict, annotate\n",
69
+ "import cv2\n",
70
+ "\n",
71
+ "model = load_model(\"groundingdino/config/GroundingDINO_SwinT_OGC.py\", \"../04-06-segment-anything/weights/groundingdino_swint_ogc.pth\")\n",
72
+ "IMAGE_PATH = \".asset/cat_dog.jpeg\"\n",
73
+ "TEXT_PROMPT = \"chair . person . dog .\"\n",
74
+ "BOX_TRESHOLD = 0.35\n",
75
+ "TEXT_TRESHOLD = 0.25\n",
76
+ "\n",
77
+ "image_source, image = load_image(IMAGE_PATH)\n",
78
+ "\n",
79
+ "boxes, logits, phrases = predict(\n",
80
+ " model=model,\n",
81
+ " image=image,\n",
82
+ " caption=TEXT_PROMPT,\n",
83
+ " box_threshold=BOX_TRESHOLD,\n",
84
+ " text_threshold=TEXT_TRESHOLD\n",
85
+ ")\n",
86
+ "\n",
87
+ "annotated_frame = annotate(image_source=image_source, boxes=boxes, logits=logits, phrases=phrases)\n",
88
+ "cv2.imwrite(\"annotated_image.jpg\", annotated_frame)"
89
+ ]
90
+ }
91
+ ],
92
+ "metadata": {
93
+ "kernelspec": {
94
+ "display_name": "base",
95
+ "language": "python",
96
+ "name": "python3"
97
+ },
98
+ "language_info": {
99
+ "codemirror_mode": {
100
+ "name": "ipython",
101
+ "version": 3
102
+ },
103
+ "file_extension": ".py",
104
+ "mimetype": "text/x-python",
105
+ "name": "python",
106
+ "nbconvert_exporter": "python",
107
+ "pygments_lexer": "ipython3",
108
+ "version": "3.8.10"
109
+ },
110
+ "orig_nbformat": 4
111
+ },
112
+ "nbformat": 4,
113
+ "nbformat_minor": 2
114
+ }