phiph commited on
Commit
7382c66
·
verified ·
1 Parent(s): 6dcfb17

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +20 -0
  2. .gitignore +6 -0
  3. DA-2-repo/LICENSE +201 -0
  4. DA-2-repo/README.md +103 -0
  5. DA-2-repo/app.py +165 -0
  6. DA-2-repo/assets/badges/icon2.png +0 -0
  7. DA-2-repo/assets/badges/teaser.jpg +3 -0
  8. DA-2-repo/assets/demos/a0.png +3 -0
  9. DA-2-repo/assets/demos/a1.png +3 -0
  10. DA-2-repo/assets/demos/a10.png +3 -0
  11. DA-2-repo/assets/demos/a11.png +3 -0
  12. DA-2-repo/assets/demos/a2.png +3 -0
  13. DA-2-repo/assets/demos/a3.png +3 -0
  14. DA-2-repo/assets/demos/a4.png +3 -0
  15. DA-2-repo/assets/demos/a5.png +3 -0
  16. DA-2-repo/assets/demos/a6.png +3 -0
  17. DA-2-repo/assets/demos/a7.png +3 -0
  18. DA-2-repo/assets/demos/a8.png +3 -0
  19. DA-2-repo/assets/demos/a9.png +3 -0
  20. DA-2-repo/assets/demos/b0.png +3 -0
  21. DA-2-repo/assets/demos/b1.png +3 -0
  22. DA-2-repo/assets/demos/b2.png +3 -0
  23. DA-2-repo/assets/demos/b3.png +3 -0
  24. DA-2-repo/assets/demos/b4.png +3 -0
  25. DA-2-repo/assets/demos/b5.png +3 -0
  26. DA-2-repo/assets/masks/b0.png +0 -0
  27. DA-2-repo/assets/masks/b1.png +0 -0
  28. DA-2-repo/assets/masks/b2.png +0 -0
  29. DA-2-repo/assets/masks/b3.png +0 -0
  30. DA-2-repo/assets/masks/b4.png +0 -0
  31. DA-2-repo/assets/masks/b5.png +0 -0
  32. DA-2-repo/configs/accelerate/0.yaml +16 -0
  33. DA-2-repo/configs/accelerate/1.yaml +16 -0
  34. DA-2-repo/configs/accelerate/2.yaml +16 -0
  35. DA-2-repo/configs/accelerate/3.yaml +16 -0
  36. DA-2-repo/configs/accelerate/4.yaml +16 -0
  37. DA-2-repo/configs/accelerate/5.yaml +16 -0
  38. DA-2-repo/configs/accelerate/6.yaml +16 -0
  39. DA-2-repo/configs/accelerate/7.yaml +16 -0
  40. DA-2-repo/configs/eval.json +76 -0
  41. DA-2-repo/configs/infer.json +39 -0
  42. DA-2-repo/eval.py +29 -0
  43. DA-2-repo/eval.sh +7 -0
  44. DA-2-repo/eval/__init__.py +0 -0
  45. DA-2-repo/eval/datasets/__init__.py +35 -0
  46. DA-2-repo/eval/datasets/base_depth_dataset.py +268 -0
  47. DA-2-repo/eval/datasets/matterport3d_dataset.py +25 -0
  48. DA-2-repo/eval/datasets/panosuncg_dataset.py +26 -0
  49. DA-2-repo/eval/datasets/splits/2d3ds.txt +0 -0
  50. DA-2-repo/eval/datasets/splits/matterport3d.txt +0 -0
.gitattributes CHANGED
@@ -33,3 +33,23 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ DA-2-repo/assets/badges/teaser.jpg filter=lfs diff=lfs merge=lfs -text
37
+ DA-2-repo/assets/demos/a0.png filter=lfs diff=lfs merge=lfs -text
38
+ DA-2-repo/assets/demos/a1.png filter=lfs diff=lfs merge=lfs -text
39
+ DA-2-repo/assets/demos/a10.png filter=lfs diff=lfs merge=lfs -text
40
+ DA-2-repo/assets/demos/a11.png filter=lfs diff=lfs merge=lfs -text
41
+ DA-2-repo/assets/demos/a2.png filter=lfs diff=lfs merge=lfs -text
42
+ DA-2-repo/assets/demos/a3.png filter=lfs diff=lfs merge=lfs -text
43
+ DA-2-repo/assets/demos/a4.png filter=lfs diff=lfs merge=lfs -text
44
+ DA-2-repo/assets/demos/a5.png filter=lfs diff=lfs merge=lfs -text
45
+ DA-2-repo/assets/demos/a6.png filter=lfs diff=lfs merge=lfs -text
46
+ DA-2-repo/assets/demos/a7.png filter=lfs diff=lfs merge=lfs -text
47
+ DA-2-repo/assets/demos/a8.png filter=lfs diff=lfs merge=lfs -text
48
+ DA-2-repo/assets/demos/a9.png filter=lfs diff=lfs merge=lfs -text
49
+ DA-2-repo/assets/demos/b0.png filter=lfs diff=lfs merge=lfs -text
50
+ DA-2-repo/assets/demos/b1.png filter=lfs diff=lfs merge=lfs -text
51
+ DA-2-repo/assets/demos/b2.png filter=lfs diff=lfs merge=lfs -text
52
+ DA-2-repo/assets/demos/b3.png filter=lfs diff=lfs merge=lfs -text
53
+ DA-2-repo/assets/demos/b4.png filter=lfs diff=lfs merge=lfs -text
54
+ DA-2-repo/assets/demos/b5.png filter=lfs diff=lfs merge=lfs -text
55
+ model.onnx.data filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ venv/
2
+ __pycache__/
3
+ *.pyc
4
+ .DS_Store
5
+ *.safetensors
6
+ .vscode/
DA-2-repo/LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
DA-2-repo/README.md ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # <img src="assets/badges/icon2.png" alt="lotus" style="height:1.2em; vertical-align:bottom;"/>&nbsp;DA<sup>2</sup>: Depth Anything in Any Direction
2
+
3
+ [![Page](https://img.shields.io/badge/Project-Website-pink?logo=googlechrome&logoColor=white)](https://depth-any-in-any-dir.github.io/)
4
+ [![Paper](https://img.shields.io/badge/arXiv-Paper-b31b1b?logo=arxiv&logoColor=white)](http://arxiv.org/abs/2509.26618)
5
+ [![HuggingFace Demo](https://img.shields.io/badge/🤗%20HuggingFace-Demo%20-yellow)](https://huggingface.co/spaces/haodongli/DA-2)
6
+ [![Data](https://img.shields.io/badge/📂%20HuggingFace-Data-green)](https://huggingface.co/datasets/haodongli/DA-2)
7
+ [![Slides](https://img.shields.io/badge/Google-Slides-blue?logo=slideshare&logoColor=white)](https://docs.google.com/presentation/d/1QUonqLuYGEh0qcqY72pbTXsZimINlyN4rOogy7qX4GY/edit?usp=sharing)
8
+ [![BibTeX](https://img.shields.io/badge/BibTeX-grey?logo=googlescholar&logoColor=white)](https://depth-any-in-any-dir.github.io/bibtex.txt)
9
+
10
+ [Haodong Li](https://haodong2000.github.io/)<sup>123&sect;</sup>,
11
+ [Wangguangdong Zheng](https://wangguandongzheng.github.io/)<sup>1</sup>,
12
+ [Jing He](https://jingheya.github.io/)<sup>3</sup>,
13
+ [Yuhao Liu](https://yuhaoliu7456.github.io/)<sup>1</sup>,
14
+ [Xin Lin](https://linxin0.github.io/)<sup>2</sup>,
15
+ [Xin Yang](https://abnervictor.github.io/2023/06/12/Academic-Self-Intro.html)<sup>34</sup>,<br>
16
+ [Ying-Cong Chen](https://www.yingcong.me/)<sup>34&#9993;</sup>,
17
+ [Chunchao Guo]()<sup>1&#9993;</sup>
18
+
19
+ <span class="author-block"><sup>1</sup>Tencent Hunyuan</span>
20
+ <span class="author-block"><sup>2</sup>UC San Diego</span>
21
+ <span class="author-block"><sup>3</sup>HKUST(GZ)</span>
22
+ <span class="author-block"><sup>4</sup>HKUST</span><br>
23
+ <span class="author-block">
24
+ <sup>&sect;</sup>Work primarily done during an internship at Tencent Hunyuan.
25
+ <sup>&#9993;</sup>Corresponding author.
26
+ </span>
27
+
28
+ ![teaser](assets/badges/teaser.jpg)
29
+
30
+ <strong>DA<sup>2</sup> predicts dense, scale-invariant distance from a single 360&deg; panorama in an end-to-end manner, with remarkable geometric fidelity and strong zero-shot generalization.</strong>
31
+
32
+ ## 📢 News
33
+ - 2025-10-10 The curated panoramic data is released on [huggingface](https://huggingface.co/datasets/haodongli/DA-2)!
34
+ - 2025-10-10 The evaluation code and the [testing data](https://huggingface.co/datasets/haodongli/DA-2-Evaluation) are released!
35
+ - 2025-10-04 The 🤗Huggingface Gradio demo ([online](https://huggingface.co/spaces/haodongli/DA-2) and [local](https://github.com/EnVision-Research/DA-2?tab=readme-ov-file#-gradio-demo)) are released!
36
+ - 2025-10-04 The inference code and the [model](https://huggingface.co/haodongli/DA-2) are released!
37
+ - 2025-10-01 [Paper](https://arxiv.org/abs/2509.26618) released on arXiv!
38
+
39
+ ## 🛠️ Setup
40
+ > This installation was tested on: Ubuntu 20.04 LTS, Python 3.12, CUDA 12.2, NVIDIA GeForce RTX 3090.
41
+
42
+ 1. Clone the repository:
43
+ ```
44
+ git clone https://github.com/EnVision-Research/DA-2.git
45
+ cd DA-2
46
+ ```
47
+ 2. Install dependencies using conda:
48
+ ```
49
+ conda create -n da-2 python=3.12 -y
50
+ conda activate da-2
51
+ pip install -e src
52
+ ```
53
+ > For macOS users: Please remove `xformers==0.0.28.post2` (line 16) from `src/pyproject.toml` before `pip install -e src`, as [xFormers does not support macOS](https://github.com/facebookresearch/xformers/issues/775#issuecomment-1611284979).
54
+
55
+ ## 🤗 Gradio Demo
56
+ 1. Online demo: [Hugggingface Space](https://huggingface.co/spaces/haodongli/DA-2)
57
+ 2. Local demo:
58
+ ```
59
+ python app.py
60
+ ```
61
+
62
+ ## 🕹️ Inference
63
+ > We've pre-uploaded the cases appeared in the [project page](https://depth-any-in-any-dir.github.io/). So you can proceed directly to step 3.
64
+
65
+ 1. Images are placed in a directory, e.g., `assets/demos`.
66
+ 2. (Optional) Masks (e.g., sky masks for outdoor images) in another directory, e.g., `assets/masks`. The filenames under both directories should be consistent.
67
+ 3. Run the inference command:
68
+ ```
69
+ sh infer.sh
70
+ ```
71
+ 4. The visualized distance and normal maps will be saved at `output/infer/vis_all.png`. The projected 3D point clouds will be saved at `output/infer/3dpc`.
72
+
73
+ ## 🚗 Evaluation
74
+ 1. Download the evaluation datasets from [huggingface](https://huggingface.co/datasets/haodongli/DA-2-Evaluation):
75
+ ```
76
+ cd [YOUR_DATA_DIR]
77
+ huggingface-cli login
78
+ hf download --repo-type dataset haodongli/DA-2-Evaluation --local-dir [YOUR_DATA_DIR]
79
+ ```
80
+ 2. Unzip the downloaded datasets:
81
+ ```
82
+ tar -zxvf [DATA_NAME].tar.gz
83
+ ```
84
+ 3. Set the `datasets_dir` (line 20) in `configs/eval.json` with `YOUR_DATA_DIR`.
85
+ 4. Run the evaluation command:
86
+ ```
87
+ sh eval.sh
88
+ ```
89
+ 5. The results will be saved at `output/eval`.
90
+
91
+ ## 🎓 Citation
92
+ If you find our work useful in your research, please consider citing our paper🌹:
93
+ ```bibtex
94
+ @article{li2025depth,
95
+ title={DA$^{2}$: Depth Anything in Any Direction},
96
+ author={Li, Haodong and Zheng, Wangguangdong and He, Jing and Liu, Yuhao and Lin, Xin and Yang, Xin and Chen, Ying-Cong and Guo, Chunchao},
97
+ journal={arXiv preprint arXiv:2509.26618},
98
+ year={2025}
99
+ }
100
+ ```
101
+
102
+ ## 🤝 Acknowledgement
103
+ This implementation is impossible without the awesome contributions of [MoGe](https://wangrc.site/MoGePage/), [UniK3D](https://lpiccinelli-eth.github.io/pub/unik3d/), [Lotus](https://lotus3d.github.io/), [Marigold](https://marigoldmonodepth.github.io/), [DINOv2](https://github.com/facebookresearch/dinov2), [Accelerate](https://github.com/huggingface/accelerate), [Gradio](https://github.com/gradio-app/gradio), [HuggingFace Hub](https://github.com/huggingface/huggingface_hub), and [PyTorch](https://pytorch.org/) to the open-cource community.
DA-2-repo/app.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from accelerate import Accelerator
3
+ from accelerate.logging import get_logger
4
+ from accelerate.utils import (
5
+ InitProcessGroupKwargs,
6
+ ProjectConfiguration,
7
+ set_seed
8
+ )
9
+ import torch
10
+ from contextlib import nullcontext
11
+ import trimesh
12
+ import gradio as gr
13
+ from gradio_imageslider import ImageSlider
14
+ from da2.utils.base import load_config
15
+ from da2.utils.model import load_model
16
+ from da2.utils.io import (
17
+ read_cv2_image,
18
+ torch_transform,
19
+ tensorize
20
+ )
21
+ from da2.utils.vis import colorize_distance
22
+ from da2.utils.d2pc import distance2pointcloud
23
+ from datetime import (
24
+ timedelta,
25
+ datetime
26
+ )
27
+ import cv2
28
+ import numpy as np
29
+
30
+ last_glb_path = None
31
+
32
+ def prepare_to_run_demo():
33
+ config = load_config('configs/infer.json')
34
+ kwargs = InitProcessGroupKwargs(timeout=timedelta(seconds=config['accelerator']['timeout']))
35
+ output_dir = f'output/infer'
36
+ if not os.path.exists(output_dir): os.makedirs(output_dir, exist_ok=True)
37
+ accu_steps = config['accelerator']['accumulation_nsteps']
38
+ accelerator = Accelerator(
39
+ gradient_accumulation_steps=accu_steps,
40
+ mixed_precision=config['accelerator']['mixed_precision'],
41
+ log_with=config['accelerator']['report_to'],
42
+ project_config=ProjectConfiguration(project_dir=output_dir),
43
+ kwargs_handlers=[kwargs]
44
+ )
45
+ logger = get_logger(__name__, log_level='INFO')
46
+ config['env']['logger'] = logger
47
+ set_seed(config['env']['seed'])
48
+ return config, accelerator
49
+
50
+ def read_mask_demo(mask_path, shape):
51
+ if mask_path is None:
52
+ return np.ones(shape[1:]) > 0
53
+ mask = cv2.imread(mask_path, cv2.IMREAD_GRAYSCALE)
54
+ mask = mask > 0
55
+ return mask
56
+
57
+ def load_infer_data_demo(image, mask, model_dtype, device):
58
+ cv2_image = read_cv2_image(image)
59
+ image = torch_transform(cv2_image)
60
+ mask = read_mask_demo(mask, image.shape)
61
+ image = tensorize(image, model_dtype, device)
62
+ return image, cv2_image, mask
63
+
64
+ def ply2glb(ply_path, glb_path):
65
+ pcd = trimesh.load(ply_path)
66
+ points = np.asarray(pcd.vertices)
67
+ colors = np.asarray(pcd.visual.vertex_colors)
68
+ cloud = trimesh.points.PointCloud(vertices=points, colors=colors)
69
+ cloud.export(glb_path)
70
+ os.remove(ply_path)
71
+
72
+ def fn(image_path, mask_path):
73
+ global last_glb_path
74
+ config, accelerator = prepare_to_run_demo()
75
+ model = load_model(config, accelerator)
76
+ image, cv2_image, mask = load_infer_data_demo(image_path, mask_path,
77
+ model_dtype=config['spherevit']['dtype'], device=accelerator.device)
78
+ if torch.backends.mps.is_available():
79
+ autocast_ctx = nullcontext()
80
+ else:
81
+ autocast_ctx = torch.autocast(accelerator.device.type)
82
+ with autocast_ctx, torch.no_grad():
83
+ distance = model(image).cpu().numpy()[0]
84
+ if last_glb_path is not None:
85
+ os.remove(last_glb_path)
86
+ distance_vis = colorize_distance(distance, mask)
87
+ save_path = f'cache/tmp_{datetime.now().strftime("%Y%m%d_%H%M%S")}.glb'
88
+ last_glb_path = save_path
89
+ normal_image = distance2pointcloud(distance, cv2_image, mask, save_path=save_path.replace('.glb', '.ply'), return_normal=True, save_distance=False)
90
+ ply2glb(save_path.replace('.glb', '.ply'), save_path)
91
+ return save_path, [distance_vis, normal_image]
92
+
93
+ inputs = [
94
+ gr.Image(label="Input Image", type="filepath"),
95
+ gr.Image(label="Input Mask", type="filepath"),
96
+ ]
97
+ outputs = [
98
+ gr.Model3D(clear_color=[0.0, 0.0, 0.0, 0.0], label="3D Point Cloud"),
99
+ gr.ImageSlider(
100
+ label="Output Depth / Normal (transformed from the depth)",
101
+ type="pil",
102
+ slider_position=75,
103
+ )
104
+ ]
105
+
106
+ demo = gr.Interface(
107
+ fn=fn,
108
+ title="DA<sup>2</sup>: <u>D</u>epth <u>A</u>nything in <u>A</u>ny <u>D</u>irection",
109
+ description="""
110
+ <p align="center">
111
+ <a title="Project Page" href="https://depth-any-in-any-dir.github.io/" target="_blank" rel="noopener noreferrer" style="display: inline-block;">
112
+ <img src="https://img.shields.io/badge/Project-Website-pink?logo=googlechrome&logoColor=white">
113
+ </a>
114
+ <a title="arXiv" href="http://arxiv.org/abs/2509.26618" target="_blank" rel="noopener noreferrer" style="display: inline-block;">
115
+ <img src="https://img.shields.io/badge/arXiv-Paper-b31b1b?logo=arxiv&logoColor=white">
116
+ </a>
117
+ <a title="Github" href="https://github.com/EnVision-Research/DA-2" target="_blank" rel="noopener noreferrer" style="display: inline-block;">
118
+ <img src="https://img.shields.io/github/stars/EnVision-Research/DA-2?label=GitHub%20%E2%98%85&logo=github&color=C8C" alt="badge-github-stars">
119
+ </a>
120
+ <a title="Social" href="https://x.com/_akhaliq/status/1973283687652606411" target="_blank" rel="noopener noreferrer" style="display: inline-block;">
121
+ <img src="https://www.obukhov.ai/img/badges/badge-social.svg" alt="social">
122
+ </a>
123
+ <a title="Social" href="https://x.com/haodongli00/status/1973287870317338747" target="_blank" rel="noopener noreferrer" style="display: inline-block;">
124
+ <img src="https://www.obukhov.ai/img/badges/badge-social.svg" alt="social">
125
+ </a>
126
+ <br>
127
+ <strong>Please consider starring <span style="color: orange">&#9733;</span> our <a href="https://github.com/EnVision-Research/DA-2" target="_blank" rel="noopener noreferrer">GitHub Repo</a> if you find this demo useful!</strong>
128
+ </p>
129
+ <p><strong>Note: the "Input Mask" is optional, all pixels are assumed to be valid if mask is None.</strong></p>
130
+ """,
131
+ inputs=inputs,
132
+ outputs=outputs,
133
+ examples=[
134
+ [os.path.join(os.path.dirname(__file__), "assets/demos/a1.png"), None],
135
+ [os.path.join(os.path.dirname(__file__), "assets/demos/a2.png"), None],
136
+ [os.path.join(os.path.dirname(__file__), "assets/demos/a3.png"), None],
137
+ [os.path.join(os.path.dirname(__file__), "assets/demos/a4.png"), None],
138
+ [os.path.join(os.path.dirname(__file__), "assets/demos/b0.png"),
139
+ os.path.join(os.path.dirname(__file__), "assets/masks/b0.png")],
140
+ [os.path.join(os.path.dirname(__file__), "assets/demos/b1.png"),
141
+ os.path.join(os.path.dirname(__file__), "assets/masks/b1.png")],
142
+ [os.path.join(os.path.dirname(__file__), "assets/demos/a5.png"), None],
143
+ [os.path.join(os.path.dirname(__file__), "assets/demos/a6.png"), None],
144
+ [os.path.join(os.path.dirname(__file__), "assets/demos/a7.png"), None],
145
+ [os.path.join(os.path.dirname(__file__), "assets/demos/a8.png"), None],
146
+ [os.path.join(os.path.dirname(__file__), "assets/demos/b2.png"),
147
+ os.path.join(os.path.dirname(__file__), "assets/masks/b2.png")],
148
+ [os.path.join(os.path.dirname(__file__), "assets/demos/b3.png"),
149
+ os.path.join(os.path.dirname(__file__), "assets/masks/b3.png")],
150
+ [os.path.join(os.path.dirname(__file__), "assets/demos/a9.png"), None],
151
+ [os.path.join(os.path.dirname(__file__), "assets/demos/a10.png"), None],
152
+ [os.path.join(os.path.dirname(__file__), "assets/demos/a11.png"), None],
153
+ [os.path.join(os.path.dirname(__file__), "assets/demos/a0.png"), None],
154
+ [os.path.join(os.path.dirname(__file__), "assets/demos/b4.png"),
155
+ os.path.join(os.path.dirname(__file__), "assets/masks/b4.png")],
156
+ [os.path.join(os.path.dirname(__file__), "assets/demos/b5.png"),
157
+ os.path.join(os.path.dirname(__file__), "assets/masks/b5.png")],
158
+ ],
159
+ examples_per_page=20
160
+ )
161
+
162
+ demo.launch(
163
+ server_name="0.0.0.0",
164
+ server_port=6381,
165
+ )
DA-2-repo/assets/badges/icon2.png ADDED
DA-2-repo/assets/badges/teaser.jpg ADDED

Git LFS Details

  • SHA256: 5c6786218d0a17115e6ed1320434b2b47101290a7e244f2eed1ebe70e4822464
  • Pointer size: 132 Bytes
  • Size of remote file: 1.2 MB
DA-2-repo/assets/demos/a0.png ADDED

Git LFS Details

  • SHA256: eedc66f98cf0a949602f691c3eed51511ae520cf8f63674abe542741ba6090b8
  • Pointer size: 131 Bytes
  • Size of remote file: 744 kB
DA-2-repo/assets/demos/a1.png ADDED

Git LFS Details

  • SHA256: 906f336ab4c6561ee85b9cb883a6aa34cf11289fc86b6a4e4382baed56981aa7
  • Pointer size: 131 Bytes
  • Size of remote file: 822 kB
DA-2-repo/assets/demos/a10.png ADDED

Git LFS Details

  • SHA256: d6d058aef9322964f5d36de90ab91470e283acab248604bcd488a43c680a9e7d
  • Pointer size: 131 Bytes
  • Size of remote file: 882 kB
DA-2-repo/assets/demos/a11.png ADDED

Git LFS Details

  • SHA256: 45af8c71b8d44880503b5da1b5f67a0d5638860b9f9149cae7d16a3a3975d090
  • Pointer size: 131 Bytes
  • Size of remote file: 848 kB
DA-2-repo/assets/demos/a2.png ADDED

Git LFS Details

  • SHA256: 6fa931d70c6220cec0b56a9cdf651f12fa35436d937cd2cf481d10dddb2a114e
  • Pointer size: 131 Bytes
  • Size of remote file: 810 kB
DA-2-repo/assets/demos/a3.png ADDED

Git LFS Details

  • SHA256: a85573ac5d51a261d82b23475488e769bd9b3e392948e60e6dc73f0c7ace762b
  • Pointer size: 131 Bytes
  • Size of remote file: 854 kB
DA-2-repo/assets/demos/a4.png ADDED

Git LFS Details

  • SHA256: d0a544ec4b542c59f1fbfaf99f86eb60b4c0dbce7c8e4b1bac9e6e23e889c7ec
  • Pointer size: 131 Bytes
  • Size of remote file: 813 kB
DA-2-repo/assets/demos/a5.png ADDED

Git LFS Details

  • SHA256: 7e36ed78b74223eae24f8c85f1cdab00d1a3a5b494fec807240cb7d3427fad87
  • Pointer size: 131 Bytes
  • Size of remote file: 848 kB
DA-2-repo/assets/demos/a6.png ADDED

Git LFS Details

  • SHA256: e48031fcd3e5a84e4ea4513a23e2ec8150f8ec3fbdae1d4b2d51fc67ac588fe6
  • Pointer size: 131 Bytes
  • Size of remote file: 818 kB
DA-2-repo/assets/demos/a7.png ADDED

Git LFS Details

  • SHA256: 12b99fdddea8eefb6885114bd386fc4fad0484e13c85c88364a43396f9cef3f9
  • Pointer size: 131 Bytes
  • Size of remote file: 905 kB
DA-2-repo/assets/demos/a8.png ADDED

Git LFS Details

  • SHA256: 5b29df5b6294742acc43d8ce41073b335e98024459273b77d9b943fd3583ac35
  • Pointer size: 131 Bytes
  • Size of remote file: 784 kB
DA-2-repo/assets/demos/a9.png ADDED

Git LFS Details

  • SHA256: ba92bf3adf1d1b2a775d5b0f895a16876159fc1a43d98328c923fdc994d6e346
  • Pointer size: 131 Bytes
  • Size of remote file: 910 kB
DA-2-repo/assets/demos/b0.png ADDED

Git LFS Details

  • SHA256: 3b610ae826372778853553810ef0e07e4f91d8507549dc0f5f32eca038348a37
  • Pointer size: 131 Bytes
  • Size of remote file: 850 kB
DA-2-repo/assets/demos/b1.png ADDED

Git LFS Details

  • SHA256: 2df3207be859cf8524e9a00a76efb606e626ca4cc9dbd81178fe24de43a6b97b
  • Pointer size: 131 Bytes
  • Size of remote file: 798 kB
DA-2-repo/assets/demos/b2.png ADDED

Git LFS Details

  • SHA256: 790218133cd507f1f9ca65fcdff60f74325df39ebd0df1d5b6e6261a8dfd29a8
  • Pointer size: 131 Bytes
  • Size of remote file: 863 kB
DA-2-repo/assets/demos/b3.png ADDED

Git LFS Details

  • SHA256: 843b680077e114451285efc6536e811739cbbab07ade423459a5bc24e747455f
  • Pointer size: 131 Bytes
  • Size of remote file: 651 kB
DA-2-repo/assets/demos/b4.png ADDED

Git LFS Details

  • SHA256: 5615e49fa1bea5ee049a66bbe577d48dd63f441e86a4ae5b225136e7e2295187
  • Pointer size: 131 Bytes
  • Size of remote file: 804 kB
DA-2-repo/assets/demos/b5.png ADDED

Git LFS Details

  • SHA256: 7957ee9e54dd6b61b74014412ece3de7bbe999ae0c0be41c4d762d62d8352656
  • Pointer size: 131 Bytes
  • Size of remote file: 669 kB
DA-2-repo/assets/masks/b0.png ADDED
DA-2-repo/assets/masks/b1.png ADDED
DA-2-repo/assets/masks/b2.png ADDED
DA-2-repo/assets/masks/b3.png ADDED
DA-2-repo/assets/masks/b4.png ADDED
DA-2-repo/assets/masks/b5.png ADDED
DA-2-repo/configs/accelerate/0.yaml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ compute_environment: LOCAL_MACHINE
2
+ debug: false
3
+ distributed_type: 'NO'
4
+ downcast_bf16: 'no'
5
+ gpu_ids: '0'
6
+ machine_rank: 0
7
+ main_training_function: main
8
+ mixed_precision: 'no'
9
+ num_machines: 1
10
+ num_processes: 1
11
+ rdzv_backend: static
12
+ same_network: true
13
+ tpu_env: []
14
+ tpu_use_cluster: false
15
+ tpu_use_sudo: false
16
+ use_cpu: false
DA-2-repo/configs/accelerate/1.yaml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ compute_environment: LOCAL_MACHINE
2
+ debug: false
3
+ distributed_type: 'NO'
4
+ downcast_bf16: 'no'
5
+ gpu_ids: '1'
6
+ machine_rank: 0
7
+ main_training_function: main
8
+ mixed_precision: 'no'
9
+ num_machines: 1
10
+ num_processes: 1
11
+ rdzv_backend: static
12
+ same_network: true
13
+ tpu_env: []
14
+ tpu_use_cluster: false
15
+ tpu_use_sudo: false
16
+ use_cpu: false
DA-2-repo/configs/accelerate/2.yaml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ compute_environment: LOCAL_MACHINE
2
+ debug: false
3
+ distributed_type: 'NO'
4
+ downcast_bf16: 'no'
5
+ gpu_ids: '2'
6
+ machine_rank: 0
7
+ main_training_function: main
8
+ mixed_precision: 'no'
9
+ num_machines: 1
10
+ num_processes: 1
11
+ rdzv_backend: static
12
+ same_network: true
13
+ tpu_env: []
14
+ tpu_use_cluster: false
15
+ tpu_use_sudo: false
16
+ use_cpu: false
DA-2-repo/configs/accelerate/3.yaml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ compute_environment: LOCAL_MACHINE
2
+ debug: false
3
+ distributed_type: 'NO'
4
+ downcast_bf16: 'no'
5
+ gpu_ids: '3'
6
+ machine_rank: 0
7
+ main_training_function: main
8
+ mixed_precision: 'no'
9
+ num_machines: 1
10
+ num_processes: 1
11
+ rdzv_backend: static
12
+ same_network: true
13
+ tpu_env: []
14
+ tpu_use_cluster: false
15
+ tpu_use_sudo: false
16
+ use_cpu: false
DA-2-repo/configs/accelerate/4.yaml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ compute_environment: LOCAL_MACHINE
2
+ debug: false
3
+ distributed_type: 'NO'
4
+ downcast_bf16: 'no'
5
+ gpu_ids: '4'
6
+ machine_rank: 0
7
+ main_training_function: main
8
+ mixed_precision: 'no'
9
+ num_machines: 1
10
+ num_processes: 1
11
+ rdzv_backend: static
12
+ same_network: true
13
+ tpu_env: []
14
+ tpu_use_cluster: false
15
+ tpu_use_sudo: false
16
+ use_cpu: false
DA-2-repo/configs/accelerate/5.yaml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ compute_environment: LOCAL_MACHINE
2
+ debug: false
3
+ distributed_type: 'NO'
4
+ downcast_bf16: 'no'
5
+ gpu_ids: '5'
6
+ machine_rank: 0
7
+ main_training_function: main
8
+ mixed_precision: 'no'
9
+ num_machines: 1
10
+ num_processes: 1
11
+ rdzv_backend: static
12
+ same_network: true
13
+ tpu_env: []
14
+ tpu_use_cluster: false
15
+ tpu_use_sudo: false
16
+ use_cpu: false
DA-2-repo/configs/accelerate/6.yaml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ compute_environment: LOCAL_MACHINE
2
+ debug: false
3
+ distributed_type: 'NO'
4
+ downcast_bf16: 'no'
5
+ gpu_ids: '6'
6
+ machine_rank: 0
7
+ main_training_function: main
8
+ mixed_precision: 'no'
9
+ num_machines: 1
10
+ num_processes: 1
11
+ rdzv_backend: static
12
+ same_network: true
13
+ tpu_env: []
14
+ tpu_use_cluster: false
15
+ tpu_use_sudo: false
16
+ use_cpu: false
DA-2-repo/configs/accelerate/7.yaml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ compute_environment: LOCAL_MACHINE
2
+ debug: false
3
+ distributed_type: 'NO'
4
+ downcast_bf16: 'no'
5
+ gpu_ids: '7'
6
+ machine_rank: 0
7
+ main_training_function: main
8
+ mixed_precision: 'no'
9
+ num_machines: 1
10
+ num_processes: 1
11
+ rdzv_backend: static
12
+ same_network: true
13
+ tpu_env: []
14
+ tpu_use_cluster: false
15
+ tpu_use_sudo: false
16
+ use_cpu: false
DA-2-repo/configs/eval.json ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "env": {
3
+ "seed": 42,
4
+ "verbose": true
5
+ },
6
+ "accelerator": {
7
+ "report_to": ["tensorboard"],
8
+ "mixed_precision": "fp16",
9
+ "accumulation_nsteps": 4,
10
+ "timeout": 36000
11
+ },
12
+ "inference": {
13
+ "images": "assets/demos",
14
+ "masks": "assets/masks",
15
+ "min_pixels": 580000,
16
+ "max_pixels": 620000
17
+ },
18
+ "evaluation": {
19
+ "alignment": "median",
20
+ "datasets_dir": "/home/haodong/data/DA-2/Evaluation",
21
+ "metric_names": [
22
+ "abs_relative_difference",
23
+ "squared_relative_difference",
24
+ "rmse_linear",
25
+ "rmse_log",
26
+ "log10",
27
+ "delta1_acc",
28
+ "delta2_acc",
29
+ "delta3_acc",
30
+ "i_rmse",
31
+ "silog_rmse"
32
+ ],
33
+ "metric_show": {
34
+ "abs_relative_difference": "AbsRel",
35
+ "delta1_acc": "δ_1"
36
+ },
37
+ "datasets": {
38
+ "2d3ds": {
39
+ "dir": "2D3DS/wo_xyz",
40
+ "filenames": "eval/datasets/splits/2d3ds.txt",
41
+ "alignment_max_res": 2048
42
+ },
43
+ "matterport3d": {
44
+ "dir": "Matterport3D",
45
+ "filenames": "eval/datasets/splits/matterport3d.txt",
46
+ "alignment_max_res": 2048
47
+ },
48
+ "panosuncg": {
49
+ "dir": "PanoSUNCG/rotated",
50
+ "filenames": "eval/datasets/splits/panosuncg.txt",
51
+ "alignment_max_res": 1024
52
+ }
53
+ }
54
+ },
55
+ "spherevit": {
56
+ "vit_w_esphere": {
57
+ "input_dims": [1024, 1024, 1024, 1024],
58
+ "hidden_dim": 512,
59
+ "num_heads": 8,
60
+ "expansion": 4,
61
+ "num_layers_head": [2, 2, 2],
62
+ "dropout": 0.0,
63
+ "layer_scale": 0.0001,
64
+ "out_dim": 64,
65
+ "kernel_size": 3,
66
+ "num_prompt_blocks": 1,
67
+ "use_norm": false
68
+ },
69
+ "sphere": {
70
+ "width": 1092,
71
+ "height": 546,
72
+ "hfov": 6.2832,
73
+ "vfov": 3.1416
74
+ }
75
+ }
76
+ }
DA-2-repo/configs/infer.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "env": {
3
+ "seed": 42,
4
+ "verbose": true
5
+ },
6
+ "accelerator": {
7
+ "report_to": ["tensorboard"],
8
+ "mixed_precision": "fp16",
9
+ "accumulation_nsteps": 4,
10
+ "timeout": 36000
11
+ },
12
+ "inference": {
13
+ "images": "assets/demos",
14
+ "masks": "assets/masks",
15
+ "min_pixels": 580000,
16
+ "max_pixels": 620000
17
+ },
18
+ "spherevit": {
19
+ "vit_w_esphere": {
20
+ "input_dims": [1024, 1024, 1024, 1024],
21
+ "hidden_dim": 512,
22
+ "num_heads": 8,
23
+ "expansion": 4,
24
+ "num_layers_head": [2, 2, 2],
25
+ "dropout": 0.0,
26
+ "layer_scale": 0.0001,
27
+ "out_dim": 64,
28
+ "kernel_size": 3,
29
+ "num_prompt_blocks": 1,
30
+ "use_norm": false
31
+ },
32
+ "sphere": {
33
+ "width": 1092,
34
+ "height": 546,
35
+ "hfov": 6.2832,
36
+ "vfov": 3.1416
37
+ }
38
+ }
39
+ }
DA-2-repo/eval.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ from contextlib import nullcontext
4
+ from tqdm import tqdm
5
+ from da2 import (
6
+ prepare_to_run,
7
+ load_model
8
+ )
9
+ from eval.utils import run_evaluation
10
+
11
+
12
+ def eval(model, config, accelerator, output_dir):
13
+ model = model.eval()
14
+ eval_datasets = config['evaluation']['datasets']
15
+ if accelerator.is_main_process:
16
+ if torch.backends.mps.is_available():
17
+ autocast_ctx = nullcontext()
18
+ else:
19
+ autocast_ctx = torch.autocast(accelerator.device.type)
20
+ with autocast_ctx, torch.no_grad():
21
+ for dataset_name in eval_datasets.keys():
22
+ metrics = run_evaluation(model, config, dataset_name, output_dir, accelerator.device)
23
+ for metric_name in config['evaluation']['metric_show']:
24
+ config['env']['logger'].info(f"\033[92mEVAL --> {dataset_name}: {config['evaluation']['metric_show'][metric_name]} = {metrics[metric_name]}.\033[0m")
25
+
26
+ if __name__ == '__main__':
27
+ config, accelerator, output_dir = prepare_to_run()
28
+ model = load_model(config, accelerator)
29
+ eval(model, config, accelerator, output_dir)
DA-2-repo/eval.sh ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ export CUDA=0
2
+ export CONFIG_PATH="configs/eval.json"
3
+ accelerate launch \
4
+ --config_file=configs/accelerate/$CUDA.yaml \
5
+ --mixed_precision="fp16" \
6
+ --main_process_port="12345" \
7
+ eval.py --config_path=$CONFIG_PATH
DA-2-repo/eval/__init__.py ADDED
File without changes
DA-2-repo/eval/datasets/__init__.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Authors: Bingxin Ke, Haodong Li
2
+ # Last modified: 2025-05-25
3
+ # Note: Add PanoSUNCGDataset, Matterport3DDataset, Stanford2D3DSDataset for 360° depth (or distance) evaluation.
4
+
5
+ import os
6
+
7
+ from .base_depth_dataset import BaseDepthDataset, get_pred_name, DatasetMode
8
+ from .stanford2d3ds_dataset import Stanford2D3DSDataset
9
+ from .matterport3d_dataset import Matterport3DDataset
10
+ from .panosuncg_dataset import PanoSUNCGDataset
11
+
12
+ dataset_name_class_dict = {
13
+ "2d3ds": Stanford2D3DSDataset,
14
+ "matterport3d": Matterport3DDataset,
15
+ "panosuncg": PanoSUNCGDataset
16
+ }
17
+
18
+
19
+ def get_dataset(
20
+ cfg_data_split, dataset_name, base_data_dir: str, mode: DatasetMode, **kwargs
21
+ ) -> BaseDepthDataset:
22
+ if dataset_name in dataset_name_class_dict.keys():
23
+ dataset_class = dataset_name_class_dict[dataset_name]
24
+ dataset = dataset_class(
25
+ mode=mode,
26
+ filename_ls_path=cfg_data_split['filenames'],
27
+ dataset_dir=os.path.join(base_data_dir, cfg_data_split['dir']),
28
+ disp_name=dataset_name,
29
+ **cfg_data_split,
30
+ **kwargs,
31
+ )
32
+ else:
33
+ raise NotImplementedError
34
+
35
+ return dataset
DA-2-repo/eval/datasets/base_depth_dataset.py ADDED
@@ -0,0 +1,268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Author: Bingxin Ke
2
+ # Last modified: 2024-04-15
3
+
4
+ import io
5
+ import os
6
+ import random
7
+ import tarfile
8
+ from enum import Enum
9
+
10
+ import numpy as np
11
+ import cv2
12
+ import torch
13
+ from PIL import Image
14
+ from torch.utils.data import Dataset
15
+ from torchvision.transforms import InterpolationMode, Resize
16
+
17
+
18
+ class DatasetMode(Enum):
19
+ RGB_ONLY = "rgb_only"
20
+ EVAL = "evaluate"
21
+ TRAIN = "train"
22
+
23
+
24
+ def read_image_from_tar(tar_obj, img_rel_path):
25
+ image = tar_obj.extractfile("./" + img_rel_path)
26
+ image = image.read()
27
+ image = Image.open(io.BytesIO(image))
28
+
29
+
30
+ class BaseDepthDataset(Dataset):
31
+ def __init__(
32
+ self,
33
+ mode: DatasetMode,
34
+ filename_ls_path: str,
35
+ dataset_dir: str,
36
+ disp_name: str,
37
+ min_depth,
38
+ max_depth,
39
+ has_filled_depth,
40
+ name_mode,
41
+ depth_transform=None,
42
+ augmentation_args: dict = None,
43
+ resize_to_hw=None,
44
+ move_invalid_to_far_plane: bool = True,
45
+ rgb_transform=lambda x: x / 255.0 * 2 - 1, # [0, 255] -> [-1, 1],
46
+ **kwargs,
47
+ ) -> None:
48
+ super().__init__()
49
+ self.mode = mode
50
+ # dataset info
51
+ self.filename_ls_path = filename_ls_path
52
+ self.dataset_dir = dataset_dir
53
+ self.disp_name = disp_name
54
+ self.has_filled_depth = has_filled_depth
55
+ self.name_mode: DepthFileNameMode = name_mode
56
+ self.min_depth = min_depth
57
+ self.max_depth = max_depth
58
+
59
+ # training arguments
60
+ self.depth_transform = depth_transform
61
+ self.augm_args = augmentation_args
62
+ self.resize_to_hw = resize_to_hw
63
+ self.rgb_transform = rgb_transform
64
+ self.move_invalid_to_far_plane = move_invalid_to_far_plane
65
+
66
+ # Load filenames
67
+ with open(self.filename_ls_path, "r") as f:
68
+ self.filenames = [
69
+ s.split() for s in f.readlines()
70
+ ] # [['rgb.png', 'depth.tif'], [], ...]
71
+
72
+ # Tar dataset
73
+ self.tar_obj = None
74
+ self.is_tar = (
75
+ True
76
+ if os.path.isfile(dataset_dir) and tarfile.is_tarfile(dataset_dir)
77
+ else False
78
+ )
79
+
80
+ def __len__(self):
81
+ return len(self.filenames)
82
+
83
+ def __getitem__(self, index):
84
+ rasters, other = self._get_data_item(index)
85
+ if DatasetMode.TRAIN == self.mode:
86
+ rasters = self._training_preprocess(rasters)
87
+ # merge
88
+ outputs = rasters
89
+ outputs.update(other)
90
+ return outputs
91
+
92
+ def _get_data_item(self, index):
93
+ rgb_rel_path, depth_rel_path, filled_rel_path = self._get_data_path(index=index)
94
+
95
+ rasters = {}
96
+
97
+ # RGB data
98
+ rasters.update(self._load_rgb_data(rgb_rel_path=rgb_rel_path))
99
+
100
+ # Depth data
101
+ if DatasetMode.RGB_ONLY != self.mode:
102
+ # load data
103
+ depth_data = self._load_depth_data(
104
+ depth_rel_path=depth_rel_path, filled_rel_path=filled_rel_path
105
+ )
106
+ rasters.update(depth_data)
107
+ # valid mask
108
+ rasters["valid_mask_raw"] = self._get_valid_mask(
109
+ rasters["depth_raw_linear"]
110
+ ).clone()
111
+ rasters["valid_mask_filled"] = self._get_valid_mask(
112
+ rasters["depth_filled_linear"]
113
+ ).clone()
114
+
115
+ other = {"index": index, "rgb_relative_path": rgb_rel_path}
116
+
117
+ return rasters, other
118
+
119
+ def _load_rgb_data(self, rgb_rel_path):
120
+ # Read RGB data
121
+ rgb = self._read_rgb_file(rgb_rel_path)
122
+
123
+ outputs = {
124
+ "rgb_int": torch.from_numpy(rgb).int(),
125
+ }
126
+ return outputs
127
+
128
+ def _load_depth_data(self, depth_rel_path, filled_rel_path):
129
+ # Read depth data
130
+ outputs = {}
131
+ depth_raw = self._read_depth_file(depth_rel_path).squeeze()
132
+ depth_raw_linear = torch.from_numpy(depth_raw).float().unsqueeze(0) # [1, H, W]
133
+ outputs["depth_raw_linear"] = depth_raw_linear.clone()
134
+
135
+ if self.has_filled_depth:
136
+ depth_filled = self._read_depth_file(filled_rel_path).squeeze()
137
+ depth_filled_linear = torch.from_numpy(depth_filled).float().unsqueeze(0)
138
+ outputs["depth_filled_linear"] = depth_filled_linear
139
+ else:
140
+ outputs["depth_filled_linear"] = depth_raw_linear.clone()
141
+
142
+ return outputs
143
+
144
+ def _get_data_path(self, index):
145
+ filename_line = self.filenames[index]
146
+
147
+ # Get data path
148
+ rgb_rel_path = filename_line[0]
149
+
150
+ depth_rel_path, filled_rel_path = None, None
151
+ if DatasetMode.RGB_ONLY != self.mode:
152
+ depth_rel_path = filename_line[1]
153
+ if self.has_filled_depth:
154
+ filled_rel_path = filename_line[2]
155
+ return rgb_rel_path, depth_rel_path, filled_rel_path
156
+
157
+ def _read_image(self, img_rel_path) -> np.ndarray:
158
+ if self.is_tar:
159
+ if self.tar_obj is None:
160
+ self.tar_obj = tarfile.open(self.dataset_dir)
161
+ image = self.tar_obj.extractfile("./" + img_rel_path)
162
+ image = image.read()
163
+ image = Image.open(io.BytesIO(image)) # [H, W, rgb]
164
+ else:
165
+ img_path = os.path.join(self.dataset_dir, img_rel_path)
166
+ image = Image.open(img_path).convert('RGB')
167
+ image = np.asarray(image)
168
+ return image
169
+
170
+ def _read_depth_cv2(self, img_rel_path) -> np.ndarray:
171
+ depth_path = os.path.join(self.dataset_dir, img_rel_path)
172
+ depth_in = cv2.imread(depth_path, cv2.IMREAD_UNCHANGED)
173
+ if depth_in.shape[2] == 3: # If image has 3 channels
174
+ depth_in = depth_in[..., 0] # PANO
175
+ depth_in = depth_in.astype(np.float32)
176
+ return depth_in
177
+
178
+ def _read_rgb_file(self, rel_path) -> np.ndarray:
179
+ rgb = self._read_image(rel_path)
180
+ # Handle RGBA images by converting to RGB
181
+ if rgb.shape[2] == 4: # If image has 4 channels (RGBA)
182
+ rgb = rgb[:, :, :3] # Take only the RGB channels
183
+ rgb = np.transpose(rgb, (2, 0, 1)).astype(int) # [rgb, H, W]
184
+ return rgb
185
+
186
+ def _read_depth_file(self, rel_path):
187
+ depth_in = self._read_image(rel_path)
188
+ # Replace code below to decode depth according to dataset definition
189
+ depth_decoded = depth_in
190
+
191
+ return depth_decoded
192
+
193
+ def _get_valid_mask(self, depth: torch.Tensor):
194
+ valid_mask = torch.logical_and(
195
+ (depth > self.min_depth), (depth < self.max_depth)
196
+ ).bool()
197
+ return valid_mask
198
+
199
+ def _training_preprocess(self, rasters):
200
+ # Augmentation
201
+ if self.augm_args is not None:
202
+ rasters = self._augment_data(rasters)
203
+
204
+ # Normalization
205
+ rasters["depth_raw_norm"] = self.depth_transform(
206
+ rasters["depth_raw_linear"], rasters["valid_mask_raw"]
207
+ ).clone()
208
+ rasters["depth_filled_norm"] = self.depth_transform(
209
+ rasters["depth_filled_linear"], rasters["valid_mask_filled"]
210
+ ).clone()
211
+
212
+ # Set invalid pixel to far plane
213
+ if self.move_invalid_to_far_plane:
214
+ if self.depth_transform.far_plane_at_max:
215
+ rasters["depth_filled_norm"][~rasters["valid_mask_filled"]] = (
216
+ self.depth_transform.norm_max
217
+ )
218
+ else:
219
+ rasters["depth_filled_norm"][~rasters["valid_mask_filled"]] = (
220
+ self.depth_transform.norm_min
221
+ )
222
+
223
+ # Resize
224
+ if self.resize_to_hw is not None:
225
+ resize_transform = Resize(
226
+ size=self.resize_to_hw, interpolation=InterpolationMode.NEAREST_EXACT
227
+ )
228
+ rasters = {k: resize_transform(v) for k, v in rasters.items()}
229
+
230
+ return rasters
231
+
232
+ def _augment_data(self, rasters_dict):
233
+ # lr flipping
234
+ lr_flip_p = self.augm_args.lr_flip_p
235
+ if random.random() < lr_flip_p:
236
+ rasters_dict = {k: v.flip(-1) for k, v in rasters_dict.items()}
237
+
238
+ return rasters_dict
239
+
240
+ def __del__(self):
241
+ if self.tar_obj is not None:
242
+ self.tar_obj.close()
243
+ self.tar_obj = None
244
+
245
+
246
+ # Prediction file naming modes
247
+ class DepthFileNameMode(Enum):
248
+ id = 1 # id.png
249
+ rgb_id = 2 # rgb_id.png
250
+ i_d_rgb = 3 # i_d_1_rgb.png
251
+ rgb_i_d = 4
252
+
253
+
254
+ def get_pred_name(rgb_basename, name_mode, suffix=".png"):
255
+ if DepthFileNameMode.rgb_id == name_mode:
256
+ pred_basename = "pred_" + rgb_basename.split("_")[1]
257
+ elif DepthFileNameMode.i_d_rgb == name_mode:
258
+ pred_basename = rgb_basename.replace("_rgb.", "_pred.")
259
+ elif DepthFileNameMode.id == name_mode:
260
+ pred_basename = "pred_" + rgb_basename
261
+ elif DepthFileNameMode.rgb_i_d == name_mode:
262
+ pred_basename = "pred_" + "_".join(rgb_basename.split("_")[1:])
263
+ else:
264
+ raise NotImplementedError
265
+ # change suffix
266
+ pred_basename = os.path.splitext(pred_basename)[0] + suffix
267
+
268
+ return pred_basename
DA-2-repo/eval/datasets/matterport3d_dataset.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Author: Haodong Li
2
+ # Last modified: 2025-05-25
3
+
4
+ from .base_depth_dataset import BaseDepthDataset, DepthFileNameMode
5
+ import cv2
6
+ import os
7
+
8
+ class Matterport3DDataset(BaseDepthDataset):
9
+ def __init__(
10
+ self,
11
+ **kwargs,
12
+ ) -> None:
13
+ super().__init__(
14
+ min_depth=1e-3,
15
+ max_depth=5,
16
+ has_filled_depth=False,
17
+ name_mode=DepthFileNameMode.id,
18
+ **kwargs,
19
+ )
20
+
21
+ def _read_depth_file(self, rel_path):
22
+ img_path = os.path.join(self.dataset_dir, rel_path)
23
+ depth_in = cv2.imread(img_path, cv2.IMREAD_UNCHANGED)
24
+ depth_decoded = depth_in / 2560.0
25
+ return depth_decoded
DA-2-repo/eval/datasets/panosuncg_dataset.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Author: Haodong Li
2
+ # Last modified: 2025-05-25
3
+
4
+ from .base_depth_dataset import BaseDepthDataset, DepthFileNameMode
5
+ import cv2
6
+ import os
7
+
8
+ class PanoSUNCGDataset(BaseDepthDataset):
9
+ def __init__(
10
+ self,
11
+ **kwargs,
12
+ ) -> None:
13
+ super().__init__(
14
+ min_depth=1e-3,
15
+ max_depth=5,
16
+ has_filled_depth=False,
17
+ name_mode=DepthFileNameMode.id,
18
+ **kwargs,
19
+ )
20
+
21
+ def _read_depth_file(self, rel_path):
22
+ img_path = os.path.join(self.dataset_dir, rel_path)
23
+ depth_in = cv2.imread(img_path, cv2.IMREAD_UNCHANGED)
24
+ depth_in = depth_in[..., 0]
25
+ depth_decoded = depth_in / 20.0
26
+ return depth_decoded
DA-2-repo/eval/datasets/splits/2d3ds.txt ADDED
The diff for this file is too large to render. See raw diff
 
DA-2-repo/eval/datasets/splits/matterport3d.txt ADDED
The diff for this file is too large to render. See raw diff