Spaces:

akhaliq
/

depth-pro

Running on Zero

App Files Files Community

akhaliq HF Staff commited on Oct 4, 2024

Commit

de1b1de

verified ·

1 Parent(s): 5795657

Upload folder using huggingface_hub

Browse files

Files changed (23) hide show

.gitattributes +1 -0
ACKNOWLEDGEMENTS.md +418 -0
CODE_OF_CONDUCT.md +71 -0
CONTRIBUTING.md +11 -0
LICENSE +47 -0
README.md +97 -12
data/depth-pro-teaser.jpg +0 -0
data/example.jpg +3 -0
get_pretrained_models.sh +8 -0
pyproject.toml +59 -0
src/depth_pro/__init__.py +5 -0
src/depth_pro/cli/__init__.py +4 -0
src/depth_pro/cli/run.py +149 -0
src/depth_pro/depth_pro.py +298 -0
src/depth_pro/eval/boundary_metrics.py +332 -0
src/depth_pro/eval/dis5k_sample_list.txt +200 -0
src/depth_pro/network/__init__.py +2 -0
src/depth_pro/network/decoder.py +206 -0
src/depth_pro/network/encoder.py +332 -0
src/depth_pro/network/fov.py +82 -0
src/depth_pro/network/vit.py +123 -0
src/depth_pro/network/vit_factory.py +124 -0
src/depth_pro/utils.py +112 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+data/example.jpg filter=lfs diff=lfs merge=lfs -text

ACKNOWLEDGEMENTS.md ADDED Viewed

	@@ -0,0 +1,418 @@

+Acknowledgements
+Portions of this Software may utilize the following copyrighted
+material, the use of which is hereby acknowledged.
+------------------------------------------------
+PyTorch Image Models (timm)
+Ross Wightman
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "{}"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright 2019 Ross Wightman
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+------------------------------------------------
+DINOv2: Learning Robust Visual Features without Supervision
+Github source: https://github.com/facebookresearch/dinov2
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

CODE_OF_CONDUCT.md ADDED Viewed

	@@ -0,0 +1,71 @@

+# Code of Conduct
+## Our Pledge
+In the interest of fostering an open and welcoming environment, we as
+contributors and maintainers pledge to making participation in our project and
+our community a harassment-free experience for everyone, regardless of age, body
+size, disability, ethnicity, sex characteristics, gender identity and expression,
+level of experience, education, socio-economic status, nationality, personal
+appearance, race, religion, or sexual identity and orientation.
+## Our Standards
+Examples of behavior that contributes to creating a positive environment
+include:
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+Examples of unacceptable behavior by participants include:
+* The use of sexualized language or imagery and unwelcome sexual attention or
+  advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic
+  address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+## Our Responsibilities
+Project maintainers are responsible for clarifying the standards of acceptable
+behavior and are expected to take appropriate and fair corrective action in
+response to any instances of unacceptable behavior.
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct, or to ban temporarily or
+permanently any contributor for other behaviors that they deem inappropriate,
+threatening, offensive, or harmful.
+## Scope
+This Code of Conduct applies within all project spaces, and it also applies when
+an individual is representing the project or its community in public spaces.
+Examples of representing a project or community include using an official
+project e-mail address, posting via an official social media account, or acting
+as an appointed representative at an online or offline event. Representation of
+a project may be further defined and clarified by project maintainers.
+## Enforcement
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by contacting the open source team at [opensource-conduct@group.apple.com](mailto:opensource-conduct@group.apple.com). All
+complaints will be reviewed and investigated and will result in a response that
+is deemed necessary and appropriate to the circumstances. The project team is
+obligated to maintain confidentiality with regard to the reporter of an incident.
+Further details of specific enforcement policies may be posted separately.
+Project maintainers who do not follow or enforce the Code of Conduct in good
+faith may face temporary or permanent repercussions as determined by other
+members of the project's leadership.
+## Attribution
+This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org), version 1.4,
+available at [https://www.contributor-covenant.org/version/1/4/code-of-conduct.html](https://www.contributor-covenant.org/version/1/4/code-of-conduct.html)

CONTRIBUTING.md ADDED Viewed

	@@ -0,0 +1,11 @@

+# Contribution Guide
+Thanks for your interest in contributing. This project was released to accompany a research paper for purposes of reproducibility, and beyond its publication there are limited plans for future development of the repository.
+While we welcome new pull requests and issues please note that our response may be limited. Forks and out-of-tree improvements are strongly encouraged.
+## Before you get started
+By submitting a pull request, you represent that you have the right to license your contribution to Apple and the community, and agree by submitting the patch that your contributions are licensed under the [LICENSE](LICENSE).
+We ask that all community members read and observe our [Code of Conduct](CODE_OF_CONDUCT.md).

LICENSE ADDED Viewed

	@@ -0,0 +1,47 @@

+Copyright (C) 2024 Apple Inc. All Rights Reserved.
+Disclaimer: IMPORTANT:  This Apple software is supplied to you by Apple
+Inc. ("Apple") in consideration of your agreement to the following
+terms, and your use, installation, modification or redistribution of
+this Apple software constitutes acceptance of these terms.  If you do
+not agree with these terms, please do not use, install, modify or
+redistribute this Apple software.
+In consideration of your agreement to abide by the following terms, and
+subject to these terms, Apple grants you a personal, non-exclusive
+license, under Apple's copyrights in this original Apple software (the
+"Apple Software"), to use, reproduce, modify and redistribute the Apple
+Software, with or without modifications, in source and/or binary forms;
+provided that if you redistribute the Apple Software in its entirety and
+without modifications, you must retain this notice and the following
+text and disclaimers in all such redistributions of the Apple Software.
+Neither the name, trademarks, service marks or logos of Apple Inc. may
+be used to endorse or promote products derived from the Apple Software
+without specific prior written permission from Apple.  Except as
+expressly stated in this notice, no other rights or licenses, express or
+implied, are granted by Apple herein, including but not limited to any
+patent rights that may be infringed by your derivative works or by other
+works in which the Apple Software may be incorporated.
+The Apple Software is provided by Apple on an "AS IS" basis.  APPLE
+MAKES NO WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION
+THE IMPLIED WARRANTIES OF NON-INFRINGEMENT, MERCHANTABILITY AND FITNESS
+FOR A PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND
+OPERATION ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
+IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL
+OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION,
+MODIFICATION AND/OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED
+AND WHETHER UNDER THEORY OF CONTRACT, TORT (INCLUDING NEGLIGENCE),
+STRICT LIABILITY OR OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+-------------------------------------------------------------------------------
+SOFTWARE DISTRIBUTED IN THIS REPOSITORY:
+This software includes a number of subcomponents with separate
+copyright notices and license terms - please see the file ACKNOWLEDGEMENTS.
+-------------------------------------------------------------------------------

README.md CHANGED Viewed

@@ -1,12 +1,97 @@
----
-title: Depth Pro
-emoji: 🐠
-colorFrom: red
-colorTo: green
-sdk: gradio
-sdk_version: 4.44.1
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+## Depth Pro: Sharp Monocular Metric Depth in Less Than a Second
+This software project accompanies the research paper:
+**[Depth Pro: Sharp Monocular Metric Depth in Less Than a Second](https://arxiv.org/abs/2410.02073)**,
+*Aleksei Bochkovskii, Amaël Delaunoy, Hugo Germain, Marcel Santos, Yichao Zhou, Stephan R. Richter, and Vladlen Koltun*.
+![](data/depth-pro-teaser.jpg)
+We present a foundation model for zero-shot metric monocular depth estimation. Our model, Depth Pro, synthesizes high-resolution depth maps with unparalleled sharpness and high-frequency details. The predictions are metric, with absolute scale, without relying on the availability of metadata such as camera intrinsics. And the model is fast, producing a 2.25-megapixel depth map in 0.3 seconds on a standard GPU. These characteristics are enabled by a number of technical contributions, including an efficient multi-scale vision transformer for dense prediction, a training protocol that combines real and synthetic datasets to achieve high metric accuracy alongside fine boundary tracing, dedicated evaluation metrics for boundary accuracy in estimated depth maps, and state-of-the-art focal length estimation from a single image.
+The model in this repository is a reference implementation, which has been re-trained. Its performance is close to the model reported in the paper but does not match it exactly.
+## Getting Started
+We recommend setting up a virtual environment. Using e.g. miniconda, the `depth_pro` package can be installed via:
+```bash
+conda create -n depth-pro -y python=3.9
+conda activate depth-pro
+pip install -e .
+```
+To download pretrained checkpoints follow the code snippet below:
+```bash
+source get_pretrained_models.sh   # Files will be downloaded to `checkpoints` directory.
+```
+### Running from commandline
+We provide a helper script to directly run the model on a single image:
+```bash
+# Run prediction on a single image:
+depth-pro-run -i ./data/example.jpg
+# Run `depth-pro-run -h` for available options.
+```
+### Running from python
+```python
+from PIL import Image
+import depth_pro
+# Load model and preprocessing transform
+model, transform = depth_pro.create_model_and_transforms()
+model.eval()
+# Load and preprocess an image.
+image, _, f_px = depth_pro.load_rgb(image_path)
+image = transform(image)
+# Run inference.
+prediction = model.infer(image, f_px=f_px)
+depth = prediction["depth"]  # Depth in [m].
+focallength_px = prediction["focallength_px"]  # Focal length in pixels.
+```
+### Evaluation (boundary metrics)
+Our boundary metrics can be found under `eval/boundary_metrics.py` and used as follows:
+```python
+# for a depth-based dataset
+boundary_f1 = SI_boundary_F1(predicted_depth, target_depth)
+# for a mask-based dataset (image matting / segmentation)
+boundary_recall = SI_boundary_Recall(predicted_depth, target_mask)
+```
+## Citation
+If you find our work useful, please cite the following paper:
+```bibtex
+@article{Bochkovskii2024:arxiv,
+  author     = {Aleksei Bochkovskii and Ama\"{e}l Delaunoy and Hugo Germain and Marcel Santos and
+               Yichao Zhou and Stephan R. Richter and Vladlen Koltun}
+  title      = {Depth Pro: Sharp Monocular Metric Depth in Less Than a Second},
+  journal    = {arXiv},
+  year       = {2024},
+  url        = {https://arxiv.org/abs/2410.02073},
+}
+```
+## License
+This sample code is released under the [LICENSE](LICENSE) terms.
+The model weights are released under the [LICENSE](LICENSE) terms.
+## Acknowledgements
+Our codebase is built using multiple opensource contributions, please see [Acknowledgements](ACKNOWLEDGEMENTS.md) for more details.
+Please check the paper for a complete list of references and datasets used in this work.

data/depth-pro-teaser.jpg ADDED Viewed

data/example.jpg ADDED Viewed

Git LFS Details

SHA256: 7b07a583db12943bc90c2429afeca0aca63450e8eb7a2f29314ffbb1acd8d710
Pointer size: 132 Bytes
Size of remote file: 2.33 MB

get_pretrained_models.sh ADDED Viewed

	@@ -0,0 +1,8 @@

+#!/usr/bin/env bash
+#
+# For licensing see accompanying LICENSE file.
+# Copyright (C) 2024 Apple Inc. All Rights Reserved.
+#
+mkdir -p checkpoints
+# Place final weights here:
+wget https://ml-site.cdn-apple.com/models/depth-pro/depth_pro.pt -P checkpoints

pyproject.toml ADDED Viewed

	@@ -0,0 +1,59 @@

+[project]
+name = "depth_pro"
+version = "0.1"
+description = "Inference/Network/Model code for Apple Depth Pro monocular depth estimation."
+readme = "README.md"
+dependencies = [
+    "torch",
+    "torchvision",
+    "timm",
+    "numpy<2",
+    "pillow_heif",
+    "matplotlib",
+]
+[project.scripts]
+depth-pro-run = "depth_pro.cli:run_main"
+[project.urls]
+Homepage = "https://github.com/apple/ml-depth-pro"
+Repository = "https://github.com/apple/ml-depth-pro"
+[build-system]
+requires = ["setuptools", "setuptools-scm"]
+build-backend = "setuptools.build_meta"
+[tool.setuptools.packages.find]
+where = ["src"]
+[tool.pyright]
+include = ["src"]
+exclude = [
+    "**/node_modules",
+    "**/__pycache__",
+]
+pythonVersion = "3.9"
+[tool.pytest.ini_options]
+minversion = "6.0"
+addopts = "-ra -q"
+testpaths = [
+    "tests"
+]
+filterwarnings = [
+    "ignore::DeprecationWarning"
+]
+[tool.lint.per-file-ignores]
+"__init__.py" = ["F401", "D100", "D104"]
+[tool.ruff]
+line-length = 100
+lint.select = ["E", "F", "D", "I"]
+lint.ignore = ["D100", "D105"]
+extend-exclude = [
+    "*external*",
+    "third_party",
+]
+src = ["depth_pro", "tests"]
+target-version = "py39"

src/depth_pro/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# Copyright (C) 2024 Apple Inc. All Rights Reserved.
+"""Depth Pro package."""
+from .depth_pro import create_model_and_transforms  # noqa
+from .utils import load_rgb  # noqa

src/depth_pro/cli/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# Copyright (C) 2024 Apple Inc. All Rights Reserved.
+"""Depth Pro CLI and tools."""
+from .run import main as run_main  # noqa

src/depth_pro/cli/run.py ADDED Viewed

	@@ -0,0 +1,149 @@

+#!/usr/bin/env python3
+"""Sample script to run DepthPro.
+Copyright (C) 2024 Apple Inc. All Rights Reserved.
+"""
+import argparse
+import logging
+from pathlib import Path
+import numpy as np
+import PIL.Image
+import torch
+from matplotlib import pyplot as plt
+from tqdm import tqdm
+from depth_pro import create_model_and_transforms, load_rgb
+LOGGER = logging.getLogger(__name__)
+def get_torch_device() -> torch.device:
+    """Get the Torch device."""
+    device = torch.device("cpu")
+    if torch.cuda.is_available():
+        device = torch.device("cuda:0")
+    elif torch.backends.mps.is_available():
+        device = torch.device("mps")
+    return device
+def run(args):
+    """Run Depth Pro on a sample image."""
+    if args.verbose:
+        logging.basicConfig(level=logging.INFO)
+    # Load model.
+    model, transform = create_model_and_transforms(
+        device=get_torch_device(),
+        precision=torch.half,
+    )
+    model.eval()
+    image_paths = [args.image_path]
+    if args.image_path.is_dir():
+        image_paths = args.image_path.glob("**/*")
+        relative_path = args.image_path
+    else:
+        relative_path = args.image_path.parent
+    if not args.skip_display:
+        plt.ion()
+        fig = plt.figure()
+        ax_rgb = fig.add_subplot(121)
+        ax_disp = fig.add_subplot(122)
+    for image_path in tqdm(image_paths):
+        # Load image and focal length from exif info (if found.).
+        try:
+            LOGGER.info(f"Loading image {image_path} ...")
+            image, _, f_px = load_rgb(image_path)
+        except Exception as e:
+            LOGGER.error(str(e))
+            continue
+        # Run prediction. If `f_px` is provided, it is used to estimate the final metric depth,
+        # otherwise the model estimates `f_px` to compute the depth metricness.
+        prediction = model.infer(transform(image), f_px=f_px)
+        # Extract the depth and focal length.
+        depth = prediction["depth"].detach().cpu().numpy().squeeze()
+        if f_px is not None:
+            LOGGER.debug(f"Focal length (from exif): {f_px:0.2f}")
+        elif prediction["focallength_px"] is not None:
+            focallength_px = prediction["focallength_px"].detach().cpu().item()
+            LOGGER.info(f"Estimated focal length: {focallength_px}")
+        # Save Depth as npz file.
+        if args.output_path is not None:
+            output_file = (
+                args.output_path
+                / image_path.relative_to(relative_path).parent
+                / image_path.stem
+            )
+            LOGGER.info(f"Saving depth map to: {str(output_file)}")
+            output_file.parent.mkdir(parents=True, exist_ok=True)
+            np.savez_compressed(output_file, depth=depth)
+            # Save as color-mapped "turbo" jpg image.
+            cmap = plt.get_cmap("turbo_r")
+            normalized_depth = (depth - depth.min()) / (
+                depth.max() - depth.min()
+            )
+            color_depth = (cmap(normalized_depth)[..., :3] * 255).astype(
+                np.uint8
+            )
+            color_map_output_file = str(output_file) + ".jpg"
+            LOGGER.info(f"Saving color-mapped depth to: : {color_map_output_file}")
+            PIL.Image.fromarray(color_depth).save(
+                color_map_output_file, format="JPEG", quality=90
+            )
+        # Display the image and estimated depth map.
+        if not args.skip_display:
+            ax_rgb.imshow(image)
+            ax_disp.imshow(depth, cmap="turbo_r")
+            fig.canvas.draw()
+            fig.canvas.flush_events()
+    LOGGER.info("Done predicting depth!")
+    if not args.skip_display:
+        plt.show(block=True)
+def main():
+    """Run DepthPro inference example."""
+    parser = argparse.ArgumentParser(
+        description="Inference scripts of DepthPro with PyTorch models."
+    )
+    parser.add_argument(
+        "-i",
+        "--image-path",
+        type=Path,
+        default="./data/example.jpg",
+        help="Path to input image.",
+    )
+    parser.add_argument(
+        "-o",
+        "--output-path",
+        type=Path,
+        help="Path to store output files.",
+    )
+    parser.add_argument(
+        "--skip-display",
+        action="store_true",
+        help="Skip matplotlib display.",
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+        help="Show verbose output."
+    )
+    run(parser.parse_args())
+if __name__ == "__main__":
+    main()

src/depth_pro/depth_pro.py ADDED Viewed

	@@ -0,0 +1,298 @@

+# Copyright (C) 2024 Apple Inc. All Rights Reserved.
+# Depth Pro: Sharp Monocular Metric Depth in Less Than a Second
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Mapping, Optional, Tuple, Union
+import torch
+from torch import nn
+from torchvision.transforms import (
+    Compose,
+    ConvertImageDtype,
+    Lambda,
+    Normalize,
+    ToTensor,
+)
+from .network.decoder import MultiresConvDecoder
+from .network.encoder import DepthProEncoder
+from .network.fov import FOVNetwork
+from .network.vit_factory import VIT_CONFIG_DICT, ViTPreset, create_vit
+@dataclass
+class DepthProConfig:
+    """Configuration for DepthPro."""
+    patch_encoder_preset: ViTPreset
+    image_encoder_preset: ViTPreset
+    decoder_features: int
+    checkpoint_uri: Optional[str] = None
+    fov_encoder_preset: Optional[ViTPreset] = None
+    use_fov_head: bool = True
+DEFAULT_MONODEPTH_CONFIG_DICT = DepthProConfig(
+    patch_encoder_preset="dinov2l16_384",
+    image_encoder_preset="dinov2l16_384",
+    checkpoint_uri="./checkpoints/depth_pro.pt",
+    decoder_features=256,
+    use_fov_head=True,
+    fov_encoder_preset="dinov2l16_384",
+)
+def create_backbone_model(
+    preset: ViTPreset
+) -> Tuple[nn.Module, ViTPreset]:
+    """Create and load a backbone model given a config.
+    Args:
+    ----
+        preset: A backbone preset to load pre-defind configs.
+    Returns:
+    -------
+        A Torch module and the associated config.
+    """
+    if preset in VIT_CONFIG_DICT:
+        config = VIT_CONFIG_DICT[preset]
+        model = create_vit(preset=preset, use_pretrained=False)
+    else:
+        raise KeyError(f"Preset {preset} not found.")
+    return model, config
+def create_model_and_transforms(
+    config: DepthProConfig = DEFAULT_MONODEPTH_CONFIG_DICT,
+    device: torch.device = torch.device("cpu"),
+    precision: torch.dtype = torch.float32,
+) -> Tuple[DepthPro, Compose]:
+    """Create a DepthPro model and load weights from `config.checkpoint_uri`.
+    Args:
+    ----
+        config: The configuration for the DPT model architecture.
+        device: The optional Torch device to load the model onto, default runs on "cpu".
+        precision: The optional precision used for the model, default is FP32.
+    Returns:
+    -------
+        The Torch DepthPro model and associated Transform.
+    """
+    patch_encoder, patch_encoder_config = create_backbone_model(
+        preset=config.patch_encoder_preset
+    )
+    image_encoder, _ = create_backbone_model(
+        preset=config.image_encoder_preset
+    )
+    fov_encoder = None
+    if config.use_fov_head and config.fov_encoder_preset is not None:
+        fov_encoder, _ = create_backbone_model(preset=config.fov_encoder_preset)
+    dims_encoder = patch_encoder_config.encoder_feature_dims
+    hook_block_ids = patch_encoder_config.encoder_feature_layer_ids
+    encoder = DepthProEncoder(
+        dims_encoder=dims_encoder,
+        patch_encoder=patch_encoder,
+        image_encoder=image_encoder,
+        hook_block_ids=hook_block_ids,
+        decoder_features=config.decoder_features,
+    )
+    decoder = MultiresConvDecoder(
+        dims_encoder=[config.decoder_features] + list(encoder.dims_encoder),
+        dim_decoder=config.decoder_features,
+    )
+    model = DepthPro(
+        encoder=encoder,
+        decoder=decoder,
+        last_dims=(32, 1),
+        use_fov_head=config.use_fov_head,
+        fov_encoder=fov_encoder,
+    ).to(device)
+    if precision == torch.half:
+        model.half()
+    transform = Compose(
+        [
+            ToTensor(),
+            Lambda(lambda x: x.to(device)),
+            Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5]),
+            ConvertImageDtype(precision),
+        ]
+    )
+    if config.checkpoint_uri is not None:
+        state_dict = torch.load(config.checkpoint_uri, map_location="cpu")
+        missing_keys, unexpected_keys = model.load_state_dict(
+            state_dict=state_dict, strict=True
+        )
+        if len(unexpected_keys) != 0:
+            raise KeyError(
+                f"Found unexpected keys when loading monodepth: {unexpected_keys}"
+            )
+        # fc_norm is only for the classification head,
+        # which we would not use. We only use the encoding.
+        missing_keys = [key for key in missing_keys if "fc_norm" not in key]
+        if len(missing_keys) != 0:
+            raise KeyError(f"Keys are missing when loading monodepth: {missing_keys}")
+    return model, transform
+class DepthPro(nn.Module):
+    """DepthPro network."""
+    def __init__(
+        self,
+        encoder: DepthProEncoder,
+        decoder: MultiresConvDecoder,
+        last_dims: tuple[int, int],
+        use_fov_head: bool = True,
+        fov_encoder: Optional[nn.Module] = None,
+    ):
+        """Initialize DepthPro.
+        Args:
+        ----
+            encoder: The DepthProEncoder backbone.
+            decoder: The MultiresConvDecoder decoder.
+            last_dims: The dimension for the last convolution layers.
+            use_fov_head: Whether to use the field-of-view head.
+            fov_encoder: A separate encoder for the field of view.
+        """
+        super().__init__()
+        self.encoder = encoder
+        self.decoder = decoder
+        dim_decoder = decoder.dim_decoder
+        self.head = nn.Sequential(
+            nn.Conv2d(
+                dim_decoder, dim_decoder // 2, kernel_size=3, stride=1, padding=1
+            ),
+            nn.ConvTranspose2d(
+                in_channels=dim_decoder // 2,
+                out_channels=dim_decoder // 2,
+                kernel_size=2,
+                stride=2,
+                padding=0,
+                bias=True,
+            ),
+            nn.Conv2d(
+                dim_decoder // 2,
+                last_dims[0],
+                kernel_size=3,
+                stride=1,
+                padding=1,
+            ),
+            nn.ReLU(True),
+            nn.Conv2d(last_dims[0], last_dims[1], kernel_size=1, stride=1, padding=0),
+            nn.ReLU(),
+        )
+        # Set the final convoultion layer's bias to be 0.
+        self.head[4].bias.data.fill_(0)
+        # Set the FOV estimation head.
+        if use_fov_head:
+            self.fov = FOVNetwork(num_features=dim_decoder, fov_encoder=fov_encoder)
+    @property
+    def img_size(self) -> int:
+        """Return the internal image size of the network."""
+        return self.encoder.img_size
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """Decode by projection and fusion of multi-resolution encodings.
+        Args:
+        ----
+            x (torch.Tensor): Input image.
+        Returns:
+        -------
+            The canonical inverse depth map [m] and the optional estimated field of view [deg].
+        """
+        _, _, H, W = x.shape
+        assert H == self.img_size and W == self.img_size
+        encodings = self.encoder(x)
+        features, features_0 = self.decoder(encodings)
+        canonical_inverse_depth = self.head(features)
+        fov_deg = None
+        if hasattr(self, "fov"):
+            fov_deg = self.fov.forward(x, features_0.detach())
+        return canonical_inverse_depth, fov_deg
+    @torch.no_grad()
+    def infer(
+        self,
+        x: torch.Tensor,
+        f_px: Optional[Union[float, torch.Tensor]] = None,
+        interpolation_mode="bilinear",
+    ) -> Mapping[str, torch.Tensor]:
+        """Infer depth and fov for a given image.
+        If the image is not at network resolution, it is resized to 1536x1536 and
+        the estimated depth is resized to the original image resolution.
+        Note: if the focal length is given, the estimated value is ignored and the provided
+        focal length is use to generate the metric depth values.
+        Args:
+        ----
+            x (torch.Tensor): Input image
+            f_px (torch.Tensor): Optional focal length in pixels corresponding to `x`.
+            interpolation_mode (str): Interpolation function for downsampling/upsampling.
+        Returns:
+        -------
+            Tensor dictionary (torch.Tensor): depth [m], focallength [pixels].
+        """
+        if len(x.shape) == 3:
+            x = x.unsqueeze(0)
+        _, _, H, W = x.shape
+        resize = H != self.img_size or W != self.img_size
+        if resize:
+            x = nn.functional.interpolate(
+                x,
+                size=(self.img_size, self.img_size),
+                mode=interpolation_mode,
+                align_corners=False,
+            )
+        canonical_inverse_depth, fov_deg = self.forward(x)
+        if f_px is None:
+            f_px = 0.5 * W / torch.tan(0.5 * torch.deg2rad(fov_deg.to(torch.float)))
+        inverse_depth = canonical_inverse_depth * (W / f_px)
+        f_px = f_px.squeeze()
+        if resize:
+            inverse_depth = nn.functional.interpolate(
+                inverse_depth, size=(H, W), mode=interpolation_mode, align_corners=False
+            )
+        depth = 1.0 / torch.clamp(inverse_depth, min=1e-4, max=1e4)
+        return {
+            "depth": depth.squeeze(),
+            "focallength_px": f_px,
+        }

src/depth_pro/eval/boundary_metrics.py ADDED Viewed

	@@ -0,0 +1,332 @@

+from typing import List, Tuple
+import numpy as np
+def connected_component(r: np.ndarray, c: np.ndarray) -> List[List[int]]:
+    """Find connected components in the given row and column indices.
+    Args:
+    ----
+        r (np.ndarray): Row indices.
+        c (np.ndarray): Column indices.
+    Yields:
+    ------
+        List[int]: Indices of connected components.
+    """
+    indices = [0]
+    for i in range(1, r.size):
+        if r[i] == r[indices[-1]] and c[i] == c[indices[-1]] + 1:
+            indices.append(i)
+        else:
+            yield indices
+            indices = [i]
+    yield indices
+def nms_horizontal(ratio: np.ndarray, threshold: float) -> np.ndarray:
+    """Apply Non-Maximum Suppression (NMS) horizontally on the given ratio matrix.
+    Args:
+    ----
+        ratio (np.ndarray): Input ratio matrix.
+        threshold (float): Threshold for NMS.
+    Returns:
+    -------
+        np.ndarray: Binary mask after applying NMS.
+    """
+    mask = np.zeros_like(ratio, dtype=bool)
+    r, c = np.nonzero(ratio > threshold)
+    if len(r) == 0:
+        return mask
+    for ids in connected_component(r, c):
+        values = [ratio[r[i], c[i]] for i in ids]
+        mi = np.argmax(values)
+        mask[r[ids[mi]], c[ids[mi]]] = True
+    return mask
+def nms_vertical(ratio: np.ndarray, threshold: float) -> np.ndarray:
+    """Apply Non-Maximum Suppression (NMS) vertically on the given ratio matrix.
+    Args:
+    ----
+        ratio (np.ndarray): Input ratio matrix.
+        threshold (float): Threshold for NMS.
+    Returns:
+    -------
+        np.ndarray: Binary mask after applying NMS.
+    """
+    return np.transpose(nms_horizontal(np.transpose(ratio), threshold))
+def fgbg_depth(
+    d: np.ndarray, t: float
+) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+    """Find foreground-background relations between neighboring pixels.
+    Args:
+    ----
+        d (np.ndarray): Depth matrix.
+        t (float): Threshold for comparison.
+    Returns:
+    -------
+        Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: Four matrices indicating
+        left, top, right, and bottom foreground-background relations.
+    """
+    right_is_big_enough = (d[..., :, 1:] / d[..., :, :-1]) > t
+    left_is_big_enough = (d[..., :, :-1] / d[..., :, 1:]) > t
+    bottom_is_big_enough = (d[..., 1:, :] / d[..., :-1, :]) > t
+    top_is_big_enough = (d[..., :-1, :] / d[..., 1:, :]) > t
+    return (
+        left_is_big_enough,
+        top_is_big_enough,
+        right_is_big_enough,
+        bottom_is_big_enough,
+    )
+def fgbg_depth_thinned(
+    d: np.ndarray, t: float
+) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+    """Find foreground-background relations between neighboring pixels with Non-Maximum Suppression.
+    Args:
+    ----
+        d (np.ndarray): Depth matrix.
+        t (float): Threshold for NMS.
+    Returns:
+    -------
+        Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: Four matrices indicating
+        left, top, right, and bottom foreground-background relations with NMS applied.
+    """
+    right_is_big_enough = nms_horizontal(d[..., :, 1:] / d[..., :, :-1], t)
+    left_is_big_enough = nms_horizontal(d[..., :, :-1] / d[..., :, 1:], t)
+    bottom_is_big_enough = nms_vertical(d[..., 1:, :] / d[..., :-1, :], t)
+    top_is_big_enough = nms_vertical(d[..., :-1, :] / d[..., 1:, :], t)
+    return (
+        left_is_big_enough,
+        top_is_big_enough,
+        right_is_big_enough,
+        bottom_is_big_enough,
+    )
+def fgbg_binary_mask(
+    d: np.ndarray,
+) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+    """Find foreground-background relations between neighboring pixels in binary masks.
+    Args:
+    ----
+        d (np.ndarray): Binary depth matrix.
+    Returns:
+    -------
+        Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: Four matrices indicating
+        left, top, right, and bottom foreground-background relations in binary masks.
+    """
+    assert d.dtype == bool
+    right_is_big_enough = d[..., :, 1:] & ~d[..., :, :-1]
+    left_is_big_enough = d[..., :, :-1] & ~d[..., :, 1:]
+    bottom_is_big_enough = d[..., 1:, :] & ~d[..., :-1, :]
+    top_is_big_enough = d[..., :-1, :] & ~d[..., 1:, :]
+    return (
+        left_is_big_enough,
+        top_is_big_enough,
+        right_is_big_enough,
+        bottom_is_big_enough,
+    )
+def edge_recall_matting(pr: np.ndarray, gt: np.ndarray, t: float) -> float:
+    """Calculate edge recall for image matting.
+    Args:
+    ----
+        pr (np.ndarray): Predicted depth matrix.
+        gt (np.ndarray): Ground truth binary mask.
+        t (float): Threshold for NMS.
+    Returns:
+    -------
+        float: Edge recall value.
+    """
+    assert gt.dtype == bool
+    ap, bp, cp, dp = fgbg_depth_thinned(pr, t)
+    ag, bg, cg, dg = fgbg_binary_mask(gt)
+    return 0.25 * (
+        np.count_nonzero(ap & ag) / max(np.count_nonzero(ag), 1)
+        + np.count_nonzero(bp & bg) / max(np.count_nonzero(bg), 1)
+        + np.count_nonzero(cp & cg) / max(np.count_nonzero(cg), 1)
+        + np.count_nonzero(dp & dg) / max(np.count_nonzero(dg), 1)
+    )
+def boundary_f1(
+    pr: np.ndarray,
+    gt: np.ndarray,
+    t: float,
+    return_p: bool = False,
+    return_r: bool = False,
+) -> float:
+    """Calculate Boundary F1 score.
+    Args:
+    ----
+        pr (np.ndarray): Predicted depth matrix.
+        gt (np.ndarray): Ground truth depth matrix.
+        t (float): Threshold for comparison.
+        return_p (bool, optional): If True, return precision. Defaults to False.
+        return_r (bool, optional): If True, return recall. Defaults to False.
+    Returns:
+    -------
+        float: Boundary F1 score, or precision, or recall depending on the flags.
+    """
+    ap, bp, cp, dp = fgbg_depth(pr, t)
+    ag, bg, cg, dg = fgbg_depth(gt, t)
+    r = 0.25 * (
+        np.count_nonzero(ap & ag) / max(np.count_nonzero(ag), 1)
+        + np.count_nonzero(bp & bg) / max(np.count_nonzero(bg), 1)
+        + np.count_nonzero(cp & cg) / max(np.count_nonzero(cg), 1)
+        + np.count_nonzero(dp & dg) / max(np.count_nonzero(dg), 1)
+    )
+    p = 0.25 * (
+        np.count_nonzero(ap & ag) / max(np.count_nonzero(ap), 1)
+        + np.count_nonzero(bp & bg) / max(np.count_nonzero(bp), 1)
+        + np.count_nonzero(cp & cg) / max(np.count_nonzero(cp), 1)
+        + np.count_nonzero(dp & dg) / max(np.count_nonzero(dp), 1)
+    )
+    if r + p == 0:
+        return 0.0
+    if return_p:
+        return p
+    if return_r:
+        return r
+    return 2 * (r * p) / (r + p)
+def get_thresholds_and_weights(
+    t_min: float, t_max: float, N: int
+) -> Tuple[np.ndarray, np.ndarray]:
+    """Generate thresholds and weights for the given range.
+    Args:
+    ----
+        t_min (float): Minimum threshold.
+        t_max (float): Maximum threshold.
+        N (int): Number of thresholds.
+    Returns:
+    -------
+        Tuple[np.ndarray, np.ndarray]: Array of thresholds and corresponding weights.
+    """
+    thresholds = np.linspace(t_min, t_max, N)
+    weights = thresholds / thresholds.sum()
+    return thresholds, weights
+def invert_depth(depth: np.ndarray, eps: float = 1e-6) -> np.ndarray:
+    """Inverts a depth map with numerical stability.
+    Args:
+    ----
+        depth (np.ndarray): Depth map to be inverted.
+        eps (float): Minimum value to avoid division by zero (default is 1e-6).
+    Returns:
+    -------
+    np.ndarray: Inverted depth map.
+    """
+    inverse_depth = 1.0 / depth.clip(min=eps)
+    return inverse_depth
+def SI_boundary_F1(
+    predicted_depth: np.ndarray,
+    target_depth: np.ndarray,
+    t_min: float = 1.05,
+    t_max: float = 1.25,
+    N: int = 10,
+) -> float:
+    """Calculate Scale-Invariant Boundary F1 Score for depth-based ground-truth.
+    Args:
+    ----
+        predicted_depth (np.ndarray): Predicted depth matrix.
+        target_depth (np.ndarray): Ground truth depth matrix.
+        t_min (float, optional): Minimum threshold. Defaults to 1.05.
+        t_max (float, optional): Maximum threshold. Defaults to 1.25.
+        N (int, optional): Number of thresholds. Defaults to 10.
+    Returns:
+    -------
+        float: Scale-Invariant Boundary F1 Score.
+    """
+    assert predicted_depth.ndim == target_depth.ndim == 2
+    thresholds, weights = get_thresholds_and_weights(t_min, t_max, N)
+    f1_scores = np.array(
+        [
+            boundary_f1(invert_depth(predicted_depth), invert_depth(target_depth), t)
+            for t in thresholds
+        ]
+    )
+    return np.sum(f1_scores * weights)
+def SI_boundary_Recall(
+    predicted_depth: np.ndarray,
+    target_mask: np.ndarray,
+    t_min: float = 1.05,
+    t_max: float = 1.25,
+    N: int = 10,
+    alpha_threshold: float = 0.1,
+) -> float:
+    """Calculate Scale-Invariant Boundary Recall Score for mask-based ground-truth.
+    Args:
+    ----
+        predicted_depth (np.ndarray): Predicted depth matrix.
+        target_mask (np.ndarray): Ground truth binary mask.
+        t_min (float, optional): Minimum threshold. Defaults to 1.05.
+        t_max (float, optional): Maximum threshold. Defaults to 1.25.
+        N (int, optional): Number of thresholds. Defaults to 10.
+        alpha_threshold (float, optional): Threshold for alpha masking. Defaults to 0.1.
+    Returns:
+    -------
+        float: Scale-Invariant Boundary Recall Score.
+    """
+    assert predicted_depth.ndim == target_mask.ndim == 2
+    thresholds, weights = get_thresholds_and_weights(t_min, t_max, N)
+    thresholded_target = target_mask > alpha_threshold
+    recall_scores = np.array(
+        [
+            edge_recall_matting(
+                invert_depth(predicted_depth), thresholded_target, t=float(t)
+            )
+            for t in thresholds
+        ]
+    )
+    weighted_recall = np.sum(recall_scores * weights)
+    return weighted_recall

src/depth_pro/eval/dis5k_sample_list.txt ADDED Viewed

	@@ -0,0 +1,200 @@

+DIS5K/DIS-TE1/im/12#Graphics#4#TrafficSign#8245751856_821be14f86_o.jpg
+DIS5K/DIS-TE1/im/13#Insect#4#Butterfly#16023994688_7ff8cdccb1_o.jpg
+DIS5K/DIS-TE1/im/14#Kitchenware#4#Kitchenware#IMG_20210520_205538.jpg
+DIS5K/DIS-TE1/im/14#Kitchenware#8#SweetStand#4848284981_fc90f54b50_o.jpg
+DIS5K/DIS-TE1/im/17#Non-motor Vehicle#4#Cart#15012855035_d10b57014f_o.jpg
+DIS5K/DIS-TE1/im/2#Aircraft#5#Kite#13104545564_5afceec9bd_o.jpg
+DIS5K/DIS-TE1/im/20#Sports#10#Skateboarding#8472763540_bb2390e928_o.jpg
+DIS5K/DIS-TE1/im/21#Tool#14#Sword#32473146960_dcc6b77848_o.jpg
+DIS5K/DIS-TE1/im/21#Tool#15#Tapeline#9680492386_2d2020f282_o.jpg
+DIS5K/DIS-TE1/im/21#Tool#4#Flag#507752845_ef852100f0_o.jpg
+DIS5K/DIS-TE1/im/21#Tool#6#Key#11966089533_3becd78b44_o.jpg
+DIS5K/DIS-TE1/im/21#Tool#8#Scale#31946428472_d28def471b_o.jpg
+DIS5K/DIS-TE1/im/22#Weapon#4#Rifle#8472656430_3eb908b211_o.jpg
+DIS5K/DIS-TE1/im/8#Electronics#3#Earphone#1177468301_641df8c267_o.jpg
+DIS5K/DIS-TE1/im/8#Electronics#9#MusicPlayer#2235782872_7d47847bb4_o.jpg
+DIS5K/DIS-TE2/im/11#Furniture#13#Ladder#3878434417_2ed740586e_o.jpg
+DIS5K/DIS-TE2/im/13#Insect#1#Ant#27047700955_3b3a1271f8_o.jpg
+DIS5K/DIS-TE2/im/13#Insect#11#Spider#5567179191_38d1f65589_o.jpg
+DIS5K/DIS-TE2/im/13#Insect#8#Locust#5237933769_e6687c05e4_o.jpg
+DIS5K/DIS-TE2/im/14#Kitchenware#2#DishRack#70838854_40cf689da7_o.jpg
+DIS5K/DIS-TE2/im/14#Kitchenware#8#SweetStand#8467929412_fef7f4275d_o.jpg
+DIS5K/DIS-TE2/im/16#Music Instrument#2#Harp#28058219806_28e05ff24a_o.jpg
+DIS5K/DIS-TE2/im/17#Non-motor Vehicle#1#BabyCarriage#29794777180_2e1695a0cf_o.jpg
+DIS5K/DIS-TE2/im/19#Ship#3#Sailboat#22442908623_5977e3becf_o.jpg
+DIS5K/DIS-TE2/im/2#Aircraft#5#Kite#44654358051_1400e71cc4_o.jpg
+DIS5K/DIS-TE2/im/21#Tool#11#Stand#IMG_20210520_205442.jpg
+DIS5K/DIS-TE2/im/21#Tool#17#Tripod#9318977876_34615ec9a0_o.jpg
+DIS5K/DIS-TE2/im/5#Artifact#3#Handcraft#50860882577_8482143b1b_o.jpg
+DIS5K/DIS-TE2/im/8#Electronics#10#Robot#3093360210_fee54dc5c5_o.jpg
+DIS5K/DIS-TE2/im/8#Electronics#6#Microphone#47411477652_6da66cbc10_o.jpg
+DIS5K/DIS-TE3/im/14#Kitchenware#4#Kitchenware#2451122898_ef883175dd_o.jpg
+DIS5K/DIS-TE3/im/15#Machine#4#SewingMachine#9311164128_97ba1d3947_o.jpg
+DIS5K/DIS-TE3/im/16#Music Instrument#2#Harp#7670920550_59e992fd7b_o.jpg
+DIS5K/DIS-TE3/im/17#Non-motor Vehicle#1#BabyCarriage#8389984877_1fddf8715c_o.jpg
+DIS5K/DIS-TE3/im/17#Non-motor Vehicle#3#Carriage#5947122724_98e0fc3d1f_o.jpg
+DIS5K/DIS-TE3/im/2#Aircraft#2#Balloon#2487168092_641505883f_o.jpg
+DIS5K/DIS-TE3/im/2#Aircraft#4#Helicopter#8401177591_06c71c8df2_o.jpg
+DIS5K/DIS-TE3/im/20#Sports#1#Archery#12520003103_faa43ea3e0_o.jpg
+DIS5K/DIS-TE3/im/21#Tool#11#Stand#IMG_20210709_221507.jpg
+DIS5K/DIS-TE3/im/21#Tool#2#Clip#5656649687_63d0c6696d_o.jpg
+DIS5K/DIS-TE3/im/21#Tool#6#Key#12878459244_6387a140ea_o.jpg
+DIS5K/DIS-TE3/im/3#Aquatic#1#Lobster#109214461_f52b4b6093_o.jpg
+DIS5K/DIS-TE3/im/4#Architecture#19#Windmill#20195851863_2627117e0e_o.jpg
+DIS5K/DIS-TE3/im/5#Artifact#2#Cage#5821476369_ea23927487_o.jpg
+DIS5K/DIS-TE3/im/8#Electronics#7#MobileHolder#49732997896_7f53c290b5_o.jpg
+DIS5K/DIS-TE4/im/13#Insect#6#Centipede#15302179708_a267850881_o.jpg
+DIS5K/DIS-TE4/im/17#Non-motor Vehicle#11#Tricycle#5771069105_a3aef6f665_o.jpg
+DIS5K/DIS-TE4/im/17#Non-motor Vehicle#2#Bicycle#4245936196_fdf812dcb7_o.jpg
+DIS5K/DIS-TE4/im/17#Non-motor Vehicle#9#ShoppingCart#4674052920_a5b7a2b236_o.jpg
+DIS5K/DIS-TE4/im/18#Plant#1#Bonsai#3539420884_ca8973e2c0_o.jpg
+DIS5K/DIS-TE4/im/2#Aircraft#6#Parachute#33590416634_9d6f2325e7_o.jpg
+DIS5K/DIS-TE4/im/20#Sports#1#Archery#46924476515_0be1caa684_o.jpg
+DIS5K/DIS-TE4/im/20#Sports#8#Racket#19337607166_dd1985fb59_o.jpg
+DIS5K/DIS-TE4/im/21#Tool#6#Key#3193329588_839b0c74ce_o.jpg
+DIS5K/DIS-TE4/im/5#Artifact#2#Cage#5821886526_0573ba2d0d_o.jpg
+DIS5K/DIS-TE4/im/5#Artifact#3#Handcraft#50105138282_3c1d02c968_o.jpg
+DIS5K/DIS-TE4/im/8#Electronics#1#Antenna#4305034305_874f21a701_o.jpg
+DIS5K/DIS-TR/im/1#Accessories#1#Bag#15554964549_3105e51b6f_o.jpg
+DIS5K/DIS-TR/im/1#Accessories#1#Bag#41104261980_098a6c4a56_o.jpg
+DIS5K/DIS-TR/im/1#Accessories#2#Clothes#2284764037_871b2e8ca4_o.jpg
+DIS5K/DIS-TR/im/1#Accessories#3#Eyeglasses#1824643784_70d0134156_o.jpg
+DIS5K/DIS-TR/im/1#Accessories#3#Eyeglasses#3590020230_37b09a29b3_o.jpg
+DIS5K/DIS-TR/im/1#Accessories#3#Eyeglasses#4809652879_4da8a69f3b_o.jpg
+DIS5K/DIS-TR/im/1#Accessories#3#Eyeglasses#792204934_f9b28f99b4_o.jpg
+DIS5K/DIS-TR/im/1#Accessories#5#Jewelry#13909132974_c4750c5fb7_o.jpg
+DIS5K/DIS-TR/im/1#Accessories#7#Shoe#2483391615_9199ece8d6_o.jpg
+DIS5K/DIS-TR/im/1#Accessories#8#Watch#4343266960_f6633b029b_o.jpg
+DIS5K/DIS-TR/im/10#Frame#2#BicycleFrame#17897573_42964dd104_o.jpg
+DIS5K/DIS-TR/im/10#Frame#5#Rack#15898634812_64807069ff_o.jpg
+DIS5K/DIS-TR/im/10#Frame#5#Rack#23928546819_c184cb0b60_o.jpg
+DIS5K/DIS-TR/im/11#Furniture#19#Shower#6189119596_77bcfe80ee_o.jpg
+DIS5K/DIS-TR/im/11#Furniture#2#Bench#3263647075_9306e280b5_o.jpg
+DIS5K/DIS-TR/im/11#Furniture#5#CoatHanger#12774091054_cd5ff520ef_o.jpg
+DIS5K/DIS-TR/im/11#Furniture#6#DentalChair#13878156865_d0439dcb32_o.jpg
+DIS5K/DIS-TR/im/11#Furniture#9#Easel#5861024714_2070cd480c_o.jpg
+DIS5K/DIS-TR/im/12#Graphics#4#TrafficSign#40621867334_f3c32ec189_o.jpg
+DIS5K/DIS-TR/im/13#Insect#1#Ant#3295038190_db5dd0d4f4_o.jpg
+DIS5K/DIS-TR/im/13#Insect#10#Mosquito#24341339_a88a1dad4c_o.jpg
+DIS5K/DIS-TR/im/13#Insect#11#Spider#27171518270_63b78069ff_o.jpg
+DIS5K/DIS-TR/im/13#Insect#11#Spider#49925050281_fa727c154e_o.jpg
+DIS5K/DIS-TR/im/13#Insect#2#Beatle#279616486_2f1e64f591_o.jpg
+DIS5K/DIS-TR/im/13#Insect#3#Bee#43892067695_82cf3e536b_o.jpg
+DIS5K/DIS-TR/im/13#Insect#6#Centipede#20874281788_3e15c90a1c_o.jpg
+DIS5K/DIS-TR/im/13#Insect#7#Dragonfly#14106671120_1b824d77e4_o.jpg
+DIS5K/DIS-TR/im/13#Insect#8#Locust#21637491048_676ef7c9f7_o.jpg
+DIS5K/DIS-TR/im/13#Insect#9#Mantis#1381120202_9dff6987b2_o.jpg
+DIS5K/DIS-TR/im/14#Kitchenware#1#Cup#12812517473_327d6474b8_o.jpg
+DIS5K/DIS-TR/im/14#Kitchenware#10#WineGlass#6402491641_389275d4d1_o.jpg
+DIS5K/DIS-TR/im/14#Kitchenware#3#Hydrovalve#3129932040_8c05825004_o.jpg
+DIS5K/DIS-TR/im/14#Kitchenware#4#Kitchenware#2881934780_87d5218ebb_o.jpg
+DIS5K/DIS-TR/im/14#Kitchenware#4#Kitchenware#IMG_20210520_205527.jpg
+DIS5K/DIS-TR/im/14#Kitchenware#6#Spoon#32989113501_b69eccf0df_o.jpg
+DIS5K/DIS-TR/im/14#Kitchenware#8#SweetStand#2867322189_c56d1e0b87_o.jpg
+DIS5K/DIS-TR/im/15#Machine#1#Gear#19217846720_f5f2807475_o.jpg
+DIS5K/DIS-TR/im/15#Machine#2#Machine#1620160659_9571b7a7ab_o.jpg
+DIS5K/DIS-TR/im/16#Music Instrument#2#Harp#6012801603_1a6e2c16a6_o.jpg
+DIS5K/DIS-TR/im/16#Music Instrument#5#Trombone#8683292118_d223c17ccb_o.jpg
+DIS5K/DIS-TR/im/16#Music Instrument#6#Trumpet#8393262740_b8c216142c_o.jpg
+DIS5K/DIS-TR/im/16#Music Instrument#8#Violin#1511267391_40e4949d68_o.jpg
+DIS5K/DIS-TR/im/17#Non-motor Vehicle#1#BabyCarriage#6989512997_38b3dbc88b_o.jpg
+DIS5K/DIS-TR/im/17#Non-motor Vehicle#12#Wheel#14627183228_b2d68cf501_o.jpg
+DIS5K/DIS-TR/im/17#Non-motor Vehicle#12#Wheel#2932226475_1b2403e549_o.jpg
+DIS5K/DIS-TR/im/17#Non-motor Vehicle#12#Wheel#5420155648_86459905b8_o.jpg
+DIS5K/DIS-TR/im/17#Non-motor Vehicle#2#Bicycle#IMG_20210513_134904.jpg
+DIS5K/DIS-TR/im/17#Non-motor Vehicle#3#Carriage#3311962551_6f211b7bd6_o.jpg
+DIS5K/DIS-TR/im/17#Non-motor Vehicle#4#Cart#2609732026_baf7fff3a1_o.jpg
+DIS5K/DIS-TR/im/17#Non-motor Vehicle#5#Handcart#5821282211_201cefeaf2_o.jpg
+DIS5K/DIS-TR/im/17#Non-motor Vehicle#7#Mower#5779003232_3bb3ae531a_o.jpg
+DIS5K/DIS-TR/im/17#Non-motor Vehicle#9#ShoppingCart#10051622843_ace07e32b8_o.jpg
+DIS5K/DIS-TR/im/17#Non-motor Vehicle#9#ShoppingCart#8075259294_f23e243849_o.jpg
+DIS5K/DIS-TR/im/18#Plant#2#Tree#44800999741_e377e16dbb_o.jpg
+DIS5K/DIS-TR/im/2#Aircraft#1#Airplane#2631761913_3ac67d0223_o.jpg
+DIS5K/DIS-TR/im/2#Aircraft#1#Airplane#37707911566_e908a261b6_o.jpg
+DIS5K/DIS-TR/im/2#Aircraft#3#HangGlider#2557220131_b8506920c5_o.jpg
+DIS5K/DIS-TR/im/2#Aircraft#4#Helicopter#6215659280_5dbd9b4546_o.jpg
+DIS5K/DIS-TR/im/2#Aircraft#6#Parachute#20185790493_e56fcaf8c6_o.jpg
+DIS5K/DIS-TR/im/20#Sports#1#Archery#3871269982_ae4c59a7eb_o.jpg
+DIS5K/DIS-TR/im/20#Sports#9#RockClimbing#9662433268_51299bc50e_o.jpg
+DIS5K/DIS-TR/im/21#Tool#14#Sword#26258479365_2950d7fa37_o.jpg
+DIS5K/DIS-TR/im/21#Tool#15#Tapeline#15505703447_e0fdeaa5a6_o.jpg
+DIS5K/DIS-TR/im/21#Tool#4#Flag#26678602024_9b665742de_o.jpg
+DIS5K/DIS-TR/im/21#Tool#4#Flag#5774823110_d603ce3cc8_o.jpg
+DIS5K/DIS-TR/im/21#Tool#5#Hook#6867989814_dba18d673c_o.jpg
+DIS5K/DIS-TR/im/22#Weapon#4#Rifle#4451713125_cd91719189_o.jpg
+DIS5K/DIS-TR/im/3#Aquatic#2#Seadragon#4910944581_913139b238_o.jpg
+DIS5K/DIS-TR/im/4#Architecture#12#Scaffold#3661448960_8aff24cc4d_o.jpg
+DIS5K/DIS-TR/im/4#Architecture#13#Sculpture#6385318715_9a88d4eba7_o.jpg
+DIS5K/DIS-TR/im/4#Architecture#17#Well#5011603479_75cf42808a_o.jpg
+DIS5K/DIS-TR/im/5#Artifact#2#Cage#4892828841_7f1bc05682_o.jpg
+DIS5K/DIS-TR/im/5#Artifact#3#Handcraft#15404211628_9e9ff2ce2e_o.jpg
+DIS5K/DIS-TR/im/5#Artifact#3#Handcraft#3200169865_7c84cfcccf_o.jpg
+DIS5K/DIS-TR/im/5#Artifact#3#Handcraft#5859295071_c217e7c22f_o.jpg
+DIS5K/DIS-TR/im/6#Automobile#10#SteeringWheel#17200338026_f1e2122d8e_o.jpg
+DIS5K/DIS-TR/im/6#Automobile#3#Car#3780893425_1a7d275e09_o.jpg
+DIS5K/DIS-TR/im/6#Automobile#5#Crane#15282506502_1b1132a7c3_o.jpg
+DIS5K/DIS-TR/im/7#Electrical#1#Cable#16767791875_8e6df41752_o.jpg
+DIS5K/DIS-TR/im/7#Electrical#1#Cable#3291433361_38747324c4_o.jpg
+DIS5K/DIS-TR/im/7#Electrical#1#Cable#4195104238_12a754c61a_o.jpg
+DIS5K/DIS-TR/im/7#Electrical#1#Cable#49645415132_61e5664ecf_o.jpg
+DIS5K/DIS-TR/im/7#Electrical#1#Cable#IMG_20210521_232406.jpg
+DIS5K/DIS-TR/im/7#Electrical#10#UtilityPole#3298312021_92f431e3e9_o.jpg
+DIS5K/DIS-TR/im/7#Electrical#10#UtilityPole#47950134773_fbfff63f4e_o.jpg
+DIS5K/DIS-TR/im/7#Electrical#11#VacuumCleaner#5448403677_6a29e21881_o.jpg
+DIS5K/DIS-TR/im/7#Electrical#2#CeilingLamp#611568868_680ed5d39f_o.jpg
+DIS5K/DIS-TR/im/7#Electrical#3#Fan#3391683115_990525a693_o.jpg
+DIS5K/DIS-TR/im/7#Electrical#6#StreetLamp#150049122_0692266618_o.jpg
+DIS5K/DIS-TR/im/7#Electrical#9#TransmissionTower#31433908671_7e7e277dfe_o.jpg
+DIS5K/DIS-TR/im/8#Electronics#1#Antenna#8727884873_e0622ee5c4_o.jpg
+DIS5K/DIS-TR/im/8#Electronics#2#Camcorder#4172690390_7e5f280ace_o.jpg
+DIS5K/DIS-TR/im/8#Electronics#3#Earphone#413984555_f290febdf5_o.jpg
+DIS5K/DIS-TR/im/8#Electronics#5#Headset#30574225373_3717ed9fa4_o.jpg
+DIS5K/DIS-TR/im/8#Electronics#6#Microphone#538006482_4aae4f5bd6_o.jpg
+DIS5K/DIS-TR/im/8#Electronics#9#MusicPlayer#1306012480_2ea80d2afd_o.jpg
+DIS5K/DIS-TR/im/9#Entertainment#1#GymEquipment#33071754135_8f3195cbd1_o.jpg
+DIS5K/DIS-TR/im/9#Entertainment#2#KidsPlayground#2305807849_be53d724ea_o.jpg
+DIS5K/DIS-TR/im/9#Entertainment#2#KidsPlayground#3862040422_5bbf903204_o.jpg
+DIS5K/DIS-TR/im/9#Entertainment#3#OutdoorFitnessEquipment#10814507005_3dacaa28b3_o.jpg
+DIS5K/DIS-TR/im/9#Entertainment#4#FerrisWheel#81640293_4b0ee62040_o.jpg
+DIS5K/DIS-TR/im/9#Entertainment#5#Swing#49867339188_08073f4b76_o.jpg
+DIS5K/DIS-VD/im/1#Accessories#1#Bag#6815402415_e01c1a41e6_o.jpg
+DIS5K/DIS-VD/im/1#Accessories#5#Jewelry#2744070193_1486582e8d_o.jpg
+DIS5K/DIS-VD/im/10#Frame#1#BasketballHoop#IMG_20210521_232650.jpg
+DIS5K/DIS-VD/im/10#Frame#5#Rack#6156611713_49ebf12b1e_o.jpg
+DIS5K/DIS-VD/im/11#Furniture#11#Handrail#3276641240_1b84b5af85_o.jpg
+DIS5K/DIS-VD/im/11#Furniture#13#Ladder#33423266_5391cf47e9_o.jpg
+DIS5K/DIS-VD/im/11#Furniture#17#Table#3725111755_4fc101e7ab_o.jpg
+DIS5K/DIS-VD/im/11#Furniture#2#Bench#35556410400_7235b58070_o.jpg
+DIS5K/DIS-VD/im/11#Furniture#4#Chair#3301769985_e49de6739f_o.jpg
+DIS5K/DIS-VD/im/11#Furniture#6#DentalChair#23811071619_2a95c3a688_o.jpg
+DIS5K/DIS-VD/im/11#Furniture#9#Easel#8322807354_df6d56542e_o.jpg
+DIS5K/DIS-VD/im/13#Insect#10#Mosquito#12391674863_0cdf430d3f_o.jpg
+DIS5K/DIS-VD/im/13#Insect#7#Dragonfly#14693028899_344ea118f2_o.jpg
+DIS5K/DIS-VD/im/14#Kitchenware#10#WineGlass#4450148455_8f460f541a_o.jpg
+DIS5K/DIS-VD/im/14#Kitchenware#3#Hydrovalve#IMG_20210520_203410.jpg
+DIS5K/DIS-VD/im/15#Machine#3#PlowHarrow#34521712846_df4babb024_o.jpg
+DIS5K/DIS-VD/im/16#Music Instrument#5#Trombone#6222242743_e7189405cd_o.jpg
+DIS5K/DIS-VD/im/17#Non-motor Vehicle#12#Wheel#25677578797_ea47e1d9e8_o.jpg
+DIS5K/DIS-VD/im/17#Non-motor Vehicle#2#Bicycle#5153474856_21560b081b_o.jpg
+DIS5K/DIS-VD/im/17#Non-motor Vehicle#7#Mower#16992510572_8a6ff27398_o.jpg
+DIS5K/DIS-VD/im/19#Ship#2#Canoe#40571458163_7faf8b73d9_o.jpg
+DIS5K/DIS-VD/im/2#Aircraft#1#Airplane#4270588164_66a619e834_o.jpg
+DIS5K/DIS-VD/im/2#Aircraft#4#Helicopter#86789665_650b94b2ee_o.jpg
+DIS5K/DIS-VD/im/20#Sports#14#Wakesurfing#5589577652_5061c168d2_o.jpg
+DIS5K/DIS-VD/im/21#Tool#10#Spade#37018312543_63b21b0784_o.jpg
+DIS5K/DIS-VD/im/21#Tool#14#Sword#24789047250_42df9bf422_o.jpg
+DIS5K/DIS-VD/im/21#Tool#18#Umbrella#IMG_20210513_140445.jpg
+DIS5K/DIS-VD/im/21#Tool#6#Key#43939732715_5a6e28b518_o.jpg
+DIS5K/DIS-VD/im/22#Weapon#1#Cannon#12758066705_90b54295e7_o.jpg
+DIS5K/DIS-VD/im/22#Weapon#4#Rifle#8019368790_fb6dc469a7_o.jpg
+DIS5K/DIS-VD/im/3#Aquatic#5#Shrimp#2582833427_7a99e7356e_o.jpg
+DIS5K/DIS-VD/im/4#Architecture#12#Scaffold#1013402687_590750354e_o.jpg
+DIS5K/DIS-VD/im/4#Architecture#13#Sculpture#17176841759_272a3ed6e3_o.jpg
+DIS5K/DIS-VD/im/4#Architecture#14#Stair#15079108505_0d11281624_o.jpg
+DIS5K/DIS-VD/im/4#Architecture#19#Windmill#2928111082_ceb3051c04_o.jpg
+DIS5K/DIS-VD/im/4#Architecture#3#Crack#3551574032_17dd106d31_o.jpg
+DIS5K/DIS-VD/im/4#Architecture#5#GasStation#4564307581_c3069bdc62_o.jpg
+DIS5K/DIS-VD/im/4#Architecture#8#ObservationTower#2704526950_d4f0ddc807_o.jpg
+DIS5K/DIS-VD/im/5#Artifact#3#Handcraft#10873642323_1bafce3aa5_o.jpg
+DIS5K/DIS-VD/im/6#Automobile#11#Tractor#8594504006_0c2c557d85_o.jpg
+DIS5K/DIS-VD/im/8#Electronics#3#Earphone#8106454803_1178d867cc_o.jpg

src/depth_pro/network/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Copyright (C) 2024 Apple Inc. All Rights Reserved.
2	+ """Depth Pro network blocks."""

src/depth_pro/network/decoder.py ADDED Viewed

	@@ -0,0 +1,206 @@

+"""Copyright (C) 2024 Apple Inc. All Rights Reserved.
+Dense Prediction Transformer Decoder architecture.
+Implements a variant of Vision Transformers for Dense Prediction, https://arxiv.org/abs/2103.13413
+"""
+from __future__ import annotations
+from typing import Iterable
+import torch
+from torch import nn
+class MultiresConvDecoder(nn.Module):
+    """Decoder for multi-resolution encodings."""
+    def __init__(
+        self,
+        dims_encoder: Iterable[int],
+        dim_decoder: int,
+    ):
+        """Initialize multiresolution convolutional decoder.
+        Args:
+        ----
+            dims_encoder: Expected dims at each level from the encoder.
+            dim_decoder: Dim of decoder features.
+        """
+        super().__init__()
+        self.dims_encoder = list(dims_encoder)
+        self.dim_decoder = dim_decoder
+        self.dim_out = dim_decoder
+        num_encoders = len(self.dims_encoder)
+        # At the highest resolution, i.e. level 0, we apply projection w/ 1x1 convolution
+        # when the dimensions mismatch. Otherwise we do not do anything, which is
+        # the default behavior of monodepth.
+        conv0 = (
+            nn.Conv2d(self.dims_encoder[0], dim_decoder, kernel_size=1, bias=False)
+            if self.dims_encoder[0] != dim_decoder
+            else nn.Identity()
+        )
+        convs = [conv0]
+        for i in range(1, num_encoders):
+            convs.append(
+                nn.Conv2d(
+                    self.dims_encoder[i],
+                    dim_decoder,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    bias=False,
+                )
+            )
+        self.convs = nn.ModuleList(convs)
+        fusions = []
+        for i in range(num_encoders):
+            fusions.append(
+                FeatureFusionBlock2d(
+                    num_features=dim_decoder,
+                    deconv=(i != 0),
+                    batch_norm=False,
+                )
+            )
+        self.fusions = nn.ModuleList(fusions)
+    def forward(self, encodings: torch.Tensor) -> torch.Tensor:
+        """Decode the multi-resolution encodings."""
+        num_levels = len(encodings)
+        num_encoders = len(self.dims_encoder)
+        if num_levels != num_encoders:
+            raise ValueError(
+                f"Got encoder output levels={num_levels}, expected levels={num_encoders+1}."
+            )
+        # Project features of different encoder dims to the same decoder dim.
+        # Fuse features from the lowest resolution (num_levels-1)
+        # to the highest (0).
+        features = self.convs[-1](encodings[-1])
+        lowres_features = features
+        features = self.fusions[-1](features)
+        for i in range(num_levels - 2, -1, -1):
+            features_i = self.convs[i](encodings[i])
+            features = self.fusions[i](features, features_i)
+        return features, lowres_features
+class ResidualBlock(nn.Module):
+    """Generic implementation of residual blocks.
+    This implements a generic residual block from
+        He et al. - Identity Mappings in Deep Residual Networks (2016),
+        https://arxiv.org/abs/1603.05027
+    which can be further customized via factory functions.
+    """
+    def __init__(self, residual: nn.Module, shortcut: nn.Module | None = None) -> None:
+        """Initialize ResidualBlock."""
+        super().__init__()
+        self.residual = residual
+        self.shortcut = shortcut
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Apply residual block."""
+        delta_x = self.residual(x)
+        if self.shortcut is not None:
+            x = self.shortcut(x)
+        return x + delta_x
+class FeatureFusionBlock2d(nn.Module):
+    """Feature fusion for DPT."""
+    def __init__(
+        self,
+        num_features: int,
+        deconv: bool = False,
+        batch_norm: bool = False,
+    ):
+        """Initialize feature fusion block.
+        Args:
+        ----
+            num_features: Input and output dimensions.
+            deconv: Whether to use deconv before the final output conv.
+            batch_norm: Whether to use batch normalization in resnet blocks.
+        """
+        super().__init__()
+        self.resnet1 = self._residual_block(num_features, batch_norm)
+        self.resnet2 = self._residual_block(num_features, batch_norm)
+        self.use_deconv = deconv
+        if deconv:
+            self.deconv = nn.ConvTranspose2d(
+                in_channels=num_features,
+                out_channels=num_features,
+                kernel_size=2,
+                stride=2,
+                padding=0,
+                bias=False,
+            )
+        self.out_conv = nn.Conv2d(
+            num_features,
+            num_features,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True,
+        )
+        self.skip_add = nn.quantized.FloatFunctional()
+    def forward(self, x0: torch.Tensor, x1: torch.Tensor | None = None) -> torch.Tensor:
+        """Process and fuse input features."""
+        x = x0
+        if x1 is not None:
+            res = self.resnet1(x1)
+            x = self.skip_add.add(x, res)
+        x = self.resnet2(x)
+        if self.use_deconv:
+            x = self.deconv(x)
+        x = self.out_conv(x)
+        return x
+    @staticmethod
+    def _residual_block(num_features: int, batch_norm: bool):
+        """Create a residual block."""
+        def _create_block(dim: int, batch_norm: bool) -> list[nn.Module]:
+            layers = [
+                nn.ReLU(False),
+                nn.Conv2d(
+                    num_features,
+                    num_features,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    bias=not batch_norm,
+                ),
+            ]
+            if batch_norm:
+                layers.append(nn.BatchNorm2d(dim))
+            return layers
+        residual = nn.Sequential(
+            *_create_block(dim=num_features, batch_norm=batch_norm),
+            *_create_block(dim=num_features, batch_norm=batch_norm),
+        )
+        return ResidualBlock(residual)

src/depth_pro/network/encoder.py ADDED Viewed

	@@ -0,0 +1,332 @@

+# Copyright (C) 2024 Apple Inc. All Rights Reserved.
+# DepthProEncoder combining patch and image encoders.
+from __future__ import annotations
+import math
+from typing import Iterable, Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class DepthProEncoder(nn.Module):
+    """DepthPro Encoder.
+    An encoder aimed at creating multi-resolution encodings from Vision Transformers.
+    """
+    def __init__(
+        self,
+        dims_encoder: Iterable[int],
+        patch_encoder: nn.Module,
+        image_encoder: nn.Module,
+        hook_block_ids: Iterable[int],
+        decoder_features: int,
+    ):
+        """Initialize DepthProEncoder.
+        The framework
+            1. creates an image pyramid,
+            2. generates overlapping patches with a sliding window at each pyramid level,
+            3. creates batched encodings via vision transformer backbones,
+            4. produces multi-resolution encodings.
+        Args:
+        ----
+            img_size: Backbone image resolution.
+            dims_encoder: Dimensions of the encoder at different layers.
+            patch_encoder: Backbone used for patches.
+            image_encoder: Backbone used for global image encoder.
+            hook_block_ids: Hooks to obtain intermediate features for the patch encoder model.
+            decoder_features: Number of feature output in the decoder.
+        """
+        super().__init__()
+        self.dims_encoder = list(dims_encoder)
+        self.patch_encoder = patch_encoder
+        self.image_encoder = image_encoder
+        self.hook_block_ids = list(hook_block_ids)
+        patch_encoder_embed_dim = patch_encoder.embed_dim
+        image_encoder_embed_dim = image_encoder.embed_dim
+        self.out_size = int(
+            patch_encoder.patch_embed.img_size[0] // patch_encoder.patch_embed.patch_size[0]
+        )
+        def _create_project_upsample_block(
+            dim_in: int,
+            dim_out: int,
+            upsample_layers: int,
+            dim_int: Optional[int] = None,
+        ) -> nn.Module:
+            if dim_int is None:
+                dim_int = dim_out
+            # Projection.
+            blocks = [
+                nn.Conv2d(
+                    in_channels=dim_in,
+                    out_channels=dim_int,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                    bias=False,
+                )
+            ]
+            # Upsampling.
+            blocks += [
+                nn.ConvTranspose2d(
+                    in_channels=dim_int if i == 0 else dim_out,
+                    out_channels=dim_out,
+                    kernel_size=2,
+                    stride=2,
+                    padding=0,
+                    bias=False,
+                )
+                for i in range(upsample_layers)
+            ]
+            return nn.Sequential(*blocks)
+        self.upsample_latent0 = _create_project_upsample_block(
+            dim_in=patch_encoder_embed_dim,
+            dim_int=self.dims_encoder[0],
+            dim_out=decoder_features,
+            upsample_layers=3,
+        )
+        self.upsample_latent1 = _create_project_upsample_block(
+            dim_in=patch_encoder_embed_dim, dim_out=self.dims_encoder[0], upsample_layers=2
+        )
+        self.upsample0 = _create_project_upsample_block(
+            dim_in=patch_encoder_embed_dim, dim_out=self.dims_encoder[1], upsample_layers=1
+        )
+        self.upsample1 = _create_project_upsample_block(
+            dim_in=patch_encoder_embed_dim, dim_out=self.dims_encoder[2], upsample_layers=1
+        )
+        self.upsample2 = _create_project_upsample_block(
+            dim_in=patch_encoder_embed_dim, dim_out=self.dims_encoder[3], upsample_layers=1
+        )
+        self.upsample_lowres = nn.ConvTranspose2d(
+            in_channels=image_encoder_embed_dim,
+            out_channels=self.dims_encoder[3],
+            kernel_size=2,
+            stride=2,
+            padding=0,
+            bias=True,
+        )
+        self.fuse_lowres = nn.Conv2d(
+            in_channels=(self.dims_encoder[3] + self.dims_encoder[3]),
+            out_channels=self.dims_encoder[3],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True,
+        )
+        # Obtain intermediate outputs of the blocks.
+        self.patch_encoder.blocks[self.hook_block_ids[0]].register_forward_hook(
+            self._hook0
+        )
+        self.patch_encoder.blocks[self.hook_block_ids[1]].register_forward_hook(
+            self._hook1
+        )
+    def _hook0(self, model, input, output):
+        self.backbone_highres_hook0 = output
+    def _hook1(self, model, input, output):
+        self.backbone_highres_hook1 = output
+    @property
+    def img_size(self) -> int:
+        """Return the full image size of the SPN network."""
+        return self.patch_encoder.patch_embed.img_size[0] * 4
+    def _create_pyramid(
+        self, x: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Create a 3-level image pyramid."""
+        # Original resolution: 1536 by default.
+        x0 = x
+        # Middle resolution: 768 by default.
+        x1 = F.interpolate(
+            x, size=None, scale_factor=0.5, mode="bilinear", align_corners=False
+        )
+        # Low resolution: 384 by default, corresponding to the backbone resolution.
+        x2 = F.interpolate(
+            x, size=None, scale_factor=0.25, mode="bilinear", align_corners=False
+        )
+        return x0, x1, x2
+    def split(self, x: torch.Tensor, overlap_ratio: float = 0.25) -> torch.Tensor:
+        """Split the input into small patches with sliding window."""
+        patch_size = 384
+        patch_stride = int(patch_size * (1 - overlap_ratio))
+        image_size = x.shape[-1]
+        steps = int(math.ceil((image_size - patch_size) / patch_stride)) + 1
+        x_patch_list = []
+        for j in range(steps):
+            j0 = j * patch_stride
+            j1 = j0 + patch_size
+            for i in range(steps):
+                i0 = i * patch_stride
+                i1 = i0 + patch_size
+                x_patch_list.append(x[..., j0:j1, i0:i1])
+        return torch.cat(x_patch_list, dim=0)
+    def merge(self, x: torch.Tensor, batch_size: int, padding: int = 3) -> torch.Tensor:
+        """Merge the patched input into a image with sliding window."""
+        steps = int(math.sqrt(x.shape[0] // batch_size))
+        idx = 0
+        output_list = []
+        for j in range(steps):
+            output_row_list = []
+            for i in range(steps):
+                output = x[batch_size * idx : batch_size * (idx + 1)]
+                if j != 0:
+                    output = output[..., padding:, :]
+                if i != 0:
+                    output = output[..., :, padding:]
+                if j != steps - 1:
+                    output = output[..., :-padding, :]
+                if i != steps - 1:
+                    output = output[..., :, :-padding]
+                output_row_list.append(output)
+                idx += 1
+            output_row = torch.cat(output_row_list, dim=-1)
+            output_list.append(output_row)
+        output = torch.cat(output_list, dim=-2)
+        return output
+    def reshape_feature(
+        self, embeddings: torch.Tensor, width, height, cls_token_offset=1
+    ):
+        """Discard class token and reshape 1D feature map to a 2D grid."""
+        b, hw, c = embeddings.shape
+        # Remove class token.
+        if cls_token_offset > 0:
+            embeddings = embeddings[:, cls_token_offset:, :]
+        # Shape: (batch, height, width, dim) -> (batch, dim, height, width)
+        embeddings = embeddings.reshape(b, height, width, c).permute(0, 3, 1, 2)
+        return embeddings
+    def forward(self, x: torch.Tensor) -> list[torch.Tensor]:
+        """Encode input at multiple resolutions.
+        Args:
+        ----
+            x (torch.Tensor): Input image.
+        Returns:
+        -------
+            Multi resolution encoded features.
+        """
+        batch_size = x.shape[0]
+        # Step 0: create a 3-level image pyramid.
+        x0, x1, x2 = self._create_pyramid(x)
+        # Step 1: split to create batched overlapped mini-images at the backbone (BeiT/ViT/Dino)
+        # resolution.
+        # 5x5 @ 384x384 at the highest resolution (1536x1536).
+        x0_patches = self.split(x0, overlap_ratio=0.25)
+        # 3x3 @ 384x384 at the middle resolution (768x768).
+        x1_patches = self.split(x1, overlap_ratio=0.5)
+        # 1x1 # 384x384 at the lowest resolution (384x384).
+        x2_patches = x2
+        # Concatenate all the sliding window patches and form a batch of size (35=5x5+3x3+1x1).
+        x_pyramid_patches = torch.cat(
+            (x0_patches, x1_patches, x2_patches),
+            dim=0,
+        )
+        # Step 2: Run the backbone (BeiT) model and get the result of large batch size.
+        x_pyramid_encodings = self.patch_encoder(x_pyramid_patches)
+        x_pyramid_encodings = self.reshape_feature(
+            x_pyramid_encodings, self.out_size, self.out_size
+        )
+        # Step 3: merging.
+        # Merge highres latent encoding.
+        x_latent0_encodings = self.reshape_feature(
+            self.backbone_highres_hook0,
+            self.out_size,
+            self.out_size,
+        )
+        x_latent0_features = self.merge(
+            x_latent0_encodings[: batch_size * 5 * 5], batch_size=batch_size, padding=3
+        )
+        x_latent1_encodings = self.reshape_feature(
+            self.backbone_highres_hook1,
+            self.out_size,
+            self.out_size,
+        )
+        x_latent1_features = self.merge(
+            x_latent1_encodings[: batch_size * 5 * 5], batch_size=batch_size, padding=3
+        )
+        # Split the 35 batch size from pyramid encoding back into 5x5+3x3+1x1.
+        x0_encodings, x1_encodings, x2_encodings = torch.split(
+            x_pyramid_encodings,
+            [len(x0_patches), len(x1_patches), len(x2_patches)],
+            dim=0,
+        )
+        # 96x96 feature maps by merging 5x5 @ 24x24 patches with overlaps.
+        x0_features = self.merge(x0_encodings, batch_size=batch_size, padding=3)
+        # 48x84 feature maps by merging 3x3 @ 24x24 patches with overlaps.
+        x1_features = self.merge(x1_encodings, batch_size=batch_size, padding=6)
+        # 24x24 feature maps.
+        x2_features = x2_encodings
+        # Apply the image encoder model.
+        x_global_features = self.image_encoder(x2_patches)
+        x_global_features = self.reshape_feature(
+            x_global_features, self.out_size, self.out_size
+        )
+        # Upsample feature maps.
+        x_latent0_features = self.upsample_latent0(x_latent0_features)
+        x_latent1_features = self.upsample_latent1(x_latent1_features)
+        x0_features = self.upsample0(x0_features)
+        x1_features = self.upsample1(x1_features)
+        x2_features = self.upsample2(x2_features)
+        x_global_features = self.upsample_lowres(x_global_features)
+        x_global_features = self.fuse_lowres(
+            torch.cat((x2_features, x_global_features), dim=1)
+        )
+        return [
+            x_latent0_features,
+            x_latent1_features,
+            x0_features,
+            x1_features,
+            x_global_features,
+        ]

src/depth_pro/network/fov.py ADDED Viewed

	@@ -0,0 +1,82 @@

+# Copyright (C) 2024 Apple Inc. All Rights Reserved.
+# Field of View network architecture.
+from typing import Optional
+import torch
+from torch import nn
+from torch.nn import functional as F
+class FOVNetwork(nn.Module):
+    """Field of View estimation network."""
+    def __init__(
+        self,
+        num_features: int,
+        fov_encoder: Optional[nn.Module] = None,
+    ):
+        """Initialize the Field of View estimation block.
+        Args:
+        ----
+            num_features: Number of features used.
+            fov_encoder: Optional encoder to bring additional network capacity.
+        """
+        super().__init__()
+        # Create FOV head.
+        fov_head0 = [
+            nn.Conv2d(
+                num_features, num_features // 2, kernel_size=3, stride=2, padding=1
+            ),  # 128 x 24 x 24
+            nn.ReLU(True),
+        ]
+        fov_head = [
+            nn.Conv2d(
+                num_features // 2, num_features // 4, kernel_size=3, stride=2, padding=1
+            ),  # 64 x 12 x 12
+            nn.ReLU(True),
+            nn.Conv2d(
+                num_features // 4, num_features // 8, kernel_size=3, stride=2, padding=1
+            ),  # 32 x 6 x 6
+            nn.ReLU(True),
+            nn.Conv2d(num_features // 8, 1, kernel_size=6, stride=1, padding=0),
+        ]
+        if fov_encoder is not None:
+            self.encoder = nn.Sequential(
+                fov_encoder, nn.Linear(fov_encoder.embed_dim, num_features // 2)
+            )
+            self.downsample = nn.Sequential(*fov_head0)
+        else:
+            fov_head = fov_head0 + fov_head
+        self.head = nn.Sequential(*fov_head)
+    def forward(self, x: torch.Tensor, lowres_feature: torch.Tensor) -> torch.Tensor:
+        """Forward the fov network.
+        Args:
+        ----
+            x (torch.Tensor): Input image.
+            lowres_feature (torch.Tensor): Low resolution feature.
+        Returns:
+        -------
+            The field of view tensor.
+        """
+        if hasattr(self, "encoder"):
+            x = F.interpolate(
+                x,
+                size=None,
+                scale_factor=0.25,
+                mode="bilinear",
+                align_corners=False,
+            )
+            x = self.encoder(x)[:, 1:].permute(0, 2, 1)
+            lowres_feature = self.downsample(lowres_feature)
+            x = x.reshape_as(lowres_feature) + lowres_feature
+        else:
+            x = lowres_feature
+        return self.head(x)

src/depth_pro/network/vit.py ADDED Viewed

	@@ -0,0 +1,123 @@

+# Copyright (C) 2024 Apple Inc. All Rights Reserved.
+try:
+    from timm.layers import resample_abs_pos_embed
+except ImportError as err:
+    print("ImportError: {0}".format(err))
+import torch
+import torch.nn as nn
+from torch.utils.checkpoint import checkpoint
+def make_vit_b16_backbone(
+    model,
+    encoder_feature_dims,
+    encoder_feature_layer_ids,
+    vit_features,
+    start_index=1,
+    use_grad_checkpointing=False,
+) -> nn.Module:
+    """Make a ViTb16 backbone for the DPT model."""
+    if use_grad_checkpointing:
+        model.set_grad_checkpointing()
+    vit_model = nn.Module()
+    vit_model.hooks = encoder_feature_layer_ids
+    vit_model.model = model
+    vit_model.features = encoder_feature_dims
+    vit_model.vit_features = vit_features
+    vit_model.model.start_index = start_index
+    vit_model.model.patch_size = vit_model.model.patch_embed.patch_size
+    vit_model.model.is_vit = True
+    vit_model.model.forward = vit_model.model.forward_features
+    return vit_model
+def forward_features_eva_fixed(self, x):
+    """Encode features."""
+    x = self.patch_embed(x)
+    x, rot_pos_embed = self._pos_embed(x)
+    for blk in self.blocks:
+        if self.grad_checkpointing:
+            x = checkpoint(blk, x, rot_pos_embed)
+        else:
+            x = blk(x, rot_pos_embed)
+    x = self.norm(x)
+    return x
+def resize_vit(model: nn.Module, img_size) -> nn.Module:
+    """Resample the ViT module to the given size."""
+    patch_size = model.patch_embed.patch_size
+    model.patch_embed.img_size = img_size
+    grid_size = tuple([s // p for s, p in zip(img_size, patch_size)])
+    model.patch_embed.grid_size = grid_size
+    pos_embed = resample_abs_pos_embed(
+        model.pos_embed,
+        grid_size,  # img_size
+        num_prefix_tokens=(
+            0 if getattr(model, "no_embed_class", False) else model.num_prefix_tokens
+        ),
+    )
+    model.pos_embed = torch.nn.Parameter(pos_embed)
+    return model
+def resize_patch_embed(model: nn.Module, new_patch_size=(16, 16)) -> nn.Module:
+    """Resample the ViT patch size to the given one."""
+    # interpolate patch embedding
+    if hasattr(model, "patch_embed"):
+        old_patch_size = model.patch_embed.patch_size
+        if (
+            new_patch_size[0] != old_patch_size[0]
+            or new_patch_size[1] != old_patch_size[1]
+        ):
+            patch_embed_proj = model.patch_embed.proj.weight
+            patch_embed_proj_bias = model.patch_embed.proj.bias
+            use_bias = True if patch_embed_proj_bias is not None else False
+            _, _, h, w = patch_embed_proj.shape
+            new_patch_embed_proj = torch.nn.functional.interpolate(
+                patch_embed_proj,
+                size=[new_patch_size[0], new_patch_size[1]],
+                mode="bicubic",
+                align_corners=False,
+            )
+            new_patch_embed_proj = (
+                new_patch_embed_proj * (h / new_patch_size[0]) * (w / new_patch_size[1])
+            )
+            model.patch_embed.proj = nn.Conv2d(
+                in_channels=model.patch_embed.proj.in_channels,
+                out_channels=model.patch_embed.proj.out_channels,
+                kernel_size=new_patch_size,
+                stride=new_patch_size,
+                bias=use_bias,
+            )
+            if use_bias:
+                model.patch_embed.proj.bias = patch_embed_proj_bias
+            model.patch_embed.proj.weight = torch.nn.Parameter(new_patch_embed_proj)
+            model.patch_size = new_patch_size
+            model.patch_embed.patch_size = new_patch_size
+            model.patch_embed.img_size = (
+                int(
+                    model.patch_embed.img_size[0]
+                    * new_patch_size[0]
+                    / old_patch_size[0]
+                ),
+                int(
+                    model.patch_embed.img_size[1]
+                    * new_patch_size[1]
+                    / old_patch_size[1]
+                ),
+            )
+    return model

src/depth_pro/network/vit_factory.py ADDED Viewed

	@@ -0,0 +1,124 @@

+# Copyright (C) 2024 Apple Inc. All Rights Reserved.
+# Factory functions to build and load ViT models.
+from __future__ import annotations
+import logging
+import types
+from dataclasses import dataclass
+from typing import Dict, List, Literal, Optional
+import timm
+import torch
+import torch.nn as nn
+from .vit import (
+    forward_features_eva_fixed,
+    make_vit_b16_backbone,
+    resize_patch_embed,
+    resize_vit,
+)
+LOGGER = logging.getLogger(__name__)
+ViTPreset = Literal[
+    "dinov2l16_384",
+]
+@dataclass
+class ViTConfig:
+    """Configuration for ViT."""
+    in_chans: int
+    embed_dim: int
+    img_size: int = 384
+    patch_size: int = 16
+    # In case we need to rescale the backbone when loading from timm.
+    timm_preset: Optional[str] = None
+    timm_img_size: int = 384
+    timm_patch_size: int = 16
+    # The following 2 parameters are only used by DPT.  See dpt_factory.py.
+    encoder_feature_layer_ids: List[int] = None
+    """The layers in the Beit/ViT used to constructs encoder features for DPT."""
+    encoder_feature_dims: List[int] = None
+    """The dimension of features of encoder layers from Beit/ViT features for DPT."""
+VIT_CONFIG_DICT: Dict[ViTPreset, ViTConfig] = {
+    "dinov2l16_384": ViTConfig(
+        in_chans=3,
+        embed_dim=1024,
+        encoder_feature_layer_ids=[5, 11, 17, 23],
+        encoder_feature_dims=[256, 512, 1024, 1024],
+        img_size=384,
+        patch_size=16,
+        timm_preset="vit_large_patch14_dinov2",
+        timm_img_size=518,
+        timm_patch_size=14,
+    ),
+}
+def create_vit(
+    preset: ViTPreset,
+    use_pretrained: bool = False,
+    checkpoint_uri: str | None = None,
+    use_grad_checkpointing: bool = False,
+) -> nn.Module:
+    """Create and load a VIT backbone module.
+    Args:
+    ----
+        preset: The VIT preset to load the pre-defined config.
+        use_pretrained: Load pretrained weights if True, default is False.
+        checkpoint_uri: Checkpoint to load the wights from.
+        use_grad_checkpointing: Use grandient checkpointing.
+    Returns:
+    -------
+        A Torch ViT backbone module.
+    """
+    config = VIT_CONFIG_DICT[preset]
+    img_size = (config.img_size, config.img_size)
+    patch_size = (config.patch_size, config.patch_size)
+    if "eva02" in preset:
+        model = timm.create_model(config.timm_preset, pretrained=use_pretrained)
+        model.forward_features = types.MethodType(forward_features_eva_fixed, model)
+    else:
+        model = timm.create_model(
+            config.timm_preset, pretrained=use_pretrained, dynamic_img_size=True
+        )
+    model = make_vit_b16_backbone(
+        model,
+        encoder_feature_dims=config.encoder_feature_dims,
+        encoder_feature_layer_ids=config.encoder_feature_layer_ids,
+        vit_features=config.embed_dim,
+        use_grad_checkpointing=use_grad_checkpointing,
+    )
+    if config.patch_size != config.timm_patch_size:
+        model.model = resize_patch_embed(model.model, new_patch_size=patch_size)
+    if config.img_size != config.timm_img_size:
+        model.model = resize_vit(model.model, img_size=img_size)
+    if checkpoint_uri is not None:
+        state_dict = torch.load(checkpoint_uri, map_location="cpu")
+        missing_keys, unexpected_keys = model.load_state_dict(
+            state_dict=state_dict, strict=False
+        )
+        if len(unexpected_keys) != 0:
+            raise KeyError(f"Found unexpected keys when loading vit: {unexpected_keys}")
+        if len(missing_keys) != 0:
+            raise KeyError(f"Keys are missing when loading vit: {missing_keys}")
+    LOGGER.info(model)
+    return model.model

src/depth_pro/utils.py ADDED Viewed

	@@ -0,0 +1,112 @@

+# Copyright (C) 2024 Apple Inc. All Rights Reserved.
+import logging
+from pathlib import Path
+from typing import Any, Dict, List, Tuple, Union
+import numpy as np
+import pillow_heif
+from PIL import ExifTags, Image, TiffTags
+from pillow_heif import register_heif_opener
+register_heif_opener()
+LOGGER = logging.getLogger(__name__)
+def extract_exif(img_pil: Image) -> Dict[str, Any]:
+    """Return exif information as a dictionary.
+    Args:
+    ----
+        img_pil: A Pillow image.
+    Returns:
+    -------
+        A dictionary with extracted EXIF information.
+    """
+    # Get full exif description from get_ifd(0x8769):
+    # cf https://pillow.readthedocs.io/en/stable/releasenotes/8.2.0.html#image-getexif-exif-and-gps-ifd
+    img_exif = img_pil.getexif().get_ifd(0x8769)
+    exif_dict = {ExifTags.TAGS[k]: v for k, v in img_exif.items() if k in ExifTags.TAGS}
+    tiff_tags = img_pil.getexif()
+    tiff_dict = {
+        TiffTags.TAGS_V2[k].name: v
+        for k, v in tiff_tags.items()
+        if k in TiffTags.TAGS_V2
+    }
+    return {**exif_dict, **tiff_dict}
+def fpx_from_f35(width: float, height: float, f_mm: float = 50) -> float:
+    """Convert a focal length given in mm (35mm film equivalent) to pixels."""
+    return f_mm * np.sqrt(width**2.0 + height**2.0) / np.sqrt(36**2 + 24**2)
+def load_rgb(
+    path: Union[Path, str], auto_rotate: bool = True, remove_alpha: bool = True
+) -> Tuple[np.ndarray, List[bytes], float]:
+    """Load an RGB image.
+    Args:
+    ----
+        path: The url to the image to load.
+        auto_rotate: Rotate the image based on the EXIF data, default is True.
+        remove_alpha: Remove the alpha channel, default is True.
+    Returns:
+    -------
+        img: The image loaded as a numpy array.
+        icc_profile: The color profile of the image.
+        f_px: The optional focal length in pixels, extracting from the exif data.
+    """
+    LOGGER.debug(f"Loading image {path} ...")
+    path = Path(path)
+    if path.suffix.lower() in [".heic"]:
+        heif_file = pillow_heif.open_heif(path, convert_hdr_to_8bit=True)
+        img_pil = heif_file.to_pillow()
+    else:
+        img_pil = Image.open(path)
+    img_exif = extract_exif(img_pil)
+    icc_profile = img_pil.info.get("icc_profile", None)
+    # Rotate the image.
+    if auto_rotate:
+        exif_orientation = img_exif.get("Orientation", 1)
+        if exif_orientation == 3:
+            img_pil = img_pil.transpose(Image.ROTATE_180)
+        elif exif_orientation == 6:
+            img_pil = img_pil.transpose(Image.ROTATE_270)
+        elif exif_orientation == 8:
+            img_pil = img_pil.transpose(Image.ROTATE_90)
+        elif exif_orientation != 1:
+            LOGGER.warning(f"Ignoring image orientation {exif_orientation}.")
+    img = np.array(img_pil)
+    # Convert to RGB if single channel.
+    if img.ndim < 3 or img.shape[2] == 1:
+        img = np.dstack((img, img, img))
+    if remove_alpha:
+        img = img[:, :, :3]
+    LOGGER.debug(f"\tHxW: {img.shape[0]}x{img.shape[1]}")
+    # Extract the focal length from exif data.
+    f_35mm = img_exif.get(
+        "FocalLengthIn35mmFilm",
+        img_exif.get(
+            "FocalLenIn35mmFilm", img_exif.get("FocalLengthIn35mmFormat", None)
+        ),
+    )
+    if f_35mm is not None and f_35mm > 0:
+        LOGGER.debug(f"\tfocal length @ 35mm film: {f_35mm}mm")
+        f_px = fpx_from_f35(img.shape[1], img.shape[0], f_35mm)
+    else:
+        f_px = None
+    return img, icc_profile, f_px