Spaces:
Running on A10G

zxhezexin commited on
Commit
f2a2544
1 Parent(s): 87c0d1b

Update spaces

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. .gitignore +52 -0
  3. LICENSE +201 -0
  4. LICENSE_NVIDIA +99 -0
  5. LICENSE_WEIGHT +407 -0
  6. README.md +117 -0
  7. app.py +210 -0
  8. assets/mesh_snapshot/crop.building.ply00.png +0 -0
  9. assets/mesh_snapshot/crop.building.ply01.png +0 -0
  10. assets/mesh_snapshot/crop.owl.ply00.png +0 -0
  11. assets/mesh_snapshot/crop.owl.ply01.png +0 -0
  12. assets/mesh_snapshot/crop.rose.ply00.png +0 -0
  13. assets/mesh_snapshot/crop.rose.ply01.png +0 -0
  14. assets/rendered_video/teaser.gif +3 -0
  15. assets/sample_input/building.png +0 -0
  16. assets/sample_input/ceramic.png +0 -0
  17. assets/sample_input/fire.png +0 -0
  18. assets/sample_input/girl.png +0 -0
  19. assets/sample_input/hotdogs.png +0 -0
  20. assets/sample_input/hydrant.png +0 -0
  21. assets/sample_input/lamp.png +0 -0
  22. assets/sample_input/mailbox.png +0 -0
  23. assets/sample_input/owl.png +0 -0
  24. assets/sample_input/traffic.png +0 -0
  25. configs/infer-b.yaml +8 -0
  26. configs/infer-gradio.yaml +7 -0
  27. configs/infer-l.yaml +8 -0
  28. configs/infer-s.yaml +8 -0
  29. model_card.md +67 -0
  30. openlrm/__init__.py +15 -0
  31. openlrm/datasets/__init__.py +16 -0
  32. openlrm/datasets/base.py +68 -0
  33. openlrm/datasets/cam_utils.py +179 -0
  34. openlrm/launch.py +36 -0
  35. openlrm/losses/__init__.py +18 -0
  36. openlrm/losses/perceptual.py +70 -0
  37. openlrm/losses/pixelwise.py +58 -0
  38. openlrm/losses/tvloss.py +55 -0
  39. openlrm/models/__init__.py +21 -0
  40. openlrm/models/block.py +124 -0
  41. openlrm/models/embedder.py +37 -0
  42. openlrm/models/encoders/__init__.py +15 -0
  43. openlrm/models/encoders/dino_wrapper.py +68 -0
  44. openlrm/models/encoders/dinov2/__init__.py +15 -0
  45. openlrm/models/encoders/dinov2/hub/__init__.py +4 -0
  46. openlrm/models/encoders/dinov2/hub/backbones.py +166 -0
  47. openlrm/models/encoders/dinov2/hub/classifiers.py +268 -0
  48. openlrm/models/encoders/dinov2/hub/depth/__init__.py +7 -0
  49. openlrm/models/encoders/dinov2/hub/depth/decode_heads.py +747 -0
  50. openlrm/models/encoders/dinov2/hub/depth/encoder_decoder.py +351 -0
.gitattributes ADDED
@@ -0,0 +1 @@
 
 
1
+ assets/rendered_video/teaser.gif filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Ignore Python cache files
2
+ **/__pycache__
3
+
4
+ # Ignore compiled Python files
5
+ *.pyc
6
+
7
+ # Ignore editor-specific files
8
+ .vscode/
9
+ .idea/
10
+
11
+ # Ignore operating system files
12
+ .DS_Store
13
+ Thumbs.db
14
+
15
+ # Ignore log files
16
+ *.log
17
+
18
+ # Ignore temporary and cache files
19
+ *.tmp
20
+ *.cache
21
+
22
+ # Ignore build artifacts
23
+ /build/
24
+ /dist/
25
+
26
+ # Ignore virtual environment files
27
+ /venv/
28
+ /.venv/
29
+
30
+ # Ignore package manager files
31
+ /node_modules/
32
+ /yarn.lock
33
+ /package-lock.json
34
+
35
+ # Ignore database files
36
+ *.db
37
+ *.sqlite
38
+
39
+ # Ignore secret files
40
+ *.secret
41
+
42
+ # Ignore compiled binaries
43
+ *.exe
44
+ *.dll
45
+ *.so
46
+ *.dylib
47
+
48
+ # Ignore backup files
49
+ *.bak
50
+ *.swp
51
+ *.swo
52
+ *.~*
LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
LICENSE_NVIDIA ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Copyright (c) 2021-2022, NVIDIA Corporation & affiliates. All rights
2
+ reserved.
3
+
4
+
5
+ NVIDIA Source Code License for EG3D
6
+
7
+
8
+ =======================================================================
9
+
10
+ 1. Definitions
11
+
12
+ "Licensor" means any person or entity that distributes its Work.
13
+
14
+ "Software" means the original work of authorship made available under
15
+ this License.
16
+
17
+ "Work" means the Software and any additions to or derivative works of
18
+ the Software that are made available under this License.
19
+
20
+ The terms "reproduce," "reproduction," "derivative works," and
21
+ "distribution" have the meaning as provided under U.S. copyright law;
22
+ provided, however, that for the purposes of this License, derivative
23
+ works shall not include works that remain separable from, or merely
24
+ link (or bind by name) to the interfaces of, the Work.
25
+
26
+ Works, including the Software, are "made available" under this License
27
+ by including in or with the Work either (a) a copyright notice
28
+ referencing the applicability of this License to the Work, or (b) a
29
+ copy of this License.
30
+
31
+ 2. License Grants
32
+
33
+ 2.1 Copyright Grant. Subject to the terms and conditions of this
34
+ License, each Licensor grants to you a perpetual, worldwide,
35
+ non-exclusive, royalty-free, copyright license to reproduce,
36
+ prepare derivative works of, publicly display, publicly perform,
37
+ sublicense and distribute its Work and any resulting derivative
38
+ works in any form.
39
+
40
+ 3. Limitations
41
+
42
+ 3.1 Redistribution. You may reproduce or distribute the Work only
43
+ if (a) you do so under this License, (b) you include a complete
44
+ copy of this License with your distribution, and (c) you retain
45
+ without modification any copyright, patent, trademark, or
46
+ attribution notices that are present in the Work.
47
+
48
+ 3.2 Derivative Works. You may specify that additional or different
49
+ terms apply to the use, reproduction, and distribution of your
50
+ derivative works of the Work ("Your Terms") only if (a) Your Terms
51
+ provide that the use limitation in Section 3.3 applies to your
52
+ derivative works, and (b) you identify the specific derivative
53
+ works that are subject to Your Terms. Notwithstanding Your Terms,
54
+ this License (including the redistribution requirements in Section
55
+ 3.1) will continue to apply to the Work itself.
56
+
57
+ 3.3 Use Limitation. The Work and any derivative works thereof only
58
+ may be used or intended for use non-commercially. The Work or
59
+ derivative works thereof may be used or intended for use by NVIDIA
60
+ or it’s affiliates commercially or non-commercially. As used
61
+ herein, "non-commercially" means for research or evaluation
62
+ purposes only and not for any direct or indirect monetary gain.
63
+
64
+ 3.4 Patent Claims. If you bring or threaten to bring a patent claim
65
+ against any Licensor (including any claim, cross-claim or
66
+ counterclaim in a lawsuit) to enforce any patents that you allege
67
+ are infringed by any Work, then your rights under this License from
68
+ such Licensor (including the grants in Sections 2.1) will terminate
69
+ immediately.
70
+
71
+ 3.5 Trademarks. This License does not grant any rights to use any
72
+ Licensor’s or its affiliates’ names, logos, or trademarks, except
73
+ as necessary to reproduce the notices described in this License.
74
+
75
+ 3.6 Termination. If you violate any term of this License, then your
76
+ rights under this License (including the grants in Sections 2.1)
77
+ will terminate immediately.
78
+
79
+ 4. Disclaimer of Warranty.
80
+
81
+ THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY
82
+ KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
83
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
84
+ NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
85
+ THIS LICENSE.
86
+
87
+ 5. Limitation of Liability.
88
+
89
+ EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
90
+ THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
91
+ SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
92
+ INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
93
+ OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK
94
+ (INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
95
+ LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
96
+ COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF
97
+ THE POSSIBILITY OF SUCH DAMAGES.
98
+
99
+ =======================================================================
LICENSE_WEIGHT ADDED
@@ -0,0 +1,407 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Attribution-NonCommercial 4.0 International
2
+
3
+ =======================================================================
4
+
5
+ Creative Commons Corporation ("Creative Commons") is not a law firm and
6
+ does not provide legal services or legal advice. Distribution of
7
+ Creative Commons public licenses does not create a lawyer-client or
8
+ other relationship. Creative Commons makes its licenses and related
9
+ information available on an "as-is" basis. Creative Commons gives no
10
+ warranties regarding its licenses, any material licensed under their
11
+ terms and conditions, or any related information. Creative Commons
12
+ disclaims all liability for damages resulting from their use to the
13
+ fullest extent possible.
14
+
15
+ Using Creative Commons Public Licenses
16
+
17
+ Creative Commons public licenses provide a standard set of terms and
18
+ conditions that creators and other rights holders may use to share
19
+ original works of authorship and other material subject to copyright
20
+ and certain other rights specified in the public license below. The
21
+ following considerations are for informational purposes only, are not
22
+ exhaustive, and do not form part of our licenses.
23
+
24
+ Considerations for licensors: Our public licenses are
25
+ intended for use by those authorized to give the public
26
+ permission to use material in ways otherwise restricted by
27
+ copyright and certain other rights. Our licenses are
28
+ irrevocable. Licensors should read and understand the terms
29
+ and conditions of the license they choose before applying it.
30
+ Licensors should also secure all rights necessary before
31
+ applying our licenses so that the public can reuse the
32
+ material as expected. Licensors should clearly mark any
33
+ material not subject to the license. This includes other CC-
34
+ licensed material, or material used under an exception or
35
+ limitation to copyright. More considerations for licensors:
36
+ wiki.creativecommons.org/Considerations_for_licensors
37
+
38
+ Considerations for the public: By using one of our public
39
+ licenses, a licensor grants the public permission to use the
40
+ licensed material under specified terms and conditions. If
41
+ the licensor's permission is not necessary for any reason--for
42
+ example, because of any applicable exception or limitation to
43
+ copyright--then that use is not regulated by the license. Our
44
+ licenses grant only permissions under copyright and certain
45
+ other rights that a licensor has authority to grant. Use of
46
+ the licensed material may still be restricted for other
47
+ reasons, including because others have copyright or other
48
+ rights in the material. A licensor may make special requests,
49
+ such as asking that all changes be marked or described.
50
+ Although not required by our licenses, you are encouraged to
51
+ respect those requests where reasonable. More considerations
52
+ for the public:
53
+ wiki.creativecommons.org/Considerations_for_licensees
54
+
55
+ =======================================================================
56
+
57
+ Creative Commons Attribution-NonCommercial 4.0 International Public
58
+ License
59
+
60
+ By exercising the Licensed Rights (defined below), You accept and agree
61
+ to be bound by the terms and conditions of this Creative Commons
62
+ Attribution-NonCommercial 4.0 International Public License ("Public
63
+ License"). To the extent this Public License may be interpreted as a
64
+ contract, You are granted the Licensed Rights in consideration of Your
65
+ acceptance of these terms and conditions, and the Licensor grants You
66
+ such rights in consideration of benefits the Licensor receives from
67
+ making the Licensed Material available under these terms and
68
+ conditions.
69
+
70
+
71
+ Section 1 -- Definitions.
72
+
73
+ a. Adapted Material means material subject to Copyright and Similar
74
+ Rights that is derived from or based upon the Licensed Material
75
+ and in which the Licensed Material is translated, altered,
76
+ arranged, transformed, or otherwise modified in a manner requiring
77
+ permission under the Copyright and Similar Rights held by the
78
+ Licensor. For purposes of this Public License, where the Licensed
79
+ Material is a musical work, performance, or sound recording,
80
+ Adapted Material is always produced where the Licensed Material is
81
+ synched in timed relation with a moving image.
82
+
83
+ b. Adapter's License means the license You apply to Your Copyright
84
+ and Similar Rights in Your contributions to Adapted Material in
85
+ accordance with the terms and conditions of this Public License.
86
+
87
+ c. Copyright and Similar Rights means copyright and/or similar rights
88
+ closely related to copyright including, without limitation,
89
+ performance, broadcast, sound recording, and Sui Generis Database
90
+ Rights, without regard to how the rights are labeled or
91
+ categorized. For purposes of this Public License, the rights
92
+ specified in Section 2(b)(1)-(2) are not Copyright and Similar
93
+ Rights.
94
+ d. Effective Technological Measures means those measures that, in the
95
+ absence of proper authority, may not be circumvented under laws
96
+ fulfilling obligations under Article 11 of the WIPO Copyright
97
+ Treaty adopted on December 20, 1996, and/or similar international
98
+ agreements.
99
+
100
+ e. Exceptions and Limitations means fair use, fair dealing, and/or
101
+ any other exception or limitation to Copyright and Similar Rights
102
+ that applies to Your use of the Licensed Material.
103
+
104
+ f. Licensed Material means the artistic or literary work, database,
105
+ or other material to which the Licensor applied this Public
106
+ License.
107
+
108
+ g. Licensed Rights means the rights granted to You subject to the
109
+ terms and conditions of this Public License, which are limited to
110
+ all Copyright and Similar Rights that apply to Your use of the
111
+ Licensed Material and that the Licensor has authority to license.
112
+
113
+ h. Licensor means the individual(s) or entity(ies) granting rights
114
+ under this Public License.
115
+
116
+ i. NonCommercial means not primarily intended for or directed towards
117
+ commercial advantage or monetary compensation. For purposes of
118
+ this Public License, the exchange of the Licensed Material for
119
+ other material subject to Copyright and Similar Rights by digital
120
+ file-sharing or similar means is NonCommercial provided there is
121
+ no payment of monetary compensation in connection with the
122
+ exchange.
123
+
124
+ j. Share means to provide material to the public by any means or
125
+ process that requires permission under the Licensed Rights, such
126
+ as reproduction, public display, public performance, distribution,
127
+ dissemination, communication, or importation, and to make material
128
+ available to the public including in ways that members of the
129
+ public may access the material from a place and at a time
130
+ individually chosen by them.
131
+
132
+ k. Sui Generis Database Rights means rights other than copyright
133
+ resulting from Directive 96/9/EC of the European Parliament and of
134
+ the Council of 11 March 1996 on the legal protection of databases,
135
+ as amended and/or succeeded, as well as other essentially
136
+ equivalent rights anywhere in the world.
137
+
138
+ l. You means the individual or entity exercising the Licensed Rights
139
+ under this Public License. Your has a corresponding meaning.
140
+
141
+
142
+ Section 2 -- Scope.
143
+
144
+ a. License grant.
145
+
146
+ 1. Subject to the terms and conditions of this Public License,
147
+ the Licensor hereby grants You a worldwide, royalty-free,
148
+ non-sublicensable, non-exclusive, irrevocable license to
149
+ exercise the Licensed Rights in the Licensed Material to:
150
+
151
+ a. reproduce and Share the Licensed Material, in whole or
152
+ in part, for NonCommercial purposes only; and
153
+
154
+ b. produce, reproduce, and Share Adapted Material for
155
+ NonCommercial purposes only.
156
+
157
+ 2. Exceptions and Limitations. For the avoidance of doubt, where
158
+ Exceptions and Limitations apply to Your use, this Public
159
+ License does not apply, and You do not need to comply with
160
+ its terms and conditions.
161
+
162
+ 3. Term. The term of this Public License is specified in Section
163
+ 6(a).
164
+
165
+ 4. Media and formats; technical modifications allowed. The
166
+ Licensor authorizes You to exercise the Licensed Rights in
167
+ all media and formats whether now known or hereafter created,
168
+ and to make technical modifications necessary to do so. The
169
+ Licensor waives and/or agrees not to assert any right or
170
+ authority to forbid You from making technical modifications
171
+ necessary to exercise the Licensed Rights, including
172
+ technical modifications necessary to circumvent Effective
173
+ Technological Measures. For purposes of this Public License,
174
+ simply making modifications authorized by this Section 2(a)
175
+ (4) never produces Adapted Material.
176
+
177
+ 5. Downstream recipients.
178
+
179
+ a. Offer from the Licensor -- Licensed Material. Every
180
+ recipient of the Licensed Material automatically
181
+ receives an offer from the Licensor to exercise the
182
+ Licensed Rights under the terms and conditions of this
183
+ Public License.
184
+
185
+ b. No downstream restrictions. You may not offer or impose
186
+ any additional or different terms or conditions on, or
187
+ apply any Effective Technological Measures to, the
188
+ Licensed Material if doing so restricts exercise of the
189
+ Licensed Rights by any recipient of the Licensed
190
+ Material.
191
+
192
+ 6. No endorsement. Nothing in this Public License constitutes or
193
+ may be construed as permission to assert or imply that You
194
+ are, or that Your use of the Licensed Material is, connected
195
+ with, or sponsored, endorsed, or granted official status by,
196
+ the Licensor or others designated to receive attribution as
197
+ provided in Section 3(a)(1)(A)(i).
198
+
199
+ b. Other rights.
200
+
201
+ 1. Moral rights, such as the right of integrity, are not
202
+ licensed under this Public License, nor are publicity,
203
+ privacy, and/or other similar personality rights; however, to
204
+ the extent possible, the Licensor waives and/or agrees not to
205
+ assert any such rights held by the Licensor to the limited
206
+ extent necessary to allow You to exercise the Licensed
207
+ Rights, but not otherwise.
208
+
209
+ 2. Patent and trademark rights are not licensed under this
210
+ Public License.
211
+
212
+ 3. To the extent possible, the Licensor waives any right to
213
+ collect royalties from You for the exercise of the Licensed
214
+ Rights, whether directly or through a collecting society
215
+ under any voluntary or waivable statutory or compulsory
216
+ licensing scheme. In all other cases the Licensor expressly
217
+ reserves any right to collect such royalties, including when
218
+ the Licensed Material is used other than for NonCommercial
219
+ purposes.
220
+
221
+
222
+ Section 3 -- License Conditions.
223
+
224
+ Your exercise of the Licensed Rights is expressly made subject to the
225
+ following conditions.
226
+
227
+ a. Attribution.
228
+
229
+ 1. If You Share the Licensed Material (including in modified
230
+ form), You must:
231
+
232
+ a. retain the following if it is supplied by the Licensor
233
+ with the Licensed Material:
234
+
235
+ i. identification of the creator(s) of the Licensed
236
+ Material and any others designated to receive
237
+ attribution, in any reasonable manner requested by
238
+ the Licensor (including by pseudonym if
239
+ designated);
240
+
241
+ ii. a copyright notice;
242
+
243
+ iii. a notice that refers to this Public License;
244
+
245
+ iv. a notice that refers to the disclaimer of
246
+ warranties;
247
+
248
+ v. a URI or hyperlink to the Licensed Material to the
249
+ extent reasonably practicable;
250
+
251
+ b. indicate if You modified the Licensed Material and
252
+ retain an indication of any previous modifications; and
253
+
254
+ c. indicate the Licensed Material is licensed under this
255
+ Public License, and include the text of, or the URI or
256
+ hyperlink to, this Public License.
257
+
258
+ 2. You may satisfy the conditions in Section 3(a)(1) in any
259
+ reasonable manner based on the medium, means, and context in
260
+ which You Share the Licensed Material. For example, it may be
261
+ reasonable to satisfy the conditions by providing a URI or
262
+ hyperlink to a resource that includes the required
263
+ information.
264
+
265
+ 3. If requested by the Licensor, You must remove any of the
266
+ information required by Section 3(a)(1)(A) to the extent
267
+ reasonably practicable.
268
+
269
+ 4. If You Share Adapted Material You produce, the Adapter's
270
+ License You apply must not prevent recipients of the Adapted
271
+ Material from complying with this Public License.
272
+
273
+
274
+ Section 4 -- Sui Generis Database Rights.
275
+
276
+ Where the Licensed Rights include Sui Generis Database Rights that
277
+ apply to Your use of the Licensed Material:
278
+
279
+ a. for the avoidance of doubt, Section 2(a)(1) grants You the right
280
+ to extract, reuse, reproduce, and Share all or a substantial
281
+ portion of the contents of the database for NonCommercial purposes
282
+ only;
283
+
284
+ b. if You include all or a substantial portion of the database
285
+ contents in a database in which You have Sui Generis Database
286
+ Rights, then the database in which You have Sui Generis Database
287
+ Rights (but not its individual contents) is Adapted Material; and
288
+
289
+ c. You must comply with the conditions in Section 3(a) if You Share
290
+ all or a substantial portion of the contents of the database.
291
+
292
+ For the avoidance of doubt, this Section 4 supplements and does not
293
+ replace Your obligations under this Public License where the Licensed
294
+ Rights include other Copyright and Similar Rights.
295
+
296
+
297
+ Section 5 -- Disclaimer of Warranties and Limitation of Liability.
298
+
299
+ a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
300
+ EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
301
+ AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
302
+ ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
303
+ IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
304
+ WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
305
+ PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
306
+ ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
307
+ KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
308
+ ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
309
+
310
+ b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
311
+ TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
312
+ NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
313
+ INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
314
+ COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
315
+ USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
316
+ ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
317
+ DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
318
+ IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
319
+
320
+ c. The disclaimer of warranties and limitation of liability provided
321
+ above shall be interpreted in a manner that, to the extent
322
+ possible, most closely approximates an absolute disclaimer and
323
+ waiver of all liability.
324
+
325
+
326
+ Section 6 -- Term and Termination.
327
+
328
+ a. This Public License applies for the term of the Copyright and
329
+ Similar Rights licensed here. However, if You fail to comply with
330
+ this Public License, then Your rights under this Public License
331
+ terminate automatically.
332
+
333
+ b. Where Your right to use the Licensed Material has terminated under
334
+ Section 6(a), it reinstates:
335
+
336
+ 1. automatically as of the date the violation is cured, provided
337
+ it is cured within 30 days of Your discovery of the
338
+ violation; or
339
+
340
+ 2. upon express reinstatement by the Licensor.
341
+
342
+ For the avoidance of doubt, this Section 6(b) does not affect any
343
+ right the Licensor may have to seek remedies for Your violations
344
+ of this Public License.
345
+
346
+ c. For the avoidance of doubt, the Licensor may also offer the
347
+ Licensed Material under separate terms or conditions or stop
348
+ distributing the Licensed Material at any time; however, doing so
349
+ will not terminate this Public License.
350
+
351
+ d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
352
+ License.
353
+
354
+
355
+ Section 7 -- Other Terms and Conditions.
356
+
357
+ a. The Licensor shall not be bound by any additional or different
358
+ terms or conditions communicated by You unless expressly agreed.
359
+
360
+ b. Any arrangements, understandings, or agreements regarding the
361
+ Licensed Material not stated herein are separate from and
362
+ independent of the terms and conditions of this Public License.
363
+
364
+
365
+ Section 8 -- Interpretation.
366
+
367
+ a. For the avoidance of doubt, this Public License does not, and
368
+ shall not be interpreted to, reduce, limit, restrict, or impose
369
+ conditions on any use of the Licensed Material that could lawfully
370
+ be made without permission under this Public License.
371
+
372
+ b. To the extent possible, if any provision of this Public License is
373
+ deemed unenforceable, it shall be automatically reformed to the
374
+ minimum extent necessary to make it enforceable. If the provision
375
+ cannot be reformed, it shall be severed from this Public License
376
+ without affecting the enforceability of the remaining terms and
377
+ conditions.
378
+
379
+ c. No term or condition of this Public License will be waived and no
380
+ failure to comply consented to unless expressly agreed to by the
381
+ Licensor.
382
+
383
+ d. Nothing in this Public License constitutes or may be interpreted
384
+ as a limitation upon, or waiver of, any privileges and immunities
385
+ that apply to the Licensor or You, including from the legal
386
+ processes of any jurisdiction or authority.
387
+
388
+ =======================================================================
389
+
390
+ Creative Commons is not a party to its public
391
+ licenses. Notwithstanding, Creative Commons may elect to apply one of
392
+ its public licenses to material it publishes and in those instances
393
+ will be considered the “Licensor.” The text of the Creative Commons
394
+ public licenses is dedicated to the public domain under the CC0 Public
395
+ Domain Dedication. Except for the limited purpose of indicating that
396
+ material is shared under a Creative Commons public license or as
397
+ otherwise permitted by the Creative Commons policies published at
398
+ creativecommons.org/policies, Creative Commons does not authorize the
399
+ use of the trademark "Creative Commons" or any other trademark or logo
400
+ of Creative Commons without its prior written consent including,
401
+ without limitation, in connection with any unauthorized modifications
402
+ to any of its public licenses or any other arrangements,
403
+ understandings, or agreements concerning use of licensed material. For
404
+ the avoidance of doubt, this paragraph does not form part of the
405
+ public licenses.
406
+
407
+ Creative Commons may be contacted at creativecommons.org.
README.md ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # OpenLRM: Open-Source Large Reconstruction Models
2
+
3
+ [![Code License](https://img.shields.io/badge/Code%20License-Apache_2.0-yellow.svg)](LICENSE)
4
+ [![Weight License](https://img.shields.io/badge/Weight%20License-CC%20By%20NC%204.0-red)](LICENSE_WEIGHT)
5
+ [![LRM](https://img.shields.io/badge/LRM-Arxiv%20Link-green)](https://arxiv.org/abs/2311.04400)
6
+
7
+ [![HF Models](https://img.shields.io/badge/Models-Huggingface%20Models-bron)](https://huggingface.co/zxhezexin)
8
+ [![HF Demo](https://img.shields.io/badge/Demo-Huggingface%20Demo-blue)](https://huggingface.co/spaces/zxhezexin/OpenLRM)
9
+
10
+ <img src="assets/rendered_video/teaser.gif" width="75%" height="auto"/>
11
+
12
+ <div style="text-align: left">
13
+ <img src="assets/mesh_snapshot/crop.owl.ply00.png" width="12%" height="auto"/>
14
+ <img src="assets/mesh_snapshot/crop.owl.ply01.png" width="12%" height="auto"/>
15
+ <img src="assets/mesh_snapshot/crop.building.ply00.png" width="12%" height="auto"/>
16
+ <img src="assets/mesh_snapshot/crop.building.ply01.png" width="12%" height="auto"/>
17
+ <img src="assets/mesh_snapshot/crop.rose.ply00.png" width="12%" height="auto"/>
18
+ <img src="assets/mesh_snapshot/crop.rose.ply01.png" width="12%" height="auto"/>
19
+ </div>
20
+
21
+ ## News
22
+
23
+ - [2024.03.04] Version update v1.1. Release model weights trained on both Objaverse and MVImgNet. Codebase is majorly refactored for better usability and extensibility. Please refer to [v1.1.0](https://github.com/3DTopia/OpenLRM/releases/tag/v1.1.0) for details.
24
+ - [2024.01.09] Updated all v1.0 models trained on Objaverse. Please refer to [HF Models](https://huggingface.co/zxhezexin) and overwrite previous model weights.
25
+ - [2023.12.21] [Hugging Face Demo](https://huggingface.co/spaces/zxhezexin/OpenLRM) is online. Have a try!
26
+ - [2023.12.20] Release weights of the base and large models trained on Objaverse.
27
+ - [2023.12.20] We release this project OpenLRM, which is an open-source implementation of the paper [LRM](https://arxiv.org/abs/2311.04400).
28
+
29
+ ## Setup
30
+
31
+ ### Installation
32
+ ```
33
+ git clone https://github.com/3DTopia/OpenLRM.git
34
+ cd OpenLRM
35
+ ```
36
+
37
+ ### Environment
38
+ - Install requirements for OpenLRM first.
39
+ ```
40
+ pip install -r requirements.txt
41
+ ```
42
+ - Please then follow the [xFormers installation guide](https://github.com/facebookresearch/xformers?tab=readme-ov-file#installing-xformers) to enable memory efficient attention inside [DINOv2 encoder](openlrm/models/encoders/dinov2/layers/attention.py).
43
+
44
+ ## Quick Start
45
+
46
+ ### Pretrained Models
47
+
48
+ - Model weights are released on [Hugging Face](https://huggingface.co/zxhezexin).
49
+ - Weights will be downloaded automatically when you run the inference script for the first time.
50
+ - Please be aware of the [license](LICENSE_WEIGHT) before using the weights.
51
+
52
+ | Model | Training Data | Layers | Feat. Dim | Trip. Dim. | In. Res. | Link |
53
+ | :--- | :--- | :--- | :--- | :--- | :--- | :--- |
54
+ | openlrm-obj-small-1.1 | Objaverse | 12 | 512 | 32 | 224 | [HF](https://huggingface.co/zxhezexin/openlrm-obj-small-1.1) |
55
+ | openlrm-obj-base-1.1 | Objaverse | 12 | 768 | 48 | 336 | [HF](https://huggingface.co/zxhezexin/openlrm-obj-base-1.1) |
56
+ | openlrm-obj-large-1.1 | Objaverse | 16 | 1024 | 80 | 448 | [HF](https://huggingface.co/zxhezexin/openlrm-obj-large-1.1) |
57
+ | openlrm-mix-small-1.1 | Objaverse + MVImgNet | 12 | 512 | 32 | 224 | [HF](https://huggingface.co/zxhezexin/openlrm-mix-small-1.1) |
58
+ | openlrm-mix-base-1.1 | Objaverse + MVImgNet | 12 | 768 | 48 | 336 | [HF](https://huggingface.co/zxhezexin/openlrm-mix-base-1.1) |
59
+ | openlrm-mix-large-1.1 | Objaverse + MVImgNet | 16 | 1024 | 80 | 448 | [HF](https://huggingface.co/zxhezexin/openlrm-mix-large-1.1) |
60
+
61
+ Model cards with additional details can be found in [model_card.md](model_card.md).
62
+
63
+ ### Prepare Images
64
+ - We put some sample inputs under `assets/sample_input`, and you can quickly try them.
65
+ - Prepare RGBA images or RGB images with white background (with some background removal tools, e.g., [Rembg](https://github.com/danielgatis/rembg), [Clipdrop](https://clipdrop.co)).
66
+
67
+ ### Inference
68
+ - Run the inference script to get 3D assets.
69
+ - You may specify which form of output to generate by setting the flags `EXPORT_VIDEO=true` and `EXPORT_MESH=true`.
70
+ - Please set default `INFER_CONFIG` according to the model you want to use. E.g., `infer-b.yaml` for base models and `infer-s.yaml` for small models.
71
+ - An example usage is as follows:
72
+
73
+ ```
74
+ # Example usage
75
+ EXPORT_VIDEO=true
76
+ EXPORT_MESH=true
77
+ INFER_CONFIG="./configs/infer-b.yaml"
78
+ MODEL_NAME="zxhezexin/openlrm-mix-base-1.1"
79
+ IMAGE_INPUT="./assets/sample_input/owl.png"
80
+
81
+ python -m openlrm.launch infer.lrm --infer $INFER_CONFIG model_name=$MODEL_NAME image_input=$IMAGE_INPUT export_video=$EXPORT_VIDEO export_mesh=$EXPORT_MESH
82
+ ```
83
+
84
+ ## Training
85
+ To be released soon.
86
+
87
+ ## Acknowledgement
88
+
89
+ - We thank the authors of the [original paper](https://arxiv.org/abs/2311.04400) for their great work! Special thanks to Kai Zhang and Yicong Hong for assistance during the reproduction.
90
+ - This project is supported by Shanghai AI Lab by providing the computing resources.
91
+ - This project is advised by Ziwei Liu and Jiaya Jia.
92
+
93
+ ## Citation
94
+
95
+ If you find this work useful for your research, please consider citing:
96
+ ```
97
+ @article{hong2023lrm,
98
+ title={Lrm: Large reconstruction model for single image to 3d},
99
+ author={Hong, Yicong and Zhang, Kai and Gu, Jiuxiang and Bi, Sai and Zhou, Yang and Liu, Difan and Liu, Feng and Sunkavalli, Kalyan and Bui, Trung and Tan, Hao},
100
+ journal={arXiv preprint arXiv:2311.04400},
101
+ year={2023}
102
+ }
103
+ ```
104
+
105
+ ```
106
+ @misc{openlrm,
107
+ title = {OpenLRM: Open-Source Large Reconstruction Models},
108
+ author = {Zexin He and Tengfei Wang},
109
+ year = {2023},
110
+ howpublished = {\url{https://github.com/3DTopia/OpenLRM}},
111
+ }
112
+ ```
113
+
114
+ ## License
115
+
116
+ - OpenLRM as a whole is licensed under the [Apache License, Version 2.0](LICENSE), while certain components are covered by [NVIDIA's proprietary license](LICENSE_NVIDIA). Users are responsible for complying with the respective licensing terms of each component.
117
+ - Model weights are licensed under the [Creative Commons Attribution-NonCommercial 4.0 International License](LICENSE_WEIGHT). They are provided for research purposes only, and CANNOT be used commercially.
app.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023-2024, Zexin He
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # https://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ import os
17
+ from PIL import Image
18
+ import numpy as np
19
+ import gradio as gr
20
+
21
+
22
+ def assert_input_image(input_image):
23
+ if input_image is None:
24
+ raise gr.Error("No image selected or uploaded!")
25
+
26
+ def prepare_working_dir():
27
+ import tempfile
28
+ working_dir = tempfile.TemporaryDirectory()
29
+ return working_dir
30
+
31
+ def init_preprocessor():
32
+ from openlrm.utils.preprocess import Preprocessor
33
+ global preprocessor
34
+ preprocessor = Preprocessor()
35
+
36
+ def preprocess_fn(image_in: np.ndarray, remove_bg: bool, recenter: bool, working_dir):
37
+ image_raw = os.path.join(working_dir.name, "raw.png")
38
+ with Image.fromarray(image_in) as img:
39
+ img.save(image_raw)
40
+ image_out = os.path.join(working_dir.name, "rembg.png")
41
+ success = preprocessor.preprocess(image_path=image_raw, save_path=image_out, rmbg=remove_bg, recenter=recenter)
42
+ assert success, f"Failed under preprocess_fn!"
43
+ return image_out
44
+
45
+
46
+ def demo_openlrm(infer_impl):
47
+
48
+ def core_fn(image: str, source_cam_dist: float, working_dir):
49
+ dump_video_path = os.path.join(working_dir.name, "output.mp4")
50
+ dump_mesh_path = os.path.join(working_dir.name, "output.ply")
51
+ infer_impl(
52
+ image_path=image,
53
+ source_cam_dist=source_cam_dist,
54
+ export_video=True,
55
+ export_mesh=False,
56
+ dump_video_path=dump_video_path,
57
+ dump_mesh_path=dump_mesh_path,
58
+ )
59
+ return dump_video_path
60
+
61
+ def example_fn(image: np.ndarray):
62
+ from gradio.utils import get_cache_folder
63
+ working_dir = get_cache_folder()
64
+ image = preprocess_fn(
65
+ image_in=image,
66
+ remove_bg=True,
67
+ recenter=True,
68
+ working_dir=working_dir,
69
+ )
70
+ video = core_fn(
71
+ image=image,
72
+ source_cam_dist=2.0,
73
+ working_dir=working_dir,
74
+ )
75
+ return image, video
76
+
77
+
78
+ _TITLE = '''OpenLRM: Open-Source Large Reconstruction Models'''
79
+
80
+ _DESCRIPTION = '''
81
+ <div>
82
+ <a style="display:inline-block" href='https://github.com/3DTopia/OpenLRM'><img src='https://img.shields.io/github/stars/3DTopia/OpenLRM?style=social'/></a>
83
+ <a style="display:inline-block; margin-left: .5em" href="https://huggingface.co/zxhezexin"><img src='https://img.shields.io/badge/Model-Weights-blue'/></a>
84
+ </div>
85
+ OpenLRM is an open-source implementation of Large Reconstruction Models.
86
+
87
+ <strong>Image-to-3D in 10 seconds!</strong>
88
+
89
+ <strong>Disclaimer:</strong> This demo uses `openlrm-mix-base-1.1` model with 288x288 rendering resolution here for a quick demonstration.
90
+ '''
91
+
92
+ with gr.Blocks(analytics_enabled=False) as demo:
93
+
94
+ # HEADERS
95
+ with gr.Row():
96
+ with gr.Column(scale=1):
97
+ gr.Markdown('# ' + _TITLE)
98
+ with gr.Row():
99
+ gr.Markdown(_DESCRIPTION)
100
+
101
+ # DISPLAY
102
+ with gr.Row():
103
+
104
+ with gr.Column(variant='panel', scale=1):
105
+ with gr.Tabs(elem_id="openlrm_input_image"):
106
+ with gr.TabItem('Input Image'):
107
+ with gr.Row():
108
+ input_image = gr.Image(label="Input Image", image_mode="RGBA", width="auto", sources="upload", type="numpy", elem_id="content_image")
109
+
110
+ with gr.Column(variant='panel', scale=1):
111
+ with gr.Tabs(elem_id="openlrm_processed_image"):
112
+ with gr.TabItem('Processed Image'):
113
+ with gr.Row():
114
+ processed_image = gr.Image(label="Processed Image", image_mode="RGBA", type="filepath", elem_id="processed_image", width="auto", interactive=False)
115
+
116
+ with gr.Column(variant='panel', scale=1):
117
+ with gr.Tabs(elem_id="openlrm_render_video"):
118
+ with gr.TabItem('Rendered Video'):
119
+ with gr.Row():
120
+ output_video = gr.Video(label="Rendered Video", format="mp4", width="auto", autoplay=True)
121
+
122
+ # SETTING
123
+ with gr.Row():
124
+ with gr.Column(variant='panel', scale=1):
125
+ with gr.Tabs(elem_id="openlrm_attrs"):
126
+ with gr.TabItem('Settings'):
127
+ with gr.Column(variant='panel'):
128
+ gr.Markdown(
129
+ """
130
+ <strong>Best Practice</strong>:
131
+ Centered objects in reasonable sizes. Try adjusting source camera distances.
132
+ """
133
+ )
134
+ checkbox_rembg = gr.Checkbox(True, label='Remove background')
135
+ checkbox_recenter = gr.Checkbox(True, label='Recenter the object')
136
+ slider_cam_dist = gr.Slider(1.0, 3.5, value=2.0, step=0.1, label="Source Camera Distance")
137
+ submit = gr.Button('Generate', elem_id="openlrm_generate", variant='primary')
138
+
139
+ # EXAMPLES
140
+ with gr.Row():
141
+ examples = [
142
+ ['assets/sample_input/owl.png'],
143
+ ['assets/sample_input/building.png'],
144
+ ['assets/sample_input/mailbox.png'],
145
+ ['assets/sample_input/fire.png'],
146
+ ['assets/sample_input/girl.png'],
147
+ ['assets/sample_input/lamp.png'],
148
+ ['assets/sample_input/hydrant.png'],
149
+ ['assets/sample_input/hotdogs.png'],
150
+ ['assets/sample_input/traffic.png'],
151
+ ['assets/sample_input/ceramic.png'],
152
+ ]
153
+ gr.Examples(
154
+ examples=examples,
155
+ inputs=[input_image],
156
+ outputs=[processed_image, output_video],
157
+ fn=example_fn,
158
+ cache_examples=os.getenv('SYSTEM') != 'spaces',
159
+ examples_per_page=20,
160
+ )
161
+
162
+ working_dir = gr.State()
163
+ submit.click(
164
+ fn=assert_input_image,
165
+ inputs=[input_image],
166
+ queue=False,
167
+ ).success(
168
+ fn=prepare_working_dir,
169
+ outputs=[working_dir],
170
+ queue=False,
171
+ ).success(
172
+ fn=preprocess_fn,
173
+ inputs=[input_image, checkbox_rembg, checkbox_recenter, working_dir],
174
+ outputs=[processed_image],
175
+ ).success(
176
+ fn=core_fn,
177
+ inputs=[processed_image, slider_cam_dist, working_dir],
178
+ outputs=[output_video],
179
+ )
180
+
181
+ demo.queue()
182
+ demo.launch()
183
+
184
+
185
+ def launch_gradio_app():
186
+
187
+ os.environ.update({
188
+ "APP_ENABLED": "1",
189
+ "APP_MODEL_NAME": "zxhezexin/openlrm-mix-base-1.1",
190
+ "APP_INFER": "./configs/infer-gradio.yaml",
191
+ "APP_TYPE": "infer.lrm",
192
+ "NUMBA_THREADING_LAYER": 'omp',
193
+ })
194
+
195
+ from openlrm.runners import REGISTRY_RUNNERS
196
+ from openlrm.runners.infer.base_inferrer import Inferrer
197
+ InferrerClass : Inferrer = REGISTRY_RUNNERS[os.getenv("APP_TYPE")]
198
+ with InferrerClass() as inferrer:
199
+ init_preprocessor()
200
+ if os.getenv('SYSTEM') != 'spaces':
201
+ from openlrm.utils.proxy import no_proxy
202
+ demo = no_proxy(demo_openlrm)
203
+ else:
204
+ demo = demo_openlrm
205
+ demo(infer_impl=inferrer.infer_single)
206
+
207
+
208
+ if __name__ == '__main__':
209
+
210
+ launch_gradio_app()
assets/mesh_snapshot/crop.building.ply00.png ADDED
assets/mesh_snapshot/crop.building.ply01.png ADDED
assets/mesh_snapshot/crop.owl.ply00.png ADDED
assets/mesh_snapshot/crop.owl.ply01.png ADDED
assets/mesh_snapshot/crop.rose.ply00.png ADDED
assets/mesh_snapshot/crop.rose.ply01.png ADDED
assets/rendered_video/teaser.gif ADDED

Git LFS Details

  • SHA256: 29ad154c012d5f8a3165b1d0a9386759b65bb45a9c40aa705626a7c47508c17b
  • Pointer size: 132 Bytes
  • Size of remote file: 3.45 MB
assets/sample_input/building.png ADDED
assets/sample_input/ceramic.png ADDED
assets/sample_input/fire.png ADDED
assets/sample_input/girl.png ADDED
assets/sample_input/hotdogs.png ADDED
assets/sample_input/hydrant.png ADDED
assets/sample_input/lamp.png ADDED
assets/sample_input/mailbox.png ADDED
assets/sample_input/owl.png ADDED
assets/sample_input/traffic.png ADDED
configs/infer-b.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ source_size: 336
2
+ source_cam_dist: 2.0
3
+ render_size: 288
4
+ render_views: 160
5
+ render_fps: 40
6
+ frame_size: 4
7
+ mesh_size: 384
8
+ mesh_thres: 3.0
configs/infer-gradio.yaml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ source_size: 336
2
+ render_size: 288
3
+ render_views: 100
4
+ render_fps: 25
5
+ frame_size: 2
6
+ mesh_size: 384
7
+ mesh_thres: 3.0
configs/infer-l.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ source_size: 448
2
+ source_cam_dist: 2.0
3
+ render_size: 384
4
+ render_views: 160
5
+ render_fps: 40
6
+ frame_size: 2
7
+ mesh_size: 384
8
+ mesh_thres: 3.0
configs/infer-s.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ source_size: 224
2
+ source_cam_dist: 2.0
3
+ render_size: 192
4
+ render_views: 160
5
+ render_fps: 40
6
+ frame_size: 4
7
+ mesh_size: 384
8
+ mesh_thres: 3.0
model_card.md ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model Card for OpenLRM V1.1
2
+
3
+ ## Overview
4
+
5
+ - This model card is for the [OpenLRM](https://github.com/3DTopia/OpenLRM) project, which is an open-source implementation of the paper [LRM](https://arxiv.org/abs/2311.04400).
6
+ - Information contained in this model card corresponds to [Version 1.1](https://github.com/3DTopia/OpenLRM/releases).
7
+
8
+ ## Model Details
9
+
10
+ - Training data
11
+
12
+ | Model | Training Data |
13
+ | :---: | :---: |
14
+ | [openlrm-obj-small-1.1](https://huggingface.co/zxhezexin/openlrm-obj-small-1.1) | Objaverse |
15
+ | [openlrm-obj-base-1.1](https://huggingface.co/zxhezexin/openlrm-obj-base-1.1) | Objaverse |
16
+ | [openlrm-obj-large-1.1](https://huggingface.co/zxhezexin/openlrm-obj-large-1.1) | Objaverse |
17
+ | [openlrm-mix-small-1.1](https://huggingface.co/zxhezexin/openlrm-mix-small-1.1) | Objaverse + MVImgNet |
18
+ | [openlrm-mix-base-1.1](https://huggingface.co/zxhezexin/openlrm-mix-base-1.1) | Objaverse + MVImgNet |
19
+ | [openlrm-mix-large-1.1](https://huggingface.co/zxhezexin/openlrm-mix-large-1.1) | Objaverse + MVImgNet |
20
+
21
+ - Model architecture (version==1.1)
22
+
23
+ | Type | Layers | Feat. Dim | Attn. Heads | Triplane Dim. | Input Res. | Image Encoder | Size |
24
+ | :---: | :----: | :-------: | :---------: | :-----------: | :--------: | :---------------: | :---: |
25
+ | small | 12 | 512 | 8 | 32 | 224 | dinov2_vits14_reg | 446M |
26
+ | base | 12 | 768 | 12 | 48 | 336 | dinov2_vitb14_reg | 1.04G |
27
+ | large | 16 | 1024 | 16 | 80 | 448 | dinov2_vitb14_reg | 1.81G |
28
+
29
+ - Training settings
30
+
31
+ | Type | Rend. Res. | Rend. Patch | Ray Samples |
32
+ | :---: | :--------: | :---------: | :---------: |
33
+ | small | 192 | 64 | 96 |
34
+ | base | 288 | 96 | 96 |
35
+ | large | 384 | 128 | 128 |
36
+
37
+ ## Notable Differences from the Original Paper
38
+
39
+ - We do not use the deferred back-propagation technique in the original paper.
40
+ - We used random background colors during training.
41
+ - The image encoder is based on the [DINOv2](https://github.com/facebookresearch/dinov2) model with register tokens.
42
+ - The triplane decoder contains 4 layers in our implementation.
43
+
44
+ ## License
45
+
46
+ - The model weights are released under the [Creative Commons Attribution-NonCommercial 4.0 International License](LICENSE_WEIGHT).
47
+ - They are provided for research purposes only, and CANNOT be used commercially.
48
+
49
+ ## Disclaimer
50
+
51
+ This model is an open-source implementation and is NOT the official release of the original research paper. While it aims to reproduce the original results as faithfully as possible, there may be variations due to model implementation, training data, and other factors.
52
+
53
+ ### Ethical Considerations
54
+
55
+ - This model should be used responsibly and ethically, and should not be used for malicious purposes.
56
+ - Users should be aware of potential biases in the training data.
57
+ - The model should not be used under the circumstances that could lead to harm or unfair treatment of individuals or groups.
58
+
59
+ ### Usage Considerations
60
+
61
+ - The model is provided "as is" without warranty of any kind.
62
+ - Users are responsible for ensuring that their use complies with all relevant laws and regulations.
63
+ - The developers and contributors of this model are not liable for any damages or losses arising from the use of this model.
64
+
65
+ ---
66
+
67
+ *This model card is subject to updates and modifications. Users are advised to check for the latest version regularly.*
openlrm/__init__.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023-2024, Zexin He
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # https://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ #
15
+ # Empty
openlrm/datasets/__init__.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023-2024, Zexin He
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # https://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ # from .mixer import MixerDataset
openlrm/datasets/base.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023-2024, Zexin He
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # https://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ from abc import ABC, abstractmethod
17
+ import json
18
+ import numpy as np
19
+ import torch
20
+ from PIL import Image
21
+ from megfile import smart_open, smart_path_join, smart_exists
22
+
23
+
24
+ class BaseDataset(torch.utils.data.Dataset, ABC):
25
+ def __init__(self, root_dirs: list[str], meta_path: str):
26
+ super().__init__()
27
+ self.root_dirs = root_dirs
28
+ self.uids = self._load_uids(meta_path)
29
+
30
+ def __len__(self):
31
+ return len(self.uids)
32
+
33
+ @abstractmethod
34
+ def inner_get_item(self, idx):
35
+ pass
36
+
37
+ def __getitem__(self, idx):
38
+ try:
39
+ return self.inner_get_item(idx)
40
+ except Exception as e:
41
+ print(f"[DEBUG-DATASET] Error when loading {self.uids[idx]}")
42
+ # return self.__getitem__(idx+1)
43
+ raise e
44
+
45
+ @staticmethod
46
+ def _load_uids(meta_path: str):
47
+ # meta_path is a json file
48
+ with open(meta_path, 'r') as f:
49
+ uids = json.load(f)
50
+ return uids
51
+
52
+ @staticmethod
53
+ def _load_rgba_image(file_path, bg_color: float = 1.0):
54
+ ''' Load and blend RGBA image to RGB with certain background, 0-1 scaled '''
55
+ rgba = np.array(Image.open(smart_open(file_path, 'rb')))
56
+ rgba = torch.from_numpy(rgba).float() / 255.0
57
+ rgba = rgba.permute(2, 0, 1).unsqueeze(0)
58
+ rgb = rgba[:, :3, :, :] * rgba[:, 3:4, :, :] + bg_color * (1 - rgba[:, 3:, :, :])
59
+ rgba[:, :3, ...] * rgba[:, 3:, ...] + (1 - rgba[:, 3:, ...])
60
+ return rgb
61
+
62
+ @staticmethod
63
+ def _locate_datadir(root_dirs, uid, locator: str):
64
+ for root_dir in root_dirs:
65
+ datadir = smart_path_join(root_dir, uid, locator)
66
+ if smart_exists(datadir):
67
+ return root_dir
68
+ raise FileNotFoundError(f"Cannot find valid data directory for uid {uid}")
openlrm/datasets/cam_utils.py ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023-2024, Zexin He
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # https://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ import math
17
+ import torch
18
+
19
+ """
20
+ R: (N, 3, 3)
21
+ T: (N, 3)
22
+ E: (N, 4, 4)
23
+ vector: (N, 3)
24
+ """
25
+
26
+
27
+ def compose_extrinsic_R_T(R: torch.Tensor, T: torch.Tensor):
28
+ """
29
+ Compose the standard form extrinsic matrix from R and T.
30
+ Batched I/O.
31
+ """
32
+ RT = torch.cat((R, T.unsqueeze(-1)), dim=-1)
33
+ return compose_extrinsic_RT(RT)
34
+
35
+
36
+ def compose_extrinsic_RT(RT: torch.Tensor):
37
+ """
38
+ Compose the standard form extrinsic matrix from RT.
39
+ Batched I/O.
40
+ """
41
+ return torch.cat([
42
+ RT,
43
+ torch.tensor([[[0, 0, 0, 1]]], dtype=RT.dtype, device=RT.device).repeat(RT.shape[0], 1, 1)
44
+ ], dim=1)
45
+
46
+
47
+ def decompose_extrinsic_R_T(E: torch.Tensor):
48
+ """
49
+ Decompose the standard extrinsic matrix into R and T.
50
+ Batched I/O.
51
+ """
52
+ RT = decompose_extrinsic_RT(E)
53
+ return RT[:, :, :3], RT[:, :, 3]
54
+
55
+
56
+ def decompose_extrinsic_RT(E: torch.Tensor):
57
+ """
58
+ Decompose the standard extrinsic matrix into RT.
59
+ Batched I/O.
60
+ """
61
+ return E[:, :3, :]
62
+
63
+
64
+ def get_normalized_camera_intrinsics(intrinsics: torch.Tensor):
65
+ """
66
+ intrinsics: (N, 3, 2), [[fx, fy], [cx, cy], [width, height]]
67
+ Return batched fx, fy, cx, cy
68
+ """
69
+ fx, fy = intrinsics[:, 0, 0], intrinsics[:, 0, 1]
70
+ cx, cy = intrinsics[:, 1, 0], intrinsics[:, 1, 1]
71
+ width, height = intrinsics[:, 2, 0], intrinsics[:, 2, 1]
72
+ fx, fy = fx / width, fy / height
73
+ cx, cy = cx / width, cy / height
74
+ return fx, fy, cx, cy
75
+
76
+
77
+ def build_camera_principle(RT: torch.Tensor, intrinsics: torch.Tensor):
78
+ """
79
+ RT: (N, 3, 4)
80
+ intrinsics: (N, 3, 2), [[fx, fy], [cx, cy], [width, height]]
81
+ """
82
+ fx, fy, cx, cy = get_normalized_camera_intrinsics(intrinsics)
83
+ return torch.cat([
84
+ RT.reshape(-1, 12),
85
+ fx.unsqueeze(-1), fy.unsqueeze(-1), cx.unsqueeze(-1), cy.unsqueeze(-1),
86
+ ], dim=-1)
87
+
88
+
89
+ def build_camera_standard(RT: torch.Tensor, intrinsics: torch.Tensor):
90
+ """
91
+ RT: (N, 3, 4)
92
+ intrinsics: (N, 3, 2), [[fx, fy], [cx, cy], [width, height]]
93
+ """
94
+ E = compose_extrinsic_RT(RT)
95
+ fx, fy, cx, cy = get_normalized_camera_intrinsics(intrinsics)
96
+ I = torch.stack([
97
+ torch.stack([fx, torch.zeros_like(fx), cx], dim=-1),
98
+ torch.stack([torch.zeros_like(fy), fy, cy], dim=-1),
99
+ torch.tensor([[0, 0, 1]], dtype=torch.float32, device=RT.device).repeat(RT.shape[0], 1),
100
+ ], dim=1)
101
+ return torch.cat([
102
+ E.reshape(-1, 16),
103
+ I.reshape(-1, 9),
104
+ ], dim=-1)
105
+
106
+
107
+ def center_looking_at_camera_pose(
108
+ camera_position: torch.Tensor, look_at: torch.Tensor = None, up_world: torch.Tensor = None,
109
+ device: torch.device = torch.device('cpu'),
110
+ ):
111
+ """
112
+ camera_position: (M, 3)
113
+ look_at: (3)
114
+ up_world: (3)
115
+ return: (M, 3, 4)
116
+ """
117
+ # by default, looking at the origin and world up is pos-z
118
+ if look_at is None:
119
+ look_at = torch.tensor([0, 0, 0], dtype=torch.float32, device=device)
120
+ if up_world is None:
121
+ up_world = torch.tensor([0, 0, 1], dtype=torch.float32, device=device)
122
+ look_at = look_at.unsqueeze(0).repeat(camera_position.shape[0], 1)
123
+ up_world = up_world.unsqueeze(0).repeat(camera_position.shape[0], 1)
124
+
125
+ z_axis = camera_position - look_at
126
+ z_axis = z_axis / z_axis.norm(dim=-1, keepdim=True)
127
+ x_axis = torch.cross(up_world, z_axis)
128
+ x_axis = x_axis / x_axis.norm(dim=-1, keepdim=True)
129
+ y_axis = torch.cross(z_axis, x_axis)
130
+ y_axis = y_axis / y_axis.norm(dim=-1, keepdim=True)
131
+ extrinsics = torch.stack([x_axis, y_axis, z_axis, camera_position], dim=-1)
132
+ return extrinsics
133
+
134
+
135
+ def surrounding_views_linspace(n_views: int, radius: float = 2.0, height: float = 0.8, device: torch.device = torch.device('cpu')):
136
+ """
137
+ n_views: number of surrounding views
138
+ radius: camera dist to center
139
+ height: height of the camera
140
+ return: (M, 3, 4)
141
+ """
142
+ assert n_views > 0
143
+ assert radius > 0
144
+
145
+ theta = torch.linspace(-torch.pi / 2, 3 * torch.pi / 2, n_views, device=device)
146
+ projected_radius = math.sqrt(radius ** 2 - height ** 2)
147
+ x = torch.cos(theta) * projected_radius
148
+ y = torch.sin(theta) * projected_radius
149
+ z = torch.full((n_views,), height, device=device)
150
+
151
+ camera_positions = torch.stack([x, y, z], dim=1)
152
+ extrinsics = center_looking_at_camera_pose(camera_positions, device=device)
153
+
154
+ return extrinsics
155
+
156
+
157
+ def create_intrinsics(
158
+ f: float,
159
+ c: float = None, cx: float = None, cy: float = None,
160
+ w: float = 1., h: float = 1.,
161
+ dtype: torch.dtype = torch.float32,
162
+ device: torch.device = torch.device('cpu'),
163
+ ):
164
+ """
165
+ return: (3, 2)
166
+ """
167
+ fx = fy = f
168
+ if c is not None:
169
+ assert cx is None and cy is None, "c and cx/cy cannot be used together"
170
+ cx = cy = c
171
+ else:
172
+ assert cx is not None and cy is not None, "cx/cy must be provided when c is not provided"
173
+ fx, fy, cx, cy, w, h = fx/w, fy/h, cx/w, cy/h, 1., 1.
174
+ intrinsics = torch.tensor([
175
+ [fx, fy],
176
+ [cx, cy],
177
+ [w, h],
178
+ ], dtype=dtype, device=device)
179
+ return intrinsics
openlrm/launch.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023-2024, Zexin He
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # https://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ import argparse
17
+
18
+ from openlrm.runners import REGISTRY_RUNNERS
19
+
20
+
21
+ def main():
22
+
23
+ parser = argparse.ArgumentParser(description='OpenLRM launcher')
24
+ parser.add_argument('runner', type=str, help='Runner to launch')
25
+ args, unknown = parser.parse_known_args()
26
+
27
+ if args.runner not in REGISTRY_RUNNERS:
28
+ raise ValueError('Runner {} not found'.format(args.runner))
29
+
30
+ RunnerClass = REGISTRY_RUNNERS[args.runner]
31
+ with RunnerClass() as runner:
32
+ runner.run()
33
+
34
+
35
+ if __name__ == '__main__':
36
+ main()
openlrm/losses/__init__.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023-2024, Zexin He
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # https://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ from .pixelwise import *
17
+ from .perceptual import *
18
+ from .tvloss import *
openlrm/losses/perceptual.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023-2024, Zexin He
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # https://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ import torch
17
+ import torch.nn as nn
18
+
19
+ __all__ = ['LPIPSLoss']
20
+
21
+
22
+ class LPIPSLoss(nn.Module):
23
+ """
24
+ Compute LPIPS loss between two images.
25
+ """
26
+
27
+ def __init__(self, device, prefech: bool = False):
28
+ super().__init__()
29
+ self.device = device
30
+ self.cached_models = {}
31
+ if prefech:
32
+ self.prefetch_models()
33
+
34
+ def _get_model(self, model_name: str):
35
+ if model_name not in self.cached_models:
36
+ import warnings
37
+ with warnings.catch_warnings():
38
+ warnings.filterwarnings('ignore', category=UserWarning)
39
+ import lpips
40
+ _model = lpips.LPIPS(net=model_name, eval_mode=True, verbose=False).to(self.device)
41
+ _model = torch.compile(_model)
42
+ self.cached_models[model_name] = _model
43
+ return self.cached_models[model_name]
44
+
45
+ def prefetch_models(self):
46
+ _model_names = ['alex', 'vgg']
47
+ for model_name in _model_names:
48
+ self._get_model(model_name)
49
+
50
+ def forward(self, x, y, is_training: bool = True):
51
+ """
52
+ Assume images are 0-1 scaled and channel first.
53
+
54
+ Args:
55
+ x: [N, M, C, H, W]
56
+ y: [N, M, C, H, W]
57
+ is_training: whether to use VGG or AlexNet.
58
+
59
+ Returns:
60
+ Mean-reduced LPIPS loss across batch.
61
+ """
62
+ model_name = 'vgg' if is_training else 'alex'
63
+ loss_fn = self._get_model(model_name)
64
+ N, M, C, H, W = x.shape
65
+ x = x.reshape(N*M, C, H, W)
66
+ y = y.reshape(N*M, C, H, W)
67
+ image_loss = loss_fn(x, y, normalize=True).mean(dim=[1, 2, 3])
68
+ batch_loss = image_loss.reshape(N, M).mean(dim=1)
69
+ all_loss = batch_loss.mean()
70
+ return all_loss
openlrm/losses/pixelwise.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023-2024, Zexin He
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # https://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ import torch
17
+ import torch.nn as nn
18
+
19
+ __all__ = ['PixelLoss']
20
+
21
+
22
+ class PixelLoss(nn.Module):
23
+ """
24
+ Pixel-wise loss between two images.
25
+ """
26
+
27
+ def __init__(self, option: str = 'mse'):
28
+ super().__init__()
29
+ self.loss_fn = self._build_from_option(option)
30
+
31
+ @staticmethod
32
+ def _build_from_option(option: str, reduction: str = 'none'):
33
+ if option == 'mse':
34
+ return nn.MSELoss(reduction=reduction)
35
+ elif option == 'l1':
36
+ return nn.L1Loss(reduction=reduction)
37
+ else:
38
+ raise NotImplementedError(f'Unknown pixel loss option: {option}')
39
+
40
+ @torch.compile
41
+ def forward(self, x, y):
42
+ """
43
+ Assume images are channel first.
44
+
45
+ Args:
46
+ x: [N, M, C, H, W]
47
+ y: [N, M, C, H, W]
48
+
49
+ Returns:
50
+ Mean-reduced pixel loss across batch.
51
+ """
52
+ N, M, C, H, W = x.shape
53
+ x = x.reshape(N*M, C, H, W)
54
+ y = y.reshape(N*M, C, H, W)
55
+ image_loss = self.loss_fn(x, y).mean(dim=[1, 2, 3])
56
+ batch_loss = image_loss.reshape(N, M).mean(dim=1)
57
+ all_loss = batch_loss.mean()
58
+ return all_loss
openlrm/losses/tvloss.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023-2024, Zexin He
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # https://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ import torch
17
+ import torch.nn as nn
18
+
19
+ __all__ = ['TVLoss']
20
+
21
+
22
+ class TVLoss(nn.Module):
23
+ """
24
+ Total variance loss.
25
+ """
26
+
27
+ def __init__(self):
28
+ super().__init__()
29
+
30
+ def numel_excluding_first_dim(self, x):
31
+ return x.numel() // x.shape[0]
32
+
33
+ @torch.compile
34
+ def forward(self, x):
35
+ """
36
+ Assume batched and channel first with inner sizes.
37
+
38
+ Args:
39
+ x: [N, M, C, H, W]
40
+
41
+ Returns:
42
+ Mean-reduced TV loss with element-level scaling.
43
+ """
44
+ N, M, C, H, W = x.shape
45
+ x = x.reshape(N*M, C, H, W)
46
+ diff_i = x[..., 1:, :] - x[..., :-1, :]
47
+ diff_j = x[..., :, 1:] - x[..., :, :-1]
48
+ div_i = self.numel_excluding_first_dim(diff_i)
49
+ div_j = self.numel_excluding_first_dim(diff_j)
50
+ tv_i = diff_i.pow(2).sum(dim=[1,2,3]) / div_i
51
+ tv_j = diff_j.pow(2).sum(dim=[1,2,3]) / div_j
52
+ tv = tv_i + tv_j
53
+ batch_tv = tv.reshape(N, M).mean(dim=1)
54
+ all_tv = batch_tv.mean()
55
+ return all_tv
openlrm/models/__init__.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023-2024, Zexin He
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # https://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ from .modeling_lrm import ModelLRM
17
+
18
+
19
+ model_dict = {
20
+ 'lrm': ModelLRM,
21
+ }
openlrm/models/block.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023-2024, Zexin He
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # https://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ import torch.nn as nn
17
+
18
+ from .modulate import ModLN
19
+
20
+
21
+ class BasicBlock(nn.Module):
22
+ """
23
+ Transformer block that is in its simplest form.
24
+ Designed for PF-LRM architecture.
25
+ """
26
+ # Block contains a self-attention layer and an MLP
27
+ def __init__(self, inner_dim: int, num_heads: int, eps: float,
28
+ attn_drop: float = 0., attn_bias: bool = False,
29
+ mlp_ratio: float = 4., mlp_drop: float = 0.):
30
+ super().__init__()
31
+ self.norm1 = nn.LayerNorm(inner_dim, eps=eps)
32
+ self.self_attn = nn.MultiheadAttention(
33
+ embed_dim=inner_dim, num_heads=num_heads,
34
+ dropout=attn_drop, bias=attn_bias, batch_first=True)
35
+ self.norm2 = nn.LayerNorm(inner_dim, eps=eps)
36
+ self.mlp = nn.Sequential(
37
+ nn.Linear(inner_dim, int(inner_dim * mlp_ratio)),
38
+ nn.GELU(),
39
+ nn.Dropout(mlp_drop),
40
+ nn.Linear(int(inner_dim * mlp_ratio), inner_dim),
41
+ nn.Dropout(mlp_drop),
42
+ )
43
+
44
+ def forward(self, x):
45
+ # x: [N, L, D]
46
+ before_sa = self.norm1(x)
47
+ x = x + self.self_attn(before_sa, before_sa, before_sa, need_weights=False)[0]
48
+ x = x + self.mlp(self.norm2(x))
49
+ return x
50
+
51
+
52
+ class ConditionBlock(nn.Module):
53
+ """
54
+ Transformer block that takes in a cross-attention condition.
55
+ Designed for SparseLRM architecture.
56
+ """
57
+ # Block contains a cross-attention layer, a self-attention layer, and an MLP
58
+ def __init__(self, inner_dim: int, cond_dim: int, num_heads: int, eps: float,
59
+ attn_drop: float = 0., attn_bias: bool = False,
60
+ mlp_ratio: float = 4., mlp_drop: float = 0.):
61
+ super().__init__()
62
+ self.norm1 = nn.LayerNorm(inner_dim, eps=eps)
63
+ self.cross_attn = nn.MultiheadAttention(
64
+ embed_dim=inner_dim, num_heads=num_heads, kdim=cond_dim, vdim=cond_dim,
65
+ dropout=attn_drop, bias=attn_bias, batch_first=True)
66
+ self.norm2 = nn.LayerNorm(inner_dim, eps=eps)
67
+ self.self_attn = nn.MultiheadAttention(
68
+ embed_dim=inner_dim, num_heads=num_heads,
69
+ dropout=attn_drop, bias=attn_bias, batch_first=True)
70
+ self.norm3 = nn.LayerNorm(inner_dim, eps=eps)
71
+ self.mlp = nn.Sequential(
72
+ nn.Linear(inner_dim, int(inner_dim * mlp_ratio)),
73
+ nn.GELU(),
74
+ nn.Dropout(mlp_drop),
75
+ nn.Linear(int(inner_dim * mlp_ratio), inner_dim),
76
+ nn.Dropout(mlp_drop),
77
+ )
78
+
79
+ def forward(self, x, cond):
80
+ # x: [N, L, D]
81
+ # cond: [N, L_cond, D_cond]
82
+ x = x + self.cross_attn(self.norm1(x), cond, cond, need_weights=False)[0]
83
+ before_sa = self.norm2(x)
84
+ x = x + self.self_attn(before_sa, before_sa, before_sa, need_weights=False)[0]
85
+ x = x + self.mlp(self.norm3(x))
86
+ return x
87
+
88
+
89
+ class ConditionModulationBlock(nn.Module):
90
+ """
91
+ Transformer block that takes in a cross-attention condition and another modulation vector applied to sub-blocks.
92
+ Designed for raw LRM architecture.
93
+ """
94
+ # Block contains a cross-attention layer, a self-attention layer, and an MLP
95
+ def __init__(self, inner_dim: int, cond_dim: int, mod_dim: int, num_heads: int, eps: float,
96
+ attn_drop: float = 0., attn_bias: bool = False,
97
+ mlp_ratio: float = 4., mlp_drop: float = 0.):
98
+ super().__init__()
99
+ self.norm1 = ModLN(inner_dim, mod_dim, eps)
100
+ self.cross_attn = nn.MultiheadAttention(
101
+ embed_dim=inner_dim, num_heads=num_heads, kdim=cond_dim, vdim=cond_dim,
102
+ dropout=attn_drop, bias=attn_bias, batch_first=True)
103
+ self.norm2 = ModLN(inner_dim, mod_dim, eps)
104
+ self.self_attn = nn.MultiheadAttention(
105
+ embed_dim=inner_dim, num_heads=num_heads,
106
+ dropout=attn_drop, bias=attn_bias, batch_first=True)
107
+ self.norm3 = ModLN(inner_dim, mod_dim, eps)
108
+ self.mlp = nn.Sequential(
109
+ nn.Linear(inner_dim, int(inner_dim * mlp_ratio)),
110
+ nn.GELU(),
111
+ nn.Dropout(mlp_drop),
112
+ nn.Linear(int(inner_dim * mlp_ratio), inner_dim),
113
+ nn.Dropout(mlp_drop),
114
+ )
115
+
116
+ def forward(self, x, cond, mod):
117
+ # x: [N, L, D]
118
+ # cond: [N, L_cond, D_cond]
119
+ # mod: [N, D_mod]
120
+ x = x + self.cross_attn(self.norm1(x, mod), cond, cond, need_weights=False)[0]
121
+ before_sa = self.norm2(x, mod)
122
+ x = x + self.self_attn(before_sa, before_sa, before_sa, need_weights=False)[0]
123
+ x = x + self.mlp(self.norm3(x, mod))
124
+ return x
openlrm/models/embedder.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023-2024, Zexin He
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # https://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ import torch
17
+ import torch.nn as nn
18
+
19
+
20
+ class CameraEmbedder(nn.Module):
21
+ """
22
+ Embed camera features to a high-dimensional vector.
23
+
24
+ Reference:
25
+ DiT: https://github.com/facebookresearch/DiT/blob/main/models.py#L27
26
+ """
27
+ def __init__(self, raw_dim: int, embed_dim: int):
28
+ super().__init__()
29
+ self.mlp = nn.Sequential(
30
+ nn.Linear(raw_dim, embed_dim),
31
+ nn.SiLU(),
32
+ nn.Linear(embed_dim, embed_dim),
33
+ )
34
+
35
+ @torch.compile
36
+ def forward(self, x):
37
+ return self.mlp(x)
openlrm/models/encoders/__init__.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023-2024, Zexin He
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # https://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ #
15
+ # Empty
openlrm/models/encoders/dino_wrapper.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023-2024, Zexin He
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # https://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ import torch
17
+ import torch.nn as nn
18
+ from transformers import ViTImageProcessor, ViTModel
19
+ from accelerate.logging import get_logger
20
+
21
+
22
+ logger = get_logger(__name__)
23
+
24
+
25
+ class DinoWrapper(nn.Module):
26
+ """
27
+ Dino v1 wrapper using huggingface transformer implementation.
28
+ """
29
+ def __init__(self, model_name: str, freeze: bool = True):
30
+ super().__init__()
31
+ self.model, self.processor = self._build_dino(model_name)
32
+ if freeze:
33
+ self._freeze()
34
+
35
+ @torch.compile
36
+ def forward_model(self, inputs):
37
+ return self.model(**inputs, interpolate_pos_encoding=True)
38
+
39
+ def forward(self, image):
40
+ # image: [N, C, H, W], on cpu
41
+ # RGB image with [0,1] scale and properly sized
42
+ inputs = self.processor(images=image, return_tensors="pt", do_rescale=False, do_resize=False).to(self.model.device)
43
+ # This resampling of positional embedding uses bicubic interpolation
44
+ outputs = self.forward_model(inputs)
45
+ last_hidden_states = outputs.last_hidden_state
46
+ return last_hidden_states
47
+
48
+ def _freeze(self):
49
+ logger.warning(f"======== Freezing DinoWrapper ========")
50
+ self.model.eval()
51
+ for name, param in self.model.named_parameters():
52
+ param.requires_grad = False
53
+
54
+ @staticmethod
55
+ def _build_dino(model_name: str, proxy_error_retries: int = 3, proxy_error_cooldown: int = 5):
56
+ import requests
57
+ try:
58
+ model = ViTModel.from_pretrained(model_name, add_pooling_layer=False)
59
+ processor = ViTImageProcessor.from_pretrained(model_name)
60
+ return model, processor
61
+ except requests.exceptions.ProxyError as err:
62
+ if proxy_error_retries > 0:
63
+ print(f"Huggingface ProxyError: Retrying ({proxy_error_retries}) in {proxy_error_cooldown} seconds...")
64
+ import time
65
+ time.sleep(proxy_error_cooldown)
66
+ return DinoWrapper._build_dino(model_name, proxy_error_retries - 1, proxy_error_cooldown)
67
+ else:
68
+ raise err
openlrm/models/encoders/dinov2/__init__.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023-2024, Zexin He
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # https://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ #
15
+ # Empty
openlrm/models/encoders/dinov2/hub/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
openlrm/models/encoders/dinov2/hub/backbones.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
5
+
6
+ from enum import Enum
7
+ from typing import Union
8
+
9
+ import torch
10
+
11
+ from .utils import _DINOV2_BASE_URL, _make_dinov2_model_name
12
+
13
+
14
+ class Weights(Enum):
15
+ LVD142M = "LVD142M"
16
+
17
+
18
+ def _make_dinov2_model(
19
+ *,
20
+ arch_name: str = "vit_large",
21
+ img_size: int = 518,
22
+ patch_size: int = 14,
23
+ init_values: float = 1.0,
24
+ ffn_layer: str = "mlp",
25
+ block_chunks: int = 0,
26
+ num_register_tokens: int = 0,
27
+ interpolate_antialias: bool = False,
28
+ interpolate_offset: float = 0.1,
29
+ pretrained: bool = True,
30
+ weights: Union[Weights, str] = Weights.LVD142M,
31
+ **kwargs,
32
+ ):
33
+ from ..models import vision_transformer as vits
34
+
35
+ if isinstance(weights, str):
36
+ try:
37
+ weights = Weights[weights]
38
+ except KeyError:
39
+ raise AssertionError(f"Unsupported weights: {weights}")
40
+
41
+ model_base_name = _make_dinov2_model_name(arch_name, patch_size)
42
+ vit_kwargs = dict(
43
+ img_size=img_size,
44
+ patch_size=patch_size,
45
+ init_values=init_values,
46
+ ffn_layer=ffn_layer,
47
+ block_chunks=block_chunks,
48
+ num_register_tokens=num_register_tokens,
49
+ interpolate_antialias=interpolate_antialias,
50
+ interpolate_offset=interpolate_offset,
51
+ )
52
+ vit_kwargs.update(**kwargs)
53
+ model = vits.__dict__[arch_name](**vit_kwargs)
54
+
55
+ if pretrained:
56
+ model_full_name = _make_dinov2_model_name(arch_name, patch_size, num_register_tokens)
57
+ url = _DINOV2_BASE_URL + f"/{model_base_name}/{model_full_name}_pretrain.pth"
58
+ state_dict = torch.hub.load_state_dict_from_url(url, map_location="cpu")
59
+ # ********** Modified by Zexin He in 2023-2024 **********
60
+ state_dict = {k: v for k, v in state_dict.items() if 'mask_token' not in k} # DDP concern
61
+ if vit_kwargs.get("modulation_dim") is not None:
62
+ state_dict = {
63
+ k.replace('norm1', 'norm1.norm').replace('norm2', 'norm2.norm'): v
64
+ for k, v in state_dict.items()
65
+ }
66
+ model.load_state_dict(state_dict, strict=False)
67
+ else:
68
+ model.load_state_dict(state_dict, strict=True)
69
+ # ********************************************************
70
+
71
+ return model
72
+
73
+
74
+ def dinov2_vits14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
75
+ """
76
+ DINOv2 ViT-S/14 model (optionally) pretrained on the LVD-142M dataset.
77
+ """
78
+ return _make_dinov2_model(arch_name="vit_small", pretrained=pretrained, weights=weights, **kwargs)
79
+
80
+
81
+ def dinov2_vitb14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
82
+ """
83
+ DINOv2 ViT-B/14 model (optionally) pretrained on the LVD-142M dataset.
84
+ """
85
+ return _make_dinov2_model(arch_name="vit_base", pretrained=pretrained, weights=weights, **kwargs)
86
+
87
+
88
+ def dinov2_vitl14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
89
+ """
90
+ DINOv2 ViT-L/14 model (optionally) pretrained on the LVD-142M dataset.
91
+ """
92
+ return _make_dinov2_model(arch_name="vit_large", pretrained=pretrained, weights=weights, **kwargs)
93
+
94
+
95
+ def dinov2_vitg14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
96
+ """
97
+ DINOv2 ViT-g/14 model (optionally) pretrained on the LVD-142M dataset.
98
+ """
99
+ return _make_dinov2_model(
100
+ arch_name="vit_giant2",
101
+ ffn_layer="swiglufused",
102
+ weights=weights,
103
+ pretrained=pretrained,
104
+ **kwargs,
105
+ )
106
+
107
+
108
+ def dinov2_vits14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
109
+ """
110
+ DINOv2 ViT-S/14 model with registers (optionally) pretrained on the LVD-142M dataset.
111
+ """
112
+ return _make_dinov2_model(
113
+ arch_name="vit_small",
114
+ pretrained=pretrained,
115
+ weights=weights,
116
+ num_register_tokens=4,
117
+ interpolate_antialias=True,
118
+ interpolate_offset=0.0,
119
+ **kwargs,
120
+ )
121
+
122
+
123
+ def dinov2_vitb14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
124
+ """
125
+ DINOv2 ViT-B/14 model with registers (optionally) pretrained on the LVD-142M dataset.
126
+ """
127
+ return _make_dinov2_model(
128
+ arch_name="vit_base",
129
+ pretrained=pretrained,
130
+ weights=weights,
131
+ num_register_tokens=4,
132
+ interpolate_antialias=True,
133
+ interpolate_offset=0.0,
134
+ **kwargs,
135
+ )
136
+
137
+
138
+ def dinov2_vitl14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
139
+ """
140
+ DINOv2 ViT-L/14 model with registers (optionally) pretrained on the LVD-142M dataset.
141
+ """
142
+ return _make_dinov2_model(
143
+ arch_name="vit_large",
144
+ pretrained=pretrained,
145
+ weights=weights,
146
+ num_register_tokens=4,
147
+ interpolate_antialias=True,
148
+ interpolate_offset=0.0,
149
+ **kwargs,
150
+ )
151
+
152
+
153
+ def dinov2_vitg14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
154
+ """
155
+ DINOv2 ViT-g/14 model with registers (optionally) pretrained on the LVD-142M dataset.
156
+ """
157
+ return _make_dinov2_model(
158
+ arch_name="vit_giant2",
159
+ ffn_layer="swiglufused",
160
+ weights=weights,
161
+ pretrained=pretrained,
162
+ num_register_tokens=4,
163
+ interpolate_antialias=True,
164
+ interpolate_offset=0.0,
165
+ **kwargs,
166
+ )
openlrm/models/encoders/dinov2/hub/classifiers.py ADDED
@@ -0,0 +1,268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
5
+
6
+ from enum import Enum
7
+ from typing import Union
8
+
9
+ import torch
10
+ import torch.nn as nn
11
+
12
+ from .backbones import _make_dinov2_model
13
+ from .utils import _DINOV2_BASE_URL, _make_dinov2_model_name
14
+
15
+
16
+ class Weights(Enum):
17
+ IMAGENET1K = "IMAGENET1K"
18
+
19
+
20
+ def _make_dinov2_linear_classification_head(
21
+ *,
22
+ arch_name: str = "vit_large",
23
+ patch_size: int = 14,
24
+ embed_dim: int = 1024,
25
+ layers: int = 4,
26
+ pretrained: bool = True,
27
+ weights: Union[Weights, str] = Weights.IMAGENET1K,
28
+ num_register_tokens: int = 0,
29
+ **kwargs,
30
+ ):
31
+ if layers not in (1, 4):
32
+ raise AssertionError(f"Unsupported number of layers: {layers}")
33
+ if isinstance(weights, str):
34
+ try:
35
+ weights = Weights[weights]
36
+ except KeyError:
37
+ raise AssertionError(f"Unsupported weights: {weights}")
38
+
39
+ linear_head = nn.Linear((1 + layers) * embed_dim, 1_000)
40
+
41
+ if pretrained:
42
+ model_base_name = _make_dinov2_model_name(arch_name, patch_size)
43
+ model_full_name = _make_dinov2_model_name(arch_name, patch_size, num_register_tokens)
44
+ layers_str = str(layers) if layers == 4 else ""
45
+ url = _DINOV2_BASE_URL + f"/{model_base_name}/{model_full_name}_linear{layers_str}_head.pth"
46
+ state_dict = torch.hub.load_state_dict_from_url(url, map_location="cpu")
47
+ linear_head.load_state_dict(state_dict, strict=True)
48
+
49
+ return linear_head
50
+
51
+
52
+ class _LinearClassifierWrapper(nn.Module):
53
+ def __init__(self, *, backbone: nn.Module, linear_head: nn.Module, layers: int = 4):
54
+ super().__init__()
55
+ self.backbone = backbone
56
+ self.linear_head = linear_head
57
+ self.layers = layers
58
+
59
+ def forward(self, x):
60
+ if self.layers == 1:
61
+ x = self.backbone.forward_features(x)
62
+ cls_token = x["x_norm_clstoken"]
63
+ patch_tokens = x["x_norm_patchtokens"]
64
+ # fmt: off
65
+ linear_input = torch.cat([
66
+ cls_token,
67
+ patch_tokens.mean(dim=1),
68
+ ], dim=1)
69
+ # fmt: on
70
+ elif self.layers == 4:
71
+ x = self.backbone.get_intermediate_layers(x, n=4, return_class_token=True)
72
+ # fmt: off
73
+ linear_input = torch.cat([
74
+ x[0][1],
75
+ x[1][1],
76
+ x[2][1],
77
+ x[3][1],
78
+ x[3][0].mean(dim=1),
79
+ ], dim=1)
80
+ # fmt: on
81
+ else:
82
+ assert False, f"Unsupported number of layers: {self.layers}"
83
+ return self.linear_head(linear_input)
84
+
85
+
86
+ def _make_dinov2_linear_classifier(
87
+ *,
88
+ arch_name: str = "vit_large",
89
+ layers: int = 4,
90
+ pretrained: bool = True,
91
+ weights: Union[Weights, str] = Weights.IMAGENET1K,
92
+ num_register_tokens: int = 0,
93
+ interpolate_antialias: bool = False,
94
+ interpolate_offset: float = 0.1,
95
+ **kwargs,
96
+ ):
97
+ backbone = _make_dinov2_model(
98
+ arch_name=arch_name,
99
+ pretrained=pretrained,
100
+ num_register_tokens=num_register_tokens,
101
+ interpolate_antialias=interpolate_antialias,
102
+ interpolate_offset=interpolate_offset,
103
+ **kwargs,
104
+ )
105
+
106
+ embed_dim = backbone.embed_dim
107
+ patch_size = backbone.patch_size
108
+ linear_head = _make_dinov2_linear_classification_head(
109
+ arch_name=arch_name,
110
+ patch_size=patch_size,
111
+ embed_dim=embed_dim,
112
+ layers=layers,
113
+ pretrained=pretrained,
114
+ weights=weights,
115
+ num_register_tokens=num_register_tokens,
116
+ )
117
+
118
+ return _LinearClassifierWrapper(backbone=backbone, linear_head=linear_head, layers=layers)
119
+
120
+
121
+ def dinov2_vits14_lc(
122
+ *,
123
+ layers: int = 4,
124
+ pretrained: bool = True,
125
+ weights: Union[Weights, str] = Weights.IMAGENET1K,
126
+ **kwargs,
127
+ ):
128
+ """
129
+ Linear classifier (1 or 4 layers) on top of a DINOv2 ViT-S/14 backbone (optionally) pretrained on the LVD-142M dataset and trained on ImageNet-1k.
130
+ """
131
+ return _make_dinov2_linear_classifier(
132
+ arch_name="vit_small",
133
+ layers=layers,
134
+ pretrained=pretrained,
135
+ weights=weights,
136
+ **kwargs,
137
+ )
138
+
139
+
140
+ def dinov2_vitb14_lc(
141
+ *,
142
+ layers: int = 4,
143
+ pretrained: bool = True,
144
+ weights: Union[Weights, str] = Weights.IMAGENET1K,
145
+ **kwargs,
146
+ ):
147
+ """
148
+ Linear classifier (1 or 4 layers) on top of a DINOv2 ViT-B/14 backbone (optionally) pretrained on the LVD-142M dataset and trained on ImageNet-1k.
149
+ """
150
+ return _make_dinov2_linear_classifier(
151
+ arch_name="vit_base",
152
+ layers=layers,
153
+ pretrained=pretrained,
154
+ weights=weights,
155
+ **kwargs,
156
+ )
157
+
158
+
159
+ def dinov2_vitl14_lc(
160
+ *,
161
+ layers: int = 4,
162
+ pretrained: bool = True,
163
+ weights: Union[Weights, str] = Weights.IMAGENET1K,
164
+ **kwargs,
165
+ ):
166
+ """
167
+ Linear classifier (1 or 4 layers) on top of a DINOv2 ViT-L/14 backbone (optionally) pretrained on the LVD-142M dataset and trained on ImageNet-1k.
168
+ """
169
+ return _make_dinov2_linear_classifier(
170
+ arch_name="vit_large",
171
+ layers=layers,
172
+ pretrained=pretrained,
173
+ weights=weights,
174
+ **kwargs,
175
+ )
176
+
177
+
178
+ def dinov2_vitg14_lc(
179
+ *,
180
+ layers: int = 4,
181
+ pretrained: bool = True,
182
+ weights: Union[Weights, str] = Weights.IMAGENET1K,
183
+ **kwargs,
184
+ ):
185
+ """
186
+ Linear classifier (1 or 4 layers) on top of a DINOv2 ViT-g/14 backbone (optionally) pretrained on the LVD-142M dataset and trained on ImageNet-1k.
187
+ """
188
+ return _make_dinov2_linear_classifier(
189
+ arch_name="vit_giant2",
190
+ layers=layers,
191
+ ffn_layer="swiglufused",
192
+ pretrained=pretrained,
193
+ weights=weights,
194
+ **kwargs,
195
+ )
196
+
197
+
198
+ def dinov2_vits14_reg_lc(
199
+ *, layers: int = 4, pretrained: bool = True, weights: Union[Weights, str] = Weights.IMAGENET1K, **kwargs
200
+ ):
201
+ """
202
+ Linear classifier (1 or 4 layers) on top of a DINOv2 ViT-S/14 backbone with registers (optionally) pretrained on the LVD-142M dataset and trained on ImageNet-1k.
203
+ """
204
+ return _make_dinov2_linear_classifier(
205
+ arch_name="vit_small",
206
+ layers=layers,
207
+ pretrained=pretrained,
208
+ weights=weights,
209
+ num_register_tokens=4,
210
+ interpolate_antialias=True,
211
+ interpolate_offset=0.0,
212
+ **kwargs,
213
+ )
214
+
215
+
216
+ def dinov2_vitb14_reg_lc(
217
+ *, layers: int = 4, pretrained: bool = True, weights: Union[Weights, str] = Weights.IMAGENET1K, **kwargs
218
+ ):
219
+ """
220
+ Linear classifier (1 or 4 layers) on top of a DINOv2 ViT-B/14 backbone with registers (optionally) pretrained on the LVD-142M dataset and trained on ImageNet-1k.
221
+ """
222
+ return _make_dinov2_linear_classifier(
223
+ arch_name="vit_base",
224
+ layers=layers,
225
+ pretrained=pretrained,
226
+ weights=weights,
227
+ num_register_tokens=4,
228
+ interpolate_antialias=True,
229
+ interpolate_offset=0.0,
230
+ **kwargs,
231
+ )
232
+
233
+
234
+ def dinov2_vitl14_reg_lc(
235
+ *, layers: int = 4, pretrained: bool = True, weights: Union[Weights, str] = Weights.IMAGENET1K, **kwargs
236
+ ):
237
+ """
238
+ Linear classifier (1 or 4 layers) on top of a DINOv2 ViT-L/14 backbone with registers (optionally) pretrained on the LVD-142M dataset and trained on ImageNet-1k.
239
+ """
240
+ return _make_dinov2_linear_classifier(
241
+ arch_name="vit_large",
242
+ layers=layers,
243
+ pretrained=pretrained,
244
+ weights=weights,
245
+ num_register_tokens=4,
246
+ interpolate_antialias=True,
247
+ interpolate_offset=0.0,
248
+ **kwargs,
249
+ )
250
+
251
+
252
+ def dinov2_vitg14_reg_lc(
253
+ *, layers: int = 4, pretrained: bool = True, weights: Union[Weights, str] = Weights.IMAGENET1K, **kwargs
254
+ ):
255
+ """
256
+ Linear classifier (1 or 4 layers) on top of a DINOv2 ViT-g/14 backbone with registers (optionally) pretrained on the LVD-142M dataset and trained on ImageNet-1k.
257
+ """
258
+ return _make_dinov2_linear_classifier(
259
+ arch_name="vit_giant2",
260
+ layers=layers,
261
+ ffn_layer="swiglufused",
262
+ pretrained=pretrained,
263
+ weights=weights,
264
+ num_register_tokens=4,
265
+ interpolate_antialias=True,
266
+ interpolate_offset=0.0,
267
+ **kwargs,
268
+ )
openlrm/models/encoders/dinov2/hub/depth/__init__.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
5
+
6
+ from .decode_heads import BNHead, DPTHead
7
+ from .encoder_decoder import DepthEncoderDecoder
openlrm/models/encoders/dinov2/hub/depth/decode_heads.py ADDED
@@ -0,0 +1,747 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
5
+
6
+ import copy
7
+ from functools import partial
8
+ import math
9
+ import warnings
10
+
11
+ import torch
12
+ import torch.nn as nn
13
+
14
+ from .ops import resize
15
+
16
+
17
+ # XXX: (Untested) replacement for mmcv.imdenormalize()
18
+ def _imdenormalize(img, mean, std, to_bgr=True):
19
+ import numpy as np
20
+
21
+ mean = mean.reshape(1, -1).astype(np.float64)
22
+ std = std.reshape(1, -1).astype(np.float64)
23
+ img = (img * std) + mean
24
+ if to_bgr:
25
+ img = img[::-1]
26
+ return img
27
+
28
+
29
+ class DepthBaseDecodeHead(nn.Module):
30
+ """Base class for BaseDecodeHead.
31
+
32
+ Args:
33
+ in_channels (List): Input channels.
34
+ channels (int): Channels after modules, before conv_depth.
35
+ conv_layer (nn.Module): Conv layers. Default: None.
36
+ act_layer (nn.Module): Activation layers. Default: nn.ReLU.
37
+ loss_decode (dict): Config of decode loss.
38
+ Default: ().
39
+ sampler (dict|None): The config of depth map sampler.
40
+ Default: None.
41
+ align_corners (bool): align_corners argument of F.interpolate.
42
+ Default: False.
43
+ min_depth (int): Min depth in dataset setting.
44
+ Default: 1e-3.
45
+ max_depth (int): Max depth in dataset setting.
46
+ Default: None.
47
+ norm_layer (dict|None): Norm layers.
48
+ Default: None.
49
+ classify (bool): Whether predict depth in a cls.-reg. manner.
50
+ Default: False.
51
+ n_bins (int): The number of bins used in cls. step.
52
+ Default: 256.
53
+ bins_strategy (str): The discrete strategy used in cls. step.
54
+ Default: 'UD'.
55
+ norm_strategy (str): The norm strategy on cls. probability
56
+ distribution. Default: 'linear'
57
+ scale_up (str): Whether predict depth in a scale-up manner.
58
+ Default: False.
59
+ """
60
+
61
+ def __init__(
62
+ self,
63
+ in_channels,
64
+ conv_layer=None,
65
+ act_layer=nn.ReLU,
66
+ channels=96,
67
+ loss_decode=(),
68
+ sampler=None,
69
+ align_corners=False,
70
+ min_depth=1e-3,
71
+ max_depth=None,
72
+ norm_layer=None,
73
+ classify=False,
74
+ n_bins=256,
75
+ bins_strategy="UD",
76
+ norm_strategy="linear",
77
+ scale_up=False,
78
+ ):
79
+ super(DepthBaseDecodeHead, self).__init__()
80
+
81
+ self.in_channels = in_channels
82
+ self.channels = channels
83
+ self.conf_layer = conv_layer
84
+ self.act_layer = act_layer
85
+ self.loss_decode = loss_decode
86
+ self.align_corners = align_corners
87
+ self.min_depth = min_depth
88
+ self.max_depth = max_depth
89
+ self.norm_layer = norm_layer
90
+ self.classify = classify
91
+ self.n_bins = n_bins
92
+ self.scale_up = scale_up
93
+
94
+ if self.classify:
95
+ assert bins_strategy in ["UD", "SID"], "Support bins_strategy: UD, SID"
96
+ assert norm_strategy in ["linear", "softmax", "sigmoid"], "Support norm_strategy: linear, softmax, sigmoid"
97
+
98
+ self.bins_strategy = bins_strategy
99
+ self.norm_strategy = norm_strategy
100
+ self.softmax = nn.Softmax(dim=1)
101
+ self.conv_depth = nn.Conv2d(channels, n_bins, kernel_size=3, padding=1, stride=1)
102
+ else:
103
+ self.conv_depth = nn.Conv2d(channels, 1, kernel_size=3, padding=1, stride=1)
104
+
105
+ self.relu = nn.ReLU()
106
+ self.sigmoid = nn.Sigmoid()
107
+
108
+ def forward(self, inputs, img_metas):
109
+ """Placeholder of forward function."""
110
+ pass
111
+
112
+ def forward_train(self, img, inputs, img_metas, depth_gt):
113
+ """Forward function for training.
114
+ Args:
115
+ inputs (list[Tensor]): List of multi-level img features.
116
+ img_metas (list[dict]): List of image info dict where each dict
117
+ has: 'img_shape', 'scale_factor', 'flip', and may also contain
118
+ 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
119
+ For details on the values of these keys see
120
+ `depth/datasets/pipelines/formatting.py:Collect`.
121
+ depth_gt (Tensor): GT depth
122
+
123
+ Returns:
124
+ dict[str, Tensor]: a dictionary of loss components
125
+ """
126
+ depth_pred = self.forward(inputs, img_metas)
127
+ losses = self.losses(depth_pred, depth_gt)
128
+
129
+ log_imgs = self.log_images(img[0], depth_pred[0], depth_gt[0], img_metas[0])
130
+ losses.update(**log_imgs)
131
+
132
+ return losses
133
+
134
+ def forward_test(self, inputs, img_metas):
135
+ """Forward function for testing.
136
+ Args:
137
+ inputs (list[Tensor]): List of multi-level img features.
138
+ img_metas (list[dict]): List of image info dict where each dict
139
+ has: 'img_shape', 'scale_factor', 'flip', and may also contain
140
+ 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
141
+ For details on the values of these keys see
142
+ `depth/datasets/pipelines/formatting.py:Collect`.
143
+
144
+ Returns:
145
+ Tensor: Output depth map.
146
+ """
147
+ return self.forward(inputs, img_metas)
148
+
149
+ def depth_pred(self, feat):
150
+ """Prediction each pixel."""
151
+ if self.classify:
152
+ logit = self.conv_depth(feat)
153
+
154
+ if self.bins_strategy == "UD":
155
+ bins = torch.linspace(self.min_depth, self.max_depth, self.n_bins, device=feat.device)
156
+ elif self.bins_strategy == "SID":
157
+ bins = torch.logspace(self.min_depth, self.max_depth, self.n_bins, device=feat.device)
158
+
159
+ # following Adabins, default linear
160
+ if self.norm_strategy == "linear":
161
+ logit = torch.relu(logit)
162
+ eps = 0.1
163
+ logit = logit + eps
164
+ logit = logit / logit.sum(dim=1, keepdim=True)
165
+ elif self.norm_strategy == "softmax":
166
+ logit = torch.softmax(logit, dim=1)
167
+ elif self.norm_strategy == "sigmoid":
168
+ logit = torch.sigmoid(logit)
169
+ logit = logit / logit.sum(dim=1, keepdim=True)
170
+
171
+ output = torch.einsum("ikmn,k->imn", [logit, bins]).unsqueeze(dim=1)
172
+
173
+ else:
174
+ if self.scale_up:
175
+ output = self.sigmoid(self.conv_depth(feat)) * self.max_depth
176
+ else:
177
+ output = self.relu(self.conv_depth(feat)) + self.min_depth
178
+ return output
179
+
180
+ def losses(self, depth_pred, depth_gt):
181
+ """Compute depth loss."""
182
+ loss = dict()
183
+ depth_pred = resize(
184
+ input=depth_pred, size=depth_gt.shape[2:], mode="bilinear", align_corners=self.align_corners, warning=False
185
+ )
186
+ if not isinstance(self.loss_decode, nn.ModuleList):
187
+ losses_decode = [self.loss_decode]
188
+ else:
189
+ losses_decode = self.loss_decode
190
+ for loss_decode in losses_decode:
191
+ if loss_decode.loss_name not in loss:
192
+ loss[loss_decode.loss_name] = loss_decode(depth_pred, depth_gt)
193
+ else:
194
+ loss[loss_decode.loss_name] += loss_decode(depth_pred, depth_gt)
195
+ return loss
196
+
197
+ def log_images(self, img_path, depth_pred, depth_gt, img_meta):
198
+ import numpy as np
199
+
200
+ show_img = copy.deepcopy(img_path.detach().cpu().permute(1, 2, 0))
201
+ show_img = show_img.numpy().astype(np.float32)
202
+ show_img = _imdenormalize(
203
+ show_img,
204
+ img_meta["img_norm_cfg"]["mean"],
205
+ img_meta["img_norm_cfg"]["std"],
206
+ img_meta["img_norm_cfg"]["to_rgb"],
207
+ )
208
+ show_img = np.clip(show_img, 0, 255)
209
+ show_img = show_img.astype(np.uint8)
210
+ show_img = show_img[:, :, ::-1]
211
+ show_img = show_img.transpose(0, 2, 1)
212
+ show_img = show_img.transpose(1, 0, 2)
213
+
214
+ depth_pred = depth_pred / torch.max(depth_pred)
215
+ depth_gt = depth_gt / torch.max(depth_gt)
216
+
217
+ depth_pred_color = copy.deepcopy(depth_pred.detach().cpu())
218
+ depth_gt_color = copy.deepcopy(depth_gt.detach().cpu())
219
+
220
+ return {"img_rgb": show_img, "img_depth_pred": depth_pred_color, "img_depth_gt": depth_gt_color}
221
+
222
+
223
+ class BNHead(DepthBaseDecodeHead):
224
+ """Just a batchnorm."""
225
+
226
+ def __init__(self, input_transform="resize_concat", in_index=(0, 1, 2, 3), upsample=1, **kwargs):
227
+ super().__init__(**kwargs)
228
+ self.input_transform = input_transform
229
+ self.in_index = in_index
230
+ self.upsample = upsample
231
+ # self.bn = nn.SyncBatchNorm(self.in_channels)
232
+ if self.classify:
233
+ self.conv_depth = nn.Conv2d(self.channels, self.n_bins, kernel_size=1, padding=0, stride=1)
234
+ else:
235
+ self.conv_depth = nn.Conv2d(self.channels, 1, kernel_size=1, padding=0, stride=1)
236
+
237
+ def _transform_inputs(self, inputs):
238
+ """Transform inputs for decoder.
239
+ Args:
240
+ inputs (list[Tensor]): List of multi-level img features.
241
+ Returns:
242
+ Tensor: The transformed inputs
243
+ """
244
+
245
+ if "concat" in self.input_transform:
246
+ inputs = [inputs[i] for i in self.in_index]
247
+ if "resize" in self.input_transform:
248
+ inputs = [
249
+ resize(
250
+ input=x,
251
+ size=[s * self.upsample for s in inputs[0].shape[2:]],
252
+ mode="bilinear",
253
+ align_corners=self.align_corners,
254
+ )
255
+ for x in inputs
256
+ ]
257
+ inputs = torch.cat(inputs, dim=1)
258
+ elif self.input_transform == "multiple_select":
259
+ inputs = [inputs[i] for i in self.in_index]
260
+ else:
261
+ inputs = inputs[self.in_index]
262
+
263
+ return inputs
264
+
265
+ def _forward_feature(self, inputs, img_metas=None, **kwargs):
266
+ """Forward function for feature maps before classifying each pixel with
267
+ ``self.cls_seg`` fc.
268
+ Args:
269
+ inputs (list[Tensor]): List of multi-level img features.
270
+ Returns:
271
+ feats (Tensor): A tensor of shape (batch_size, self.channels,
272
+ H, W) which is feature map for last layer of decoder head.
273
+ """
274
+ # accept lists (for cls token)
275
+ inputs = list(inputs)
276
+ for i, x in enumerate(inputs):
277
+ if len(x) == 2:
278
+ x, cls_token = x[0], x[1]
279
+ if len(x.shape) == 2:
280
+ x = x[:, :, None, None]
281
+ cls_token = cls_token[:, :, None, None].expand_as(x)
282
+ inputs[i] = torch.cat((x, cls_token), 1)
283
+ else:
284
+ x = x[0]
285
+ if len(x.shape) == 2:
286
+ x = x[:, :, None, None]
287
+ inputs[i] = x
288
+ x = self._transform_inputs(inputs)
289
+ # feats = self.bn(x)
290
+ return x
291
+
292
+ def forward(self, inputs, img_metas=None, **kwargs):
293
+ """Forward function."""
294
+ output = self._forward_feature(inputs, img_metas=img_metas, **kwargs)
295
+ output = self.depth_pred(output)
296
+ return output
297
+
298
+
299
+ class ConvModule(nn.Module):
300
+ """A conv block that bundles conv/norm/activation layers.
301
+
302
+ This block simplifies the usage of convolution layers, which are commonly
303
+ used with a norm layer (e.g., BatchNorm) and activation layer (e.g., ReLU).
304
+ It is based upon three build methods: `build_conv_layer()`,
305
+ `build_norm_layer()` and `build_activation_layer()`.
306
+
307
+ Besides, we add some additional features in this module.
308
+ 1. Automatically set `bias` of the conv layer.
309
+ 2. Spectral norm is supported.
310
+ 3. More padding modes are supported. Before PyTorch 1.5, nn.Conv2d only
311
+ supports zero and circular padding, and we add "reflect" padding mode.
312
+
313
+ Args:
314
+ in_channels (int): Number of channels in the input feature map.
315
+ Same as that in ``nn._ConvNd``.
316
+ out_channels (int): Number of channels produced by the convolution.
317
+ Same as that in ``nn._ConvNd``.
318
+ kernel_size (int | tuple[int]): Size of the convolving kernel.
319
+ Same as that in ``nn._ConvNd``.
320
+ stride (int | tuple[int]): Stride of the convolution.
321
+ Same as that in ``nn._ConvNd``.
322
+ padding (int | tuple[int]): Zero-padding added to both sides of
323
+ the input. Same as that in ``nn._ConvNd``.
324
+ dilation (int | tuple[int]): Spacing between kernel elements.
325
+ Same as that in ``nn._ConvNd``.
326
+ groups (int): Number of blocked connections from input channels to
327
+ output channels. Same as that in ``nn._ConvNd``.
328
+ bias (bool | str): If specified as `auto`, it will be decided by the
329
+ norm_layer. Bias will be set as True if `norm_layer` is None, otherwise
330
+ False. Default: "auto".
331
+ conv_layer (nn.Module): Convolution layer. Default: None,
332
+ which means using conv2d.
333
+ norm_layer (nn.Module): Normalization layer. Default: None.
334
+ act_layer (nn.Module): Activation layer. Default: nn.ReLU.
335
+ inplace (bool): Whether to use inplace mode for activation.
336
+ Default: True.
337
+ with_spectral_norm (bool): Whether use spectral norm in conv module.
338
+ Default: False.
339
+ padding_mode (str): If the `padding_mode` has not been supported by
340
+ current `Conv2d` in PyTorch, we will use our own padding layer
341
+ instead. Currently, we support ['zeros', 'circular'] with official
342
+ implementation and ['reflect'] with our own implementation.
343
+ Default: 'zeros'.
344
+ order (tuple[str]): The order of conv/norm/activation layers. It is a
345
+ sequence of "conv", "norm" and "act". Common examples are
346
+ ("conv", "norm", "act") and ("act", "conv", "norm").
347
+ Default: ('conv', 'norm', 'act').
348
+ """
349
+
350
+ _abbr_ = "conv_block"
351
+
352
+ def __init__(
353
+ self,
354
+ in_channels,
355
+ out_channels,
356
+ kernel_size,
357
+ stride=1,
358
+ padding=0,
359
+ dilation=1,
360
+ groups=1,
361
+ bias="auto",
362
+ conv_layer=nn.Conv2d,
363
+ norm_layer=None,
364
+ act_layer=nn.ReLU,
365
+ inplace=True,
366
+ with_spectral_norm=False,
367
+ padding_mode="zeros",
368
+ order=("conv", "norm", "act"),
369
+ ):
370
+ super(ConvModule, self).__init__()
371
+ official_padding_mode = ["zeros", "circular"]
372
+ self.conv_layer = conv_layer
373
+ self.norm_layer = norm_layer
374
+ self.act_layer = act_layer
375
+ self.inplace = inplace
376
+ self.with_spectral_norm = with_spectral_norm
377
+ self.with_explicit_padding = padding_mode not in official_padding_mode
378
+ self.order = order
379
+ assert isinstance(self.order, tuple) and len(self.order) == 3
380
+ assert set(order) == set(["conv", "norm", "act"])
381
+
382
+ self.with_norm = norm_layer is not None
383
+ self.with_activation = act_layer is not None
384
+ # if the conv layer is before a norm layer, bias is unnecessary.
385
+ if bias == "auto":
386
+ bias = not self.with_norm
387
+ self.with_bias = bias
388
+
389
+ if self.with_explicit_padding:
390
+ if padding_mode == "zeros":
391
+ padding_layer = nn.ZeroPad2d
392
+ else:
393
+ raise AssertionError(f"Unsupported padding mode: {padding_mode}")
394
+ self.pad = padding_layer(padding)
395
+
396
+ # reset padding to 0 for conv module
397
+ conv_padding = 0 if self.with_explicit_padding else padding
398
+ # build convolution layer
399
+ self.conv = self.conv_layer(
400
+ in_channels,
401
+ out_channels,
402
+ kernel_size,
403
+ stride=stride,
404
+ padding=conv_padding,
405
+ dilation=dilation,
406
+ groups=groups,
407
+ bias=bias,
408
+ )
409
+ # export the attributes of self.conv to a higher level for convenience
410
+ self.in_channels = self.conv.in_channels
411
+ self.out_channels = self.conv.out_channels
412
+ self.kernel_size = self.conv.kernel_size
413
+ self.stride = self.conv.stride
414
+ self.padding = padding
415
+ self.dilation = self.conv.dilation
416
+ self.transposed = self.conv.transposed
417
+ self.output_padding = self.conv.output_padding
418
+ self.groups = self.conv.groups
419
+
420
+ if self.with_spectral_norm:
421
+ self.conv = nn.utils.spectral_norm(self.conv)
422
+
423
+ # build normalization layers
424
+ if self.with_norm:
425
+ # norm layer is after conv layer
426
+ if order.index("norm") > order.index("conv"):
427
+ norm_channels = out_channels
428
+ else:
429
+ norm_channels = in_channels
430
+ norm = partial(norm_layer, num_features=norm_channels)
431
+ self.add_module("norm", norm)
432
+ if self.with_bias:
433
+ from torch.nnModules.batchnorm import _BatchNorm
434
+ from torch.nnModules.instancenorm import _InstanceNorm
435
+
436
+ if isinstance(norm, (_BatchNorm, _InstanceNorm)):
437
+ warnings.warn("Unnecessary conv bias before batch/instance norm")
438
+ else:
439
+ self.norm_name = None
440
+
441
+ # build activation layer
442
+ if self.with_activation:
443
+ # nn.Tanh has no 'inplace' argument
444
+ # (nn.Tanh, nn.PReLU, nn.Sigmoid, nn.HSigmoid, nn.Swish, nn.GELU)
445
+ if not isinstance(act_layer, (nn.Tanh, nn.PReLU, nn.Sigmoid, nn.GELU)):
446
+ act_layer = partial(act_layer, inplace=inplace)
447
+ self.activate = act_layer()
448
+
449
+ # Use msra init by default
450
+ self.init_weights()
451
+
452
+ @property
453
+ def norm(self):
454
+ if self.norm_name:
455
+ return getattr(self, self.norm_name)
456
+ else:
457
+ return None
458
+
459
+ def init_weights(self):
460
+ # 1. It is mainly for customized conv layers with their own
461
+ # initialization manners by calling their own ``init_weights()``,
462
+ # and we do not want ConvModule to override the initialization.
463
+ # 2. For customized conv layers without their own initialization
464
+ # manners (that is, they don't have their own ``init_weights()``)
465
+ # and PyTorch's conv layers, they will be initialized by
466
+ # this method with default ``kaiming_init``.
467
+ # Note: For PyTorch's conv layers, they will be overwritten by our
468
+ # initialization implementation using default ``kaiming_init``.
469
+ if not hasattr(self.conv, "init_weights"):
470
+ if self.with_activation and isinstance(self.act_layer, nn.LeakyReLU):
471
+ nonlinearity = "leaky_relu"
472
+ a = 0.01 # XXX: default negative_slope
473
+ else:
474
+ nonlinearity = "relu"
475
+ a = 0
476
+ if hasattr(self.conv, "weight") and self.conv.weight is not None:
477
+ nn.init.kaiming_normal_(self.conv.weight, a=a, mode="fan_out", nonlinearity=nonlinearity)
478
+ if hasattr(self.conv, "bias") and self.conv.bias is not None:
479
+ nn.init.constant_(self.conv.bias, 0)
480
+ if self.with_norm:
481
+ if hasattr(self.norm, "weight") and self.norm.weight is not None:
482
+ nn.init.constant_(self.norm.weight, 1)
483
+ if hasattr(self.norm, "bias") and self.norm.bias is not None:
484
+ nn.init.constant_(self.norm.bias, 0)
485
+
486
+ def forward(self, x, activate=True, norm=True):
487
+ for layer in self.order:
488
+ if layer == "conv":
489
+ if self.with_explicit_padding:
490
+ x = self.pad(x)
491
+ x = self.conv(x)
492
+ elif layer == "norm" and norm and self.with_norm:
493
+ x = self.norm(x)
494
+ elif layer == "act" and activate and self.with_activation:
495
+ x = self.activate(x)
496
+ return x
497
+
498
+
499
+ class Interpolate(nn.Module):
500
+ def __init__(self, scale_factor, mode, align_corners=False):
501
+ super(Interpolate, self).__init__()
502
+ self.interp = nn.functional.interpolate
503
+ self.scale_factor = scale_factor
504
+ self.mode = mode
505
+ self.align_corners = align_corners
506
+
507
+ def forward(self, x):
508
+ x = self.interp(x, scale_factor=self.scale_factor, mode=self.mode, align_corners=self.align_corners)
509
+ return x
510
+
511
+
512
+ class HeadDepth(nn.Module):
513
+ def __init__(self, features):
514
+ super(HeadDepth, self).__init__()
515
+ self.head = nn.Sequential(
516
+ nn.Conv2d(features, features // 2, kernel_size=3, stride=1, padding=1),
517
+ Interpolate(scale_factor=2, mode="bilinear", align_corners=True),
518
+ nn.Conv2d(features // 2, 32, kernel_size=3, stride=1, padding=1),
519
+ nn.ReLU(),
520
+ nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
521
+ )
522
+
523
+ def forward(self, x):
524
+ x = self.head(x)
525
+ return x
526
+
527
+
528
+ class ReassembleBlocks(nn.Module):
529
+ """ViTPostProcessBlock, process cls_token in ViT backbone output and
530
+ rearrange the feature vector to feature map.
531
+ Args:
532
+ in_channels (int): ViT feature channels. Default: 768.
533
+ out_channels (List): output channels of each stage.
534
+ Default: [96, 192, 384, 768].
535
+ readout_type (str): Type of readout operation. Default: 'ignore'.
536
+ patch_size (int): The patch size. Default: 16.
537
+ """
538
+
539
+ def __init__(self, in_channels=768, out_channels=[96, 192, 384, 768], readout_type="ignore", patch_size=16):
540
+ super(ReassembleBlocks, self).__init__()
541
+
542
+ assert readout_type in ["ignore", "add", "project"]
543
+ self.readout_type = readout_type
544
+ self.patch_size = patch_size
545
+
546
+ self.projects = nn.ModuleList(
547
+ [
548
+ ConvModule(
549
+ in_channels=in_channels,
550
+ out_channels=out_channel,
551
+ kernel_size=1,
552
+ act_layer=None,
553
+ )
554
+ for out_channel in out_channels
555
+ ]
556
+ )
557
+
558
+ self.resize_layers = nn.ModuleList(
559
+ [
560
+ nn.ConvTranspose2d(
561
+ in_channels=out_channels[0], out_channels=out_channels[0], kernel_size=4, stride=4, padding=0
562
+ ),
563
+ nn.ConvTranspose2d(
564
+ in_channels=out_channels[1], out_channels=out_channels[1], kernel_size=2, stride=2, padding=0
565
+ ),
566
+ nn.Identity(),
567
+ nn.Conv2d(
568
+ in_channels=out_channels[3], out_channels=out_channels[3], kernel_size=3, stride=2, padding=1
569
+ ),
570
+ ]
571
+ )
572
+ if self.readout_type == "project":
573
+ self.readout_projects = nn.ModuleList()
574
+ for _ in range(len(self.projects)):
575
+ self.readout_projects.append(nn.Sequential(nn.Linear(2 * in_channels, in_channels), nn.GELU()))
576
+
577
+ def forward(self, inputs):
578
+ assert isinstance(inputs, list)
579
+ out = []
580
+ for i, x in enumerate(inputs):
581
+ assert len(x) == 2
582
+ x, cls_token = x[0], x[1]
583
+ feature_shape = x.shape
584
+ if self.readout_type == "project":
585
+ x = x.flatten(2).permute((0, 2, 1))
586
+ readout = cls_token.unsqueeze(1).expand_as(x)
587
+ x = self.readout_projects[i](torch.cat((x, readout), -1))
588
+ x = x.permute(0, 2, 1).reshape(feature_shape)
589
+ elif self.readout_type == "add":
590
+ x = x.flatten(2) + cls_token.unsqueeze(-1)
591
+ x = x.reshape(feature_shape)
592
+ else:
593
+ pass
594
+ x = self.projects[i](x)
595
+ x = self.resize_layers[i](x)
596
+ out.append(x)
597
+ return out
598
+
599
+
600
+ class PreActResidualConvUnit(nn.Module):
601
+ """ResidualConvUnit, pre-activate residual unit.
602
+ Args:
603
+ in_channels (int): number of channels in the input feature map.
604
+ act_layer (nn.Module): activation layer.
605
+ norm_layer (nn.Module): norm layer.
606
+ stride (int): stride of the first block. Default: 1
607
+ dilation (int): dilation rate for convs layers. Default: 1.
608
+ """
609
+
610
+ def __init__(self, in_channels, act_layer, norm_layer, stride=1, dilation=1):
611
+ super(PreActResidualConvUnit, self).__init__()
612
+
613
+ self.conv1 = ConvModule(
614
+ in_channels,
615
+ in_channels,
616
+ 3,
617
+ stride=stride,
618
+ padding=dilation,
619
+ dilation=dilation,
620
+ norm_layer=norm_layer,
621
+ act_layer=act_layer,
622
+ bias=False,
623
+ order=("act", "conv", "norm"),
624
+ )
625
+
626
+ self.conv2 = ConvModule(
627
+ in_channels,
628
+ in_channels,
629
+ 3,
630
+ padding=1,
631
+ norm_layer=norm_layer,
632
+ act_layer=act_layer,
633
+ bias=False,
634
+ order=("act", "conv", "norm"),
635
+ )
636
+
637
+ def forward(self, inputs):
638
+ inputs_ = inputs.clone()
639
+ x = self.conv1(inputs)
640
+ x = self.conv2(x)
641
+ return x + inputs_
642
+
643
+
644
+ class FeatureFusionBlock(nn.Module):
645
+ """FeatureFusionBlock, merge feature map from different stages.
646
+ Args:
647
+ in_channels (int): Input channels.
648
+ act_layer (nn.Module): activation layer for ResidualConvUnit.
649
+ norm_layer (nn.Module): normalization layer.
650
+ expand (bool): Whether expand the channels in post process block.
651
+ Default: False.
652
+ align_corners (bool): align_corner setting for bilinear upsample.
653
+ Default: True.
654
+ """
655
+
656
+ def __init__(self, in_channels, act_layer, norm_layer, expand=False, align_corners=True):
657
+ super(FeatureFusionBlock, self).__init__()
658
+
659
+ self.in_channels = in_channels
660
+ self.expand = expand
661
+ self.align_corners = align_corners
662
+
663
+ self.out_channels = in_channels
664
+ if self.expand:
665
+ self.out_channels = in_channels // 2
666
+
667
+ self.project = ConvModule(self.in_channels, self.out_channels, kernel_size=1, act_layer=None, bias=True)
668
+
669
+ self.res_conv_unit1 = PreActResidualConvUnit(
670
+ in_channels=self.in_channels, act_layer=act_layer, norm_layer=norm_layer
671
+ )
672
+ self.res_conv_unit2 = PreActResidualConvUnit(
673
+ in_channels=self.in_channels, act_layer=act_layer, norm_layer=norm_layer
674
+ )
675
+
676
+ def forward(self, *inputs):
677
+ x = inputs[0]
678
+ if len(inputs) == 2:
679
+ if x.shape != inputs[1].shape:
680
+ res = resize(inputs[1], size=(x.shape[2], x.shape[3]), mode="bilinear", align_corners=False)
681
+ else:
682
+ res = inputs[1]
683
+ x = x + self.res_conv_unit1(res)
684
+ x = self.res_conv_unit2(x)
685
+ x = resize(x, scale_factor=2, mode="bilinear", align_corners=self.align_corners)
686
+ x = self.project(x)
687
+ return x
688
+
689
+
690
+ class DPTHead(DepthBaseDecodeHead):
691
+ """Vision Transformers for Dense Prediction.
692
+ This head is implemented of `DPT <https://arxiv.org/abs/2103.13413>`_.
693
+ Args:
694
+ embed_dims (int): The embed dimension of the ViT backbone.
695
+ Default: 768.
696
+ post_process_channels (List): Out channels of post process conv
697
+ layers. Default: [96, 192, 384, 768].
698
+ readout_type (str): Type of readout operation. Default: 'ignore'.
699
+ patch_size (int): The patch size. Default: 16.
700
+ expand_channels (bool): Whether expand the channels in post process
701
+ block. Default: False.
702
+ """
703
+
704
+ def __init__(
705
+ self,
706
+ embed_dims=768,
707
+ post_process_channels=[96, 192, 384, 768],
708
+ readout_type="ignore",
709
+ patch_size=16,
710
+ expand_channels=False,
711
+ **kwargs,
712
+ ):
713
+ super(DPTHead, self).__init__(**kwargs)
714
+
715
+ self.in_channels = self.in_channels
716
+ self.expand_channels = expand_channels
717
+ self.reassemble_blocks = ReassembleBlocks(embed_dims, post_process_channels, readout_type, patch_size)
718
+
719
+ self.post_process_channels = [
720
+ channel * math.pow(2, i) if expand_channels else channel for i, channel in enumerate(post_process_channels)
721
+ ]
722
+ self.convs = nn.ModuleList()
723
+ for channel in self.post_process_channels:
724
+ self.convs.append(ConvModule(channel, self.channels, kernel_size=3, padding=1, act_layer=None, bias=False))
725
+ self.fusion_blocks = nn.ModuleList()
726
+ for _ in range(len(self.convs)):
727
+ self.fusion_blocks.append(FeatureFusionBlock(self.channels, self.act_layer, self.norm_layer))
728
+ self.fusion_blocks[0].res_conv_unit1 = None
729
+ self.project = ConvModule(self.channels, self.channels, kernel_size=3, padding=1, norm_layer=self.norm_layer)
730
+ self.num_fusion_blocks = len(self.fusion_blocks)
731
+ self.num_reassemble_blocks = len(self.reassemble_blocks.resize_layers)
732
+ self.num_post_process_channels = len(self.post_process_channels)
733
+ assert self.num_fusion_blocks == self.num_reassemble_blocks
734
+ assert self.num_reassemble_blocks == self.num_post_process_channels
735
+ self.conv_depth = HeadDepth(self.channels)
736
+
737
+ def forward(self, inputs, img_metas):
738
+ assert len(inputs) == self.num_reassemble_blocks
739
+ x = [inp for inp in inputs]
740
+ x = self.reassemble_blocks(x)
741
+ x = [self.convs[i](feature) for i, feature in enumerate(x)]
742
+ out = self.fusion_blocks[0](x[-1])
743
+ for i in range(1, len(self.fusion_blocks)):
744
+ out = self.fusion_blocks[i](out, x[-(i + 1)])
745
+ out = self.project(out)
746
+ out = self.depth_pred(out)
747
+ return out
openlrm/models/encoders/dinov2/hub/depth/encoder_decoder.py ADDED
@@ -0,0 +1,351 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
5
+
6
+ from collections import OrderedDict
7
+
8
+ import torch
9
+ import torch.nn as nn
10
+ import torch.nn.functional as F
11
+
12
+ from .ops import resize
13
+
14
+
15
+ def add_prefix(inputs, prefix):
16
+ """Add prefix for dict.
17
+
18
+ Args:
19
+ inputs (dict): The input dict with str keys.
20
+ prefix (str): The prefix to add.
21
+
22
+ Returns:
23
+
24
+ dict: The dict with keys updated with ``prefix``.
25
+ """
26
+
27
+ outputs = dict()
28
+ for name, value in inputs.items():
29
+ outputs[f"{prefix}.{name}"] = value
30
+
31
+ return outputs
32
+
33
+
34
+ class DepthEncoderDecoder(nn.Module):
35
+ """Encoder Decoder depther.
36
+
37
+ EncoderDecoder typically consists of backbone and decode_head.
38
+ """
39
+
40
+ def __init__(self, backbone, decode_head):
41
+ super(DepthEncoderDecoder, self).__init__()
42
+
43
+ self.backbone = backbone
44
+ self.decode_head = decode_head
45
+ self.align_corners = self.decode_head.align_corners
46
+
47
+ def extract_feat(self, img):
48
+ """Extract features from images."""
49
+ return self.backbone(img)
50
+
51
+ def encode_decode(self, img, img_metas, rescale=True, size=None):
52
+ """Encode images with backbone and decode into a depth estimation
53
+ map of the same size as input."""
54
+ x = self.extract_feat(img)
55
+ out = self._decode_head_forward_test(x, img_metas)
56
+ # crop the pred depth to the certain range.
57
+ out = torch.clamp(out, min=self.decode_head.min_depth, max=self.decode_head.max_depth)
58
+ if rescale:
59
+ if size is None:
60
+ if img_metas is not None:
61
+ size = img_metas[0]["ori_shape"][:2]
62
+ else:
63
+ size = img.shape[2:]
64
+ out = resize(input=out, size=size, mode="bilinear", align_corners=self.align_corners)
65
+ return out
66
+
67
+ def _decode_head_forward_train(self, img, x, img_metas, depth_gt, **kwargs):
68
+ """Run forward function and calculate loss for decode head in
69
+ training."""
70
+ losses = dict()
71
+ loss_decode = self.decode_head.forward_train(img, x, img_metas, depth_gt, **kwargs)
72
+ losses.update(add_prefix(loss_decode, "decode"))
73
+ return losses
74
+
75
+ def _decode_head_forward_test(self, x, img_metas):
76
+ """Run forward function and calculate loss for decode head in
77
+ inference."""
78
+ depth_pred = self.decode_head.forward_test(x, img_metas)
79
+ return depth_pred
80
+
81
+ def forward_dummy(self, img):
82
+ """Dummy forward function."""
83
+ depth = self.encode_decode(img, None)
84
+
85
+ return depth
86
+
87
+ def forward_train(self, img, img_metas, depth_gt, **kwargs):
88
+ """Forward function for training.
89
+
90
+ Args:
91
+ img (Tensor): Input images.
92
+ img_metas (list[dict]): List of image info dict where each dict
93
+ has: 'img_shape', 'scale_factor', 'flip', and may also contain
94
+ 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
95
+ For details on the values of these keys see
96
+ `depth/datasets/pipelines/formatting.py:Collect`.
97
+ depth_gt (Tensor): Depth gt
98
+ used if the architecture supports depth estimation task.
99
+
100
+ Returns:
101
+ dict[str, Tensor]: a dictionary of loss components
102
+ """
103
+
104
+ x = self.extract_feat(img)
105
+
106
+ losses = dict()
107
+
108
+ # the last of x saves the info from neck
109
+ loss_decode = self._decode_head_forward_train(img, x, img_metas, depth_gt, **kwargs)
110
+
111
+ losses.update(loss_decode)
112
+
113
+ return losses
114
+
115
+ def whole_inference(self, img, img_meta, rescale, size=None):
116
+ """Inference with full image."""
117
+ return self.encode_decode(img, img_meta, rescale, size=size)
118
+
119
+ def slide_inference(self, img, img_meta, rescale, stride, crop_size):
120
+ """Inference by sliding-window with overlap.
121
+
122
+ If h_crop > h_img or w_crop > w_img, the small patch will be used to
123
+ decode without padding.
124
+ """
125
+
126
+ h_stride, w_stride = stride
127
+ h_crop, w_crop = crop_size
128
+ batch_size, _, h_img, w_img = img.size()
129
+ h_grids = max(h_img - h_crop + h_stride - 1, 0) // h_stride + 1
130
+ w_grids = max(w_img - w_crop + w_stride - 1, 0) // w_stride + 1
131
+ preds = img.new_zeros((batch_size, 1, h_img, w_img))
132
+ count_mat = img.new_zeros((batch_size, 1, h_img, w_img))
133
+ for h_idx in range(h_grids):
134
+ for w_idx in range(w_grids):
135
+ y1 = h_idx * h_stride
136
+ x1 = w_idx * w_stride
137
+ y2 = min(y1 + h_crop, h_img)
138
+ x2 = min(x1 + w_crop, w_img)
139
+ y1 = max(y2 - h_crop, 0)
140
+ x1 = max(x2 - w_crop, 0)
141
+ crop_img = img[:, :, y1:y2, x1:x2]
142
+ depth_pred = self.encode_decode(crop_img, img_meta, rescale)
143
+ preds += F.pad(depth_pred, (int(x1), int(preds.shape[3] - x2), int(y1), int(preds.shape[2] - y2)))
144
+
145
+ count_mat[:, :, y1:y2, x1:x2] += 1
146
+ assert (count_mat == 0).sum() == 0
147
+ if torch.onnx.is_in_onnx_export():
148
+ # cast count_mat to constant while exporting to ONNX
149
+ count_mat = torch.from_numpy(count_mat.cpu().detach().numpy()).to(device=img.device)
150
+ preds = preds / count_mat
151
+ return preds
152
+
153
+ def inference(self, img, img_meta, rescale, size=None, mode="whole"):
154
+ """Inference with slide/whole style.
155
+
156
+ Args:
157
+ img (Tensor): The input image of shape (N, 3, H, W).
158
+ img_meta (dict): Image info dict where each dict has: 'img_shape',
159
+ 'scale_factor', 'flip', and may also contain
160
+ 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
161
+ For details on the values of these keys see
162
+ `depth/datasets/pipelines/formatting.py:Collect`.
163
+ rescale (bool): Whether rescale back to original shape.
164
+
165
+ Returns:
166
+ Tensor: The output depth map.
167
+ """
168
+
169
+ assert mode in ["slide", "whole"]
170
+ ori_shape = img_meta[0]["ori_shape"]
171
+ assert all(_["ori_shape"] == ori_shape for _ in img_meta)
172
+ if mode == "slide":
173
+ depth_pred = self.slide_inference(img, img_meta, rescale)
174
+ else:
175
+ depth_pred = self.whole_inference(img, img_meta, rescale, size=size)
176
+ output = depth_pred
177
+ flip = img_meta[0]["flip"]
178
+ if flip:
179
+ flip_direction = img_meta[0]["flip_direction"]
180
+ assert flip_direction in ["horizontal", "vertical"]
181
+ if flip_direction == "horizontal":
182
+ output = output.flip(dims=(3,))
183
+ elif flip_direction == "vertical":
184
+ output = output.flip(dims=(2,))
185
+
186
+ return output
187
+
188
+ def simple_test(self, img, img_meta, rescale=True):
189
+ """Simple test with single image."""
190
+ depth_pred = self.inference(img, img_meta, rescale)
191
+ if torch.onnx.is_in_onnx_export():
192
+ # our inference backend only support 4D output
193
+ depth_pred = depth_pred.unsqueeze(0)
194
+ return depth_pred
195
+ depth_pred = depth_pred.cpu().numpy()
196
+ # unravel batch dim
197
+ depth_pred = list(depth_pred)
198
+ return depth_pred
199
+
200
+ def aug_test(self, imgs, img_metas, rescale=True):
201
+ """Test with augmentations.
202
+
203
+ Only rescale=True is supported.
204
+ """
205
+ # aug_test rescale all imgs back to ori_shape for now
206
+ assert rescale
207
+ # to save memory, we get augmented depth logit inplace
208
+ depth_pred = self.inference(imgs[0], img_metas[0], rescale)
209
+ for i in range(1, len(imgs)):
210
+ cur_depth_pred = self.inference(imgs[i], img_metas[i], rescale, size=depth_pred.shape[-2:])
211
+ depth_pred += cur_depth_pred
212
+ depth_pred /= len(imgs)
213
+ depth_pred = depth_pred.cpu().numpy()
214
+ # unravel batch dim
215
+ depth_pred = list(depth_pred)
216
+ return depth_pred
217
+
218
+ def forward_test(self, imgs, img_metas, **kwargs):
219
+ """
220
+ Args:
221
+ imgs (List[Tensor]): the outer list indicates test-time
222
+ augmentations and inner Tensor should have a shape NxCxHxW,
223
+ which contains all images in the batch.
224
+ img_metas (List[List[dict]]): the outer list indicates test-time
225
+ augs (multiscale, flip, etc.) and the inner list indicates
226
+ images in a batch.
227
+ """
228
+ for var, name in [(imgs, "imgs"), (img_metas, "img_metas")]:
229
+ if not isinstance(var, list):
230
+ raise TypeError(f"{name} must be a list, but got " f"{type(var)}")
231
+ num_augs = len(imgs)
232
+ if num_augs != len(img_metas):
233
+ raise ValueError(f"num of augmentations ({len(imgs)}) != " f"num of image meta ({len(img_metas)})")
234
+ # all images in the same aug batch all of the same ori_shape and pad
235
+ # shape
236
+ for img_meta in img_metas:
237
+ ori_shapes = [_["ori_shape"] for _ in img_meta]
238
+ assert all(shape == ori_shapes[0] for shape in ori_shapes)
239
+ img_shapes = [_["img_shape"] for _ in img_meta]
240
+ assert all(shape == img_shapes[0] for shape in img_shapes)
241
+ pad_shapes = [_["pad_shape"] for _ in img_meta]
242
+ assert all(shape == pad_shapes[0] for shape in pad_shapes)
243
+
244
+ if num_augs == 1:
245
+ return self.simple_test(imgs[0], img_metas[0], **kwargs)
246
+ else:
247
+ return self.aug_test(imgs, img_metas, **kwargs)
248
+
249
+ def forward(self, img, img_metas, return_loss=True, **kwargs):
250
+ """Calls either :func:`forward_train` or :func:`forward_test` depending
251
+ on whether ``return_loss`` is ``True``.
252
+
253
+ Note this setting will change the expected inputs. When
254
+ ``return_loss=True``, img and img_meta are single-nested (i.e. Tensor
255
+ and List[dict]), and when ``resturn_loss=False``, img and img_meta
256
+ should be double nested (i.e. List[Tensor], List[List[dict]]), with
257
+ the outer list indicating test time augmentations.
258
+ """
259
+ if return_loss:
260
+ return self.forward_train(img, img_metas, **kwargs)
261
+ else:
262
+ return self.forward_test(img, img_metas, **kwargs)
263
+
264
+ def train_step(self, data_batch, optimizer, **kwargs):
265
+ """The iteration step during training.
266
+
267
+ This method defines an iteration step during training, except for the
268
+ back propagation and optimizer updating, which are done in an optimizer
269
+ hook. Note that in some complicated cases or models, the whole process
270
+ including back propagation and optimizer updating is also defined in
271
+ this method, such as GAN.
272
+
273
+ Args:
274
+ data (dict): The output of dataloader.
275
+ optimizer (:obj:`torch.optim.Optimizer` | dict): The optimizer of
276
+ runner is passed to ``train_step()``. This argument is unused
277
+ and reserved.
278
+
279
+ Returns:
280
+ dict: It should contain at least 3 keys: ``loss``, ``log_vars``,
281
+ ``num_samples``.
282
+ ``loss`` is a tensor for back propagation, which can be a
283
+ weighted sum of multiple losses.
284
+ ``log_vars`` contains all the variables to be sent to the
285
+ logger.
286
+ ``num_samples`` indicates the batch size (when the model is
287
+ DDP, it means the batch size on each GPU), which is used for
288
+ averaging the logs.
289
+ """
290
+ losses = self(**data_batch)
291
+
292
+ # split losses and images
293
+ real_losses = {}
294
+ log_imgs = {}
295
+ for k, v in losses.items():
296
+ if "img" in k:
297
+ log_imgs[k] = v
298
+ else:
299
+ real_losses[k] = v
300
+
301
+ loss, log_vars = self._parse_losses(real_losses)
302
+
303
+ outputs = dict(loss=loss, log_vars=log_vars, num_samples=len(data_batch["img_metas"]), log_imgs=log_imgs)
304
+
305
+ return outputs
306
+
307
+ def val_step(self, data_batch, **kwargs):
308
+ """The iteration step during validation.
309
+
310
+ This method shares the same signature as :func:`train_step`, but used
311
+ during val epochs. Note that the evaluation after training epochs is
312
+ not implemented with this method, but an evaluation hook.
313
+ """
314
+ output = self(**data_batch, **kwargs)
315
+ return output
316
+
317
+ @staticmethod
318
+ def _parse_losses(losses):
319
+ import torch.distributed as dist
320
+
321
+ """Parse the raw outputs (losses) of the network.
322
+
323
+ Args:
324
+ losses (dict): Raw output of the network, which usually contain
325
+ losses and other necessary information.
326
+
327
+ Returns:
328
+ tuple[Tensor, dict]: (loss, log_vars), loss is the loss tensor
329
+ which may be a weighted sum of all losses, log_vars contains
330
+ all the variables to be sent to the logger.
331
+ """
332
+ log_vars = OrderedDict()
333
+ for loss_name, loss_value in losses.items():
334
+ if isinstance(loss_value, torch.Tensor):
335
+ log_vars[loss_name] = loss_value.mean()
336
+ elif isinstance(loss_value, list):
337
+ log_vars[loss_name] = sum(_loss.mean() for _loss in loss_value)
338
+ else:
339
+ raise TypeError(f"{loss_name} is not a tensor or list of tensors")
340
+
341
+ loss = sum(_value for _key, _value in log_vars.items() if "loss" in _key)
342
+
343
+ log_vars["loss"] = loss
344
+ for loss_name, loss_value in log_vars.items():
345
+ # reduce loss when distributed training
346
+ if dist.is_available() and dist.is_initialized():
347
+ loss_value = loss_value.data.clone()
348
+ dist.all_reduce(loss_value.div_(dist.get_world_size()))
349
+ log_vars[loss_name] = loss_value.item()
350
+
351
+ return loss, log_vars