soonyau commited on
Commit
42b0b31
1 Parent(s): 5fed475

first commit

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. LICENSE +427 -0
  2. README.md +38 -0
  3. annotator/openpose/__init__.py +73 -0
  4. annotator/openpose/body.py +219 -0
  5. annotator/openpose/hand.py +86 -0
  6. annotator/openpose/model.py +219 -0
  7. annotator/openpose/util.py +163 -0
  8. annotator/segm/__init__.py +162 -0
  9. annotator/segm/modules/__init__.py +5 -0
  10. annotator/segm/modules/bn.py +132 -0
  11. annotator/segm/modules/deeplab.py +84 -0
  12. annotator/segm/modules/dense.py +42 -0
  13. annotator/segm/modules/functions.py +244 -0
  14. annotator/segm/modules/misc.py +21 -0
  15. annotator/segm/modules/residual.py +182 -0
  16. annotator/segm/modules/src/checks.h +15 -0
  17. annotator/segm/modules/src/inplace_abn.cpp +95 -0
  18. annotator/segm/modules/src/inplace_abn.h +88 -0
  19. annotator/segm/modules/src/inplace_abn_cpu.cpp +119 -0
  20. annotator/segm/modules/src/inplace_abn_cuda.cu +333 -0
  21. annotator/segm/modules/src/inplace_abn_cuda_half.cu +275 -0
  22. annotator/segm/modules/src/utils/checks.h +15 -0
  23. annotator/segm/modules/src/utils/common.h +49 -0
  24. annotator/segm/modules/src/utils/cuda.cuh +71 -0
  25. annotator/segm/networks/AugmentCE2P.py +337 -0
  26. annotator/segm/networks/__init__.py +13 -0
  27. annotator/segm/networks/backbone/mobilenetv2.py +156 -0
  28. annotator/segm/networks/backbone/resnet.py +205 -0
  29. annotator/segm/networks/backbone/resnext.py +149 -0
  30. annotator/segm/networks/context_encoding/aspp.py +64 -0
  31. annotator/segm/networks/context_encoding/ocnet.py +226 -0
  32. annotator/segm/networks/context_encoding/psp.py +48 -0
  33. annotator/segm/transforms.py +167 -0
  34. annotator/util.py +49 -0
  35. app.py +475 -0
  36. app_files/default_images/mask.png +0 -0
  37. app_files/default_images/pose.png +0 -0
  38. app_files/default_images/ref.png +0 -0
  39. app_files/samples/pose/MEN/full_1.png +0 -0
  40. app_files/samples/pose/MEN/full_2.png +0 -0
  41. app_files/samples/pose/MEN/half_back.png +0 -0
  42. app_files/samples/pose/MEN/half_front.png +0 -0
  43. app_files/samples/pose/MEN/half_left.png +0 -0
  44. app_files/samples/pose/WOMEN/pose_0.png +0 -0
  45. app_files/samples/pose/WOMEN/pose_1.png +0 -0
  46. app_files/samples/pose/WOMEN/pose_2.png +0 -0
  47. app_files/samples/pose/WOMEN/pose_3.png +0 -0
  48. app_files/samples/pose/WOMEN/pose_4.png +0 -0
  49. app_files/samples/pose/WOMEN/pose_5.png +0 -0
  50. app_files/samples/pose/WOMEN/pose_6.png +0 -0
LICENSE ADDED
@@ -0,0 +1,427 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ CC BY-SA 4.0 DEED
2
+ Attribution-ShareAlike 4.0 International
3
+ =======================================================================
4
+
5
+ Creative Commons Corporation ("Creative Commons") is not a law firm and
6
+ does not provide legal services or legal advice. Distribution of
7
+ Creative Commons public licenses does not create a lawyer-client or
8
+ other relationship. Creative Commons makes its licenses and related
9
+ information available on an "as-is" basis. Creative Commons gives no
10
+ warranties regarding its licenses, any material licensed under their
11
+ terms and conditions, or any related information. Creative Commons
12
+ disclaims all liability for damages resulting from their use to the
13
+ fullest extent possible.
14
+
15
+ Using Creative Commons Public Licenses
16
+
17
+ Creative Commons public licenses provide a standard set of terms and
18
+ conditions that creators and other rights holders may use to share
19
+ original works of authorship and other material subject to copyright
20
+ and certain other rights specified in the public license below. The
21
+ following considerations are for informational purposes only, are not
22
+ exhaustive, and do not form part of our licenses.
23
+
24
+ Considerations for licensors: Our public licenses are
25
+ intended for use by those authorized to give the public
26
+ permission to use material in ways otherwise restricted by
27
+ copyright and certain other rights. Our licenses are
28
+ irrevocable. Licensors should read and understand the terms
29
+ and conditions of the license they choose before applying it.
30
+ Licensors should also secure all rights necessary before
31
+ applying our licenses so that the public can reuse the
32
+ material as expected. Licensors should clearly mark any
33
+ material not subject to the license. This includes other CC-
34
+ licensed material, or material used under an exception or
35
+ limitation to copyright. More considerations for licensors:
36
+ wiki.creativecommons.org/Considerations_for_licensors
37
+
38
+ Considerations for the public: By using one of our public
39
+ licenses, a licensor grants the public permission to use the
40
+ licensed material under specified terms and conditions. If
41
+ the licensor's permission is not necessary for any reason--for
42
+ example, because of any applicable exception or limitation to
43
+ copyright--then that use is not regulated by the license. Our
44
+ licenses grant only permissions under copyright and certain
45
+ other rights that a licensor has authority to grant. Use of
46
+ the licensed material may still be restricted for other
47
+ reasons, including because others have copyright or other
48
+ rights in the material. A licensor may make special requests,
49
+ such as asking that all changes be marked or described.
50
+ Although not required by our licenses, you are encouraged to
51
+ respect those requests where reasonable. More considerations
52
+ for the public:
53
+ wiki.creativecommons.org/Considerations_for_licensees
54
+
55
+ =======================================================================
56
+
57
+ Creative Commons Attribution-ShareAlike 4.0 International Public
58
+ License
59
+
60
+ By exercising the Licensed Rights (defined below), You accept and agree
61
+ to be bound by the terms and conditions of this Creative Commons
62
+ Attribution-ShareAlike 4.0 International Public License ("Public
63
+ License"). To the extent this Public License may be interpreted as a
64
+ contract, You are granted the Licensed Rights in consideration of Your
65
+ acceptance of these terms and conditions, and the Licensor grants You
66
+ such rights in consideration of benefits the Licensor receives from
67
+ making the Licensed Material available under these terms and
68
+ conditions.
69
+
70
+
71
+ Section 1 -- Definitions.
72
+
73
+ a. Adapted Material means material subject to Copyright and Similar
74
+ Rights that is derived from or based upon the Licensed Material
75
+ and in which the Licensed Material is translated, altered,
76
+ arranged, transformed, or otherwise modified in a manner requiring
77
+ permission under the Copyright and Similar Rights held by the
78
+ Licensor. For purposes of this Public License, where the Licensed
79
+ Material is a musical work, performance, or sound recording,
80
+ Adapted Material is always produced where the Licensed Material is
81
+ synched in timed relation with a moving image.
82
+
83
+ b. Adapter's License means the license You apply to Your Copyright
84
+ and Similar Rights in Your contributions to Adapted Material in
85
+ accordance with the terms and conditions of this Public License.
86
+
87
+ c. BY-SA Compatible License means a license listed at
88
+ creativecommons.org/compatiblelicenses, approved by Creative
89
+ Commons as essentially the equivalent of this Public License.
90
+
91
+ d. Copyright and Similar Rights means copyright and/or similar rights
92
+ closely related to copyright including, without limitation,
93
+ performance, broadcast, sound recording, and Sui Generis Database
94
+ Rights, without regard to how the rights are labeled or
95
+ categorized. For purposes of this Public License, the rights
96
+ specified in Section 2(b)(1)-(2) are not Copyright and Similar
97
+ Rights.
98
+
99
+ e. Effective Technological Measures means those measures that, in the
100
+ absence of proper authority, may not be circumvented under laws
101
+ fulfilling obligations under Article 11 of the WIPO Copyright
102
+ Treaty adopted on December 20, 1996, and/or similar international
103
+ agreements.
104
+
105
+ f. Exceptions and Limitations means fair use, fair dealing, and/or
106
+ any other exception or limitation to Copyright and Similar Rights
107
+ that applies to Your use of the Licensed Material.
108
+
109
+ g. License Elements means the license attributes listed in the name
110
+ of a Creative Commons Public License. The License Elements of this
111
+ Public License are Attribution and ShareAlike.
112
+
113
+ h. Licensed Material means the artistic or literary work, database,
114
+ or other material to which the Licensor applied this Public
115
+ License.
116
+
117
+ i. Licensed Rights means the rights granted to You subject to the
118
+ terms and conditions of this Public License, which are limited to
119
+ all Copyright and Similar Rights that apply to Your use of the
120
+ Licensed Material and that the Licensor has authority to license.
121
+
122
+ j. Licensor means the individual(s) or entity(ies) granting rights
123
+ under this Public License.
124
+
125
+ k. Share means to provide material to the public by any means or
126
+ process that requires permission under the Licensed Rights, such
127
+ as reproduction, public display, public performance, distribution,
128
+ dissemination, communication, or importation, and to make material
129
+ available to the public including in ways that members of the
130
+ public may access the material from a place and at a time
131
+ individually chosen by them.
132
+
133
+ l. Sui Generis Database Rights means rights other than copyright
134
+ resulting from Directive 96/9/EC of the European Parliament and of
135
+ the Council of 11 March 1996 on the legal protection of databases,
136
+ as amended and/or succeeded, as well as other essentially
137
+ equivalent rights anywhere in the world.
138
+
139
+ m. You means the individual or entity exercising the Licensed Rights
140
+ under this Public License. Your has a corresponding meaning.
141
+
142
+
143
+ Section 2 -- Scope.
144
+
145
+ a. License grant.
146
+
147
+ 1. Subject to the terms and conditions of this Public License,
148
+ the Licensor hereby grants You a worldwide, royalty-free,
149
+ non-sublicensable, non-exclusive, irrevocable license to
150
+ exercise the Licensed Rights in the Licensed Material to:
151
+
152
+ a. reproduce and Share the Licensed Material, in whole or
153
+ in part; and
154
+
155
+ b. produce, reproduce, and Share Adapted Material.
156
+
157
+ 2. Exceptions and Limitations. For the avoidance of doubt, where
158
+ Exceptions and Limitations apply to Your use, this Public
159
+ License does not apply, and You do not need to comply with
160
+ its terms and conditions.
161
+
162
+ 3. Term. The term of this Public License is specified in Section
163
+ 6(a).
164
+
165
+ 4. Media and formats; technical modifications allowed. The
166
+ Licensor authorizes You to exercise the Licensed Rights in
167
+ all media and formats whether now known or hereafter created,
168
+ and to make technical modifications necessary to do so. The
169
+ Licensor waives and/or agrees not to assert any right or
170
+ authority to forbid You from making technical modifications
171
+ necessary to exercise the Licensed Rights, including
172
+ technical modifications necessary to circumvent Effective
173
+ Technological Measures. For purposes of this Public License,
174
+ simply making modifications authorized by this Section 2(a)
175
+ (4) never produces Adapted Material.
176
+
177
+ 5. Downstream recipients.
178
+
179
+ a. Offer from the Licensor -- Licensed Material. Every
180
+ recipient of the Licensed Material automatically
181
+ receives an offer from the Licensor to exercise the
182
+ Licensed Rights under the terms and conditions of this
183
+ Public License.
184
+
185
+ b. Additional offer from the Licensor -- Adapted Material.
186
+ Every recipient of Adapted Material from You
187
+ automatically receives an offer from the Licensor to
188
+ exercise the Licensed Rights in the Adapted Material
189
+ under the conditions of the Adapter's License You apply.
190
+
191
+ c. No downstream restrictions. You may not offer or impose
192
+ any additional or different terms or conditions on, or
193
+ apply any Effective Technological Measures to, the
194
+ Licensed Material if doing so restricts exercise of the
195
+ Licensed Rights by any recipient of the Licensed
196
+ Material.
197
+
198
+ 6. No endorsement. Nothing in this Public License constitutes or
199
+ may be construed as permission to assert or imply that You
200
+ are, or that Your use of the Licensed Material is, connected
201
+ with, or sponsored, endorsed, or granted official status by,
202
+ the Licensor or others designated to receive attribution as
203
+ provided in Section 3(a)(1)(A)(i).
204
+
205
+ b. Other rights.
206
+
207
+ 1. Moral rights, such as the right of integrity, are not
208
+ licensed under this Public License, nor are publicity,
209
+ privacy, and/or other similar personality rights; however, to
210
+ the extent possible, the Licensor waives and/or agrees not to
211
+ assert any such rights held by the Licensor to the limited
212
+ extent necessary to allow You to exercise the Licensed
213
+ Rights, but not otherwise.
214
+
215
+ 2. Patent and trademark rights are not licensed under this
216
+ Public License.
217
+
218
+ 3. To the extent possible, the Licensor waives any right to
219
+ collect royalties from You for the exercise of the Licensed
220
+ Rights, whether directly or through a collecting society
221
+ under any voluntary or waivable statutory or compulsory
222
+ licensing scheme. In all other cases the Licensor expressly
223
+ reserves any right to collect such royalties.
224
+
225
+
226
+ Section 3 -- License Conditions.
227
+
228
+ Your exercise of the Licensed Rights is expressly made subject to the
229
+ following conditions.
230
+
231
+ a. Attribution.
232
+
233
+ 1. If You Share the Licensed Material (including in modified
234
+ form), You must:
235
+
236
+ a. retain the following if it is supplied by the Licensor
237
+ with the Licensed Material:
238
+
239
+ i. identification of the creator(s) of the Licensed
240
+ Material and any others designated to receive
241
+ attribution, in any reasonable manner requested by
242
+ the Licensor (including by pseudonym if
243
+ designated);
244
+
245
+ ii. a copyright notice;
246
+
247
+ iii. a notice that refers to this Public License;
248
+
249
+ iv. a notice that refers to the disclaimer of
250
+ warranties;
251
+
252
+ v. a URI or hyperlink to the Licensed Material to the
253
+ extent reasonably practicable;
254
+
255
+ b. indicate if You modified the Licensed Material and
256
+ retain an indication of any previous modifications; and
257
+
258
+ c. indicate the Licensed Material is licensed under this
259
+ Public License, and include the text of, or the URI or
260
+ hyperlink to, this Public License.
261
+
262
+ 2. You may satisfy the conditions in Section 3(a)(1) in any
263
+ reasonable manner based on the medium, means, and context in
264
+ which You Share the Licensed Material. For example, it may be
265
+ reasonable to satisfy the conditions by providing a URI or
266
+ hyperlink to a resource that includes the required
267
+ information.
268
+
269
+ 3. If requested by the Licensor, You must remove any of the
270
+ information required by Section 3(a)(1)(A) to the extent
271
+ reasonably practicable.
272
+
273
+ b. ShareAlike.
274
+
275
+ In addition to the conditions in Section 3(a), if You Share
276
+ Adapted Material You produce, the following conditions also apply.
277
+
278
+ 1. The Adapter's License You apply must be a Creative Commons
279
+ license with the same License Elements, this version or
280
+ later, or a BY-SA Compatible License.
281
+
282
+ 2. You must include the text of, or the URI or hyperlink to, the
283
+ Adapter's License You apply. You may satisfy this condition
284
+ in any reasonable manner based on the medium, means, and
285
+ context in which You Share Adapted Material.
286
+
287
+ 3. You may not offer or impose any additional or different terms
288
+ or conditions on, or apply any Effective Technological
289
+ Measures to, Adapted Material that restrict exercise of the
290
+ rights granted under the Adapter's License You apply.
291
+
292
+
293
+ Section 4 -- Sui Generis Database Rights.
294
+
295
+ Where the Licensed Rights include Sui Generis Database Rights that
296
+ apply to Your use of the Licensed Material:
297
+
298
+ a. for the avoidance of doubt, Section 2(a)(1) grants You the right
299
+ to extract, reuse, reproduce, and Share all or a substantial
300
+ portion of the contents of the database;
301
+
302
+ b. if You include all or a substantial portion of the database
303
+ contents in a database in which You have Sui Generis Database
304
+ Rights, then the database in which You have Sui Generis Database
305
+ Rights (but not its individual contents) is Adapted Material,
306
+ including for purposes of Section 3(b); and
307
+
308
+ c. You must comply with the conditions in Section 3(a) if You Share
309
+ all or a substantial portion of the contents of the database.
310
+
311
+ For the avoidance of doubt, this Section 4 supplements and does not
312
+ replace Your obligations under this Public License where the Licensed
313
+ Rights include other Copyright and Similar Rights.
314
+
315
+
316
+ Section 5 -- Disclaimer of Warranties and Limitation of Liability.
317
+
318
+ a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
319
+ EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
320
+ AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
321
+ ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
322
+ IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
323
+ WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
324
+ PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
325
+ ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
326
+ KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
327
+ ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
328
+
329
+ b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
330
+ TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
331
+ NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
332
+ INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
333
+ COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
334
+ USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
335
+ ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
336
+ DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
337
+ IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
338
+
339
+ c. The disclaimer of warranties and limitation of liability provided
340
+ above shall be interpreted in a manner that, to the extent
341
+ possible, most closely approximates an absolute disclaimer and
342
+ waiver of all liability.
343
+
344
+
345
+ Section 6 -- Term and Termination.
346
+
347
+ a. This Public License applies for the term of the Copyright and
348
+ Similar Rights licensed here. However, if You fail to comply with
349
+ this Public License, then Your rights under this Public License
350
+ terminate automatically.
351
+
352
+ b. Where Your right to use the Licensed Material has terminated under
353
+ Section 6(a), it reinstates:
354
+
355
+ 1. automatically as of the date the violation is cured, provided
356
+ it is cured within 30 days of Your discovery of the
357
+ violation; or
358
+
359
+ 2. upon express reinstatement by the Licensor.
360
+
361
+ For the avoidance of doubt, this Section 6(b) does not affect any
362
+ right the Licensor may have to seek remedies for Your violations
363
+ of this Public License.
364
+
365
+ c. For the avoidance of doubt, the Licensor may also offer the
366
+ Licensed Material under separate terms or conditions or stop
367
+ distributing the Licensed Material at any time; however, doing so
368
+ will not terminate this Public License.
369
+
370
+ d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
371
+ License.
372
+
373
+
374
+ Section 7 -- Other Terms and Conditions.
375
+
376
+ a. The Licensor shall not be bound by any additional or different
377
+ terms or conditions communicated by You unless expressly agreed.
378
+
379
+ b. Any arrangements, understandings, or agreements regarding the
380
+ Licensed Material not stated herein are separate from and
381
+ independent of the terms and conditions of this Public License.
382
+
383
+
384
+ Section 8 -- Interpretation.
385
+
386
+ a. For the avoidance of doubt, this Public License does not, and
387
+ shall not be interpreted to, reduce, limit, restrict, or impose
388
+ conditions on any use of the Licensed Material that could lawfully
389
+ be made without permission under this Public License.
390
+
391
+ b. To the extent possible, if any provision of this Public License is
392
+ deemed unenforceable, it shall be automatically reformed to the
393
+ minimum extent necessary to make it enforceable. If the provision
394
+ cannot be reformed, it shall be severed from this Public License
395
+ without affecting the enforceability of the remaining terms and
396
+ conditions.
397
+
398
+ c. No term or condition of this Public License will be waived and no
399
+ failure to comply consented to unless expressly agreed to by the
400
+ Licensor.
401
+
402
+ d. Nothing in this Public License constitutes or may be interpreted
403
+ as a limitation upon, or waiver of, any privileges and immunities
404
+ that apply to the Licensor or You, including from the legal
405
+ processes of any jurisdiction or authority.
406
+
407
+
408
+ =======================================================================
409
+
410
+ Creative Commons is not a party to its public
411
+ licenses. Notwithstanding, Creative Commons may elect to apply one of
412
+ its public licenses to material it publishes and in those instances
413
+ will be considered the “Licensor.” The text of the Creative Commons
414
+ public licenses is dedicated to the public domain under the CC0 Public
415
+ Domain Dedication. Except for the limited purpose of indicating that
416
+ material is shared under a Creative Commons public license or as
417
+ otherwise permitted by the Creative Commons policies published at
418
+ creativecommons.org/policies, Creative Commons does not authorize the
419
+ use of the trademark "Creative Commons" or any other trademark or logo
420
+ of Creative Commons without its prior written consent including,
421
+ without limitation, in connection with any unauthorized modifications
422
+ to any of its public licenses or any other arrangements,
423
+ understandings, or agreements concerning use of licensed material. For
424
+ the avoidance of doubt, this paragraph does not form part of the
425
+ public licenses.
426
+
427
+ Creative Commons may be contacted at creativecommons.org.
README.md ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## *ViscoNet*: Bridging and Harmonizing Visual and Textual Conditioning for ControlNet
2
+ [Soon Yau Cheong](https://scholar.google.com/citations?user=dRot7GUAAAAJ&hl=en)
3
+ [Armin Mustafa](https://scholar.google.com/citations?user=0xOHqkMAAAAJ&hl=en)
4
+ [Andrew Gilbert](https://scholar.google.com/citations?user=NNhnVwoAAAAJ&hl=en)
5
+
6
+
7
+ <a href='https://soon-yau.github.io/visconet/'><img src='https://img.shields.io/badge/Project-Page-Green'></a>
8
+ <a href='https://arxiv.org/abs/2312.03154'><img src='https://img.shields.io/badge/Paper-Arxiv-red'></a>
9
+ [![YouTube](https://badges.aleen42.com/src/youtube.svg)](https://youtu.be/3_6Zq3hk86Q)
10
+
11
+ https://github.com/soon-yau/visconet/assets/19167278/ae58b7ab-fa76-4253-8a10-46656f234b20
12
+
13
+ ### Requirements
14
+ A suitable [conda](https://conda.io/) environment named `control` can be created
15
+ and activated with:
16
+ ```
17
+ conda env create -f environment.yaml
18
+ conda activate control
19
+ ```
20
+ ### Files
21
+ All model and data files are in [here](https://huggingface.co/soonyau/visconet/tree/main).
22
+ Including eval.zip containing all images used in human evaluation.
23
+
24
+ ### Gradio App
25
+ [![App](./assets/app.png)](https://youtu.be/3_6Zq3hk86Q)
26
+ 1. Download *visconet_v1.pth* and *exp-schp-201908301523-atr.pth* into directory ./models
27
+ 2. (Optional) download fashion.zip and unzip it to home directory.
28
+ 3. run ```python gradio_visconet.py```
29
+
30
+ ### Citation
31
+ ```
32
+ @article{cheong2023visconet,
33
+ author = {Cheong, Soon Yau and Mustafa, Armin and Gilbert, Andrew},
34
+ title = {ViscoNet: Bridging and Harmonizing Visual and Textual Conditioning for ControlNet},
35
+ journal = {Arxiv Preprint 2312.03154},
36
+ month = {December},
37
+ year = {2023}}
38
+ ```
annotator/openpose/__init__.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Openpose
2
+ # Original from CMU https://github.com/CMU-Perceptual-Computing-Lab/openpose
3
+ # 2nd Edited by https://github.com/Hzzone/pytorch-openpose
4
+ # 3rd Edited by ControlNet
5
+
6
+ import os
7
+ os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
8
+
9
+ import torch
10
+ import numpy as np
11
+ from . import util
12
+ from .body import Body
13
+ from .hand import Hand
14
+ from annotator.util import annotator_ckpts_path
15
+
16
+
17
+ body_model_path = "https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/body_pose_model.pth"
18
+ hand_model_path = "https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/hand_pose_model.pth"
19
+
20
+
21
+ class OpenposeDetector:
22
+ def __init__(self):
23
+ body_modelpath = os.path.join(annotator_ckpts_path, "body_pose_model.pth")
24
+ hand_modelpath = os.path.join(annotator_ckpts_path, "hand_pose_model.pth")
25
+
26
+ if not os.path.exists(hand_modelpath):
27
+ from basicsr.utils.download_util import load_file_from_url
28
+ load_file_from_url(body_model_path, model_dir=annotator_ckpts_path)
29
+ load_file_from_url(hand_model_path, model_dir=annotator_ckpts_path)
30
+
31
+ self.body_estimation = Body(body_modelpath)
32
+ self.hand_estimation = Hand(hand_modelpath)
33
+
34
+ def __call__(self, oriImg, hand=False):
35
+ oriImg = oriImg[:, :, ::-1].copy()
36
+ with torch.no_grad():
37
+ candidate, subset = self.body_estimation(oriImg)
38
+ canvas = np.zeros_like(oriImg)
39
+ canvas = util.draw_bodypose(canvas, candidate, subset)
40
+ if hand:
41
+ hands_list = util.handDetect(candidate, subset, oriImg)
42
+ all_hand_peaks = []
43
+ for x, y, w, is_left in hands_list:
44
+ peaks = self.hand_estimation(oriImg[y:y+w, x:x+w, :])
45
+ peaks[:, 0] = np.where(peaks[:, 0] == 0, peaks[:, 0], peaks[:, 0] + x)
46
+ peaks[:, 1] = np.where(peaks[:, 1] == 0, peaks[:, 1], peaks[:, 1] + y)
47
+ all_hand_peaks.append(peaks)
48
+ canvas = util.draw_handpose(canvas, all_hand_peaks)
49
+ return canvas, dict(candidate=candidate.tolist(), subset=subset.tolist())
50
+
51
+
52
+ class VisconetDetector(OpenposeDetector):
53
+ def __init__(self):
54
+ super().__init__()
55
+
56
+ def __call__(self, oriImg):
57
+ oriImg = oriImg[:, :, ::-1].copy()
58
+ with torch.no_grad():
59
+ candidate, subset = self.body_estimation(oriImg)
60
+ canvas = util.draw_bodypose(np.zeros_like(oriImg), candidate, subset, stickwidth=1, circlewidth=2)
61
+ # detect hand
62
+ hands_list = util.handDetect(candidate, subset, oriImg)
63
+
64
+ all_hand_peaks = []
65
+ for x, y, w, is_left in hands_list:
66
+
67
+ peaks = self.hand_estimation(oriImg[y:y+w, x:x+w, :])
68
+ peaks[:, 0] = np.where(peaks[:, 0]==0, peaks[:, 0], peaks[:, 0]+x)
69
+ peaks[:, 1] = np.where(peaks[:, 1]==0, peaks[:, 1], peaks[:, 1]+y)
70
+ all_hand_peaks.append(peaks)
71
+
72
+ canvas = util.draw_handpose(canvas, all_hand_peaks,stickwidth=1)
73
+ return canvas, dict(candidate=candidate.tolist(), subset=subset.tolist())
annotator/openpose/body.py ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import numpy as np
3
+ import math
4
+ import time
5
+ from scipy.ndimage.filters import gaussian_filter
6
+ import matplotlib.pyplot as plt
7
+ import matplotlib
8
+ import torch
9
+ from torchvision import transforms
10
+
11
+ from . import util
12
+ from .model import bodypose_model
13
+
14
+ class Body(object):
15
+ def __init__(self, model_path):
16
+ self.model = bodypose_model()
17
+ if torch.cuda.is_available():
18
+ self.model = self.model.cuda()
19
+ print('cuda')
20
+ model_dict = util.transfer(self.model, torch.load(model_path))
21
+ self.model.load_state_dict(model_dict)
22
+ self.model.eval()
23
+
24
+ def __call__(self, oriImg):
25
+ # scale_search = [0.5, 1.0, 1.5, 2.0]
26
+ scale_search = [0.5]
27
+ boxsize = 368
28
+ stride = 8
29
+ padValue = 128
30
+ thre1 = 0.1
31
+ thre2 = 0.05
32
+ multiplier = [x * boxsize / oriImg.shape[0] for x in scale_search]
33
+ heatmap_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 19))
34
+ paf_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 38))
35
+
36
+ for m in range(len(multiplier)):
37
+ scale = multiplier[m]
38
+ imageToTest = cv2.resize(oriImg, (0, 0), fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
39
+ imageToTest_padded, pad = util.padRightDownCorner(imageToTest, stride, padValue)
40
+ im = np.transpose(np.float32(imageToTest_padded[:, :, :, np.newaxis]), (3, 2, 0, 1)) / 256 - 0.5
41
+ im = np.ascontiguousarray(im)
42
+
43
+ data = torch.from_numpy(im).float()
44
+ if torch.cuda.is_available():
45
+ data = data.cuda()
46
+ # data = data.permute([2, 0, 1]).unsqueeze(0).float()
47
+ with torch.no_grad():
48
+ Mconv7_stage6_L1, Mconv7_stage6_L2 = self.model(data)
49
+ Mconv7_stage6_L1 = Mconv7_stage6_L1.cpu().numpy()
50
+ Mconv7_stage6_L2 = Mconv7_stage6_L2.cpu().numpy()
51
+
52
+ # extract outputs, resize, and remove padding
53
+ # heatmap = np.transpose(np.squeeze(net.blobs[output_blobs.keys()[1]].data), (1, 2, 0)) # output 1 is heatmaps
54
+ heatmap = np.transpose(np.squeeze(Mconv7_stage6_L2), (1, 2, 0)) # output 1 is heatmaps
55
+ heatmap = cv2.resize(heatmap, (0, 0), fx=stride, fy=stride, interpolation=cv2.INTER_CUBIC)
56
+ heatmap = heatmap[:imageToTest_padded.shape[0] - pad[2], :imageToTest_padded.shape[1] - pad[3], :]
57
+ heatmap = cv2.resize(heatmap, (oriImg.shape[1], oriImg.shape[0]), interpolation=cv2.INTER_CUBIC)
58
+
59
+ # paf = np.transpose(np.squeeze(net.blobs[output_blobs.keys()[0]].data), (1, 2, 0)) # output 0 is PAFs
60
+ paf = np.transpose(np.squeeze(Mconv7_stage6_L1), (1, 2, 0)) # output 0 is PAFs
61
+ paf = cv2.resize(paf, (0, 0), fx=stride, fy=stride, interpolation=cv2.INTER_CUBIC)
62
+ paf = paf[:imageToTest_padded.shape[0] - pad[2], :imageToTest_padded.shape[1] - pad[3], :]
63
+ paf = cv2.resize(paf, (oriImg.shape[1], oriImg.shape[0]), interpolation=cv2.INTER_CUBIC)
64
+
65
+ heatmap_avg += heatmap_avg + heatmap / len(multiplier)
66
+ paf_avg += + paf / len(multiplier)
67
+
68
+ all_peaks = []
69
+ peak_counter = 0
70
+
71
+ for part in range(18):
72
+ map_ori = heatmap_avg[:, :, part]
73
+ one_heatmap = gaussian_filter(map_ori, sigma=3)
74
+
75
+ map_left = np.zeros(one_heatmap.shape)
76
+ map_left[1:, :] = one_heatmap[:-1, :]
77
+ map_right = np.zeros(one_heatmap.shape)
78
+ map_right[:-1, :] = one_heatmap[1:, :]
79
+ map_up = np.zeros(one_heatmap.shape)
80
+ map_up[:, 1:] = one_heatmap[:, :-1]
81
+ map_down = np.zeros(one_heatmap.shape)
82
+ map_down[:, :-1] = one_heatmap[:, 1:]
83
+
84
+ peaks_binary = np.logical_and.reduce(
85
+ (one_heatmap >= map_left, one_heatmap >= map_right, one_heatmap >= map_up, one_heatmap >= map_down, one_heatmap > thre1))
86
+ peaks = list(zip(np.nonzero(peaks_binary)[1], np.nonzero(peaks_binary)[0])) # note reverse
87
+ peaks_with_score = [x + (map_ori[x[1], x[0]],) for x in peaks]
88
+ peak_id = range(peak_counter, peak_counter + len(peaks))
89
+ peaks_with_score_and_id = [peaks_with_score[i] + (peak_id[i],) for i in range(len(peak_id))]
90
+
91
+ all_peaks.append(peaks_with_score_and_id)
92
+ peak_counter += len(peaks)
93
+
94
+ # find connection in the specified sequence, center 29 is in the position 15
95
+ limbSeq = [[2, 3], [2, 6], [3, 4], [4, 5], [6, 7], [7, 8], [2, 9], [9, 10], \
96
+ [10, 11], [2, 12], [12, 13], [13, 14], [2, 1], [1, 15], [15, 17], \
97
+ [1, 16], [16, 18], [3, 17], [6, 18]]
98
+ # the middle joints heatmap correpondence
99
+ mapIdx = [[31, 32], [39, 40], [33, 34], [35, 36], [41, 42], [43, 44], [19, 20], [21, 22], \
100
+ [23, 24], [25, 26], [27, 28], [29, 30], [47, 48], [49, 50], [53, 54], [51, 52], \
101
+ [55, 56], [37, 38], [45, 46]]
102
+
103
+ connection_all = []
104
+ special_k = []
105
+ mid_num = 10
106
+
107
+ for k in range(len(mapIdx)):
108
+ score_mid = paf_avg[:, :, [x - 19 for x in mapIdx[k]]]
109
+ candA = all_peaks[limbSeq[k][0] - 1]
110
+ candB = all_peaks[limbSeq[k][1] - 1]
111
+ nA = len(candA)
112
+ nB = len(candB)
113
+ indexA, indexB = limbSeq[k]
114
+ if (nA != 0 and nB != 0):
115
+ connection_candidate = []
116
+ for i in range(nA):
117
+ for j in range(nB):
118
+ vec = np.subtract(candB[j][:2], candA[i][:2])
119
+ norm = math.sqrt(vec[0] * vec[0] + vec[1] * vec[1])
120
+ norm = max(0.001, norm)
121
+ vec = np.divide(vec, norm)
122
+
123
+ startend = list(zip(np.linspace(candA[i][0], candB[j][0], num=mid_num), \
124
+ np.linspace(candA[i][1], candB[j][1], num=mid_num)))
125
+
126
+ vec_x = np.array([score_mid[int(round(startend[I][1])), int(round(startend[I][0])), 0] \
127
+ for I in range(len(startend))])
128
+ vec_y = np.array([score_mid[int(round(startend[I][1])), int(round(startend[I][0])), 1] \
129
+ for I in range(len(startend))])
130
+
131
+ score_midpts = np.multiply(vec_x, vec[0]) + np.multiply(vec_y, vec[1])
132
+ score_with_dist_prior = sum(score_midpts) / len(score_midpts) + min(
133
+ 0.5 * oriImg.shape[0] / norm - 1, 0)
134
+ criterion1 = len(np.nonzero(score_midpts > thre2)[0]) > 0.8 * len(score_midpts)
135
+ criterion2 = score_with_dist_prior > 0
136
+ if criterion1 and criterion2:
137
+ connection_candidate.append(
138
+ [i, j, score_with_dist_prior, score_with_dist_prior + candA[i][2] + candB[j][2]])
139
+
140
+ connection_candidate = sorted(connection_candidate, key=lambda x: x[2], reverse=True)
141
+ connection = np.zeros((0, 5))
142
+ for c in range(len(connection_candidate)):
143
+ i, j, s = connection_candidate[c][0:3]
144
+ if (i not in connection[:, 3] and j not in connection[:, 4]):
145
+ connection = np.vstack([connection, [candA[i][3], candB[j][3], s, i, j]])
146
+ if (len(connection) >= min(nA, nB)):
147
+ break
148
+
149
+ connection_all.append(connection)
150
+ else:
151
+ special_k.append(k)
152
+ connection_all.append([])
153
+
154
+ # last number in each row is the total parts number of that person
155
+ # the second last number in each row is the score of the overall configuration
156
+ subset = -1 * np.ones((0, 20))
157
+ candidate = np.array([item for sublist in all_peaks for item in sublist])
158
+
159
+ for k in range(len(mapIdx)):
160
+ if k not in special_k:
161
+ partAs = connection_all[k][:, 0]
162
+ partBs = connection_all[k][:, 1]
163
+ indexA, indexB = np.array(limbSeq[k]) - 1
164
+
165
+ for i in range(len(connection_all[k])): # = 1:size(temp,1)
166
+ found = 0
167
+ subset_idx = [-1, -1]
168
+ for j in range(len(subset)): # 1:size(subset,1):
169
+ if subset[j][indexA] == partAs[i] or subset[j][indexB] == partBs[i]:
170
+ subset_idx[found] = j
171
+ found += 1
172
+
173
+ if found == 1:
174
+ j = subset_idx[0]
175
+ if subset[j][indexB] != partBs[i]:
176
+ subset[j][indexB] = partBs[i]
177
+ subset[j][-1] += 1
178
+ subset[j][-2] += candidate[partBs[i].astype(int), 2] + connection_all[k][i][2]
179
+ elif found == 2: # if found 2 and disjoint, merge them
180
+ j1, j2 = subset_idx
181
+ membership = ((subset[j1] >= 0).astype(int) + (subset[j2] >= 0).astype(int))[:-2]
182
+ if len(np.nonzero(membership == 2)[0]) == 0: # merge
183
+ subset[j1][:-2] += (subset[j2][:-2] + 1)
184
+ subset[j1][-2:] += subset[j2][-2:]
185
+ subset[j1][-2] += connection_all[k][i][2]
186
+ subset = np.delete(subset, j2, 0)
187
+ else: # as like found == 1
188
+ subset[j1][indexB] = partBs[i]
189
+ subset[j1][-1] += 1
190
+ subset[j1][-2] += candidate[partBs[i].astype(int), 2] + connection_all[k][i][2]
191
+
192
+ # if find no partA in the subset, create a new subset
193
+ elif not found and k < 17:
194
+ row = -1 * np.ones(20)
195
+ row[indexA] = partAs[i]
196
+ row[indexB] = partBs[i]
197
+ row[-1] = 2
198
+ row[-2] = sum(candidate[connection_all[k][i, :2].astype(int), 2]) + connection_all[k][i][2]
199
+ subset = np.vstack([subset, row])
200
+ # delete some rows of subset which has few parts occur
201
+ deleteIdx = []
202
+ for i in range(len(subset)):
203
+ if subset[i][-1] < 4 or subset[i][-2] / subset[i][-1] < 0.4:
204
+ deleteIdx.append(i)
205
+ subset = np.delete(subset, deleteIdx, axis=0)
206
+
207
+ # subset: n*20 array, 0-17 is the index in candidate, 18 is the total score, 19 is the total parts
208
+ # candidate: x, y, score, id
209
+ return candidate, subset
210
+
211
+ if __name__ == "__main__":
212
+ body_estimation = Body('../model/body_pose_model.pth')
213
+
214
+ test_image = '../images/ski.jpg'
215
+ oriImg = cv2.imread(test_image) # B,G,R order
216
+ candidate, subset = body_estimation(oriImg)
217
+ canvas = util.draw_bodypose(oriImg, candidate, subset)
218
+ plt.imshow(canvas[:, :, [2, 1, 0]])
219
+ plt.show()
annotator/openpose/hand.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import json
3
+ import numpy as np
4
+ import math
5
+ import time
6
+ from scipy.ndimage.filters import gaussian_filter
7
+ import matplotlib.pyplot as plt
8
+ import matplotlib
9
+ import torch
10
+ from skimage.measure import label
11
+
12
+ from .model import handpose_model
13
+ from . import util
14
+
15
+ class Hand(object):
16
+ def __init__(self, model_path):
17
+ self.model = handpose_model()
18
+ if torch.cuda.is_available():
19
+ self.model = self.model.cuda()
20
+ print('cuda')
21
+ model_dict = util.transfer(self.model, torch.load(model_path))
22
+ self.model.load_state_dict(model_dict)
23
+ self.model.eval()
24
+
25
+ def __call__(self, oriImg):
26
+ scale_search = [0.5, 1.0, 1.5, 2.0]
27
+ # scale_search = [0.5]
28
+ boxsize = 368
29
+ stride = 8
30
+ padValue = 128
31
+ thre = 0.05
32
+ multiplier = [x * boxsize / oriImg.shape[0] for x in scale_search]
33
+ heatmap_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 22))
34
+ # paf_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 38))
35
+
36
+ for m in range(len(multiplier)):
37
+ scale = multiplier[m]
38
+ imageToTest = cv2.resize(oriImg, (0, 0), fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
39
+ imageToTest_padded, pad = util.padRightDownCorner(imageToTest, stride, padValue)
40
+ im = np.transpose(np.float32(imageToTest_padded[:, :, :, np.newaxis]), (3, 2, 0, 1)) / 256 - 0.5
41
+ im = np.ascontiguousarray(im)
42
+
43
+ data = torch.from_numpy(im).float()
44
+ if torch.cuda.is_available():
45
+ data = data.cuda()
46
+ # data = data.permute([2, 0, 1]).unsqueeze(0).float()
47
+ with torch.no_grad():
48
+ output = self.model(data).cpu().numpy()
49
+ # output = self.model(data).numpy()q
50
+
51
+ # extract outputs, resize, and remove padding
52
+ heatmap = np.transpose(np.squeeze(output), (1, 2, 0)) # output 1 is heatmaps
53
+ heatmap = cv2.resize(heatmap, (0, 0), fx=stride, fy=stride, interpolation=cv2.INTER_CUBIC)
54
+ heatmap = heatmap[:imageToTest_padded.shape[0] - pad[2], :imageToTest_padded.shape[1] - pad[3], :]
55
+ heatmap = cv2.resize(heatmap, (oriImg.shape[1], oriImg.shape[0]), interpolation=cv2.INTER_CUBIC)
56
+
57
+ heatmap_avg += heatmap / len(multiplier)
58
+
59
+ all_peaks = []
60
+ for part in range(21):
61
+ map_ori = heatmap_avg[:, :, part]
62
+ one_heatmap = gaussian_filter(map_ori, sigma=3)
63
+ binary = np.ascontiguousarray(one_heatmap > thre, dtype=np.uint8)
64
+ # 全部小于阈值
65
+ if np.sum(binary) == 0:
66
+ all_peaks.append([0, 0])
67
+ continue
68
+ label_img, label_numbers = label(binary, return_num=True, connectivity=binary.ndim)
69
+ max_index = np.argmax([np.sum(map_ori[label_img == i]) for i in range(1, label_numbers + 1)]) + 1
70
+ label_img[label_img != max_index] = 0
71
+ map_ori[label_img == 0] = 0
72
+
73
+ y, x = util.npmax(map_ori)
74
+ all_peaks.append([x, y])
75
+ return np.array(all_peaks)
76
+
77
+ if __name__ == "__main__":
78
+ hand_estimation = Hand('../model/hand_pose_model.pth')
79
+
80
+ # test_image = '../images/hand.jpg'
81
+ test_image = '../images/hand.jpg'
82
+ oriImg = cv2.imread(test_image) # B,G,R order
83
+ peaks = hand_estimation(oriImg)
84
+ canvas = util.draw_handpose(oriImg, peaks, True)
85
+ cv2.imshow('', canvas)
86
+ cv2.waitKey(0)
annotator/openpose/model.py ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from collections import OrderedDict
3
+
4
+ import torch
5
+ import torch.nn as nn
6
+
7
+ def make_layers(block, no_relu_layers):
8
+ layers = []
9
+ for layer_name, v in block.items():
10
+ if 'pool' in layer_name:
11
+ layer = nn.MaxPool2d(kernel_size=v[0], stride=v[1],
12
+ padding=v[2])
13
+ layers.append((layer_name, layer))
14
+ else:
15
+ conv2d = nn.Conv2d(in_channels=v[0], out_channels=v[1],
16
+ kernel_size=v[2], stride=v[3],
17
+ padding=v[4])
18
+ layers.append((layer_name, conv2d))
19
+ if layer_name not in no_relu_layers:
20
+ layers.append(('relu_'+layer_name, nn.ReLU(inplace=True)))
21
+
22
+ return nn.Sequential(OrderedDict(layers))
23
+
24
+ class bodypose_model(nn.Module):
25
+ def __init__(self):
26
+ super(bodypose_model, self).__init__()
27
+
28
+ # these layers have no relu layer
29
+ no_relu_layers = ['conv5_5_CPM_L1', 'conv5_5_CPM_L2', 'Mconv7_stage2_L1',\
30
+ 'Mconv7_stage2_L2', 'Mconv7_stage3_L1', 'Mconv7_stage3_L2',\
31
+ 'Mconv7_stage4_L1', 'Mconv7_stage4_L2', 'Mconv7_stage5_L1',\
32
+ 'Mconv7_stage5_L2', 'Mconv7_stage6_L1', 'Mconv7_stage6_L1']
33
+ blocks = {}
34
+ block0 = OrderedDict([
35
+ ('conv1_1', [3, 64, 3, 1, 1]),
36
+ ('conv1_2', [64, 64, 3, 1, 1]),
37
+ ('pool1_stage1', [2, 2, 0]),
38
+ ('conv2_1', [64, 128, 3, 1, 1]),
39
+ ('conv2_2', [128, 128, 3, 1, 1]),
40
+ ('pool2_stage1', [2, 2, 0]),
41
+ ('conv3_1', [128, 256, 3, 1, 1]),
42
+ ('conv3_2', [256, 256, 3, 1, 1]),
43
+ ('conv3_3', [256, 256, 3, 1, 1]),
44
+ ('conv3_4', [256, 256, 3, 1, 1]),
45
+ ('pool3_stage1', [2, 2, 0]),
46
+ ('conv4_1', [256, 512, 3, 1, 1]),
47
+ ('conv4_2', [512, 512, 3, 1, 1]),
48
+ ('conv4_3_CPM', [512, 256, 3, 1, 1]),
49
+ ('conv4_4_CPM', [256, 128, 3, 1, 1])
50
+ ])
51
+
52
+
53
+ # Stage 1
54
+ block1_1 = OrderedDict([
55
+ ('conv5_1_CPM_L1', [128, 128, 3, 1, 1]),
56
+ ('conv5_2_CPM_L1', [128, 128, 3, 1, 1]),
57
+ ('conv5_3_CPM_L1', [128, 128, 3, 1, 1]),
58
+ ('conv5_4_CPM_L1', [128, 512, 1, 1, 0]),
59
+ ('conv5_5_CPM_L1', [512, 38, 1, 1, 0])
60
+ ])
61
+
62
+ block1_2 = OrderedDict([
63
+ ('conv5_1_CPM_L2', [128, 128, 3, 1, 1]),
64
+ ('conv5_2_CPM_L2', [128, 128, 3, 1, 1]),
65
+ ('conv5_3_CPM_L2', [128, 128, 3, 1, 1]),
66
+ ('conv5_4_CPM_L2', [128, 512, 1, 1, 0]),
67
+ ('conv5_5_CPM_L2', [512, 19, 1, 1, 0])
68
+ ])
69
+ blocks['block1_1'] = block1_1
70
+ blocks['block1_2'] = block1_2
71
+
72
+ self.model0 = make_layers(block0, no_relu_layers)
73
+
74
+ # Stages 2 - 6
75
+ for i in range(2, 7):
76
+ blocks['block%d_1' % i] = OrderedDict([
77
+ ('Mconv1_stage%d_L1' % i, [185, 128, 7, 1, 3]),
78
+ ('Mconv2_stage%d_L1' % i, [128, 128, 7, 1, 3]),
79
+ ('Mconv3_stage%d_L1' % i, [128, 128, 7, 1, 3]),
80
+ ('Mconv4_stage%d_L1' % i, [128, 128, 7, 1, 3]),
81
+ ('Mconv5_stage%d_L1' % i, [128, 128, 7, 1, 3]),
82
+ ('Mconv6_stage%d_L1' % i, [128, 128, 1, 1, 0]),
83
+ ('Mconv7_stage%d_L1' % i, [128, 38, 1, 1, 0])
84
+ ])
85
+
86
+ blocks['block%d_2' % i] = OrderedDict([
87
+ ('Mconv1_stage%d_L2' % i, [185, 128, 7, 1, 3]),
88
+ ('Mconv2_stage%d_L2' % i, [128, 128, 7, 1, 3]),
89
+ ('Mconv3_stage%d_L2' % i, [128, 128, 7, 1, 3]),
90
+ ('Mconv4_stage%d_L2' % i, [128, 128, 7, 1, 3]),
91
+ ('Mconv5_stage%d_L2' % i, [128, 128, 7, 1, 3]),
92
+ ('Mconv6_stage%d_L2' % i, [128, 128, 1, 1, 0]),
93
+ ('Mconv7_stage%d_L2' % i, [128, 19, 1, 1, 0])
94
+ ])
95
+
96
+ for k in blocks.keys():
97
+ blocks[k] = make_layers(blocks[k], no_relu_layers)
98
+
99
+ self.model1_1 = blocks['block1_1']
100
+ self.model2_1 = blocks['block2_1']
101
+ self.model3_1 = blocks['block3_1']
102
+ self.model4_1 = blocks['block4_1']
103
+ self.model5_1 = blocks['block5_1']
104
+ self.model6_1 = blocks['block6_1']
105
+
106
+ self.model1_2 = blocks['block1_2']
107
+ self.model2_2 = blocks['block2_2']
108
+ self.model3_2 = blocks['block3_2']
109
+ self.model4_2 = blocks['block4_2']
110
+ self.model5_2 = blocks['block5_2']
111
+ self.model6_2 = blocks['block6_2']
112
+
113
+
114
+ def forward(self, x):
115
+
116
+ out1 = self.model0(x)
117
+
118
+ out1_1 = self.model1_1(out1)
119
+ out1_2 = self.model1_2(out1)
120
+ out2 = torch.cat([out1_1, out1_2, out1], 1)
121
+
122
+ out2_1 = self.model2_1(out2)
123
+ out2_2 = self.model2_2(out2)
124
+ out3 = torch.cat([out2_1, out2_2, out1], 1)
125
+
126
+ out3_1 = self.model3_1(out3)
127
+ out3_2 = self.model3_2(out3)
128
+ out4 = torch.cat([out3_1, out3_2, out1], 1)
129
+
130
+ out4_1 = self.model4_1(out4)
131
+ out4_2 = self.model4_2(out4)
132
+ out5 = torch.cat([out4_1, out4_2, out1], 1)
133
+
134
+ out5_1 = self.model5_1(out5)
135
+ out5_2 = self.model5_2(out5)
136
+ out6 = torch.cat([out5_1, out5_2, out1], 1)
137
+
138
+ out6_1 = self.model6_1(out6)
139
+ out6_2 = self.model6_2(out6)
140
+
141
+ return out6_1, out6_2
142
+
143
+ class handpose_model(nn.Module):
144
+ def __init__(self):
145
+ super(handpose_model, self).__init__()
146
+
147
+ # these layers have no relu layer
148
+ no_relu_layers = ['conv6_2_CPM', 'Mconv7_stage2', 'Mconv7_stage3',\
149
+ 'Mconv7_stage4', 'Mconv7_stage5', 'Mconv7_stage6']
150
+ # stage 1
151
+ block1_0 = OrderedDict([
152
+ ('conv1_1', [3, 64, 3, 1, 1]),
153
+ ('conv1_2', [64, 64, 3, 1, 1]),
154
+ ('pool1_stage1', [2, 2, 0]),
155
+ ('conv2_1', [64, 128, 3, 1, 1]),
156
+ ('conv2_2', [128, 128, 3, 1, 1]),
157
+ ('pool2_stage1', [2, 2, 0]),
158
+ ('conv3_1', [128, 256, 3, 1, 1]),
159
+ ('conv3_2', [256, 256, 3, 1, 1]),
160
+ ('conv3_3', [256, 256, 3, 1, 1]),
161
+ ('conv3_4', [256, 256, 3, 1, 1]),
162
+ ('pool3_stage1', [2, 2, 0]),
163
+ ('conv4_1', [256, 512, 3, 1, 1]),
164
+ ('conv4_2', [512, 512, 3, 1, 1]),
165
+ ('conv4_3', [512, 512, 3, 1, 1]),
166
+ ('conv4_4', [512, 512, 3, 1, 1]),
167
+ ('conv5_1', [512, 512, 3, 1, 1]),
168
+ ('conv5_2', [512, 512, 3, 1, 1]),
169
+ ('conv5_3_CPM', [512, 128, 3, 1, 1])
170
+ ])
171
+
172
+ block1_1 = OrderedDict([
173
+ ('conv6_1_CPM', [128, 512, 1, 1, 0]),
174
+ ('conv6_2_CPM', [512, 22, 1, 1, 0])
175
+ ])
176
+
177
+ blocks = {}
178
+ blocks['block1_0'] = block1_0
179
+ blocks['block1_1'] = block1_1
180
+
181
+ # stage 2-6
182
+ for i in range(2, 7):
183
+ blocks['block%d' % i] = OrderedDict([
184
+ ('Mconv1_stage%d' % i, [150, 128, 7, 1, 3]),
185
+ ('Mconv2_stage%d' % i, [128, 128, 7, 1, 3]),
186
+ ('Mconv3_stage%d' % i, [128, 128, 7, 1, 3]),
187
+ ('Mconv4_stage%d' % i, [128, 128, 7, 1, 3]),
188
+ ('Mconv5_stage%d' % i, [128, 128, 7, 1, 3]),
189
+ ('Mconv6_stage%d' % i, [128, 128, 1, 1, 0]),
190
+ ('Mconv7_stage%d' % i, [128, 22, 1, 1, 0])
191
+ ])
192
+
193
+ for k in blocks.keys():
194
+ blocks[k] = make_layers(blocks[k], no_relu_layers)
195
+
196
+ self.model1_0 = blocks['block1_0']
197
+ self.model1_1 = blocks['block1_1']
198
+ self.model2 = blocks['block2']
199
+ self.model3 = blocks['block3']
200
+ self.model4 = blocks['block4']
201
+ self.model5 = blocks['block5']
202
+ self.model6 = blocks['block6']
203
+
204
+ def forward(self, x):
205
+ out1_0 = self.model1_0(x)
206
+ out1_1 = self.model1_1(out1_0)
207
+ concat_stage2 = torch.cat([out1_1, out1_0], 1)
208
+ out_stage2 = self.model2(concat_stage2)
209
+ concat_stage3 = torch.cat([out_stage2, out1_0], 1)
210
+ out_stage3 = self.model3(concat_stage3)
211
+ concat_stage4 = torch.cat([out_stage3, out1_0], 1)
212
+ out_stage4 = self.model4(concat_stage4)
213
+ concat_stage5 = torch.cat([out_stage4, out1_0], 1)
214
+ out_stage5 = self.model5(concat_stage5)
215
+ concat_stage6 = torch.cat([out_stage5, out1_0], 1)
216
+ out_stage6 = self.model6(concat_stage6)
217
+ return out_stage6
218
+
219
+
annotator/openpose/util.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import numpy as np
3
+ import matplotlib
4
+ import cv2
5
+
6
+
7
+ def padRightDownCorner(img, stride, padValue):
8
+ h = img.shape[0]
9
+ w = img.shape[1]
10
+
11
+ pad = 4 * [None]
12
+ pad[0] = 0 # up
13
+ pad[1] = 0 # left
14
+ pad[2] = 0 if (h % stride == 0) else stride - (h % stride) # down
15
+ pad[3] = 0 if (w % stride == 0) else stride - (w % stride) # right
16
+
17
+ img_padded = img
18
+ pad_up = np.tile(img_padded[0:1, :, :]*0 + padValue, (pad[0], 1, 1))
19
+ img_padded = np.concatenate((pad_up, img_padded), axis=0)
20
+ pad_left = np.tile(img_padded[:, 0:1, :]*0 + padValue, (1, pad[1], 1))
21
+ img_padded = np.concatenate((pad_left, img_padded), axis=1)
22
+ pad_down = np.tile(img_padded[-2:-1, :, :]*0 + padValue, (pad[2], 1, 1))
23
+ img_padded = np.concatenate((img_padded, pad_down), axis=0)
24
+ pad_right = np.tile(img_padded[:, -2:-1, :]*0 + padValue, (1, pad[3], 1))
25
+ img_padded = np.concatenate((img_padded, pad_right), axis=1)
26
+
27
+ return img_padded, pad
28
+
29
+ # transfer caffe model to pytorch which will match the layer name
30
+ def transfer(model, model_weights):
31
+ transfered_model_weights = {}
32
+ for weights_name in model.state_dict().keys():
33
+ transfered_model_weights[weights_name] = model_weights['.'.join(weights_name.split('.')[1:])]
34
+ return transfered_model_weights
35
+
36
+ # draw the body keypoint and lims
37
+ def draw_bodypose(canvas, candidate, subset, stickwidth=4, circlewidth=4):
38
+
39
+ limbSeq = [[2, 3], [2, 6], [3, 4], [4, 5], [6, 7], [7, 8], [2, 9], [9, 10], \
40
+ [10, 11], [2, 12], [12, 13], [13, 14], [2, 1], [1, 15], [15, 17], \
41
+ [1, 16], [16, 18], [3, 17], [6, 18]]
42
+
43
+ colors = [[255, 0, 0], [255, 85, 0], [255, 170, 0], [255, 255, 0], [170, 255, 0], [85, 255, 0], [0, 255, 0], \
44
+ [0, 255, 85], [0, 255, 170], [0, 255, 255], [0, 170, 255], [0, 85, 255], [0, 0, 255], [85, 0, 255], \
45
+ [170, 0, 255], [255, 0, 255], [255, 0, 170], [255, 0, 85]]
46
+ for i in range(18):
47
+ for n in range(len(subset)):
48
+ index = int(subset[n][i])
49
+ if index == -1:
50
+ continue
51
+ x, y = candidate[index][0:2]
52
+ cv2.circle(canvas, (int(x), int(y)), circlewidth, colors[i], thickness=-1)
53
+ for i in range(17):
54
+ for n in range(len(subset)):
55
+ index = subset[n][np.array(limbSeq[i]) - 1]
56
+ if -1 in index:
57
+ continue
58
+ cur_canvas = canvas.copy()
59
+ Y = candidate[index.astype(int), 0]
60
+ X = candidate[index.astype(int), 1]
61
+ mX = np.mean(X)
62
+ mY = np.mean(Y)
63
+ length = ((X[0] - X[1]) ** 2 + (Y[0] - Y[1]) ** 2) ** 0.5
64
+ angle = math.degrees(math.atan2(X[0] - X[1], Y[0] - Y[1]))
65
+ polygon = cv2.ellipse2Poly((int(mY), int(mX)), (int(length / 2), stickwidth), int(angle), 0, 360, 1)
66
+ cv2.fillConvexPoly(cur_canvas, polygon, colors[i])
67
+ canvas = cv2.addWeighted(canvas, 0.4, cur_canvas, 0.6, 0)
68
+ # plt.imsave("preview.jpg", canvas[:, :, [2, 1, 0]])
69
+ # plt.imshow(canvas[:, :, [2, 1, 0]])
70
+ return canvas
71
+
72
+ # image drawed by opencv is not good.
73
+ def draw_handpose(canvas, all_hand_peaks, show_number=False, stickwidth=2):
74
+ edges = [[0, 1], [1, 2], [2, 3], [3, 4], [0, 5], [5, 6], [6, 7], [7, 8], [0, 9], [9, 10], \
75
+ [10, 11], [11, 12], [0, 13], [13, 14], [14, 15], [15, 16], [0, 17], [17, 18], [18, 19], [19, 20]]
76
+
77
+ for peaks in all_hand_peaks:
78
+ for ie, e in enumerate(edges):
79
+ if np.sum(np.all(peaks[e], axis=1)==0)==0:
80
+ x1, y1 = peaks[e[0]]
81
+ x2, y2 = peaks[e[1]]
82
+ cv2.line(canvas, (x1, y1), (x2, y2), matplotlib.colors.hsv_to_rgb([ie/float(len(edges)), 1.0, 1.0])*255, thickness=stickwidth)
83
+
84
+ for i, keyponit in enumerate(peaks):
85
+ x, y = keyponit
86
+ cv2.circle(canvas, (x, y), 4, (0, 0, 255), thickness=-1)
87
+ if show_number:
88
+ cv2.putText(canvas, str(i), (x, y), cv2.FONT_HERSHEY_SIMPLEX, 0.3, (0, 0, 0), lineType=cv2.LINE_AA)
89
+ return canvas
90
+
91
+ # detect hand according to body pose keypoints
92
+ # please refer to https://github.com/CMU-Perceptual-Computing-Lab/openpose/blob/master/src/openpose/hand/handDetector.cpp
93
+ def handDetect(candidate, subset, oriImg):
94
+ # right hand: wrist 4, elbow 3, shoulder 2
95
+ # left hand: wrist 7, elbow 6, shoulder 5
96
+ ratioWristElbow = 0.33
97
+ detect_result = []
98
+ image_height, image_width = oriImg.shape[0:2]
99
+ for person in subset.astype(int):
100
+ # if any of three not detected
101
+ has_left = np.sum(person[[5, 6, 7]] == -1) == 0
102
+ has_right = np.sum(person[[2, 3, 4]] == -1) == 0
103
+ if not (has_left or has_right):
104
+ continue
105
+ hands = []
106
+ #left hand
107
+ if has_left:
108
+ left_shoulder_index, left_elbow_index, left_wrist_index = person[[5, 6, 7]]
109
+ x1, y1 = candidate[left_shoulder_index][:2]
110
+ x2, y2 = candidate[left_elbow_index][:2]
111
+ x3, y3 = candidate[left_wrist_index][:2]
112
+ hands.append([x1, y1, x2, y2, x3, y3, True])
113
+ # right hand
114
+ if has_right:
115
+ right_shoulder_index, right_elbow_index, right_wrist_index = person[[2, 3, 4]]
116
+ x1, y1 = candidate[right_shoulder_index][:2]
117
+ x2, y2 = candidate[right_elbow_index][:2]
118
+ x3, y3 = candidate[right_wrist_index][:2]
119
+ hands.append([x1, y1, x2, y2, x3, y3, False])
120
+
121
+ for x1, y1, x2, y2, x3, y3, is_left in hands:
122
+ # pos_hand = pos_wrist + ratio * (pos_wrist - pos_elbox) = (1 + ratio) * pos_wrist - ratio * pos_elbox
123
+ # handRectangle.x = posePtr[wrist*3] + ratioWristElbow * (posePtr[wrist*3] - posePtr[elbow*3]);
124
+ # handRectangle.y = posePtr[wrist*3+1] + ratioWristElbow * (posePtr[wrist*3+1] - posePtr[elbow*3+1]);
125
+ # const auto distanceWristElbow = getDistance(poseKeypoints, person, wrist, elbow);
126
+ # const auto distanceElbowShoulder = getDistance(poseKeypoints, person, elbow, shoulder);
127
+ # handRectangle.width = 1.5f * fastMax(distanceWristElbow, 0.9f * distanceElbowShoulder);
128
+ x = x3 + ratioWristElbow * (x3 - x2)
129
+ y = y3 + ratioWristElbow * (y3 - y2)
130
+ distanceWristElbow = math.sqrt((x3 - x2) ** 2 + (y3 - y2) ** 2)
131
+ distanceElbowShoulder = math.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2)
132
+ width = 1.5 * max(distanceWristElbow, 0.9 * distanceElbowShoulder)
133
+ # x-y refers to the center --> offset to topLeft point
134
+ # handRectangle.x -= handRectangle.width / 2.f;
135
+ # handRectangle.y -= handRectangle.height / 2.f;
136
+ x -= width / 2
137
+ y -= width / 2 # width = height
138
+ # overflow the image
139
+ if x < 0: x = 0
140
+ if y < 0: y = 0
141
+ width1 = width
142
+ width2 = width
143
+ if x + width > image_width: width1 = image_width - x
144
+ if y + width > image_height: width2 = image_height - y
145
+ width = min(width1, width2)
146
+ # the max hand box value is 20 pixels
147
+ if width >= 20:
148
+ detect_result.append([int(x), int(y), int(width), is_left])
149
+
150
+ '''
151
+ return value: [[x, y, w, True if left hand else False]].
152
+ width=height since the network require squared input.
153
+ x, y is the coordinate of top left
154
+ '''
155
+ return detect_result
156
+
157
+ # get max index of 2d array
158
+ def npmax(array):
159
+ arrayindex = array.argmax(1)
160
+ arrayvalue = array.max(1)
161
+ i = arrayvalue.argmax()
162
+ j = arrayindex[i]
163
+ return i, j
annotator/segm/__init__.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Self-Correction-Human-Parsing
2
+ # Original https://github.com/GoGoDuck912/Self-Correction-Human-Parsing
3
+
4
+ import os
5
+ import torch
6
+ import numpy as np
7
+ from PIL import Image
8
+ import cv2
9
+
10
+ import torchvision.transforms as T
11
+
12
+ from .transforms import transform_logits, get_affine_transform
13
+ from . import networks
14
+ from annotator.util import annotator_ckpts_path
15
+ from huggingface_hub import snapshot_download
16
+
17
+ dataset_settings = {
18
+ 'lip': {
19
+ 'input_size': [473, 473],
20
+ 'num_classes': 20,
21
+ 'label': ['Background', 'Hat', 'Hair', 'Glove', 'Sunglasses', 'Upper-clothes', 'Dress', 'Coat',
22
+ 'Socks', 'Pants', 'Jumpsuits', 'Scarf', 'Skirt', 'Face', 'Left-arm', 'Right-arm',
23
+ 'Left-leg', 'Right-leg', 'Left-shoe', 'Right-shoe']
24
+ },
25
+ 'atr': {
26
+ 'input_size': [512, 512],
27
+ 'num_classes': 18,
28
+ 'label': ['Background', 'Hat', 'Hair', 'Sunglasses', 'Upper-clothes', 'Skirt', 'Pants', 'Dress', 'Belt',
29
+ 'Left-shoe', 'Right-shoe', 'Face', 'Left-leg', 'Right-leg', 'Left-arm', 'Right-arm', 'Bag', 'Scarf']
30
+ },
31
+ 'pascal': {
32
+ 'input_size': [512, 512],
33
+ 'num_classes': 7,
34
+ 'label': ['Background', 'Head', 'Torso', 'Upper Arms', 'Lower Arms', 'Upper Legs', 'Lower Legs'],
35
+ }
36
+ }
37
+
38
+
39
+ def get_palette(num_cls):
40
+ """ Returns the color map for visualizing the segmentation mask.
41
+ Args:
42
+ num_cls: Number of classes
43
+ Returns:
44
+ The color map
45
+ """
46
+ n = num_cls
47
+ palette = [0] * (n * 3)
48
+ for j in range(0, n):
49
+ lab = j
50
+ palette[j * 3 + 0] = 0
51
+ palette[j * 3 + 1] = 0
52
+ palette[j * 3 + 2] = 0
53
+ i = 0
54
+ while lab:
55
+ palette[j * 3 + 0] |= (((lab >> 0) & 1) << (7 - i))
56
+ palette[j * 3 + 1] |= (((lab >> 1) & 1) << (7 - i))
57
+ palette[j * 3 + 2] |= (((lab >> 2) & 1) << (7 - i))
58
+ i += 1
59
+ lab >>= 3
60
+ return palette
61
+
62
+ class Segmentator(torch.nn.Module):
63
+ def __init__(self, dataset='lip'):
64
+ super().__init__()
65
+
66
+ num_classes = dataset_settings[dataset]['num_classes']
67
+ input_size = dataset_settings[dataset]['input_size']
68
+ label = dataset_settings[dataset]['label']
69
+
70
+ if dataset == 'atr':
71
+ model_path='exp-schp-201908301523-atr.pth'
72
+ elif dataset == 'lip':
73
+ model_path='exp-schp-201908261155-lip.pth'
74
+
75
+ model_path = os.path.join(annotator_ckpts_path, model_path)
76
+
77
+ snapshot_download(repo_id="soonyau/visconet", allow_patterns="exp-schp-201908301523-atr.pth", local_dir=annotator_ckpts_path)
78
+
79
+ self.model = networks.init_model('resnet101', num_classes=num_classes, pretrained=None)
80
+ state_dict = torch.load(model_path)['state_dict']
81
+ from collections import OrderedDict
82
+ new_state_dict = OrderedDict()
83
+ for k, v in state_dict.items():
84
+ name = k[7:] # remove `module.`
85
+ new_state_dict[name] = v
86
+ self.model.load_state_dict(new_state_dict)
87
+ self.model.eval()
88
+
89
+ self.palette = get_palette(num_classes)
90
+
91
+ self.transform = T.Compose([
92
+ T.ToTensor(),
93
+ T.Normalize(mean=[0.406, 0.456, 0.485], std=[0.225, 0.224, 0.229])
94
+ ])
95
+ self.aspect_ratio = input_size[1] * 1.0 / input_size[0]
96
+ self.input_size = np.asarray(input_size)
97
+
98
+ def _box2cs(self, box):
99
+ x, y, w, h = box[:4]
100
+ return self._xywh2cs(x, y, w, h)
101
+
102
+ def _xywh2cs(self, x, y, w, h):
103
+ center = np.zeros((2), dtype=np.float32)
104
+ center[0] = x + w * 0.5
105
+ center[1] = y + h * 0.5
106
+ if w > self.aspect_ratio * h:
107
+ h = w * 1.0 / self.aspect_ratio
108
+ elif w < self.aspect_ratio * h:
109
+ w = h * self.aspect_ratio
110
+ scale = np.array([w, h], dtype=np.float32)
111
+ return center, scale
112
+
113
+ def preprocess(self, image:np.array):
114
+ # convert numpy to cv2
115
+ image = image[:,:,::-1]
116
+ h, w, _ = image.shape
117
+
118
+ # Get person center and scale
119
+ person_center, s = self._box2cs([0, 0, w - 1, h - 1])
120
+ r = 0
121
+ trans = get_affine_transform(person_center, s, r, self.input_size)
122
+ input = cv2.warpAffine(
123
+ image,
124
+ trans,
125
+ (int(self.input_size[1]), int(self.input_size[0])),
126
+ flags=cv2.INTER_LINEAR,
127
+ borderMode=cv2.BORDER_CONSTANT,
128
+ borderValue=(0, 0, 0))
129
+
130
+ input = self.transform(input)
131
+ meta = {
132
+ 'center': person_center,
133
+ 'height': h,
134
+ 'width': w,
135
+ 'scale': s,
136
+ 'rotation': r
137
+ }
138
+
139
+ return input, meta
140
+
141
+ @torch.no_grad()
142
+ def __call__(self, input_image):
143
+ image, meta = self.preprocess(input_image)
144
+ c = meta['center']
145
+ s = meta['scale']
146
+ w = meta['width']
147
+ h = meta['height']
148
+ input_size = list(self.input_size)
149
+ device = next(self.parameters()).device
150
+ output = self.model(image.unsqueeze(0).to(device))
151
+ upsample = torch.nn.Upsample(size=input_size, mode='bilinear', align_corners=True)
152
+ upsample_output = upsample(output[0][-1][0].unsqueeze(0))
153
+ upsample_output = upsample_output.squeeze()
154
+ upsample_output = upsample_output.permute(1, 2, 0) # CHW -> HWC
155
+ logits_result = transform_logits(upsample_output.data.cpu().numpy(), c, s, w, h, input_size=input_size)
156
+ parsing_result = np.argmax(logits_result, axis=2)
157
+ output_img = Image.fromarray(np.asarray(parsing_result, dtype=np.uint8))
158
+ #return output_img
159
+ output_img.putpalette(self.palette)
160
+ return output_img
161
+ #return np.array(output_img)
162
+
annotator/segm/modules/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from .bn import ABN, InPlaceABN, InPlaceABNSync
2
+ from .functions import ACT_RELU, ACT_LEAKY_RELU, ACT_ELU, ACT_NONE
3
+ from .misc import GlobalAvgPool2d, SingleGPU
4
+ from .residual import IdentityResidualBlock
5
+ from .dense import DenseModule
annotator/segm/modules/bn.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as functional
4
+
5
+ try:
6
+ from queue import Queue
7
+ except ImportError:
8
+ from Queue import Queue
9
+
10
+ from .functions import *
11
+
12
+
13
+ class ABN(nn.Module):
14
+ """Activated Batch Normalization
15
+
16
+ This gathers a `BatchNorm2d` and an activation function in a single module
17
+ """
18
+
19
+ def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, activation="leaky_relu", slope=0.01):
20
+ """Creates an Activated Batch Normalization module
21
+
22
+ Parameters
23
+ ----------
24
+ num_features : int
25
+ Number of feature channels in the input and output.
26
+ eps : float
27
+ Small constant to prevent numerical issues.
28
+ momentum : float
29
+ Momentum factor applied to compute running statistics as.
30
+ affine : bool
31
+ If `True` apply learned scale and shift transformation after normalization.
32
+ activation : str
33
+ Name of the activation functions, one of: `leaky_relu`, `elu` or `none`.
34
+ slope : float
35
+ Negative slope for the `leaky_relu` activation.
36
+ """
37
+ super(ABN, self).__init__()
38
+ self.num_features = num_features
39
+ self.affine = affine
40
+ self.eps = eps
41
+ self.momentum = momentum
42
+ self.activation = activation
43
+ self.slope = slope
44
+ if self.affine:
45
+ self.weight = nn.Parameter(torch.ones(num_features))
46
+ self.bias = nn.Parameter(torch.zeros(num_features))
47
+ else:
48
+ self.register_parameter('weight', None)
49
+ self.register_parameter('bias', None)
50
+ self.register_buffer('running_mean', torch.zeros(num_features))
51
+ self.register_buffer('running_var', torch.ones(num_features))
52
+ self.reset_parameters()
53
+
54
+ def reset_parameters(self):
55
+ nn.init.constant_(self.running_mean, 0)
56
+ nn.init.constant_(self.running_var, 1)
57
+ if self.affine:
58
+ nn.init.constant_(self.weight, 1)
59
+ nn.init.constant_(self.bias, 0)
60
+
61
+ def forward(self, x):
62
+ x = functional.batch_norm(x, self.running_mean, self.running_var, self.weight, self.bias,
63
+ self.training, self.momentum, self.eps)
64
+
65
+ if self.activation == ACT_RELU:
66
+ return functional.relu(x, inplace=True)
67
+ elif self.activation == ACT_LEAKY_RELU:
68
+ return functional.leaky_relu(x, negative_slope=self.slope, inplace=True)
69
+ elif self.activation == ACT_ELU:
70
+ return functional.elu(x, inplace=True)
71
+ else:
72
+ return x
73
+
74
+ def __repr__(self):
75
+ rep = '{name}({num_features}, eps={eps}, momentum={momentum},' \
76
+ ' affine={affine}, activation={activation}'
77
+ if self.activation == "leaky_relu":
78
+ rep += ', slope={slope})'
79
+ else:
80
+ rep += ')'
81
+ return rep.format(name=self.__class__.__name__, **self.__dict__)
82
+
83
+
84
+ class InPlaceABN(ABN):
85
+ """InPlace Activated Batch Normalization"""
86
+
87
+ def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, activation="leaky_relu", slope=0.01):
88
+ """Creates an InPlace Activated Batch Normalization module
89
+
90
+ Parameters
91
+ ----------
92
+ num_features : int
93
+ Number of feature channels in the input and output.
94
+ eps : float
95
+ Small constant to prevent numerical issues.
96
+ momentum : float
97
+ Momentum factor applied to compute running statistics as.
98
+ affine : bool
99
+ If `True` apply learned scale and shift transformation after normalization.
100
+ activation : str
101
+ Name of the activation functions, one of: `leaky_relu`, `elu` or `none`.
102
+ slope : float
103
+ Negative slope for the `leaky_relu` activation.
104
+ """
105
+ super(InPlaceABN, self).__init__(num_features, eps, momentum, affine, activation, slope)
106
+
107
+ def forward(self, x):
108
+ x, _, _ = inplace_abn(x, self.weight, self.bias, self.running_mean, self.running_var,
109
+ self.training, self.momentum, self.eps, self.activation, self.slope)
110
+ return x
111
+
112
+
113
+ class InPlaceABNSync(ABN):
114
+ """InPlace Activated Batch Normalization with cross-GPU synchronization
115
+ This assumes that it will be replicated across GPUs using the same mechanism as in `nn.DistributedDataParallel`.
116
+ """
117
+
118
+ def forward(self, x):
119
+ x, _, _ = inplace_abn_sync(x, self.weight, self.bias, self.running_mean, self.running_var,
120
+ self.training, self.momentum, self.eps, self.activation, self.slope)
121
+ return x
122
+
123
+ def __repr__(self):
124
+ rep = '{name}({num_features}, eps={eps}, momentum={momentum},' \
125
+ ' affine={affine}, activation={activation}'
126
+ if self.activation == "leaky_relu":
127
+ rep += ', slope={slope})'
128
+ else:
129
+ rep += ')'
130
+ return rep.format(name=self.__class__.__name__, **self.__dict__)
131
+
132
+
annotator/segm/modules/deeplab.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as functional
4
+
5
+ from models._util import try_index
6
+ from .bn import ABN
7
+
8
+
9
+ class DeeplabV3(nn.Module):
10
+ def __init__(self,
11
+ in_channels,
12
+ out_channels,
13
+ hidden_channels=256,
14
+ dilations=(12, 24, 36),
15
+ norm_act=ABN,
16
+ pooling_size=None):
17
+ super(DeeplabV3, self).__init__()
18
+ self.pooling_size = pooling_size
19
+
20
+ self.map_convs = nn.ModuleList([
21
+ nn.Conv2d(in_channels, hidden_channels, 1, bias=False),
22
+ nn.Conv2d(in_channels, hidden_channels, 3, bias=False, dilation=dilations[0], padding=dilations[0]),
23
+ nn.Conv2d(in_channels, hidden_channels, 3, bias=False, dilation=dilations[1], padding=dilations[1]),
24
+ nn.Conv2d(in_channels, hidden_channels, 3, bias=False, dilation=dilations[2], padding=dilations[2])
25
+ ])
26
+ self.map_bn = norm_act(hidden_channels * 4)
27
+
28
+ self.global_pooling_conv = nn.Conv2d(in_channels, hidden_channels, 1, bias=False)
29
+ self.global_pooling_bn = norm_act(hidden_channels)
30
+
31
+ self.red_conv = nn.Conv2d(hidden_channels * 4, out_channels, 1, bias=False)
32
+ self.pool_red_conv = nn.Conv2d(hidden_channels, out_channels, 1, bias=False)
33
+ self.red_bn = norm_act(out_channels)
34
+
35
+ self.reset_parameters(self.map_bn.activation, self.map_bn.slope)
36
+
37
+ def reset_parameters(self, activation, slope):
38
+ gain = nn.init.calculate_gain(activation, slope)
39
+ for m in self.modules():
40
+ if isinstance(m, nn.Conv2d):
41
+ nn.init.xavier_normal_(m.weight.data, gain)
42
+ if hasattr(m, "bias") and m.bias is not None:
43
+ nn.init.constant_(m.bias, 0)
44
+ elif isinstance(m, ABN):
45
+ if hasattr(m, "weight") and m.weight is not None:
46
+ nn.init.constant_(m.weight, 1)
47
+ if hasattr(m, "bias") and m.bias is not None:
48
+ nn.init.constant_(m.bias, 0)
49
+
50
+ def forward(self, x):
51
+ # Map convolutions
52
+ out = torch.cat([m(x) for m in self.map_convs], dim=1)
53
+ out = self.map_bn(out)
54
+ out = self.red_conv(out)
55
+
56
+ # Global pooling
57
+ pool = self._global_pooling(x)
58
+ pool = self.global_pooling_conv(pool)
59
+ pool = self.global_pooling_bn(pool)
60
+ pool = self.pool_red_conv(pool)
61
+ if self.training or self.pooling_size is None:
62
+ pool = pool.repeat(1, 1, x.size(2), x.size(3))
63
+
64
+ out += pool
65
+ out = self.red_bn(out)
66
+ return out
67
+
68
+ def _global_pooling(self, x):
69
+ if self.training or self.pooling_size is None:
70
+ pool = x.view(x.size(0), x.size(1), -1).mean(dim=-1)
71
+ pool = pool.view(x.size(0), x.size(1), 1, 1)
72
+ else:
73
+ pooling_size = (min(try_index(self.pooling_size, 0), x.shape[2]),
74
+ min(try_index(self.pooling_size, 1), x.shape[3]))
75
+ padding = (
76
+ (pooling_size[1] - 1) // 2,
77
+ (pooling_size[1] - 1) // 2 if pooling_size[1] % 2 == 1 else (pooling_size[1] - 1) // 2 + 1,
78
+ (pooling_size[0] - 1) // 2,
79
+ (pooling_size[0] - 1) // 2 if pooling_size[0] % 2 == 1 else (pooling_size[0] - 1) // 2 + 1
80
+ )
81
+
82
+ pool = functional.avg_pool2d(x, pooling_size, stride=1)
83
+ pool = functional.pad(pool, pad=padding, mode="replicate")
84
+ return pool
annotator/segm/modules/dense.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import OrderedDict
2
+
3
+ import torch
4
+ import torch.nn as nn
5
+
6
+ from .bn import ABN
7
+
8
+
9
+ class DenseModule(nn.Module):
10
+ def __init__(self, in_channels, growth, layers, bottleneck_factor=4, norm_act=ABN, dilation=1):
11
+ super(DenseModule, self).__init__()
12
+ self.in_channels = in_channels
13
+ self.growth = growth
14
+ self.layers = layers
15
+
16
+ self.convs1 = nn.ModuleList()
17
+ self.convs3 = nn.ModuleList()
18
+ for i in range(self.layers):
19
+ self.convs1.append(nn.Sequential(OrderedDict([
20
+ ("bn", norm_act(in_channels)),
21
+ ("conv", nn.Conv2d(in_channels, self.growth * bottleneck_factor, 1, bias=False))
22
+ ])))
23
+ self.convs3.append(nn.Sequential(OrderedDict([
24
+ ("bn", norm_act(self.growth * bottleneck_factor)),
25
+ ("conv", nn.Conv2d(self.growth * bottleneck_factor, self.growth, 3, padding=dilation, bias=False,
26
+ dilation=dilation))
27
+ ])))
28
+ in_channels += self.growth
29
+
30
+ @property
31
+ def out_channels(self):
32
+ return self.in_channels + self.growth * self.layers
33
+
34
+ def forward(self, x):
35
+ inputs = [x]
36
+ for i in range(self.layers):
37
+ x = torch.cat(inputs, dim=1)
38
+ x = self.convs1[i](x)
39
+ x = self.convs3[i](x)
40
+ inputs += [x]
41
+
42
+ return torch.cat(inputs, dim=1)
annotator/segm/modules/functions.py ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from os import path
2
+ import torch
3
+ import torch.distributed as dist
4
+ import torch.autograd as autograd
5
+ import torch.cuda.comm as comm
6
+ from torch.autograd.function import once_differentiable
7
+ from torch.utils.cpp_extension import load
8
+
9
+ _src_path = path.join(path.dirname(path.abspath(__file__)), "src")
10
+ _backend = load(name="inplace_abn",
11
+ extra_cflags=["-O3"],
12
+ sources=[path.join(_src_path, f) for f in [
13
+ "inplace_abn.cpp",
14
+ "inplace_abn_cpu.cpp",
15
+ "inplace_abn_cuda.cu",
16
+ "inplace_abn_cuda_half.cu"
17
+ ]],
18
+ extra_cuda_cflags=["--expt-extended-lambda"])
19
+
20
+ # Activation names
21
+ ACT_RELU = "relu"
22
+ ACT_LEAKY_RELU = "leaky_relu"
23
+ ACT_ELU = "elu"
24
+ ACT_NONE = "none"
25
+
26
+
27
+ def _check(fn, *args, **kwargs):
28
+ success = fn(*args, **kwargs)
29
+ if not success:
30
+ raise RuntimeError("CUDA Error encountered in {}".format(fn))
31
+
32
+
33
+ def _broadcast_shape(x):
34
+ out_size = []
35
+ for i, s in enumerate(x.size()):
36
+ if i != 1:
37
+ out_size.append(1)
38
+ else:
39
+ out_size.append(s)
40
+ return out_size
41
+
42
+
43
+ def _reduce(x):
44
+ if len(x.size()) == 2:
45
+ return x.sum(dim=0)
46
+ else:
47
+ n, c = x.size()[0:2]
48
+ return x.contiguous().view((n, c, -1)).sum(2).sum(0)
49
+
50
+
51
+ def _count_samples(x):
52
+ count = 1
53
+ for i, s in enumerate(x.size()):
54
+ if i != 1:
55
+ count *= s
56
+ return count
57
+
58
+
59
+ def _act_forward(ctx, x):
60
+ if ctx.activation == ACT_LEAKY_RELU:
61
+ _backend.leaky_relu_forward(x, ctx.slope)
62
+ elif ctx.activation == ACT_ELU:
63
+ _backend.elu_forward(x)
64
+ elif ctx.activation == ACT_NONE:
65
+ pass
66
+
67
+
68
+ def _act_backward(ctx, x, dx):
69
+ if ctx.activation == ACT_LEAKY_RELU:
70
+ _backend.leaky_relu_backward(x, dx, ctx.slope)
71
+ elif ctx.activation == ACT_ELU:
72
+ _backend.elu_backward(x, dx)
73
+ elif ctx.activation == ACT_NONE:
74
+ pass
75
+
76
+
77
+ class InPlaceABN(autograd.Function):
78
+ @staticmethod
79
+ def forward(ctx, x, weight, bias, running_mean, running_var,
80
+ training=True, momentum=0.1, eps=1e-05, activation=ACT_LEAKY_RELU, slope=0.01):
81
+ # Save context
82
+ ctx.training = training
83
+ ctx.momentum = momentum
84
+ ctx.eps = eps
85
+ ctx.activation = activation
86
+ ctx.slope = slope
87
+ ctx.affine = weight is not None and bias is not None
88
+
89
+ # Prepare inputs
90
+ count = _count_samples(x)
91
+ x = x.contiguous()
92
+ weight = weight.contiguous() if ctx.affine else x.new_empty(0)
93
+ bias = bias.contiguous() if ctx.affine else x.new_empty(0)
94
+
95
+ if ctx.training:
96
+ mean, var = _backend.mean_var(x)
97
+
98
+ # Update running stats
99
+ running_mean.mul_((1 - ctx.momentum)).add_(ctx.momentum * mean)
100
+ running_var.mul_((1 - ctx.momentum)).add_(ctx.momentum * var * count / (count - 1))
101
+
102
+ # Mark in-place modified tensors
103
+ ctx.mark_dirty(x, running_mean, running_var)
104
+ else:
105
+ mean, var = running_mean.contiguous(), running_var.contiguous()
106
+ ctx.mark_dirty(x)
107
+
108
+ # BN forward + activation
109
+ _backend.forward(x, mean, var, weight, bias, ctx.affine, ctx.eps)
110
+ _act_forward(ctx, x)
111
+
112
+ # Output
113
+ ctx.var = var
114
+ ctx.save_for_backward(x, var, weight, bias)
115
+ ctx.mark_non_differentiable(running_mean, running_var)
116
+ return x, running_mean, running_var
117
+
118
+ @staticmethod
119
+ @once_differentiable
120
+ def backward(ctx, dz, _drunning_mean, _drunning_var):
121
+ z, var, weight, bias = ctx.saved_tensors
122
+ dz = dz.contiguous()
123
+
124
+ # Undo activation
125
+ _act_backward(ctx, z, dz)
126
+
127
+ if ctx.training:
128
+ edz, eydz = _backend.edz_eydz(z, dz, weight, bias, ctx.affine, ctx.eps)
129
+ else:
130
+ # TODO: implement simplified CUDA backward for inference mode
131
+ edz = dz.new_zeros(dz.size(1))
132
+ eydz = dz.new_zeros(dz.size(1))
133
+
134
+ dx = _backend.backward(z, dz, var, weight, bias, edz, eydz, ctx.affine, ctx.eps)
135
+ # dweight = eydz * weight.sign() if ctx.affine else None
136
+ dweight = eydz if ctx.affine else None
137
+ if dweight is not None:
138
+ dweight[weight < 0] *= -1
139
+ dbias = edz if ctx.affine else None
140
+
141
+ return dx, dweight, dbias, None, None, None, None, None, None, None
142
+
143
+
144
+ class InPlaceABNSync(autograd.Function):
145
+ @classmethod
146
+ def forward(cls, ctx, x, weight, bias, running_mean, running_var,
147
+ training=True, momentum=0.1, eps=1e-05, activation=ACT_LEAKY_RELU, slope=0.01, equal_batches=True):
148
+ # Save context
149
+ ctx.training = training
150
+ ctx.momentum = momentum
151
+ ctx.eps = eps
152
+ ctx.activation = activation
153
+ ctx.slope = slope
154
+ ctx.affine = weight is not None and bias is not None
155
+
156
+ # Prepare inputs
157
+ ctx.world_size = dist.get_world_size() if dist.is_initialized() else 1
158
+
159
+ # count = _count_samples(x)
160
+ batch_size = x.new_tensor([x.shape[0]], dtype=torch.long)
161
+
162
+ x = x.contiguous()
163
+ weight = weight.contiguous() if ctx.affine else x.new_empty(0)
164
+ bias = bias.contiguous() if ctx.affine else x.new_empty(0)
165
+
166
+ if ctx.training:
167
+ mean, var = _backend.mean_var(x)
168
+ if ctx.world_size > 1:
169
+ # get global batch size
170
+ if equal_batches:
171
+ batch_size *= ctx.world_size
172
+ else:
173
+ dist.all_reduce(batch_size, dist.ReduceOp.SUM)
174
+
175
+ ctx.factor = x.shape[0] / float(batch_size.item())
176
+
177
+ mean_all = mean.clone() * ctx.factor
178
+ dist.all_reduce(mean_all, dist.ReduceOp.SUM)
179
+
180
+ var_all = (var + (mean - mean_all) ** 2) * ctx.factor
181
+ dist.all_reduce(var_all, dist.ReduceOp.SUM)
182
+
183
+ mean = mean_all
184
+ var = var_all
185
+
186
+ # Update running stats
187
+ running_mean.mul_((1 - ctx.momentum)).add_(ctx.momentum * mean)
188
+ count = batch_size.item() * x.view(x.shape[0], x.shape[1], -1).shape[-1]
189
+ running_var.mul_((1 - ctx.momentum)).add_(ctx.momentum * var * (float(count) / (count - 1)))
190
+
191
+ # Mark in-place modified tensors
192
+ ctx.mark_dirty(x, running_mean, running_var)
193
+ else:
194
+ mean, var = running_mean.contiguous(), running_var.contiguous()
195
+ ctx.mark_dirty(x)
196
+
197
+ # BN forward + activation
198
+ _backend.forward(x, mean, var, weight, bias, ctx.affine, ctx.eps)
199
+ _act_forward(ctx, x)
200
+
201
+ # Output
202
+ ctx.var = var
203
+ ctx.save_for_backward(x, var, weight, bias)
204
+ ctx.mark_non_differentiable(running_mean, running_var)
205
+ return x, running_mean, running_var
206
+
207
+ @staticmethod
208
+ @once_differentiable
209
+ def backward(ctx, dz, _drunning_mean, _drunning_var):
210
+ z, var, weight, bias = ctx.saved_tensors
211
+ dz = dz.contiguous()
212
+
213
+ # Undo activation
214
+ _act_backward(ctx, z, dz)
215
+
216
+ if ctx.training:
217
+ edz, eydz = _backend.edz_eydz(z, dz, weight, bias, ctx.affine, ctx.eps)
218
+ edz_local = edz.clone()
219
+ eydz_local = eydz.clone()
220
+
221
+ if ctx.world_size > 1:
222
+ edz *= ctx.factor
223
+ dist.all_reduce(edz, dist.ReduceOp.SUM)
224
+
225
+ eydz *= ctx.factor
226
+ dist.all_reduce(eydz, dist.ReduceOp.SUM)
227
+ else:
228
+ edz_local = edz = dz.new_zeros(dz.size(1))
229
+ eydz_local = eydz = dz.new_zeros(dz.size(1))
230
+
231
+ dx = _backend.backward(z, dz, var, weight, bias, edz, eydz, ctx.affine, ctx.eps)
232
+ # dweight = eydz_local * weight.sign() if ctx.affine else None
233
+ dweight = eydz_local if ctx.affine else None
234
+ if dweight is not None:
235
+ dweight[weight < 0] *= -1
236
+ dbias = edz_local if ctx.affine else None
237
+
238
+ return dx, dweight, dbias, None, None, None, None, None, None, None
239
+
240
+
241
+ inplace_abn = InPlaceABN.apply
242
+ inplace_abn_sync = InPlaceABNSync.apply
243
+
244
+ __all__ = ["inplace_abn", "inplace_abn_sync", "ACT_RELU", "ACT_LEAKY_RELU", "ACT_ELU", "ACT_NONE"]
annotator/segm/modules/misc.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch.nn as nn
2
+ import torch
3
+ import torch.distributed as dist
4
+
5
+ class GlobalAvgPool2d(nn.Module):
6
+ def __init__(self):
7
+ """Global average pooling over the input's spatial dimensions"""
8
+ super(GlobalAvgPool2d, self).__init__()
9
+
10
+ def forward(self, inputs):
11
+ in_size = inputs.size()
12
+ return inputs.view((in_size[0], in_size[1], -1)).mean(dim=2)
13
+
14
+ class SingleGPU(nn.Module):
15
+ def __init__(self, module):
16
+ super(SingleGPU, self).__init__()
17
+ self.module=module
18
+
19
+ def forward(self, input):
20
+ return self.module(input.cuda(non_blocking=True))
21
+
annotator/segm/modules/residual.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import OrderedDict
2
+
3
+ import torch.nn as nn
4
+
5
+ from .bn import ABN, ACT_LEAKY_RELU, ACT_ELU, ACT_NONE
6
+ import torch.nn.functional as functional
7
+
8
+
9
+ class ResidualBlock(nn.Module):
10
+ """Configurable residual block
11
+
12
+ Parameters
13
+ ----------
14
+ in_channels : int
15
+ Number of input channels.
16
+ channels : list of int
17
+ Number of channels in the internal feature maps. Can either have two or three elements: if three construct
18
+ a residual block with two `3 x 3` convolutions, otherwise construct a bottleneck block with `1 x 1`, then
19
+ `3 x 3` then `1 x 1` convolutions.
20
+ stride : int
21
+ Stride of the first `3 x 3` convolution
22
+ dilation : int
23
+ Dilation to apply to the `3 x 3` convolutions.
24
+ groups : int
25
+ Number of convolution groups. This is used to create ResNeXt-style blocks and is only compatible with
26
+ bottleneck blocks.
27
+ norm_act : callable
28
+ Function to create normalization / activation Module.
29
+ dropout: callable
30
+ Function to create Dropout Module.
31
+ """
32
+
33
+ def __init__(self,
34
+ in_channels,
35
+ channels,
36
+ stride=1,
37
+ dilation=1,
38
+ groups=1,
39
+ norm_act=ABN,
40
+ dropout=None):
41
+ super(ResidualBlock, self).__init__()
42
+
43
+ # Check parameters for inconsistencies
44
+ if len(channels) != 2 and len(channels) != 3:
45
+ raise ValueError("channels must contain either two or three values")
46
+ if len(channels) == 2 and groups != 1:
47
+ raise ValueError("groups > 1 are only valid if len(channels) == 3")
48
+
49
+ is_bottleneck = len(channels) == 3
50
+ need_proj_conv = stride != 1 or in_channels != channels[-1]
51
+
52
+ if not is_bottleneck:
53
+ bn2 = norm_act(channels[1])
54
+ bn2.activation = ACT_NONE
55
+ layers = [
56
+ ("conv1", nn.Conv2d(in_channels, channels[0], 3, stride=stride, padding=dilation, bias=False,
57
+ dilation=dilation)),
58
+ ("bn1", norm_act(channels[0])),
59
+ ("conv2", nn.Conv2d(channels[0], channels[1], 3, stride=1, padding=dilation, bias=False,
60
+ dilation=dilation)),
61
+ ("bn2", bn2)
62
+ ]
63
+ if dropout is not None:
64
+ layers = layers[0:2] + [("dropout", dropout())] + layers[2:]
65
+ else:
66
+ bn3 = norm_act(channels[2])
67
+ bn3.activation = ACT_NONE
68
+ layers = [
69
+ ("conv1", nn.Conv2d(in_channels, channels[0], 1, stride=1, padding=0, bias=False)),
70
+ ("bn1", norm_act(channels[0])),
71
+ ("conv2", nn.Conv2d(channels[0], channels[1], 3, stride=stride, padding=dilation, bias=False,
72
+ groups=groups, dilation=dilation)),
73
+ ("bn2", norm_act(channels[1])),
74
+ ("conv3", nn.Conv2d(channels[1], channels[2], 1, stride=1, padding=0, bias=False)),
75
+ ("bn3", bn3)
76
+ ]
77
+ if dropout is not None:
78
+ layers = layers[0:4] + [("dropout", dropout())] + layers[4:]
79
+ self.convs = nn.Sequential(OrderedDict(layers))
80
+
81
+ if need_proj_conv:
82
+ self.proj_conv = nn.Conv2d(in_channels, channels[-1], 1, stride=stride, padding=0, bias=False)
83
+ self.proj_bn = norm_act(channels[-1])
84
+ self.proj_bn.activation = ACT_NONE
85
+
86
+ def forward(self, x):
87
+ if hasattr(self, "proj_conv"):
88
+ residual = self.proj_conv(x)
89
+ residual = self.proj_bn(residual)
90
+ else:
91
+ residual = x
92
+ x = self.convs(x) + residual
93
+
94
+ if self.convs.bn1.activation == ACT_LEAKY_RELU:
95
+ return functional.leaky_relu(x, negative_slope=self.convs.bn1.slope, inplace=True)
96
+ elif self.convs.bn1.activation == ACT_ELU:
97
+ return functional.elu(x, inplace=True)
98
+ else:
99
+ return x
100
+
101
+
102
+ class IdentityResidualBlock(nn.Module):
103
+ def __init__(self,
104
+ in_channels,
105
+ channels,
106
+ stride=1,
107
+ dilation=1,
108
+ groups=1,
109
+ norm_act=ABN,
110
+ dropout=None):
111
+ """Configurable identity-mapping residual block
112
+
113
+ Parameters
114
+ ----------
115
+ in_channels : int
116
+ Number of input channels.
117
+ channels : list of int
118
+ Number of channels in the internal feature maps. Can either have two or three elements: if three construct
119
+ a residual block with two `3 x 3` convolutions, otherwise construct a bottleneck block with `1 x 1`, then
120
+ `3 x 3` then `1 x 1` convolutions.
121
+ stride : int
122
+ Stride of the first `3 x 3` convolution
123
+ dilation : int
124
+ Dilation to apply to the `3 x 3` convolutions.
125
+ groups : int
126
+ Number of convolution groups. This is used to create ResNeXt-style blocks and is only compatible with
127
+ bottleneck blocks.
128
+ norm_act : callable
129
+ Function to create normalization / activation Module.
130
+ dropout: callable
131
+ Function to create Dropout Module.
132
+ """
133
+ super(IdentityResidualBlock, self).__init__()
134
+
135
+ # Check parameters for inconsistencies
136
+ if len(channels) != 2 and len(channels) != 3:
137
+ raise ValueError("channels must contain either two or three values")
138
+ if len(channels) == 2 and groups != 1:
139
+ raise ValueError("groups > 1 are only valid if len(channels) == 3")
140
+
141
+ is_bottleneck = len(channels) == 3
142
+ need_proj_conv = stride != 1 or in_channels != channels[-1]
143
+
144
+ self.bn1 = norm_act(in_channels)
145
+ if not is_bottleneck:
146
+ layers = [
147
+ ("conv1", nn.Conv2d(in_channels, channels[0], 3, stride=stride, padding=dilation, bias=False,
148
+ dilation=dilation)),
149
+ ("bn2", norm_act(channels[0])),
150
+ ("conv2", nn.Conv2d(channels[0], channels[1], 3, stride=1, padding=dilation, bias=False,
151
+ dilation=dilation))
152
+ ]
153
+ if dropout is not None:
154
+ layers = layers[0:2] + [("dropout", dropout())] + layers[2:]
155
+ else:
156
+ layers = [
157
+ ("conv1", nn.Conv2d(in_channels, channels[0], 1, stride=stride, padding=0, bias=False)),
158
+ ("bn2", norm_act(channels[0])),
159
+ ("conv2", nn.Conv2d(channels[0], channels[1], 3, stride=1, padding=dilation, bias=False,
160
+ groups=groups, dilation=dilation)),
161
+ ("bn3", norm_act(channels[1])),
162
+ ("conv3", nn.Conv2d(channels[1], channels[2], 1, stride=1, padding=0, bias=False))
163
+ ]
164
+ if dropout is not None:
165
+ layers = layers[0:4] + [("dropout", dropout())] + layers[4:]
166
+ self.convs = nn.Sequential(OrderedDict(layers))
167
+
168
+ if need_proj_conv:
169
+ self.proj_conv = nn.Conv2d(in_channels, channels[-1], 1, stride=stride, padding=0, bias=False)
170
+
171
+ def forward(self, x):
172
+ if hasattr(self, "proj_conv"):
173
+ bn1 = self.bn1(x)
174
+ shortcut = self.proj_conv(bn1)
175
+ else:
176
+ shortcut = x.clone()
177
+ bn1 = self.bn1(x)
178
+
179
+ out = self.convs(bn1)
180
+ out.add_(shortcut)
181
+
182
+ return out
annotator/segm/modules/src/checks.h ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include <ATen/ATen.h>
4
+
5
+ // Define AT_CHECK for old version of ATen where the same function was called AT_ASSERT
6
+ #ifndef AT_CHECK
7
+ #define AT_CHECK AT_ASSERT
8
+ #endif
9
+
10
+ #define CHECK_CUDA(x) AT_CHECK((x).type().is_cuda(), #x " must be a CUDA tensor")
11
+ #define CHECK_CPU(x) AT_CHECK(!(x).type().is_cuda(), #x " must be a CPU tensor")
12
+ #define CHECK_CONTIGUOUS(x) AT_CHECK((x).is_contiguous(), #x " must be contiguous")
13
+
14
+ #define CHECK_CUDA_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
15
+ #define CHECK_CPU_INPUT(x) CHECK_CPU(x); CHECK_CONTIGUOUS(x)
annotator/segm/modules/src/inplace_abn.cpp ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <torch/extension.h>
2
+
3
+ #include <vector>
4
+
5
+ #include "inplace_abn.h"
6
+
7
+ std::vector<at::Tensor> mean_var(at::Tensor x) {
8
+ if (x.is_cuda()) {
9
+ if (x.type().scalarType() == at::ScalarType::Half) {
10
+ return mean_var_cuda_h(x);
11
+ } else {
12
+ return mean_var_cuda(x);
13
+ }
14
+ } else {
15
+ return mean_var_cpu(x);
16
+ }
17
+ }
18
+
19
+ at::Tensor forward(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
20
+ bool affine, float eps) {
21
+ if (x.is_cuda()) {
22
+ if (x.type().scalarType() == at::ScalarType::Half) {
23
+ return forward_cuda_h(x, mean, var, weight, bias, affine, eps);
24
+ } else {
25
+ return forward_cuda(x, mean, var, weight, bias, affine, eps);
26
+ }
27
+ } else {
28
+ return forward_cpu(x, mean, var, weight, bias, affine, eps);
29
+ }
30
+ }
31
+
32
+ std::vector<at::Tensor> edz_eydz(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
33
+ bool affine, float eps) {
34
+ if (z.is_cuda()) {
35
+ if (z.type().scalarType() == at::ScalarType::Half) {
36
+ return edz_eydz_cuda_h(z, dz, weight, bias, affine, eps);
37
+ } else {
38
+ return edz_eydz_cuda(z, dz, weight, bias, affine, eps);
39
+ }
40
+ } else {
41
+ return edz_eydz_cpu(z, dz, weight, bias, affine, eps);
42
+ }
43
+ }
44
+
45
+ at::Tensor backward(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
46
+ at::Tensor edz, at::Tensor eydz, bool affine, float eps) {
47
+ if (z.is_cuda()) {
48
+ if (z.type().scalarType() == at::ScalarType::Half) {
49
+ return backward_cuda_h(z, dz, var, weight, bias, edz, eydz, affine, eps);
50
+ } else {
51
+ return backward_cuda(z, dz, var, weight, bias, edz, eydz, affine, eps);
52
+ }
53
+ } else {
54
+ return backward_cpu(z, dz, var, weight, bias, edz, eydz, affine, eps);
55
+ }
56
+ }
57
+
58
+ void leaky_relu_forward(at::Tensor z, float slope) {
59
+ at::leaky_relu_(z, slope);
60
+ }
61
+
62
+ void leaky_relu_backward(at::Tensor z, at::Tensor dz, float slope) {
63
+ if (z.is_cuda()) {
64
+ if (z.type().scalarType() == at::ScalarType::Half) {
65
+ return leaky_relu_backward_cuda_h(z, dz, slope);
66
+ } else {
67
+ return leaky_relu_backward_cuda(z, dz, slope);
68
+ }
69
+ } else {
70
+ return leaky_relu_backward_cpu(z, dz, slope);
71
+ }
72
+ }
73
+
74
+ void elu_forward(at::Tensor z) {
75
+ at::elu_(z);
76
+ }
77
+
78
+ void elu_backward(at::Tensor z, at::Tensor dz) {
79
+ if (z.is_cuda()) {
80
+ return elu_backward_cuda(z, dz);
81
+ } else {
82
+ return elu_backward_cpu(z, dz);
83
+ }
84
+ }
85
+
86
+ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
87
+ m.def("mean_var", &mean_var, "Mean and variance computation");
88
+ m.def("forward", &forward, "In-place forward computation");
89
+ m.def("edz_eydz", &edz_eydz, "First part of backward computation");
90
+ m.def("backward", &backward, "Second part of backward computation");
91
+ m.def("leaky_relu_forward", &leaky_relu_forward, "Leaky relu forward computation");
92
+ m.def("leaky_relu_backward", &leaky_relu_backward, "Leaky relu backward computation and inversion");
93
+ m.def("elu_forward", &elu_forward, "Elu forward computation");
94
+ m.def("elu_backward", &elu_backward, "Elu backward computation and inversion");
95
+ }
annotator/segm/modules/src/inplace_abn.h ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include <ATen/ATen.h>
4
+
5
+ #include <vector>
6
+
7
+ std::vector<at::Tensor> mean_var_cpu(at::Tensor x);
8
+ std::vector<at::Tensor> mean_var_cuda(at::Tensor x);
9
+ std::vector<at::Tensor> mean_var_cuda_h(at::Tensor x);
10
+
11
+ at::Tensor forward_cpu(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
12
+ bool affine, float eps);
13
+ at::Tensor forward_cuda(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
14
+ bool affine, float eps);
15
+ at::Tensor forward_cuda_h(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
16
+ bool affine, float eps);
17
+
18
+ std::vector<at::Tensor> edz_eydz_cpu(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
19
+ bool affine, float eps);
20
+ std::vector<at::Tensor> edz_eydz_cuda(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
21
+ bool affine, float eps);
22
+ std::vector<at::Tensor> edz_eydz_cuda_h(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
23
+ bool affine, float eps);
24
+
25
+ at::Tensor backward_cpu(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
26
+ at::Tensor edz, at::Tensor eydz, bool affine, float eps);
27
+ at::Tensor backward_cuda(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
28
+ at::Tensor edz, at::Tensor eydz, bool affine, float eps);
29
+ at::Tensor backward_cuda_h(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
30
+ at::Tensor edz, at::Tensor eydz, bool affine, float eps);
31
+
32
+ void leaky_relu_backward_cpu(at::Tensor z, at::Tensor dz, float slope);
33
+ void leaky_relu_backward_cuda(at::Tensor z, at::Tensor dz, float slope);
34
+ void leaky_relu_backward_cuda_h(at::Tensor z, at::Tensor dz, float slope);
35
+
36
+ void elu_backward_cpu(at::Tensor z, at::Tensor dz);
37
+ void elu_backward_cuda(at::Tensor z, at::Tensor dz);
38
+
39
+ static void get_dims(at::Tensor x, int64_t& num, int64_t& chn, int64_t& sp) {
40
+ num = x.size(0);
41
+ chn = x.size(1);
42
+ sp = 1;
43
+ for (int64_t i = 2; i < x.ndimension(); ++i)
44
+ sp *= x.size(i);
45
+ }
46
+
47
+ /*
48
+ * Specialized CUDA reduction functions for BN
49
+ */
50
+ #ifdef __CUDACC__
51
+
52
+ #include "utils/cuda.cuh"
53
+
54
+ template <typename T, typename Op>
55
+ __device__ T reduce(Op op, int plane, int N, int S) {
56
+ T sum = (T)0;
57
+ for (int batch = 0; batch < N; ++batch) {
58
+ for (int x = threadIdx.x; x < S; x += blockDim.x) {
59
+ sum += op(batch, plane, x);
60
+ }
61
+ }
62
+
63
+ // sum over NumThreads within a warp
64
+ sum = warpSum(sum);
65
+
66
+ // 'transpose', and reduce within warp again
67
+ __shared__ T shared[32];
68
+ __syncthreads();
69
+ if (threadIdx.x % WARP_SIZE == 0) {
70
+ shared[threadIdx.x / WARP_SIZE] = sum;
71
+ }
72
+ if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
73
+ // zero out the other entries in shared
74
+ shared[threadIdx.x] = (T)0;
75
+ }
76
+ __syncthreads();
77
+ if (threadIdx.x / WARP_SIZE == 0) {
78
+ sum = warpSum(shared[threadIdx.x]);
79
+ if (threadIdx.x == 0) {
80
+ shared[0] = sum;
81
+ }
82
+ }
83
+ __syncthreads();
84
+
85
+ // Everyone picks it up, should be broadcast into the whole gradInput
86
+ return shared[0];
87
+ }
88
+ #endif
annotator/segm/modules/src/inplace_abn_cpu.cpp ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <ATen/ATen.h>
2
+
3
+ #include <vector>
4
+
5
+ #include "utils/checks.h"
6
+ #include "inplace_abn.h"
7
+
8
+ at::Tensor reduce_sum(at::Tensor x) {
9
+ if (x.ndimension() == 2) {
10
+ return x.sum(0);
11
+ } else {
12
+ auto x_view = x.view({x.size(0), x.size(1), -1});
13
+ return x_view.sum(-1).sum(0);
14
+ }
15
+ }
16
+
17
+ at::Tensor broadcast_to(at::Tensor v, at::Tensor x) {
18
+ if (x.ndimension() == 2) {
19
+ return v;
20
+ } else {
21
+ std::vector<int64_t> broadcast_size = {1, -1};
22
+ for (int64_t i = 2; i < x.ndimension(); ++i)
23
+ broadcast_size.push_back(1);
24
+
25
+ return v.view(broadcast_size);
26
+ }
27
+ }
28
+
29
+ int64_t count(at::Tensor x) {
30
+ int64_t count = x.size(0);
31
+ for (int64_t i = 2; i < x.ndimension(); ++i)
32
+ count *= x.size(i);
33
+
34
+ return count;
35
+ }
36
+
37
+ at::Tensor invert_affine(at::Tensor z, at::Tensor weight, at::Tensor bias, bool affine, float eps) {
38
+ if (affine) {
39
+ return (z - broadcast_to(bias, z)) / broadcast_to(at::abs(weight) + eps, z);
40
+ } else {
41
+ return z;
42
+ }
43
+ }
44
+
45
+ std::vector<at::Tensor> mean_var_cpu(at::Tensor x) {
46
+ auto num = count(x);
47
+ auto mean = reduce_sum(x) / num;
48
+ auto diff = x - broadcast_to(mean, x);
49
+ auto var = reduce_sum(diff.pow(2)) / num;
50
+
51
+ return {mean, var};
52
+ }
53
+
54
+ at::Tensor forward_cpu(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
55
+ bool affine, float eps) {
56
+ auto gamma = affine ? at::abs(weight) + eps : at::ones_like(var);
57
+ auto mul = at::rsqrt(var + eps) * gamma;
58
+
59
+ x.sub_(broadcast_to(mean, x));
60
+ x.mul_(broadcast_to(mul, x));
61
+ if (affine) x.add_(broadcast_to(bias, x));
62
+
63
+ return x;
64
+ }
65
+
66
+ std::vector<at::Tensor> edz_eydz_cpu(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
67
+ bool affine, float eps) {
68
+ auto edz = reduce_sum(dz);
69
+ auto y = invert_affine(z, weight, bias, affine, eps);
70
+ auto eydz = reduce_sum(y * dz);
71
+
72
+ return {edz, eydz};
73
+ }
74
+
75
+ at::Tensor backward_cpu(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
76
+ at::Tensor edz, at::Tensor eydz, bool affine, float eps) {
77
+ auto y = invert_affine(z, weight, bias, affine, eps);
78
+ auto mul = affine ? at::rsqrt(var + eps) * (at::abs(weight) + eps) : at::rsqrt(var + eps);
79
+
80
+ auto num = count(z);
81
+ auto dx = (dz - broadcast_to(edz / num, dz) - y * broadcast_to(eydz / num, dz)) * broadcast_to(mul, dz);
82
+ return dx;
83
+ }
84
+
85
+ void leaky_relu_backward_cpu(at::Tensor z, at::Tensor dz, float slope) {
86
+ CHECK_CPU_INPUT(z);
87
+ CHECK_CPU_INPUT(dz);
88
+
89
+ AT_DISPATCH_FLOATING_TYPES(z.type(), "leaky_relu_backward_cpu", ([&] {
90
+ int64_t count = z.numel();
91
+ auto *_z = z.data<scalar_t>();
92
+ auto *_dz = dz.data<scalar_t>();
93
+
94
+ for (int64_t i = 0; i < count; ++i) {
95
+ if (_z[i] < 0) {
96
+ _z[i] *= 1 / slope;
97
+ _dz[i] *= slope;
98
+ }
99
+ }
100
+ }));
101
+ }
102
+
103
+ void elu_backward_cpu(at::Tensor z, at::Tensor dz) {
104
+ CHECK_CPU_INPUT(z);
105
+ CHECK_CPU_INPUT(dz);
106
+
107
+ AT_DISPATCH_FLOATING_TYPES(z.type(), "elu_backward_cpu", ([&] {
108
+ int64_t count = z.numel();
109
+ auto *_z = z.data<scalar_t>();
110
+ auto *_dz = dz.data<scalar_t>();
111
+
112
+ for (int64_t i = 0; i < count; ++i) {
113
+ if (_z[i] < 0) {
114
+ _z[i] = log1p(_z[i]);
115
+ _dz[i] *= (_z[i] + 1.f);
116
+ }
117
+ }
118
+ }));
119
+ }
annotator/segm/modules/src/inplace_abn_cuda.cu ADDED
@@ -0,0 +1,333 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <ATen/ATen.h>
2
+
3
+ #include <thrust/device_ptr.h>
4
+ #include <thrust/transform.h>
5
+
6
+ #include <vector>
7
+
8
+ #include "utils/checks.h"
9
+ #include "utils/cuda.cuh"
10
+ #include "inplace_abn.h"
11
+
12
+ #include <ATen/cuda/CUDAContext.h>
13
+
14
+ // Operations for reduce
15
+ template<typename T>
16
+ struct SumOp {
17
+ __device__ SumOp(const T *t, int c, int s)
18
+ : tensor(t), chn(c), sp(s) {}
19
+ __device__ __forceinline__ T operator()(int batch, int plane, int n) {
20
+ return tensor[(batch * chn + plane) * sp + n];
21
+ }
22
+ const T *tensor;
23
+ const int chn;
24
+ const int sp;
25
+ };
26
+
27
+ template<typename T>
28
+ struct VarOp {
29
+ __device__ VarOp(T m, const T *t, int c, int s)
30
+ : mean(m), tensor(t), chn(c), sp(s) {}
31
+ __device__ __forceinline__ T operator()(int batch, int plane, int n) {
32
+ T val = tensor[(batch * chn + plane) * sp + n];
33
+ return (val - mean) * (val - mean);
34
+ }
35
+ const T mean;
36
+ const T *tensor;
37
+ const int chn;
38
+ const int sp;
39
+ };
40
+
41
+ template<typename T>
42
+ struct GradOp {
43
+ __device__ GradOp(T _weight, T _bias, const T *_z, const T *_dz, int c, int s)
44
+ : weight(_weight), bias(_bias), z(_z), dz(_dz), chn(c), sp(s) {}
45
+ __device__ __forceinline__ Pair<T> operator()(int batch, int plane, int n) {
46
+ T _y = (z[(batch * chn + plane) * sp + n] - bias) / weight;
47
+ T _dz = dz[(batch * chn + plane) * sp + n];
48
+ return Pair<T>(_dz, _y * _dz);
49
+ }
50
+ const T weight;
51
+ const T bias;
52
+ const T *z;
53
+ const T *dz;
54
+ const int chn;
55
+ const int sp;
56
+ };
57
+
58
+ /***********
59
+ * mean_var
60
+ ***********/
61
+
62
+ template<typename T>
63
+ __global__ void mean_var_kernel(const T *x, T *mean, T *var, int num, int chn, int sp) {
64
+ int plane = blockIdx.x;
65
+ T norm = T(1) / T(num * sp);
66
+
67
+ T _mean = reduce<T, SumOp<T>>(SumOp<T>(x, chn, sp), plane, num, sp) * norm;
68
+ __syncthreads();
69
+ T _var = reduce<T, VarOp<T>>(VarOp<T>(_mean, x, chn, sp), plane, num, sp) * norm;
70
+
71
+ if (threadIdx.x == 0) {
72
+ mean[plane] = _mean;
73
+ var[plane] = _var;
74
+ }
75
+ }
76
+
77
+ std::vector<at::Tensor> mean_var_cuda(at::Tensor x) {
78
+ CHECK_CUDA_INPUT(x);
79
+
80
+ // Extract dimensions
81
+ int64_t num, chn, sp;
82
+ get_dims(x, num, chn, sp);
83
+
84
+ // Prepare output tensors
85
+ auto mean = at::empty({chn}, x.options());
86
+ auto var = at::empty({chn}, x.options());
87
+
88
+ // Run kernel
89
+ dim3 blocks(chn);
90
+ dim3 threads(getNumThreads(sp));
91
+ auto stream = at::cuda::getCurrentCUDAStream();
92
+ AT_DISPATCH_FLOATING_TYPES(x.type(), "mean_var_cuda", ([&] {
93
+ mean_var_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
94
+ x.data<scalar_t>(),
95
+ mean.data<scalar_t>(),
96
+ var.data<scalar_t>(),
97
+ num, chn, sp);
98
+ }));
99
+
100
+ return {mean, var};
101
+ }
102
+
103
+ /**********
104
+ * forward
105
+ **********/
106
+
107
+ template<typename T>
108
+ __global__ void forward_kernel(T *x, const T *mean, const T *var, const T *weight, const T *bias,
109
+ bool affine, float eps, int num, int chn, int sp) {
110
+ int plane = blockIdx.x;
111
+
112
+ T _mean = mean[plane];
113
+ T _var = var[plane];
114
+ T _weight = affine ? abs(weight[plane]) + eps : T(1);
115
+ T _bias = affine ? bias[plane] : T(0);
116
+
117
+ T mul = rsqrt(_var + eps) * _weight;
118
+
119
+ for (int batch = 0; batch < num; ++batch) {
120
+ for (int n = threadIdx.x; n < sp; n += blockDim.x) {
121
+ T _x = x[(batch * chn + plane) * sp + n];
122
+ T _y = (_x - _mean) * mul + _bias;
123
+
124
+ x[(batch * chn + plane) * sp + n] = _y;
125
+ }
126
+ }
127
+ }
128
+
129
+ at::Tensor forward_cuda(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
130
+ bool affine, float eps) {
131
+ CHECK_CUDA_INPUT(x);
132
+ CHECK_CUDA_INPUT(mean);
133
+ CHECK_CUDA_INPUT(var);
134
+ CHECK_CUDA_INPUT(weight);
135
+ CHECK_CUDA_INPUT(bias);
136
+
137
+ // Extract dimensions
138
+ int64_t num, chn, sp;
139
+ get_dims(x, num, chn, sp);
140
+
141
+ // Run kernel
142
+ dim3 blocks(chn);
143
+ dim3 threads(getNumThreads(sp));
144
+ auto stream = at::cuda::getCurrentCUDAStream();
145
+ AT_DISPATCH_FLOATING_TYPES(x.type(), "forward_cuda", ([&] {
146
+ forward_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
147
+ x.data<scalar_t>(),
148
+ mean.data<scalar_t>(),
149
+ var.data<scalar_t>(),
150
+ weight.data<scalar_t>(),
151
+ bias.data<scalar_t>(),
152
+ affine, eps, num, chn, sp);
153
+ }));
154
+
155
+ return x;
156
+ }
157
+
158
+ /***********
159
+ * edz_eydz
160
+ ***********/
161
+
162
+ template<typename T>
163
+ __global__ void edz_eydz_kernel(const T *z, const T *dz, const T *weight, const T *bias,
164
+ T *edz, T *eydz, bool affine, float eps, int num, int chn, int sp) {
165
+ int plane = blockIdx.x;
166
+
167
+ T _weight = affine ? abs(weight[plane]) + eps : 1.f;
168
+ T _bias = affine ? bias[plane] : 0.f;
169
+
170
+ Pair<T> res = reduce<Pair<T>, GradOp<T>>(GradOp<T>(_weight, _bias, z, dz, chn, sp), plane, num, sp);
171
+ __syncthreads();
172
+
173
+ if (threadIdx.x == 0) {
174
+ edz[plane] = res.v1;
175
+ eydz[plane] = res.v2;
176
+ }
177
+ }
178
+
179
+ std::vector<at::Tensor> edz_eydz_cuda(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
180
+ bool affine, float eps) {
181
+ CHECK_CUDA_INPUT(z);
182
+ CHECK_CUDA_INPUT(dz);
183
+ CHECK_CUDA_INPUT(weight);
184
+ CHECK_CUDA_INPUT(bias);
185
+
186
+ // Extract dimensions
187
+ int64_t num, chn, sp;
188
+ get_dims(z, num, chn, sp);
189
+
190
+ auto edz = at::empty({chn}, z.options());
191
+ auto eydz = at::empty({chn}, z.options());
192
+
193
+ // Run kernel
194
+ dim3 blocks(chn);
195
+ dim3 threads(getNumThreads(sp));
196
+ auto stream = at::cuda::getCurrentCUDAStream();
197
+ AT_DISPATCH_FLOATING_TYPES(z.type(), "edz_eydz_cuda", ([&] {
198
+ edz_eydz_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
199
+ z.data<scalar_t>(),
200
+ dz.data<scalar_t>(),
201
+ weight.data<scalar_t>(),
202
+ bias.data<scalar_t>(),
203
+ edz.data<scalar_t>(),
204
+ eydz.data<scalar_t>(),
205
+ affine, eps, num, chn, sp);
206
+ }));
207
+
208
+ return {edz, eydz};
209
+ }
210
+
211
+ /***********
212
+ * backward
213
+ ***********/
214
+
215
+ template<typename T>
216
+ __global__ void backward_kernel(const T *z, const T *dz, const T *var, const T *weight, const T *bias, const T *edz,
217
+ const T *eydz, T *dx, bool affine, float eps, int num, int chn, int sp) {
218
+ int plane = blockIdx.x;
219
+
220
+ T _weight = affine ? abs(weight[plane]) + eps : 1.f;
221
+ T _bias = affine ? bias[plane] : 0.f;
222
+ T _var = var[plane];
223
+ T _edz = edz[plane];
224
+ T _eydz = eydz[plane];
225
+
226
+ T _mul = _weight * rsqrt(_var + eps);
227
+ T count = T(num * sp);
228
+
229
+ for (int batch = 0; batch < num; ++batch) {
230
+ for (int n = threadIdx.x; n < sp; n += blockDim.x) {
231
+ T _dz = dz[(batch * chn + plane) * sp + n];
232
+ T _y = (z[(batch * chn + plane) * sp + n] - _bias) / _weight;
233
+
234
+ dx[(batch * chn + plane) * sp + n] = (_dz - _edz / count - _y * _eydz / count) * _mul;
235
+ }
236
+ }
237
+ }
238
+
239
+ at::Tensor backward_cuda(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
240
+ at::Tensor edz, at::Tensor eydz, bool affine, float eps) {
241
+ CHECK_CUDA_INPUT(z);
242
+ CHECK_CUDA_INPUT(dz);
243
+ CHECK_CUDA_INPUT(var);
244
+ CHECK_CUDA_INPUT(weight);
245
+ CHECK_CUDA_INPUT(bias);
246
+ CHECK_CUDA_INPUT(edz);
247
+ CHECK_CUDA_INPUT(eydz);
248
+
249
+ // Extract dimensions
250
+ int64_t num, chn, sp;
251
+ get_dims(z, num, chn, sp);
252
+
253
+ auto dx = at::zeros_like(z);
254
+
255
+ // Run kernel
256
+ dim3 blocks(chn);
257
+ dim3 threads(getNumThreads(sp));
258
+ auto stream = at::cuda::getCurrentCUDAStream();
259
+ AT_DISPATCH_FLOATING_TYPES(z.type(), "backward_cuda", ([&] {
260
+ backward_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
261
+ z.data<scalar_t>(),
262
+ dz.data<scalar_t>(),
263
+ var.data<scalar_t>(),
264
+ weight.data<scalar_t>(),
265
+ bias.data<scalar_t>(),
266
+ edz.data<scalar_t>(),
267
+ eydz.data<scalar_t>(),
268
+ dx.data<scalar_t>(),
269
+ affine, eps, num, chn, sp);
270
+ }));
271
+
272
+ return dx;
273
+ }
274
+
275
+ /**************
276
+ * activations
277
+ **************/
278
+
279
+ template<typename T>
280
+ inline void leaky_relu_backward_impl(T *z, T *dz, float slope, int64_t count) {
281
+ // Create thrust pointers
282
+ thrust::device_ptr<T> th_z = thrust::device_pointer_cast(z);
283
+ thrust::device_ptr<T> th_dz = thrust::device_pointer_cast(dz);
284
+
285
+ auto stream = at::cuda::getCurrentCUDAStream();
286
+ thrust::transform_if(thrust::cuda::par.on(stream),
287
+ th_dz, th_dz + count, th_z, th_dz,
288
+ [slope] __device__ (const T& dz) { return dz * slope; },
289
+ [] __device__ (const T& z) { return z < 0; });
290
+ thrust::transform_if(thrust::cuda::par.on(stream),
291
+ th_z, th_z + count, th_z,
292
+ [slope] __device__ (const T& z) { return z / slope; },
293
+ [] __device__ (const T& z) { return z < 0; });
294
+ }
295
+
296
+ void leaky_relu_backward_cuda(at::Tensor z, at::Tensor dz, float slope) {
297
+ CHECK_CUDA_INPUT(z);
298
+ CHECK_CUDA_INPUT(dz);
299
+
300
+ int64_t count = z.numel();
301
+
302
+ AT_DISPATCH_FLOATING_TYPES(z.type(), "leaky_relu_backward_cuda", ([&] {
303
+ leaky_relu_backward_impl<scalar_t>(z.data<scalar_t>(), dz.data<scalar_t>(), slope, count);
304
+ }));
305
+ }
306
+
307
+ template<typename T>
308
+ inline void elu_backward_impl(T *z, T *dz, int64_t count) {
309
+ // Create thrust pointers
310
+ thrust::device_ptr<T> th_z = thrust::device_pointer_cast(z);
311
+ thrust::device_ptr<T> th_dz = thrust::device_pointer_cast(dz);
312
+
313
+ auto stream = at::cuda::getCurrentCUDAStream();
314
+ thrust::transform_if(thrust::cuda::par.on(stream),
315
+ th_dz, th_dz + count, th_z, th_z, th_dz,
316
+ [] __device__ (const T& dz, const T& z) { return dz * (z + 1.); },
317
+ [] __device__ (const T& z) { return z < 0; });
318
+ thrust::transform_if(thrust::cuda::par.on(stream),
319
+ th_z, th_z + count, th_z,
320
+ [] __device__ (const T& z) { return log1p(z); },
321
+ [] __device__ (const T& z) { return z < 0; });
322
+ }
323
+
324
+ void elu_backward_cuda(at::Tensor z, at::Tensor dz) {
325
+ CHECK_CUDA_INPUT(z);
326
+ CHECK_CUDA_INPUT(dz);
327
+
328
+ int64_t count = z.numel();
329
+
330
+ AT_DISPATCH_FLOATING_TYPES(z.type(), "leaky_relu_backward_cuda", ([&] {
331
+ elu_backward_impl<scalar_t>(z.data<scalar_t>(), dz.data<scalar_t>(), count);
332
+ }));
333
+ }
annotator/segm/modules/src/inplace_abn_cuda_half.cu ADDED
@@ -0,0 +1,275 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <ATen/ATen.h>
2
+
3
+ #include <cuda_fp16.h>
4
+
5
+ #include <vector>
6
+
7
+ #include "utils/checks.h"
8
+ #include "utils/cuda.cuh"
9
+ #include "inplace_abn.h"
10
+
11
+ #include <ATen/cuda/CUDAContext.h>
12
+
13
+ // Operations for reduce
14
+ struct SumOpH {
15
+ __device__ SumOpH(const half *t, int c, int s)
16
+ : tensor(t), chn(c), sp(s) {}
17
+ __device__ __forceinline__ float operator()(int batch, int plane, int n) {
18
+ return __half2float(tensor[(batch * chn + plane) * sp + n]);
19
+ }
20
+ const half *tensor;
21
+ const int chn;
22
+ const int sp;
23
+ };
24
+
25
+ struct VarOpH {
26
+ __device__ VarOpH(float m, const half *t, int c, int s)
27
+ : mean(m), tensor(t), chn(c), sp(s) {}
28
+ __device__ __forceinline__ float operator()(int batch, int plane, int n) {
29
+ const auto t = __half2float(tensor[(batch * chn + plane) * sp + n]);
30
+ return (t - mean) * (t - mean);
31
+ }
32
+ const float mean;
33
+ const half *tensor;
34
+ const int chn;
35
+ const int sp;
36
+ };
37
+
38
+ struct GradOpH {
39
+ __device__ GradOpH(float _weight, float _bias, const half *_z, const half *_dz, int c, int s)
40
+ : weight(_weight), bias(_bias), z(_z), dz(_dz), chn(c), sp(s) {}
41
+ __device__ __forceinline__ Pair<float> operator()(int batch, int plane, int n) {
42
+ float _y = (__half2float(z[(batch * chn + plane) * sp + n]) - bias) / weight;
43
+ float _dz = __half2float(dz[(batch * chn + plane) * sp + n]);
44
+ return Pair<float>(_dz, _y * _dz);
45
+ }
46
+ const float weight;
47
+ const float bias;
48
+ const half *z;
49
+ const half *dz;
50
+ const int chn;
51
+ const int sp;
52
+ };
53
+
54
+ /***********
55
+ * mean_var
56
+ ***********/
57
+
58
+ __global__ void mean_var_kernel_h(const half *x, float *mean, float *var, int num, int chn, int sp) {
59
+ int plane = blockIdx.x;
60
+ float norm = 1.f / static_cast<float>(num * sp);
61
+
62
+ float _mean = reduce<float, SumOpH>(SumOpH(x, chn, sp), plane, num, sp) * norm;
63
+ __syncthreads();
64
+ float _var = reduce<float, VarOpH>(VarOpH(_mean, x, chn, sp), plane, num, sp) * norm;
65
+
66
+ if (threadIdx.x == 0) {
67
+ mean[plane] = _mean;
68
+ var[plane] = _var;
69
+ }
70
+ }
71
+
72
+ std::vector<at::Tensor> mean_var_cuda_h(at::Tensor x) {
73
+ CHECK_CUDA_INPUT(x);
74
+
75
+ // Extract dimensions
76
+ int64_t num, chn, sp;
77
+ get_dims(x, num, chn, sp);
78
+
79
+ // Prepare output tensors
80
+ auto mean = at::empty({chn},x.options().dtype(at::kFloat));
81
+ auto var = at::empty({chn},x.options().dtype(at::kFloat));
82
+
83
+ // Run kernel
84
+ dim3 blocks(chn);
85
+ dim3 threads(getNumThreads(sp));
86
+ auto stream = at::cuda::getCurrentCUDAStream();
87
+ mean_var_kernel_h<<<blocks, threads, 0, stream>>>(
88
+ reinterpret_cast<half*>(x.data<at::Half>()),
89
+ mean.data<float>(),
90
+ var.data<float>(),
91
+ num, chn, sp);
92
+
93
+ return {mean, var};
94
+ }
95
+
96
+ /**********
97
+ * forward
98
+ **********/
99
+
100
+ __global__ void forward_kernel_h(half *x, const float *mean, const float *var, const float *weight, const float *bias,
101
+ bool affine, float eps, int num, int chn, int sp) {
102
+ int plane = blockIdx.x;
103
+
104
+ const float _mean = mean[plane];
105
+ const float _var = var[plane];
106
+ const float _weight = affine ? abs(weight[plane]) + eps : 1.f;
107
+ const float _bias = affine ? bias[plane] : 0.f;
108
+
109
+ const float mul = rsqrt(_var + eps) * _weight;
110
+
111
+ for (int batch = 0; batch < num; ++batch) {
112
+ for (int n = threadIdx.x; n < sp; n += blockDim.x) {
113
+ half *x_ptr = x + (batch * chn + plane) * sp + n;
114
+ float _x = __half2float(*x_ptr);
115
+ float _y = (_x - _mean) * mul + _bias;
116
+
117
+ *x_ptr = __float2half(_y);
118
+ }
119
+ }
120
+ }
121
+
122
+ at::Tensor forward_cuda_h(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
123
+ bool affine, float eps) {
124
+ CHECK_CUDA_INPUT(x);
125
+ CHECK_CUDA_INPUT(mean);
126
+ CHECK_CUDA_INPUT(var);
127
+ CHECK_CUDA_INPUT(weight);
128
+ CHECK_CUDA_INPUT(bias);
129
+
130
+ // Extract dimensions
131
+ int64_t num, chn, sp;
132
+ get_dims(x, num, chn, sp);
133
+
134
+ // Run kernel
135
+ dim3 blocks(chn);
136
+ dim3 threads(getNumThreads(sp));
137
+ auto stream = at::cuda::getCurrentCUDAStream();
138
+ forward_kernel_h<<<blocks, threads, 0, stream>>>(
139
+ reinterpret_cast<half*>(x.data<at::Half>()),
140
+ mean.data<float>(),
141
+ var.data<float>(),
142
+ weight.data<float>(),
143
+ bias.data<float>(),
144
+ affine, eps, num, chn, sp);
145
+
146
+ return x;
147
+ }
148
+
149
+ __global__ void edz_eydz_kernel_h(const half *z, const half *dz, const float *weight, const float *bias,
150
+ float *edz, float *eydz, bool affine, float eps, int num, int chn, int sp) {
151
+ int plane = blockIdx.x;
152
+
153
+ float _weight = affine ? abs(weight[plane]) + eps : 1.f;
154
+ float _bias = affine ? bias[plane] : 0.f;
155
+
156
+ Pair<float> res = reduce<Pair<float>, GradOpH>(GradOpH(_weight, _bias, z, dz, chn, sp), plane, num, sp);
157
+ __syncthreads();
158
+
159
+ if (threadIdx.x == 0) {
160
+ edz[plane] = res.v1;
161
+ eydz[plane] = res.v2;
162
+ }
163
+ }
164
+
165
+ std::vector<at::Tensor> edz_eydz_cuda_h(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
166
+ bool affine, float eps) {
167
+ CHECK_CUDA_INPUT(z);
168
+ CHECK_CUDA_INPUT(dz);
169
+ CHECK_CUDA_INPUT(weight);
170
+ CHECK_CUDA_INPUT(bias);
171
+
172
+ // Extract dimensions
173
+ int64_t num, chn, sp;
174
+ get_dims(z, num, chn, sp);
175
+
176
+ auto edz = at::empty({chn},z.options().dtype(at::kFloat));
177
+ auto eydz = at::empty({chn},z.options().dtype(at::kFloat));
178
+
179
+ // Run kernel
180
+ dim3 blocks(chn);
181
+ dim3 threads(getNumThreads(sp));
182
+ auto stream = at::cuda::getCurrentCUDAStream();
183
+ edz_eydz_kernel_h<<<blocks, threads, 0, stream>>>(
184
+ reinterpret_cast<half*>(z.data<at::Half>()),
185
+ reinterpret_cast<half*>(dz.data<at::Half>()),
186
+ weight.data<float>(),
187
+ bias.data<float>(),
188
+ edz.data<float>(),
189
+ eydz.data<float>(),
190
+ affine, eps, num, chn, sp);
191
+
192
+ return {edz, eydz};
193
+ }
194
+
195
+ __global__ void backward_kernel_h(const half *z, const half *dz, const float *var, const float *weight, const float *bias, const float *edz,
196
+ const float *eydz, half *dx, bool affine, float eps, int num, int chn, int sp) {
197
+ int plane = blockIdx.x;
198
+
199
+ float _weight = affine ? abs(weight[plane]) + eps : 1.f;
200
+ float _bias = affine ? bias[plane] : 0.f;
201
+ float _var = var[plane];
202
+ float _edz = edz[plane];
203
+ float _eydz = eydz[plane];
204
+
205
+ float _mul = _weight * rsqrt(_var + eps);
206
+ float count = float(num * sp);
207
+
208
+ for (int batch = 0; batch < num; ++batch) {
209
+ for (int n = threadIdx.x; n < sp; n += blockDim.x) {
210
+ float _dz = __half2float(dz[(batch * chn + plane) * sp + n]);
211
+ float _y = (__half2float(z[(batch * chn + plane) * sp + n]) - _bias) / _weight;
212
+
213
+ dx[(batch * chn + plane) * sp + n] = __float2half((_dz - _edz / count - _y * _eydz / count) * _mul);
214
+ }
215
+ }
216
+ }
217
+
218
+ at::Tensor backward_cuda_h(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
219
+ at::Tensor edz, at::Tensor eydz, bool affine, float eps) {
220
+ CHECK_CUDA_INPUT(z);
221
+ CHECK_CUDA_INPUT(dz);
222
+ CHECK_CUDA_INPUT(var);
223
+ CHECK_CUDA_INPUT(weight);
224
+ CHECK_CUDA_INPUT(bias);
225
+ CHECK_CUDA_INPUT(edz);
226
+ CHECK_CUDA_INPUT(eydz);
227
+
228
+ // Extract dimensions
229
+ int64_t num, chn, sp;
230
+ get_dims(z, num, chn, sp);
231
+
232
+ auto dx = at::zeros_like(z);
233
+
234
+ // Run kernel
235
+ dim3 blocks(chn);
236
+ dim3 threads(getNumThreads(sp));
237
+ auto stream = at::cuda::getCurrentCUDAStream();
238
+ backward_kernel_h<<<blocks, threads, 0, stream>>>(
239
+ reinterpret_cast<half*>(z.data<at::Half>()),
240
+ reinterpret_cast<half*>(dz.data<at::Half>()),
241
+ var.data<float>(),
242
+ weight.data<float>(),
243
+ bias.data<float>(),
244
+ edz.data<float>(),
245
+ eydz.data<float>(),
246
+ reinterpret_cast<half*>(dx.data<at::Half>()),
247
+ affine, eps, num, chn, sp);
248
+
249
+ return dx;
250
+ }
251
+
252
+ __global__ void leaky_relu_backward_impl_h(half *z, half *dz, float slope, int64_t count) {
253
+ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < count; i += blockDim.x * gridDim.x){
254
+ float _z = __half2float(z[i]);
255
+ if (_z < 0) {
256
+ dz[i] = __float2half(__half2float(dz[i]) * slope);
257
+ z[i] = __float2half(_z / slope);
258
+ }
259
+ }
260
+ }
261
+
262
+ void leaky_relu_backward_cuda_h(at::Tensor z, at::Tensor dz, float slope) {
263
+ CHECK_CUDA_INPUT(z);
264
+ CHECK_CUDA_INPUT(dz);
265
+
266
+ int64_t count = z.numel();
267
+ dim3 threads(getNumThreads(count));
268
+ dim3 blocks = (count + threads.x - 1) / threads.x;
269
+ auto stream = at::cuda::getCurrentCUDAStream();
270
+ leaky_relu_backward_impl_h<<<blocks, threads, 0, stream>>>(
271
+ reinterpret_cast<half*>(z.data<at::Half>()),
272
+ reinterpret_cast<half*>(dz.data<at::Half>()),
273
+ slope, count);
274
+ }
275
+
annotator/segm/modules/src/utils/checks.h ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include <ATen/ATen.h>
4
+
5
+ // Define AT_CHECK for old version of ATen where the same function was called AT_ASSERT
6
+ #ifndef AT_CHECK
7
+ #define AT_CHECK AT_ASSERT
8
+ #endif
9
+
10
+ #define CHECK_CUDA(x) AT_CHECK((x).type().is_cuda(), #x " must be a CUDA tensor")
11
+ #define CHECK_CPU(x) AT_CHECK(!(x).type().is_cuda(), #x " must be a CPU tensor")
12
+ #define CHECK_CONTIGUOUS(x) AT_CHECK((x).is_contiguous(), #x " must be contiguous")
13
+
14
+ #define CHECK_CUDA_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
15
+ #define CHECK_CPU_INPUT(x) CHECK_CPU(x); CHECK_CONTIGUOUS(x)
annotator/segm/modules/src/utils/common.h ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include <ATen/ATen.h>
4
+
5
+ /*
6
+ * Functions to share code between CPU and GPU
7
+ */
8
+
9
+ #ifdef __CUDACC__
10
+ // CUDA versions
11
+
12
+ #define HOST_DEVICE __host__ __device__
13
+ #define INLINE_HOST_DEVICE __host__ __device__ inline
14
+ #define FLOOR(x) floor(x)
15
+
16
+ #if __CUDA_ARCH__ >= 600
17
+ // Recent compute capabilities have block-level atomicAdd for all data types, so we use that
18
+ #define ACCUM(x,y) atomicAdd_block(&(x),(y))
19
+ #else
20
+ // Older architectures don't have block-level atomicAdd, nor atomicAdd for doubles, so we defer to atomicAdd for float
21
+ // and use the known atomicCAS-based implementation for double
22
+ template<typename data_t>
23
+ __device__ inline data_t atomic_add(data_t *address, data_t val) {
24
+ return atomicAdd(address, val);
25
+ }
26
+
27
+ template<>
28
+ __device__ inline double atomic_add(double *address, double val) {
29
+ unsigned long long int* address_as_ull = (unsigned long long int*)address;
30
+ unsigned long long int old = *address_as_ull, assumed;
31
+ do {
32
+ assumed = old;
33
+ old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
34
+ } while (assumed != old);
35
+ return __longlong_as_double(old);
36
+ }
37
+
38
+ #define ACCUM(x,y) atomic_add(&(x),(y))
39
+ #endif // #if __CUDA_ARCH__ >= 600
40
+
41
+ #else
42
+ // CPU versions
43
+
44
+ #define HOST_DEVICE
45
+ #define INLINE_HOST_DEVICE inline
46
+ #define FLOOR(x) std::floor(x)
47
+ #define ACCUM(x,y) (x) += (y)
48
+
49
+ #endif // #ifdef __CUDACC__
annotator/segm/modules/src/utils/cuda.cuh ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ /*
4
+ * General settings and functions
5
+ */
6
+ const int WARP_SIZE = 32;
7
+ const int MAX_BLOCK_SIZE = 1024;
8
+
9
+ static int getNumThreads(int nElem) {
10
+ int threadSizes[6] = {32, 64, 128, 256, 512, MAX_BLOCK_SIZE};
11
+ for (int i = 0; i < 6; ++i) {
12
+ if (nElem <= threadSizes[i]) {
13
+ return threadSizes[i];
14
+ }
15
+ }
16
+ return MAX_BLOCK_SIZE;
17
+ }
18
+
19
+ /*
20
+ * Reduction utilities
21
+ */
22
+ template <typename T>
23
+ __device__ __forceinline__ T WARP_SHFL_XOR(T value, int laneMask, int width = warpSize,
24
+ unsigned int mask = 0xffffffff) {
25
+ #if CUDART_VERSION >= 9000
26
+ return __shfl_xor_sync(mask, value, laneMask, width);
27
+ #else
28
+ return __shfl_xor(value, laneMask, width);
29
+ #endif
30
+ }
31
+
32
+ __device__ __forceinline__ int getMSB(int val) { return 31 - __clz(val); }
33
+
34
+ template<typename T>
35
+ struct Pair {
36
+ T v1, v2;
37
+ __device__ Pair() {}
38
+ __device__ Pair(T _v1, T _v2) : v1(_v1), v2(_v2) {}
39
+ __device__ Pair(T v) : v1(v), v2(v) {}
40
+ __device__ Pair(int v) : v1(v), v2(v) {}
41
+ __device__ Pair &operator+=(const Pair<T> &a) {
42
+ v1 += a.v1;
43
+ v2 += a.v2;
44
+ return *this;
45
+ }
46
+ };
47
+
48
+ template<typename T>
49
+ static __device__ __forceinline__ T warpSum(T val) {
50
+ #if __CUDA_ARCH__ >= 300
51
+ for (int i = 0; i < getMSB(WARP_SIZE); ++i) {
52
+ val += WARP_SHFL_XOR(val, 1 << i, WARP_SIZE);
53
+ }
54
+ #else
55
+ __shared__ T values[MAX_BLOCK_SIZE];
56
+ values[threadIdx.x] = val;
57
+ __threadfence_block();
58
+ const int base = (threadIdx.x / WARP_SIZE) * WARP_SIZE;
59
+ for (int i = 1; i < WARP_SIZE; i++) {
60
+ val += values[base + ((i + threadIdx.x) % WARP_SIZE)];
61
+ }
62
+ #endif
63
+ return val;
64
+ }
65
+
66
+ template<typename T>
67
+ static __device__ __forceinline__ Pair<T> warpSum(Pair<T> value) {
68
+ value.v1 = warpSum(value.v1);
69
+ value.v2 = warpSum(value.v2);
70
+ return value;
71
+ }
annotator/segm/networks/AugmentCE2P.py ADDED
@@ -0,0 +1,337 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- encoding: utf-8 -*-
3
+
4
+ """
5
+ @Author : Peike Li
6
+ @Contact : peike.li@yahoo.com
7
+ @File : AugmentCE2P.py
8
+ @Time : 8/4/19 3:35 PM
9
+ @Desc :
10
+ @License : This source code is licensed under the license found in the
11
+ LICENSE file in the root directory of this source tree.
12
+ """
13
+
14
+ import functools
15
+
16
+ import torch
17
+ import torch.nn as nn
18
+ from torch.nn import functional as F
19
+ # Note here we adopt the InplaceABNSync implementation from https://github.com/mapillary/inplace_abn
20
+ # By default, the InplaceABNSync module contains a BatchNorm Layer and a LeakyReLu layer
21
+ from ..modules import InPlaceABNSync
22
+
23
+ BatchNorm2d = functools.partial(InPlaceABNSync, activation='none')
24
+
25
+ affine_par = True
26
+
27
+ pretrained_settings = {
28
+ 'resnet101': {
29
+ 'imagenet': {
30
+ 'input_space': 'BGR',
31
+ 'input_size': [3, 224, 224],
32
+ 'input_range': [0, 1],
33
+ 'mean': [0.406, 0.456, 0.485],
34
+ 'std': [0.225, 0.224, 0.229],
35
+ 'num_classes': 1000
36
+ }
37
+ },
38
+ }
39
+
40
+
41
+ def conv3x3(in_planes, out_planes, stride=1):
42
+ "3x3 convolution with padding"
43
+ return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
44
+ padding=1, bias=False)
45
+
46
+
47
+ class Bottleneck(nn.Module):
48
+ expansion = 4
49
+
50
+ def __init__(self, inplanes, planes, stride=1, dilation=1, downsample=None, fist_dilation=1, multi_grid=1):
51
+ super(Bottleneck, self).__init__()
52
+ self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
53
+ self.bn1 = BatchNorm2d(planes)
54
+ self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
55
+ padding=dilation * multi_grid, dilation=dilation * multi_grid, bias=False)
56
+ self.bn2 = BatchNorm2d(planes)
57
+ self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
58
+ self.bn3 = BatchNorm2d(planes * 4)
59
+ self.relu = nn.ReLU(inplace=False)
60
+ self.relu_inplace = nn.ReLU(inplace=True)
61
+ self.downsample = downsample
62
+ self.dilation = dilation
63
+ self.stride = stride
64
+
65
+ def forward(self, x):
66
+ residual = x
67
+
68
+ out = self.conv1(x)
69
+ out = self.bn1(out)
70
+ out = self.relu(out)
71
+
72
+ out = self.conv2(out)
73
+ out = self.bn2(out)
74
+ out = self.relu(out)
75
+
76
+ out = self.conv3(out)
77
+ out = self.bn3(out)
78
+
79
+ if self.downsample is not None:
80
+ residual = self.downsample(x)
81
+
82
+ out = out + residual
83
+ out = self.relu_inplace(out)
84
+
85
+ return out
86
+
87
+
88
+ class PSPModule(nn.Module):
89
+ """
90
+ Reference:
91
+ Zhao, Hengshuang, et al. *"Pyramid scene parsing network."*
92
+ """
93
+
94
+ def __init__(self, features, out_features=512, sizes=(1, 2, 3, 6)):
95
+ super(PSPModule, self).__init__()
96
+
97
+ self.stages = []
98
+ self.stages = nn.ModuleList([self._make_stage(features, out_features, size) for size in sizes])
99
+ self.bottleneck = nn.Sequential(
100
+ nn.Conv2d(features + len(sizes) * out_features, out_features, kernel_size=3, padding=1, dilation=1,
101
+ bias=False),
102
+ InPlaceABNSync(out_features),
103
+ )
104
+
105
+ def _make_stage(self, features, out_features, size):
106
+ prior = nn.AdaptiveAvgPool2d(output_size=(size, size))
107
+ conv = nn.Conv2d(features, out_features, kernel_size=1, bias=False)
108
+ bn = InPlaceABNSync(out_features)
109
+ return nn.Sequential(prior, conv, bn)
110
+
111
+ def forward(self, feats):
112
+ h, w = feats.size(2), feats.size(3)
113
+ priors = [F.interpolate(input=stage(feats), size=(h, w), mode='bilinear', align_corners=True) for stage in
114
+ self.stages] + [feats]
115
+ bottle = self.bottleneck(torch.cat(priors, 1))
116
+ return bottle
117
+
118
+
119
+ class ASPPModule(nn.Module):
120
+ """
121
+ Reference:
122
+ Chen, Liang-Chieh, et al. *"Rethinking Atrous Convolution for Semantic Image Segmentation."*
123
+ """
124
+
125
+ def __init__(self, features, inner_features=256, out_features=512, dilations=(12, 24, 36)):
126
+ super(ASPPModule, self).__init__()
127
+
128
+ self.conv1 = nn.Sequential(nn.AdaptiveAvgPool2d((1, 1)),
129
+ nn.Conv2d(features, inner_features, kernel_size=1, padding=0, dilation=1,
130
+ bias=False),
131
+ InPlaceABNSync(inner_features))
132
+ self.conv2 = nn.Sequential(
133
+ nn.Conv2d(features, inner_features, kernel_size=1, padding=0, dilation=1, bias=False),
134
+ InPlaceABNSync(inner_features))
135
+ self.conv3 = nn.Sequential(
136
+ nn.Conv2d(features, inner_features, kernel_size=3, padding=dilations[0], dilation=dilations[0], bias=False),
137
+ InPlaceABNSync(inner_features))
138
+ self.conv4 = nn.Sequential(
139
+ nn.Conv2d(features, inner_features, kernel_size=3, padding=dilations[1], dilation=dilations[1], bias=False),
140
+ InPlaceABNSync(inner_features))
141
+ self.conv5 = nn.Sequential(
142
+ nn.Conv2d(features, inner_features, kernel_size=3, padding=dilations[2], dilation=dilations[2], bias=False),
143
+ InPlaceABNSync(inner_features))
144
+
145
+ self.bottleneck = nn.Sequential(
146
+ nn.Conv2d(inner_features * 5, out_features, kernel_size=1, padding=0, dilation=1, bias=False),
147
+ InPlaceABNSync(out_features),
148
+ nn.Dropout2d(0.1)
149
+ )
150
+
151
+ def forward(self, x):
152
+ _, _, h, w = x.size()
153
+
154
+ feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True)
155
+
156
+ feat2 = self.conv2(x)
157
+ feat3 = self.conv3(x)
158
+ feat4 = self.conv4(x)
159
+ feat5 = self.conv5(x)
160
+ out = torch.cat((feat1, feat2, feat3, feat4, feat5), 1)
161
+
162
+ bottle = self.bottleneck(out)
163
+ return bottle
164
+
165
+
166
+ class Edge_Module(nn.Module):
167
+ """
168
+ Edge Learning Branch
169
+ """
170
+
171
+ def __init__(self, in_fea=[256, 512, 1024], mid_fea=256, out_fea=2):
172
+ super(Edge_Module, self).__init__()
173
+
174
+ self.conv1 = nn.Sequential(
175
+ nn.Conv2d(in_fea[0], mid_fea, kernel_size=1, padding=0, dilation=1, bias=False),
176
+ InPlaceABNSync(mid_fea)
177
+ )
178
+ self.conv2 = nn.Sequential(
179
+ nn.Conv2d(in_fea[1], mid_fea, kernel_size=1, padding=0, dilation=1, bias=False),
180
+ InPlaceABNSync(mid_fea)
181
+ )
182
+ self.conv3 = nn.Sequential(
183
+ nn.Conv2d(in_fea[2], mid_fea, kernel_size=1, padding=0, dilation=1, bias=False),
184
+ InPlaceABNSync(mid_fea)
185
+ )
186
+ self.conv4 = nn.Conv2d(mid_fea, out_fea, kernel_size=3, padding=1, dilation=1, bias=True)
187
+ self.conv5 = nn.Conv2d(out_fea * 3, out_fea, kernel_size=1, padding=0, dilation=1, bias=True)
188
+
189
+ def forward(self, x1, x2, x3):
190
+ _, _, h, w = x1.size()
191
+
192
+ edge1_fea = self.conv1(x1)
193
+ edge1 = self.conv4(edge1_fea)
194
+ edge2_fea = self.conv2(x2)
195
+ edge2 = self.conv4(edge2_fea)
196
+ edge3_fea = self.conv3(x3)
197
+ edge3 = self.conv4(edge3_fea)
198
+
199
+ edge2_fea = F.interpolate(edge2_fea, size=(h, w), mode='bilinear', align_corners=True)
200
+ edge3_fea = F.interpolate(edge3_fea, size=(h, w), mode='bilinear', align_corners=True)
201
+ edge2 = F.interpolate(edge2, size=(h, w), mode='bilinear', align_corners=True)
202
+ edge3 = F.interpolate(edge3, size=(h, w), mode='bilinear', align_corners=True)
203
+
204
+ edge = torch.cat([edge1, edge2, edge3], dim=1)
205
+ edge_fea = torch.cat([edge1_fea, edge2_fea, edge3_fea], dim=1)
206
+ edge = self.conv5(edge)
207
+
208
+ return edge, edge_fea
209
+
210
+
211
+ class Decoder_Module(nn.Module):
212
+ """
213
+ Parsing Branch Decoder Module.
214
+ """
215
+
216
+ def __init__(self, num_classes):
217
+ super(Decoder_Module, self).__init__()
218
+ self.conv1 = nn.Sequential(
219
+ nn.Conv2d(512, 256, kernel_size=1, padding=0, dilation=1, bias=False),
220
+ InPlaceABNSync(256)
221
+ )
222
+ self.conv2 = nn.Sequential(
223
+ nn.Conv2d(256, 48, kernel_size=1, stride=1, padding=0, dilation=1, bias=False),
224
+ InPlaceABNSync(48)
225
+ )
226
+ self.conv3 = nn.Sequential(
227
+ nn.Conv2d(304, 256, kernel_size=1, padding=0, dilation=1, bias=False),
228
+ InPlaceABNSync(256),
229
+ nn.Conv2d(256, 256, kernel_size=1, padding=0, dilation=1, bias=False),
230
+ InPlaceABNSync(256)
231
+ )
232
+
233
+ self.conv4 = nn.Conv2d(256, num_classes, kernel_size=1, padding=0, dilation=1, bias=True)
234
+
235
+ def forward(self, xt, xl):
236
+ _, _, h, w = xl.size()
237
+ xt = F.interpolate(self.conv1(xt), size=(h, w), mode='bilinear', align_corners=True)
238
+ xl = self.conv2(xl)
239
+ x = torch.cat([xt, xl], dim=1)
240
+ x = self.conv3(x)
241
+ seg = self.conv4(x)
242
+ return seg, x
243
+
244
+
245
+ class ResNet(nn.Module):
246
+ def __init__(self, block, layers, num_classes):
247
+ self.inplanes = 128
248
+ super(ResNet, self).__init__()
249
+ self.conv1 = conv3x3(3, 64, stride=2)
250
+ self.bn1 = BatchNorm2d(64)
251
+ self.relu1 = nn.ReLU(inplace=False)
252
+ self.conv2 = conv3x3(64, 64)
253
+ self.bn2 = BatchNorm2d(64)
254
+ self.relu2 = nn.ReLU(inplace=False)
255
+ self.conv3 = conv3x3(64, 128)
256
+ self.bn3 = BatchNorm2d(128)
257
+ self.relu3 = nn.ReLU(inplace=False)
258
+
259
+ self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
260
+
261
+ self.layer1 = self._make_layer(block, 64, layers[0])
262
+ self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
263
+ self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
264
+ self.layer4 = self._make_layer(block, 512, layers[3], stride=1, dilation=2, multi_grid=(1, 1, 1))
265
+
266
+ self.context_encoding = PSPModule(2048, 512)
267
+
268
+ self.edge = Edge_Module()
269
+ self.decoder = Decoder_Module(num_classes)
270
+
271
+ self.fushion = nn.Sequential(
272
+ nn.Conv2d(1024, 256, kernel_size=1, padding=0, dilation=1, bias=False),
273
+ InPlaceABNSync(256),
274
+ nn.Dropout2d(0.1),
275
+ nn.Conv2d(256, num_classes, kernel_size=1, padding=0, dilation=1, bias=True)
276
+ )
277
+
278
+ def _make_layer(self, block, planes, blocks, stride=1, dilation=1, multi_grid=1):
279
+ downsample = None
280
+ if stride != 1 or self.inplanes != planes * block.expansion:
281
+ downsample = nn.Sequential(
282
+ nn.Conv2d(self.inplanes, planes * block.expansion,
283
+ kernel_size=1, stride=stride, bias=False),
284
+ BatchNorm2d(planes * block.expansion, affine=affine_par))
285
+
286
+ layers = []
287
+ generate_multi_grid = lambda index, grids: grids[index % len(grids)] if isinstance(grids, tuple) else 1
288
+ layers.append(block(self.inplanes, planes, stride, dilation=dilation, downsample=downsample,
289
+ multi_grid=generate_multi_grid(0, multi_grid)))
290
+ self.inplanes = planes * block.expansion
291
+ for i in range(1, blocks):
292
+ layers.append(
293
+ block(self.inplanes, planes, dilation=dilation, multi_grid=generate_multi_grid(i, multi_grid)))
294
+
295
+ return nn.Sequential(*layers)
296
+
297
+ def forward(self, x):
298
+ x = self.relu1(self.bn1(self.conv1(x)))
299
+ x = self.relu2(self.bn2(self.conv2(x)))
300
+ x = self.relu3(self.bn3(self.conv3(x)))
301
+ x = self.maxpool(x)
302
+ x2 = self.layer1(x)
303
+ x3 = self.layer2(x2)
304
+ x4 = self.layer3(x3)
305
+ x5 = self.layer4(x4)
306
+ x = self.context_encoding(x5)
307
+ parsing_result, parsing_fea = self.decoder(x, x2)
308
+ # Edge Branch
309
+ edge_result, edge_fea = self.edge(x2, x3, x4)
310
+ # Fusion Branch
311
+ x = torch.cat([parsing_fea, edge_fea], dim=1)
312
+ fusion_result = self.fushion(x)
313
+ return [[parsing_result, fusion_result], [edge_result]]
314
+
315
+
316
+ def initialize_pretrained_model(model, settings, pretrained='./models/resnet101-imagenet.pth'):
317
+ model.input_space = settings['input_space']
318
+ model.input_size = settings['input_size']
319
+ model.input_range = settings['input_range']
320
+ model.mean = settings['mean']
321
+ model.std = settings['std']
322
+
323
+ if pretrained is not None:
324
+ saved_state_dict = torch.load(pretrained)
325
+ new_params = model.state_dict().copy()
326
+ for i in saved_state_dict:
327
+ i_parts = i.split('.')
328
+ if not i_parts[0] == 'fc':
329
+ new_params['.'.join(i_parts[0:])] = saved_state_dict[i]
330
+ model.load_state_dict(new_params)
331
+
332
+
333
+ def resnet101(num_classes=20, pretrained='./models/resnet101-imagenet.pth'):
334
+ model = ResNet(Bottleneck, [3, 4, 23, 3], num_classes)
335
+ settings = pretrained_settings['resnet101']['imagenet']
336
+ initialize_pretrained_model(model, settings, pretrained)
337
+ return model
annotator/segm/networks/__init__.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import absolute_import
2
+
3
+ from ..networks.AugmentCE2P import resnet101
4
+
5
+ __factory = {
6
+ 'resnet101': resnet101,
7
+ }
8
+
9
+
10
+ def init_model(name, *args, **kwargs):
11
+ if name not in __factory.keys():
12
+ raise KeyError("Unknown model arch: {}".format(name))
13
+ return __factory[name](*args, **kwargs)
annotator/segm/networks/backbone/mobilenetv2.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- encoding: utf-8 -*-
3
+
4
+ """
5
+ @Author : Peike Li
6
+ @Contact : peike.li@yahoo.com
7
+ @File : mobilenetv2.py
8
+ @Time : 8/4/19 3:35 PM
9
+ @Desc :
10
+ @License : This source code is licensed under the license found in the
11
+ LICENSE file in the root directory of this source tree.
12
+ """
13
+
14
+ import torch.nn as nn
15
+ import math
16
+ import functools
17
+
18
+ from modules import InPlaceABN, InPlaceABNSync
19
+
20
+ BatchNorm2d = functools.partial(InPlaceABNSync, activation='none')
21
+
22
+ __all__ = ['mobilenetv2']
23
+
24
+
25
+ def conv_bn(inp, oup, stride):
26
+ return nn.Sequential(
27
+ nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
28
+ BatchNorm2d(oup),
29
+ nn.ReLU6(inplace=True)
30
+ )
31
+
32
+
33
+ def conv_1x1_bn(inp, oup):
34
+ return nn.Sequential(
35
+ nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
36
+ BatchNorm2d(oup),
37
+ nn.ReLU6(inplace=True)
38
+ )
39
+
40
+
41
+ class InvertedResidual(nn.Module):
42
+ def __init__(self, inp, oup, stride, expand_ratio):
43
+ super(InvertedResidual, self).__init__()
44
+ self.stride = stride
45
+ assert stride in [1, 2]
46
+
47
+ hidden_dim = round(inp * expand_ratio)
48
+ self.use_res_connect = self.stride == 1 and inp == oup
49
+
50
+ if expand_ratio == 1:
51
+ self.conv = nn.Sequential(
52
+ # dw
53
+ nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
54
+ BatchNorm2d(hidden_dim),
55
+ nn.ReLU6(inplace=True),
56
+ # pw-linear
57
+ nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
58
+ BatchNorm2d(oup),
59
+ )
60
+ else:
61
+ self.conv = nn.Sequential(
62
+ # pw
63
+ nn.Conv2d(inp, hidden_dim, 1, 1, 0, bias=False),
64
+ BatchNorm2d(hidden_dim),
65
+ nn.ReLU6(inplace=True),
66
+ # dw
67
+ nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
68
+ BatchNorm2d(hidden_dim),
69
+ nn.ReLU6(inplace=True),
70
+ # pw-linear
71
+ nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
72
+ BatchNorm2d(oup),
73
+ )
74
+
75
+ def forward(self, x):
76
+ if self.use_res_connect:
77
+ return x + self.conv(x)
78
+ else:
79
+ return self.conv(x)
80
+
81
+
82
+ class MobileNetV2(nn.Module):
83
+ def __init__(self, n_class=1000, input_size=224, width_mult=1.):
84
+ super(MobileNetV2, self).__init__()
85
+ block = InvertedResidual
86
+ input_channel = 32
87
+ last_channel = 1280
88
+ interverted_residual_setting = [
89
+ # t, c, n, s
90
+ [1, 16, 1, 1],
91
+ [6, 24, 2, 2], # layer 2
92
+ [6, 32, 3, 2], # layer 3
93
+ [6, 64, 4, 2],
94
+ [6, 96, 3, 1], # layer 4
95
+ [6, 160, 3, 2],
96
+ [6, 320, 1, 1], # layer 5
97
+ ]
98
+
99
+ # building first layer
100
+ assert input_size % 32 == 0
101
+ input_channel = int(input_channel * width_mult)
102
+ self.last_channel = int(last_channel * width_mult) if width_mult > 1.0 else last_channel
103
+ self.features = [conv_bn(3, input_channel, 2)]
104
+ # building inverted residual blocks
105
+ for t, c, n, s in interverted_residual_setting:
106
+ output_channel = int(c * width_mult)
107
+ for i in range(n):
108
+ if i == 0:
109
+ self.features.append(block(input_channel, output_channel, s, expand_ratio=t))
110
+ else:
111
+ self.features.append(block(input_channel, output_channel, 1, expand_ratio=t))
112
+ input_channel = output_channel
113
+ # building last several layers
114
+ self.features.append(conv_1x1_bn(input_channel, self.last_channel))
115
+ # make it nn.Sequential
116
+ self.features = nn.Sequential(*self.features)
117
+
118
+ # building classifier
119
+ self.classifier = nn.Sequential(
120
+ nn.Dropout(0.2),
121
+ nn.Linear(self.last_channel, n_class),
122
+ )
123
+
124
+ self._initialize_weights()
125
+
126
+ def forward(self, x):
127
+ x = self.features(x)
128
+ x = x.mean(3).mean(2)
129
+ x = self.classifier(x)
130
+ return x
131
+
132
+ def _initialize_weights(self):
133
+ for m in self.modules():
134
+ if isinstance(m, nn.Conv2d):
135
+ n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
136
+ m.weight.data.normal_(0, math.sqrt(2. / n))
137
+ if m.bias is not None:
138
+ m.bias.data.zero_()
139
+ elif isinstance(m, BatchNorm2d):
140
+ m.weight.data.fill_(1)
141
+ m.bias.data.zero_()
142
+ elif isinstance(m, nn.Linear):
143
+ n = m.weight.size(1)
144
+ m.weight.data.normal_(0, 0.01)
145
+ m.bias.data.zero_()
146
+
147
+
148
+ def mobilenetv2(pretrained=False, **kwargs):
149
+ """Constructs a MobileNet_V2 model.
150
+ Args:
151
+ pretrained (bool): If True, returns a model pre-trained on ImageNet
152
+ """
153
+ model = MobileNetV2(n_class=1000, **kwargs)
154
+ if pretrained:
155
+ model.load_state_dict(load_url(model_urls['mobilenetv2']), strict=False)
156
+ return model
annotator/segm/networks/backbone/resnet.py ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- encoding: utf-8 -*-
3
+
4
+ """
5
+ @Author : Peike Li
6
+ @Contact : peike.li@yahoo.com
7
+ @File : resnet.py
8
+ @Time : 8/4/19 3:35 PM
9
+ @Desc :
10
+ @License : This source code is licensed under the license found in the
11
+ LICENSE file in the root directory of this source tree.
12
+ """
13
+
14
+ import functools
15
+ import torch.nn as nn
16
+ import math
17
+ from torch.utils.model_zoo import load_url
18
+
19
+ from modules import InPlaceABNSync
20
+
21
+ BatchNorm2d = functools.partial(InPlaceABNSync, activation='none')
22
+
23
+ __all__ = ['ResNet', 'resnet18', 'resnet50', 'resnet101'] # resnet101 is coming soon!
24
+
25
+ model_urls = {
26
+ 'resnet18': 'http://sceneparsing.csail.mit.edu/model/pretrained_resnet/resnet18-imagenet.pth',
27
+ 'resnet50': 'http://sceneparsing.csail.mit.edu/model/pretrained_resnet/resnet50-imagenet.pth',
28
+ 'resnet101': 'http://sceneparsing.csail.mit.edu/model/pretrained_resnet/resnet101-imagenet.pth'
29
+ }
30
+
31
+
32
+ def conv3x3(in_planes, out_planes, stride=1):
33
+ "3x3 convolution with padding"
34
+ return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
35
+ padding=1, bias=False)
36
+
37
+
38
+ class BasicBlock(nn.Module):
39
+ expansion = 1
40
+
41
+ def __init__(self, inplanes, planes, stride=1, downsample=None):
42
+ super(BasicBlock, self).__init__()
43
+ self.conv1 = conv3x3(inplanes, planes, stride)
44
+ self.bn1 = BatchNorm2d(planes)
45
+ self.relu = nn.ReLU(inplace=True)
46
+ self.conv2 = conv3x3(planes, planes)
47
+ self.bn2 = BatchNorm2d(planes)
48
+ self.downsample = downsample
49
+ self.stride = stride
50
+
51
+ def forward(self, x):
52
+ residual = x
53
+
54
+ out = self.conv1(x)
55
+ out = self.bn1(out)
56
+ out = self.relu(out)
57
+
58
+ out = self.conv2(out)
59
+ out = self.bn2(out)
60
+
61
+ if self.downsample is not None:
62
+ residual = self.downsample(x)
63
+
64
+ out += residual
65
+ out = self.relu(out)
66
+
67
+ return out
68
+
69
+
70
+ class Bottleneck(nn.Module):
71
+ expansion = 4
72
+
73
+ def __init__(self, inplanes, planes, stride=1, downsample=None):
74
+ super(Bottleneck, self).__init__()
75
+ self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
76
+ self.bn1 = BatchNorm2d(planes)
77
+ self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
78
+ padding=1, bias=False)
79
+ self.bn2 = BatchNorm2d(planes)
80
+ self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
81
+ self.bn3 = BatchNorm2d(planes * 4)
82
+ self.relu = nn.ReLU(inplace=True)
83
+ self.downsample = downsample
84
+ self.stride = stride
85
+
86
+ def forward(self, x):
87
+ residual = x
88
+
89
+ out = self.conv1(x)
90
+ out = self.bn1(out)
91
+ out = self.relu(out)
92
+
93
+ out = self.conv2(out)
94
+ out = self.bn2(out)
95
+ out = self.relu(out)
96
+
97
+ out = self.conv3(out)
98
+ out = self.bn3(out)
99
+
100
+ if self.downsample is not None:
101
+ residual = self.downsample(x)
102
+
103
+ out += residual
104
+ out = self.relu(out)
105
+
106
+ return out
107
+
108
+
109
+ class ResNet(nn.Module):
110
+
111
+ def __init__(self, block, layers, num_classes=1000):
112
+ self.inplanes = 128
113
+ super(ResNet, self).__init__()
114
+ self.conv1 = conv3x3(3, 64, stride=2)
115
+ self.bn1 = BatchNorm2d(64)
116
+ self.relu1 = nn.ReLU(inplace=True)
117
+ self.conv2 = conv3x3(64, 64)
118
+ self.bn2 = BatchNorm2d(64)
119
+ self.relu2 = nn.ReLU(inplace=True)
120
+ self.conv3 = conv3x3(64, 128)
121
+ self.bn3 = BatchNorm2d(128)
122
+ self.relu3 = nn.ReLU(inplace=True)
123
+ self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
124
+
125
+ self.layer1 = self._make_layer(block, 64, layers[0])
126
+ self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
127
+ self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
128
+ self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
129
+ self.avgpool = nn.AvgPool2d(7, stride=1)
130
+ self.fc = nn.Linear(512 * block.expansion, num_classes)
131
+
132
+ for m in self.modules():
133
+ if isinstance(m, nn.Conv2d):
134
+ n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
135
+ m.weight.data.normal_(0, math.sqrt(2. / n))
136
+ elif isinstance(m, BatchNorm2d):
137
+ m.weight.data.fill_(1)
138
+ m.bias.data.zero_()
139
+
140
+ def _make_layer(self, block, planes, blocks, stride=1):
141
+ downsample = None
142
+ if stride != 1 or self.inplanes != planes * block.expansion:
143
+ downsample = nn.Sequential(
144
+ nn.Conv2d(self.inplanes, planes * block.expansion,
145
+ kernel_size=1, stride=stride, bias=False),
146
+ BatchNorm2d(planes * block.expansion),
147
+ )
148
+
149
+ layers = []
150
+ layers.append(block(self.inplanes, planes, stride, downsample))
151
+ self.inplanes = planes * block.expansion
152
+ for i in range(1, blocks):
153
+ layers.append(block(self.inplanes, planes))
154
+
155
+ return nn.Sequential(*layers)
156
+
157
+ def forward(self, x):
158
+ x = self.relu1(self.bn1(self.conv1(x)))
159
+ x = self.relu2(self.bn2(self.conv2(x)))
160
+ x = self.relu3(self.bn3(self.conv3(x)))
161
+ x = self.maxpool(x)
162
+
163
+ x = self.layer1(x)
164
+ x = self.layer2(x)
165
+ x = self.layer3(x)
166
+ x = self.layer4(x)
167
+
168
+ x = self.avgpool(x)
169
+ x = x.view(x.size(0), -1)
170
+ x = self.fc(x)
171
+
172
+ return x
173
+
174
+
175
+ def resnet18(pretrained=False, **kwargs):
176
+ """Constructs a ResNet-18 model.
177
+ Args:
178
+ pretrained (bool): If True, returns a model pre-trained on ImageNet
179
+ """
180
+ model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
181
+ if pretrained:
182
+ model.load_state_dict(load_url(model_urls['resnet18']))
183
+ return model
184
+
185
+
186
+ def resnet50(pretrained=False, **kwargs):
187
+ """Constructs a ResNet-50 model.
188
+ Args:
189
+ pretrained (bool): If True, returns a model pre-trained on ImageNet
190
+ """
191
+ model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
192
+ if pretrained:
193
+ model.load_state_dict(load_url(model_urls['resnet50']), strict=False)
194
+ return model
195
+
196
+
197
+ def resnet101(pretrained=False, **kwargs):
198
+ """Constructs a ResNet-101 model.
199
+ Args:
200
+ pretrained (bool): If True, returns a model pre-trained on ImageNet
201
+ """
202
+ model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
203
+ if pretrained:
204
+ model.load_state_dict(load_url(model_urls['resnet101']), strict=False)
205
+ return model
annotator/segm/networks/backbone/resnext.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- encoding: utf-8 -*-
3
+
4
+ """
5
+ @Author : Peike Li
6
+ @Contact : peike.li@yahoo.com
7
+ @File : resnext.py.py
8
+ @Time : 8/11/19 8:58 PM
9
+ @Desc :
10
+ @License : This source code is licensed under the license found in the
11
+ LICENSE file in the root directory of this source tree.
12
+ """
13
+ import functools
14
+ import torch.nn as nn
15
+ import math
16
+ from torch.utils.model_zoo import load_url
17
+
18
+ from modules import InPlaceABNSync
19
+
20
+ BatchNorm2d = functools.partial(InPlaceABNSync, activation='none')
21
+
22
+ __all__ = ['ResNeXt', 'resnext101'] # support resnext 101
23
+
24
+ model_urls = {
25
+ 'resnext50': 'http://sceneparsing.csail.mit.edu/model/pretrained_resnet/resnext50-imagenet.pth',
26
+ 'resnext101': 'http://sceneparsing.csail.mit.edu/model/pretrained_resnet/resnext101-imagenet.pth'
27
+ }
28
+
29
+
30
+ def conv3x3(in_planes, out_planes, stride=1):
31
+ "3x3 convolution with padding"
32
+ return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
33
+ padding=1, bias=False)
34
+
35
+
36
+ class GroupBottleneck(nn.Module):
37
+ expansion = 2
38
+
39
+ def __init__(self, inplanes, planes, stride=1, groups=1, downsample=None):
40
+ super(GroupBottleneck, self).__init__()
41
+ self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
42
+ self.bn1 = BatchNorm2d(planes)
43
+ self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
44
+ padding=1, groups=groups, bias=False)
45
+ self.bn2 = BatchNorm2d(planes)
46
+ self.conv3 = nn.Conv2d(planes, planes * 2, kernel_size=1, bias=False)
47
+ self.bn3 = BatchNorm2d(planes * 2)
48
+ self.relu = nn.ReLU(inplace=True)
49
+ self.downsample = downsample
50
+ self.stride = stride
51
+
52
+ def forward(self, x):
53
+ residual = x
54
+
55
+ out = self.conv1(x)
56
+ out = self.bn1(out)
57
+ out = self.relu(out)
58
+
59
+ out = self.conv2(out)
60
+ out = self.bn2(out)
61
+ out = self.relu(out)
62
+
63
+ out = self.conv3(out)
64
+ out = self.bn3(out)
65
+
66
+ if self.downsample is not None:
67
+ residual = self.downsample(x)
68
+
69
+ out += residual
70
+ out = self.relu(out)
71
+
72
+ return out
73
+
74
+
75
+ class ResNeXt(nn.Module):
76
+
77
+ def __init__(self, block, layers, groups=32, num_classes=1000):
78
+ self.inplanes = 128
79
+ super(ResNeXt, self).__init__()
80
+ self.conv1 = conv3x3(3, 64, stride=2)
81
+ self.bn1 = BatchNorm2d(64)
82
+ self.relu1 = nn.ReLU(inplace=True)
83
+ self.conv2 = conv3x3(64, 64)
84
+ self.bn2 = BatchNorm2d(64)
85
+ self.relu2 = nn.ReLU(inplace=True)
86
+ self.conv3 = conv3x3(64, 128)
87
+ self.bn3 = BatchNorm2d(128)
88
+ self.relu3 = nn.ReLU(inplace=True)
89
+ self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
90
+
91
+ self.layer1 = self._make_layer(block, 128, layers[0], groups=groups)
92
+ self.layer2 = self._make_layer(block, 256, layers[1], stride=2, groups=groups)
93
+ self.layer3 = self._make_layer(block, 512, layers[2], stride=2, groups=groups)
94
+ self.layer4 = self._make_layer(block, 1024, layers[3], stride=2, groups=groups)
95
+ self.avgpool = nn.AvgPool2d(7, stride=1)
96
+ self.fc = nn.Linear(1024 * block.expansion, num_classes)
97
+
98
+ for m in self.modules():
99
+ if isinstance(m, nn.Conv2d):
100
+ n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels // m.groups
101
+ m.weight.data.normal_(0, math.sqrt(2. / n))
102
+ elif isinstance(m, BatchNorm2d):
103
+ m.weight.data.fill_(1)
104
+ m.bias.data.zero_()
105
+
106
+ def _make_layer(self, block, planes, blocks, stride=1, groups=1):
107
+ downsample = None
108
+ if stride != 1 or self.inplanes != planes * block.expansion:
109
+ downsample = nn.Sequential(
110
+ nn.Conv2d(self.inplanes, planes * block.expansion,
111
+ kernel_size=1, stride=stride, bias=False),
112
+ BatchNorm2d(planes * block.expansion),
113
+ )
114
+
115
+ layers = []
116
+ layers.append(block(self.inplanes, planes, stride, groups, downsample))
117
+ self.inplanes = planes * block.expansion
118
+ for i in range(1, blocks):
119
+ layers.append(block(self.inplanes, planes, groups=groups))
120
+
121
+ return nn.Sequential(*layers)
122
+
123
+ def forward(self, x):
124
+ x = self.relu1(self.bn1(self.conv1(x)))
125
+ x = self.relu2(self.bn2(self.conv2(x)))
126
+ x = self.relu3(self.bn3(self.conv3(x)))
127
+ x = self.maxpool(x)
128
+
129
+ x = self.layer1(x)
130
+ x = self.layer2(x)
131
+ x = self.layer3(x)
132
+ x = self.layer4(x)
133
+
134
+ x = self.avgpool(x)
135
+ x = x.view(x.size(0), -1)
136
+ x = self.fc(x)
137
+
138
+ return x
139
+
140
+
141
+ def resnext101(pretrained=False, **kwargs):
142
+ """Constructs a ResNet-101 model.
143
+ Args:
144
+ pretrained (bool): If True, returns a model pre-trained on Places
145
+ """
146
+ model = ResNeXt(GroupBottleneck, [3, 4, 23, 3], **kwargs)
147
+ if pretrained:
148
+ model.load_state_dict(load_url(model_urls['resnext101']), strict=False)
149
+ return model
annotator/segm/networks/context_encoding/aspp.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- encoding: utf-8 -*-
3
+
4
+ """
5
+ @Author : Peike Li
6
+ @Contact : peike.li@yahoo.com
7
+ @File : aspp.py
8
+ @Time : 8/4/19 3:36 PM
9
+ @Desc :
10
+ @License : This source code is licensed under the license found in the
11
+ LICENSE file in the root directory of this source tree.
12
+ """
13
+
14
+ import torch
15
+ import torch.nn as nn
16
+ from torch.nn import functional as F
17
+
18
+ from modules import InPlaceABNSync
19
+
20
+
21
+ class ASPPModule(nn.Module):
22
+ """
23
+ Reference:
24
+ Chen, Liang-Chieh, et al. *"Rethinking Atrous Convolution for Semantic Image Segmentation."*
25
+ """
26
+ def __init__(self, features, out_features=512, inner_features=256, dilations=(12, 24, 36)):
27
+ super(ASPPModule, self).__init__()
28
+
29
+ self.conv1 = nn.Sequential(nn.AdaptiveAvgPool2d((1, 1)),
30
+ nn.Conv2d(features, inner_features, kernel_size=1, padding=0, dilation=1,
31
+ bias=False),
32
+ InPlaceABNSync(inner_features))
33
+ self.conv2 = nn.Sequential(
34
+ nn.Conv2d(features, inner_features, kernel_size=1, padding=0, dilation=1, bias=False),
35
+ InPlaceABNSync(inner_features))
36
+ self.conv3 = nn.Sequential(
37
+ nn.Conv2d(features, inner_features, kernel_size=3, padding=dilations[0], dilation=dilations[0], bias=False),
38
+ InPlaceABNSync(inner_features))
39
+ self.conv4 = nn.Sequential(
40
+ nn.Conv2d(features, inner_features, kernel_size=3, padding=dilations[1], dilation=dilations[1], bias=False),
41
+ InPlaceABNSync(inner_features))
42
+ self.conv5 = nn.Sequential(
43
+ nn.Conv2d(features, inner_features, kernel_size=3, padding=dilations[2], dilation=dilations[2], bias=False),
44
+ InPlaceABNSync(inner_features))
45
+
46
+ self.bottleneck = nn.Sequential(
47
+ nn.Conv2d(inner_features * 5, out_features, kernel_size=1, padding=0, dilation=1, bias=False),
48
+ InPlaceABNSync(out_features),
49
+ nn.Dropout2d(0.1)
50
+ )
51
+
52
+ def forward(self, x):
53
+ _, _, h, w = x.size()
54
+
55
+ feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True)
56
+
57
+ feat2 = self.conv2(x)
58
+ feat3 = self.conv3(x)
59
+ feat4 = self.conv4(x)
60
+ feat5 = self.conv5(x)
61
+ out = torch.cat((feat1, feat2, feat3, feat4, feat5), 1)
62
+
63
+ bottle = self.bottleneck(out)
64
+ return bottle
annotator/segm/networks/context_encoding/ocnet.py ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- encoding: utf-8 -*-
3
+
4
+ """
5
+ @Author : Peike Li
6
+ @Contact : peike.li@yahoo.com
7
+ @File : ocnet.py
8
+ @Time : 8/4/19 3:36 PM
9
+ @Desc :
10
+ @License : This source code is licensed under the license found in the
11
+ LICENSE file in the root directory of this source tree.
12
+ """
13
+
14
+ import functools
15
+
16
+ import torch
17
+ import torch.nn as nn
18
+ from torch.autograd import Variable
19
+ from torch.nn import functional as F
20
+
21
+ from modules import InPlaceABNSync
22
+ BatchNorm2d = functools.partial(InPlaceABNSync, activation='none')
23
+
24
+
25
+ class _SelfAttentionBlock(nn.Module):
26
+ '''
27
+ The basic implementation for self-attention block/non-local block
28
+ Input:
29
+ N X C X H X W
30
+ Parameters:
31
+ in_channels : the dimension of the input feature map
32
+ key_channels : the dimension after the key/query transform
33
+ value_channels : the dimension after the value transform
34
+ scale : choose the scale to downsample the input feature maps (save memory cost)
35
+ Return:
36
+ N X C X H X W
37
+ position-aware context features.(w/o concate or add with the input)
38
+ '''
39
+
40
+ def __init__(self, in_channels, key_channels, value_channels, out_channels=None, scale=1):
41
+ super(_SelfAttentionBlock, self).__init__()
42
+ self.scale = scale
43
+ self.in_channels = in_channels
44
+ self.out_channels = out_channels
45
+ self.key_channels = key_channels
46
+ self.value_channels = value_channels
47
+ if out_channels == None:
48
+ self.out_channels = in_channels
49
+ self.pool = nn.MaxPool2d(kernel_size=(scale, scale))
50
+ self.f_key = nn.Sequential(
51
+ nn.Conv2d(in_channels=self.in_channels, out_channels=self.key_channels,
52
+ kernel_size=1, stride=1, padding=0),
53
+ InPlaceABNSync(self.key_channels),
54
+ )
55
+ self.f_query = self.f_key
56
+ self.f_value = nn.Conv2d(in_channels=self.in_channels, out_channels=self.value_channels,
57
+ kernel_size=1, stride=1, padding=0)
58
+ self.W = nn.Conv2d(in_channels=self.value_channels, out_channels=self.out_channels,
59
+ kernel_size=1, stride=1, padding=0)
60
+ nn.init.constant(self.W.weight, 0)
61
+ nn.init.constant(self.W.bias, 0)
62
+
63
+ def forward(self, x):
64
+ batch_size, h, w = x.size(0), x.size(2), x.size(3)
65
+ if self.scale > 1:
66
+ x = self.pool(x)
67
+
68
+ value = self.f_value(x).view(batch_size, self.value_channels, -1)
69
+ value = value.permute(0, 2, 1)
70
+ query = self.f_query(x).view(batch_size, self.key_channels, -1)
71
+ query = query.permute(0, 2, 1)
72
+ key = self.f_key(x).view(batch_size, self.key_channels, -1)
73
+
74
+ sim_map = torch.matmul(query, key)
75
+ sim_map = (self.key_channels ** -.5) * sim_map
76
+ sim_map = F.softmax(sim_map, dim=-1)
77
+
78
+ context = torch.matmul(sim_map, value)
79
+ context = context.permute(0, 2, 1).contiguous()
80
+ context = context.view(batch_size, self.value_channels, *x.size()[2:])
81
+ context = self.W(context)
82
+ if self.scale > 1:
83
+ context = F.upsample(input=context, size=(h, w), mode='bilinear', align_corners=True)
84
+ return context
85
+
86
+
87
+ class SelfAttentionBlock2D(_SelfAttentionBlock):
88
+ def __init__(self, in_channels, key_channels, value_channels, out_channels=None, scale=1):
89
+ super(SelfAttentionBlock2D, self).__init__(in_channels,
90
+ key_channels,
91
+ value_channels,
92
+ out_channels,
93
+ scale)
94
+
95
+
96
+ class BaseOC_Module(nn.Module):
97
+ """
98
+ Implementation of the BaseOC module
99
+ Parameters:
100
+ in_features / out_features: the channels of the input / output feature maps.
101
+ dropout: we choose 0.05 as the default value.
102
+ size: you can apply multiple sizes. Here we only use one size.
103
+ Return:
104
+ features fused with Object context information.
105
+ """
106
+
107
+ def __init__(self, in_channels, out_channels, key_channels, value_channels, dropout, sizes=([1])):
108
+ super(BaseOC_Module, self).__init__()
109
+ self.stages = []
110
+ self.stages = nn.ModuleList(
111
+ [self._make_stage(in_channels, out_channels, key_channels, value_channels, size) for size in sizes])
112
+ self.conv_bn_dropout = nn.Sequential(
113
+ nn.Conv2d(2 * in_channels, out_channels, kernel_size=1, padding=0),
114
+ InPlaceABNSync(out_channels),
115
+ nn.Dropout2d(dropout)
116
+ )
117
+
118
+ def _make_stage(self, in_channels, output_channels, key_channels, value_channels, size):
119
+ return SelfAttentionBlock2D(in_channels,
120
+ key_channels,
121
+ value_channels,
122
+ output_channels,
123
+ size)
124
+
125
+ def forward(self, feats):
126
+ priors = [stage(feats) for stage in self.stages]
127
+ context = priors[0]
128
+ for i in range(1, len(priors)):
129
+ context += priors[i]
130
+ output = self.conv_bn_dropout(torch.cat([context, feats], 1))
131
+ return output
132
+
133
+
134
+ class BaseOC_Context_Module(nn.Module):
135
+ """
136
+ Output only the context features.
137
+ Parameters:
138
+ in_features / out_features: the channels of the input / output feature maps.
139
+ dropout: specify the dropout ratio
140
+ fusion: We provide two different fusion method, "concat" or "add"
141
+ size: we find that directly learn the attention weights on even 1/8 feature maps is hard.
142
+ Return:
143
+ features after "concat" or "add"
144
+ """
145
+
146
+ def __init__(self, in_channels, out_channels, key_channels, value_channels, dropout, sizes=([1])):
147
+ super(BaseOC_Context_Module, self).__init__()
148
+ self.stages = []
149
+ self.stages = nn.ModuleList(
150
+ [self._make_stage(in_channels, out_channels, key_channels, value_channels, size) for size in sizes])
151
+ self.conv_bn_dropout = nn.Sequential(
152
+ nn.Conv2d(in_channels, out_channels, kernel_size=1, padding=0),
153
+ InPlaceABNSync(out_channels),
154
+ )
155
+
156
+ def _make_stage(self, in_channels, output_channels, key_channels, value_channels, size):
157
+ return SelfAttentionBlock2D(in_channels,
158
+ key_channels,
159
+ value_channels,
160
+ output_channels,
161
+ size)
162
+
163
+ def forward(self, feats):
164
+ priors = [stage(feats) for stage in self.stages]
165
+ context = priors[0]
166
+ for i in range(1, len(priors)):
167
+ context += priors[i]
168
+ output = self.conv_bn_dropout(context)
169
+ return output
170
+
171
+
172
+ class ASP_OC_Module(nn.Module):
173
+ def __init__(self, features, out_features=256, dilations=(12, 24, 36)):
174
+ super(ASP_OC_Module, self).__init__()
175
+ self.context = nn.Sequential(nn.Conv2d(features, out_features, kernel_size=3, padding=1, dilation=1, bias=True),
176
+ InPlaceABNSync(out_features),
177
+ BaseOC_Context_Module(in_channels=out_features, out_channels=out_features,
178
+ key_channels=out_features // 2, value_channels=out_features,
179
+ dropout=0, sizes=([2])))
180
+ self.conv2 = nn.Sequential(nn.Conv2d(features, out_features, kernel_size=1, padding=0, dilation=1, bias=False),
181
+ InPlaceABNSync(out_features))
182
+ self.conv3 = nn.Sequential(
183
+ nn.Conv2d(features, out_features, kernel_size=3, padding=dilations[0], dilation=dilations[0], bias=False),
184
+ InPlaceABNSync(out_features))
185
+ self.conv4 = nn.Sequential(
186
+ nn.Conv2d(features, out_features, kernel_size=3, padding=dilations[1], dilation=dilations[1], bias=False),
187
+ InPlaceABNSync(out_features))
188
+ self.conv5 = nn.Sequential(
189
+ nn.Conv2d(features, out_features, kernel_size=3, padding=dilations[2], dilation=dilations[2], bias=False),
190
+ InPlaceABNSync(out_features))
191
+
192
+ self.conv_bn_dropout = nn.Sequential(
193
+ nn.Conv2d(out_features * 5, out_features, kernel_size=1, padding=0, dilation=1, bias=False),
194
+ InPlaceABNSync(out_features),
195
+ nn.Dropout2d(0.1)
196
+ )
197
+
198
+ def _cat_each(self, feat1, feat2, feat3, feat4, feat5):
199
+ assert (len(feat1) == len(feat2))
200
+ z = []
201
+ for i in range(len(feat1)):
202
+ z.append(torch.cat((feat1[i], feat2[i], feat3[i], feat4[i], feat5[i]), 1))
203
+ return z
204
+
205
+ def forward(self, x):
206
+ if isinstance(x, Variable):
207
+ _, _, h, w = x.size()
208
+ elif isinstance(x, tuple) or isinstance(x, list):
209
+ _, _, h, w = x[0].size()
210
+ else:
211
+ raise RuntimeError('unknown input type')
212
+
213
+ feat1 = self.context(x)
214
+ feat2 = self.conv2(x)
215
+ feat3 = self.conv3(x)
216
+ feat4 = self.conv4(x)
217
+ feat5 = self.conv5(x)
218
+
219
+ if isinstance(x, Variable):
220
+ out = torch.cat((feat1, feat2, feat3, feat4, feat5), 1)
221
+ elif isinstance(x, tuple) or isinstance(x, list):
222
+ out = self._cat_each(feat1, feat2, feat3, feat4, feat5)
223
+ else:
224
+ raise RuntimeError('unknown input type')
225
+ output = self.conv_bn_dropout(out)
226
+ return output
annotator/segm/networks/context_encoding/psp.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- encoding: utf-8 -*-
3
+
4
+ """
5
+ @Author : Peike Li
6
+ @Contact : peike.li@yahoo.com
7
+ @File : psp.py
8
+ @Time : 8/4/19 3:36 PM
9
+ @Desc :
10
+ @License : This source code is licensed under the license found in the
11
+ LICENSE file in the root directory of this source tree.
12
+ """
13
+
14
+ import torch
15
+ import torch.nn as nn
16
+ from torch.nn import functional as F
17
+
18
+ from modules import InPlaceABNSync
19
+
20
+
21
+ class PSPModule(nn.Module):
22
+ """
23
+ Reference:
24
+ Zhao, Hengshuang, et al. *"Pyramid scene parsing network."*
25
+ """
26
+ def __init__(self, features, out_features=512, sizes=(1, 2, 3, 6)):
27
+ super(PSPModule, self).__init__()
28
+
29
+ self.stages = []
30
+ self.stages = nn.ModuleList([self._make_stage(features, out_features, size) for size in sizes])
31
+ self.bottleneck = nn.Sequential(
32
+ nn.Conv2d(features + len(sizes) * out_features, out_features, kernel_size=3, padding=1, dilation=1,
33
+ bias=False),
34
+ InPlaceABNSync(out_features),
35
+ )
36
+
37
+ def _make_stage(self, features, out_features, size):
38
+ prior = nn.AdaptiveAvgPool2d(output_size=(size, size))
39
+ conv = nn.Conv2d(features, out_features, kernel_size=1, bias=False)
40
+ bn = InPlaceABNSync(out_features)
41
+ return nn.Sequential(prior, conv, bn)
42
+
43
+ def forward(self, feats):
44
+ h, w = feats.size(2), feats.size(3)
45
+ priors = [F.interpolate(input=stage(feats), size=(h, w), mode='bilinear', align_corners=True) for stage in
46
+ self.stages] + [feats]
47
+ bottle = self.bottleneck(torch.cat(priors, 1))
48
+ return bottle
annotator/segm/transforms.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ------------------------------------------------------------------------------
2
+ # Copyright (c) Microsoft
3
+ # Licensed under the MIT License.
4
+ # Written by Bin Xiao (Bin.Xiao@microsoft.com)
5
+ # ------------------------------------------------------------------------------
6
+
7
+ from __future__ import absolute_import
8
+ from __future__ import division
9
+ from __future__ import print_function
10
+
11
+ import numpy as np
12
+ import cv2
13
+ import torch
14
+
15
+ class BRG2Tensor_transform(object):
16
+ def __call__(self, pic):
17
+ img = torch.from_numpy(pic.transpose((2, 0, 1)))
18
+ if isinstance(img, torch.ByteTensor):
19
+ return img.float()
20
+ else:
21
+ return img
22
+
23
+ class BGR2RGB_transform(object):
24
+ def __call__(self, tensor):
25
+ return tensor[[2,1,0],:,:]
26
+
27
+ def flip_back(output_flipped, matched_parts):
28
+ '''
29
+ ouput_flipped: numpy.ndarray(batch_size, num_joints, height, width)
30
+ '''
31
+ assert output_flipped.ndim == 4,\
32
+ 'output_flipped should be [batch_size, num_joints, height, width]'
33
+
34
+ output_flipped = output_flipped[:, :, :, ::-1]
35
+
36
+ for pair in matched_parts:
37
+ tmp = output_flipped[:, pair[0], :, :].copy()
38
+ output_flipped[:, pair[0], :, :] = output_flipped[:, pair[1], :, :]
39
+ output_flipped[:, pair[1], :, :] = tmp
40
+
41
+ return output_flipped
42
+
43
+
44
+ def fliplr_joints(joints, joints_vis, width, matched_parts):
45
+ """
46
+ flip coords
47
+ """
48
+ # Flip horizontal
49
+ joints[:, 0] = width - joints[:, 0] - 1
50
+
51
+ # Change left-right parts
52
+ for pair in matched_parts:
53
+ joints[pair[0], :], joints[pair[1], :] = \
54
+ joints[pair[1], :], joints[pair[0], :].copy()
55
+ joints_vis[pair[0], :], joints_vis[pair[1], :] = \
56
+ joints_vis[pair[1], :], joints_vis[pair[0], :].copy()
57
+
58
+ return joints*joints_vis, joints_vis
59
+
60
+
61
+ def transform_preds(coords, center, scale, input_size):
62
+ target_coords = np.zeros(coords.shape)
63
+ trans = get_affine_transform(center, scale, 0, input_size, inv=1)
64
+ for p in range(coords.shape[0]):
65
+ target_coords[p, 0:2] = affine_transform(coords[p, 0:2], trans)
66
+ return target_coords
67
+
68
+ def transform_parsing(pred, center, scale, width, height, input_size):
69
+
70
+ trans = get_affine_transform(center, scale, 0, input_size, inv=1)
71
+ target_pred = cv2.warpAffine(
72
+ pred,
73
+ trans,
74
+ (int(width), int(height)), #(int(width), int(height)),
75
+ flags=cv2.INTER_NEAREST,
76
+ borderMode=cv2.BORDER_CONSTANT,
77
+ borderValue=(0))
78
+
79
+ return target_pred
80
+
81
+ def transform_logits(logits, center, scale, width, height, input_size):
82
+
83
+ trans = get_affine_transform(center, scale, 0, input_size, inv=1)
84
+ channel = logits.shape[2]
85
+ target_logits = []
86
+ for i in range(channel):
87
+ target_logit = cv2.warpAffine(
88
+ logits[:,:,i],
89
+ trans,
90
+ (int(width), int(height)), #(int(width), int(height)),
91
+ flags=cv2.INTER_LINEAR,
92
+ borderMode=cv2.BORDER_CONSTANT,
93
+ borderValue=(0))
94
+ target_logits.append(target_logit)
95
+ target_logits = np.stack(target_logits,axis=2)
96
+
97
+ return target_logits
98
+
99
+
100
+ def get_affine_transform(center,
101
+ scale,
102
+ rot,
103
+ output_size,
104
+ shift=np.array([0, 0], dtype=np.float32),
105
+ inv=0):
106
+ if not isinstance(scale, np.ndarray) and not isinstance(scale, list):
107
+ print(scale)
108
+ scale = np.array([scale, scale])
109
+
110
+ scale_tmp = scale
111
+
112
+ src_w = scale_tmp[0]
113
+ dst_w = output_size[1]
114
+ dst_h = output_size[0]
115
+
116
+ rot_rad = np.pi * rot / 180
117
+ src_dir = get_dir([0, src_w * -0.5], rot_rad)
118
+ dst_dir = np.array([0, (dst_w-1) * -0.5], np.float32)
119
+
120
+ src = np.zeros((3, 2), dtype=np.float32)
121
+ dst = np.zeros((3, 2), dtype=np.float32)
122
+ src[0, :] = center + scale_tmp * shift
123
+ src[1, :] = center + src_dir + scale_tmp * shift
124
+ dst[0, :] = [(dst_w-1) * 0.5, (dst_h-1) * 0.5]
125
+ dst[1, :] = np.array([(dst_w-1) * 0.5, (dst_h-1) * 0.5]) + dst_dir
126
+
127
+ src[2:, :] = get_3rd_point(src[0, :], src[1, :])
128
+ dst[2:, :] = get_3rd_point(dst[0, :], dst[1, :])
129
+
130
+ if inv:
131
+ trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
132
+ else:
133
+ trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))
134
+
135
+ return trans
136
+
137
+
138
+ def affine_transform(pt, t):
139
+ new_pt = np.array([pt[0], pt[1], 1.]).T
140
+ new_pt = np.dot(t, new_pt)
141
+ return new_pt[:2]
142
+
143
+
144
+ def get_3rd_point(a, b):
145
+ direct = a - b
146
+ return b + np.array([-direct[1], direct[0]], dtype=np.float32)
147
+
148
+
149
+ def get_dir(src_point, rot_rad):
150
+ sn, cs = np.sin(rot_rad), np.cos(rot_rad)
151
+
152
+ src_result = [0, 0]
153
+ src_result[0] = src_point[0] * cs - src_point[1] * sn
154
+ src_result[1] = src_point[0] * sn + src_point[1] * cs
155
+
156
+ return src_result
157
+
158
+
159
+ def crop(img, center, scale, output_size, rot=0):
160
+ trans = get_affine_transform(center, scale, rot, output_size)
161
+
162
+ dst_img = cv2.warpAffine(img,
163
+ trans,
164
+ (int(output_size[1]), int(output_size[0])),
165
+ flags=cv2.INTER_LINEAR)
166
+
167
+ return dst_img
annotator/util.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import cv2
3
+ import os
4
+
5
+
6
+ annotator_ckpts_path = os.path.join(os.path.dirname(__file__), 'ckpts')
7
+
8
+
9
+ def HWC3(x):
10
+ assert x.dtype == np.uint8
11
+ if x.ndim == 2:
12
+ x = x[:, :, None]
13
+ assert x.ndim == 3
14
+ H, W, C = x.shape
15
+ assert C == 1 or C == 3 or C == 4
16
+ if C == 3:
17
+ return x
18
+ if C == 1:
19
+ return np.concatenate([x, x, x], axis=2)
20
+ if C == 4:
21
+ color = x[:, :, 0:3].astype(np.float32)
22
+ alpha = x[:, :, 3:4].astype(np.float32) / 255.0
23
+ y = color * alpha + 255.0 * (1.0 - alpha)
24
+ y = y.clip(0, 255).astype(np.uint8)
25
+ return y
26
+
27
+
28
+ def resize_image(input_image, resolution):
29
+ H, W, C = input_image.shape
30
+ H = float(H)
31
+ W = float(W)
32
+ k = float(resolution) / min(H, W)
33
+ H *= k
34
+ W *= k
35
+ H = int(np.round(H / 64.0)) * 64
36
+ W = int(np.round(W / 64.0)) * 64
37
+ img = cv2.resize(input_image, (W, H), interpolation=cv2.INTER_LANCZOS4 if k > 1 else cv2.INTER_AREA)
38
+ return img
39
+
40
+ def pad_image(img, min_aspect_ratio=0.625):
41
+ H, W, C = img.shape
42
+ if W/H < min_aspect_ratio:
43
+ NEW_W = int(min_aspect_ratio * H)
44
+ width_padding = (NEW_W-W)//2
45
+ black_bg = np.zeros((H, NEW_W, 3), dtype=img.dtype)
46
+ black_bg[:, width_padding:width_padding+W,:] = img
47
+ return black_bg
48
+ else:
49
+ return img
app.py ADDED
@@ -0,0 +1,475 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from share import *
2
+ import config
3
+ import os
4
+ import cv2
5
+ import einops
6
+ import gradio as gr
7
+ import numpy as np
8
+ import torch
9
+ import random
10
+ import re
11
+ from datetime import datetime
12
+ from glob import glob
13
+ import argparse
14
+
15
+ from pytorch_lightning import seed_everything
16
+ from torchvision.transforms import ToPILImage
17
+ from annotator.util import pad_image, resize_image, HWC3
18
+ from annotator.openpose import OpenposeDetector
19
+ from cldm.model import create_model, load_state_dict
20
+ from cldm.ddim_hacked import DDIMSampler
21
+ from pathlib import Path
22
+ from PIL import Image
23
+ from omegaconf import OmegaConf
24
+ from ldm.util import instantiate_from_config, log_txt_as_img
25
+ from visconet.segm import ATRSegmentCropper as SegmentCropper
26
+ from huggingface_hub import snapshot_download
27
+
28
+ # supply directory of visual prompt images
29
+ HF_REPO = 'soonyau/visconet'
30
+ GALLERY_PATH = Path('./fashion/')
31
+ WOMEN_GALLERY_PATH = GALLERY_PATH/'WOMEN'
32
+ MEN_GALLERY_PATH = GALLERY_PATH/'MEN'
33
+
34
+ DEMO = True
35
+ LOG_SAMPLES = False
36
+ APP_FILES_PATH = Path('./app_files')
37
+ VISCON_IMAGE_PATH = APP_FILES_PATH/'default_images'
38
+ LOG_PATH = APP_FILES_PATH/'logs'
39
+ SAMPLE_IMAGE_PATH = APP_FILES_PATH/'samples'
40
+
41
+ DEFAULT_CONTROL_SCALE = 1.0
42
+ SCALE_CONFIG = {
43
+ 'Default': [DEFAULT_CONTROL_SCALE]*13,
44
+ 'DeepFakes':[1.0, 1.0, 1.0,
45
+ 1.0, 1.0, 1.0,
46
+ 0.5, 0.5, 0.5,
47
+ 0.0, 0.0, 0.0, 0.0,],
48
+ 'Faithful':[1,1,1,
49
+ 1,1,1,
50
+ 1,1,0.5,
51
+ 0.5,0.5,0,0],
52
+ 'Painting':[0.0,0.0,0.0,
53
+ 0.5,0.5,0.5,
54
+ 0.5,0.5,0.5,
55
+ 0.5,0,0,0],
56
+ 'Pose': [0.0,0.0,0.0,
57
+ 0.0,0.0,0.0,
58
+ 0.0,0.0,0.5,
59
+ 0.0,0.0,0,0],
60
+ 'Texture Transfer': [1.0,1.0,1.0,
61
+ 1.0,1.0,1.0,
62
+ 0.5,0.0,0.5,
63
+ 0.0,0.0,0,0]
64
+ }
65
+ DEFAULT_SCALE_CONFIG = 'Default'
66
+ ignore_style_list = ['headwear', 'accesories', 'shoes']
67
+
68
+ global device
69
+ global segmentor
70
+ global apply_openpose
71
+ global style_encoder
72
+ global model
73
+ global ddim_sampler
74
+
75
+ def convert_fname(long_name):
76
+ gender = 'MEN' if long_name[7:10] == 'MEN' else 'WOMEN'
77
+
78
+ input_list = long_name.replace('fashion','').split('___')
79
+
80
+ # Define a regular expression pattern to match the relevant parts of each input string
81
+ if gender == 'MEN':
82
+ pattern = r'MEN(\w+)id(\d+)_(\d)(\w+)'
83
+ else:
84
+ pattern = r'WOMEN(\w+)id(\d+)_(\d)(\w+)'
85
+ # Use a list comprehension to extract the matching substrings from each input string, and format them into the desired output format
86
+ output_list = [f'{gender}/{category}/id_{id_num[:8]}/{id_num[8:]}_{view_num}_{view_desc}' for (category, id_num, view_num, view_desc) in re.findall(pattern, ' '.join(input_list))]
87
+
88
+ # Print the resulting list of formatted strings
89
+ return [f +'.jpg' for f in output_list]
90
+
91
+ def fetch_deepfashion(deepfashion_names):
92
+ src_name, dst_name = convert_fname(deepfashion_names)
93
+ input_image = np.array(Image.open(image_root/src_name))
94
+ pose_image = np.array(Image.open(str(pose_root/dst_name)))
95
+ mask_image = Image.open(str(mask_root/dst_name).replace('.jpg','_mask.png'))
96
+
97
+ temp = src_name.replace('.jpg','').split('/')
98
+ lastfolder = temp.pop(-1).replace('_','/', 1)
99
+ style_folder = style_root/('/'.join(temp+[lastfolder]))
100
+ viscon_images = []
101
+ for style_name in style_names:
102
+ f_path = style_folder/f'{style_name}.jpg'
103
+ if os.path.exists(str(f_path)):
104
+ viscon_images.append(np.array(Image.open(f_path)))
105
+ else:
106
+ viscon_images.append(None)
107
+ return [input_image, pose_image, mask_image, *viscon_images]
108
+
109
+ def select_gallery_image(evt: gr.SelectData):
110
+ return evt.target.value[evt.index]['name']
111
+
112
+ def select_default_strength(strength_config):
113
+ return SCALE_CONFIG[strength_config]
114
+
115
+ def change_all_scales(scale):
116
+ return [float(scale)]*13
117
+
118
+ def encode_style_images(style_images):
119
+ style_embeddings = []
120
+
121
+ for style_name, style_image in zip(style_names, style_images):
122
+ if style_image == None:
123
+ style_image = Image.fromarray(np.zeros((224, 224, 3), dtype=np.uint8))
124
+
125
+ #style_image = style_image.resize((224,224))
126
+ style_image = style_encoder.preprocess(style_image).to(device)
127
+ style_emb = style_encoder.postprocess(style_encoder(style_image)[0])
128
+ style_embeddings.append(style_emb)
129
+
130
+ styles = torch.tensor(np.array(style_embeddings)).squeeze(-2).unsqueeze(0).float().to(device)
131
+ return styles
132
+
133
+ def save_viscon_images(*viscon_images):
134
+ ret_images = []
135
+ for image, name in zip(viscon_images, style_names):
136
+ fname = str(VISCON_IMAGE_PATH/name)+'.jpg'
137
+ if image:
138
+ image = image.resize((224,224))
139
+ if os.path.exists(fname):
140
+ os.remove(fname)
141
+ image.save(fname)
142
+ ret_images.append(image)
143
+ return ret_images
144
+
145
+
146
+ def extract_pose_mask(input_image, detect_resolution,
147
+ ignore_head=True, ignore_hair=False):
148
+ # skeleton
149
+ input_image = pad_image(input_image, min_aspect_ratio=0.625)
150
+ detected_map, _ = apply_openpose(resize_image(input_image, detect_resolution), hand=True)
151
+ detected_map = HWC3(detected_map)
152
+
153
+ # human mask
154
+ cropped = segmentor(input_image, ignore_head=ignore_head, ignore_hair=ignore_hair)
155
+ mask = cropped['human_mask']
156
+ mask = Image.fromarray(np.array(mask*255, dtype=np.uint8), mode='L')
157
+
158
+ return [detected_map, mask]
159
+
160
+ def extract_fashion(input_image):
161
+
162
+ # style images
163
+ cropped = segmentor(input_image)
164
+ cropped_images = []
165
+ for style_name in style_names:
166
+ if style_name in cropped and style_name not in ignore_style_list:
167
+ cropped_images.append(cropped[style_name])
168
+ else:
169
+ cropped_images.append(None)
170
+
171
+ return [*cropped_images]
172
+
173
+ def get_image_files(image_path, ret_image=True, exts=['.jpg','.jpeg','.png']):
174
+ images = []
175
+ for ext in exts:
176
+ images += [x for x in glob(str(Path(image_path)/f'*{ext}'))]
177
+ if ret_image:
178
+ images = [Image.open(x) for x in images]
179
+ return images
180
+
181
+ def log_sample(seed, results, prompt, skeleton_image, mask_image, control_scales, *viscon_images):
182
+ time_str = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
183
+
184
+ log_dir = LOG_PATH/time_str
185
+ os.makedirs(str(log_dir), exist_ok=True)
186
+
187
+ # save result
188
+ concat = np.hstack((skeleton_image, *results))
189
+ Image.fromarray(skeleton_image).save(str(log_dir/'skeleton.jpg'))
190
+ Image.fromarray(mask_image).save(str(log_dir/'mask.png'))
191
+ for i, result in enumerate(results):
192
+ Image.fromarray(result).save(str(log_dir/f'result_{i}.jpg'))
193
+
194
+ # save text
195
+ with open(str(log_dir/'info.txt'),'w') as f:
196
+ f.write(f'prompt: {prompt} \n')
197
+ f.write(f'seed: {seed}\n')
198
+ control_str = [str(x) for x in control_scales]
199
+ f.write(','.join(control_str) + '\n')
200
+ # save vison images
201
+ for style_name, style_image in zip(style_names, viscon_images):
202
+ if style_image is not None:
203
+ style_image.save(str(log_dir/f'{style_name}.jpg'))
204
+
205
+
206
+ def process(prompt, a_prompt, n_prompt, num_samples,
207
+ ddim_steps, scale, seed, eta, mask_image, pose_image,
208
+ c12, c11, c10, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0,
209
+ *viscon_images):
210
+
211
+ with torch.no_grad():
212
+ control_scales = [c12, c11, c10, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0]
213
+ mask = torch.tensor(mask_image.mean(-1)/255.,dtype=torch.float) #(512,512), [0,1]
214
+ mask = mask.unsqueeze(0).to(device) # (1, 512, 512)
215
+ style_emb = encode_style_images(viscon_images)
216
+
217
+ # fix me
218
+ detected_map = HWC3(pose_image)
219
+ #detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_NEAREST)
220
+ H, W, C = detected_map.shape
221
+ control = torch.from_numpy(detected_map.copy()).float().to(device) / 255.0
222
+ control = torch.stack([control for _ in range(num_samples)], dim=0)
223
+ control = einops.rearrange(control, 'b h w c -> b c h w').clone()
224
+
225
+ if seed == -1:
226
+ seed = random.randint(0, 65535)
227
+ seed_everything(seed)
228
+
229
+ if config.save_memory:
230
+ model.low_vram_shift(is_diffusing=False)
231
+ new_style_shape = [num_samples] + [1] * (len(style_emb.shape)-1)
232
+
233
+ cond = {"c_concat": [control],
234
+ "c_crossattn": [style_emb.repeat(new_style_shape)],
235
+ "c_text": [model.get_learned_conditioning([prompt + ', ' + a_prompt] * num_samples)],
236
+ 'c_concat_mask': [mask.repeat(num_samples, 1, 1, 1)]}
237
+
238
+ un_cond = {"c_concat": [control],
239
+ "c_crossattn": [torch.zeros_like(style_emb).repeat(new_style_shape)],
240
+ "c_text":[model.get_learned_conditioning([n_prompt] * num_samples)],
241
+ 'c_concat_mask': [torch.zeros_like(mask).repeat(num_samples, 1, 1, 1)]}
242
+
243
+ shape = (4, H // 8, W // 8)
244
+
245
+ if config.save_memory:
246
+ model.low_vram_shift(is_diffusing=True)
247
+
248
+ model.control_scales = control_scales
249
+
250
+ samples, _ = ddim_sampler.sample(ddim_steps, num_samples,
251
+ shape, cond, verbose=False, eta=eta,
252
+ unconditional_guidance_scale=scale,
253
+ unconditional_conditioning=un_cond)
254
+
255
+ if config.save_memory:
256
+ model.low_vram_shift(is_diffusing=False)
257
+
258
+ x_samples = model.decode_first_stage(samples)
259
+ x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(np.uint8)
260
+
261
+ results = [x_samples[i] for i in range(num_samples)]
262
+
263
+ if LOG_SAMPLES:
264
+ log_sample(seed, results, prompt, detected_map, mask_image, control_scales, *viscon_images)
265
+ return results
266
+
267
+ def get_image(name, file_ext='.jpg'):
268
+ fname = str(VISCON_IMAGE_PATH/name)+file_ext
269
+ if not os.path.exists(fname):
270
+ return None
271
+ return Image.open(fname)
272
+
273
+ def get_image_numpy(name, file_ext='.png'):
274
+ fname = str(VISCON_IMAGE_PATH/name)+file_ext
275
+ if not os.path.exists(fname):
276
+ return None
277
+ return np.array(Image.open(fname))
278
+
279
+ def create_app():
280
+ block = gr.Blocks().queue()
281
+ with block:
282
+ with gr.Row():
283
+ gr.Markdown("## ViscoNet: Visual ControlNet with Human Pose and Fashion <br> [Video tutorial](https://youtu.be/85NyIuLeV00)")
284
+ with gr.Row():
285
+ with gr.Column():
286
+ with gr.Accordion("Get pose and mask", open=False):
287
+ with gr.Row():
288
+ input_image = gr.Image(source='upload', type="numpy", label='input image', value=np.array(get_image_numpy('ref')))
289
+ pose_image = gr.Image(source='upload', type="numpy", label='pose', value=np.array(get_image_numpy('pose')))
290
+ mask_image = gr.Image(source='upload', type="numpy", label='mask', value=np.array(get_image_numpy('mask')))
291
+ with gr.Accordion("Samples", open=False):
292
+ with gr.Tab('Female'):
293
+ samples = get_image_files(str(SAMPLE_IMAGE_PATH/'pose/WOMEN/'))
294
+ female_pose_gallery = gr.Gallery(label='pose', show_label=False, value=samples).style(grid=3, height='auto')
295
+ with gr.Tab('Male'):
296
+ samples = get_image_files(str(SAMPLE_IMAGE_PATH/'pose/MEN/'))
297
+ male_pose_gallery = gr.Gallery(label='pose', show_label=False, value=samples).style(grid=3, height='auto')
298
+ with gr.Row():
299
+ #pad_checkbox = gr.Checkbox(label='Pad pose to square', value=True)
300
+ ignorehead_checkbox = gr.Checkbox(label='Ignore face in masking (for DeepFake)', value=True)
301
+ ignorehair_checkbox = gr.Checkbox(label='Ignore hair in masking', value=False, visible=True)
302
+ with gr.Row():
303
+ #ignore_head_checkbox = gr.Checkbox(label='Ignore head', value=False)
304
+ get_pose_button = gr.Button(label="Get pose", value='Get pose')
305
+ get_fashion_button = gr.Button(label="Get visual", value='Get visual prompt')
306
+
307
+
308
+ with gr.Accordion("Visual Conditions", open=False):
309
+ gr.Markdown('Drag-and-drop, or click from samples below.')
310
+ with gr.Column():
311
+ viscon_images = []
312
+ viscon_images_names2index = {}
313
+ viscon_len = len(style_names)
314
+ v_idx = 0
315
+
316
+ with gr.Row():
317
+ for _ in range(8):
318
+ viscon_name = style_names[v_idx]
319
+ vis = False if viscon_name in ignore_style_list else True
320
+ viscon_images.append(gr.Image(source='upload', type="pil", min_height=112, min_width=112, label=viscon_name, value=get_image(viscon_name), visible=vis))
321
+ viscon_images_names2index[viscon_name] = v_idx
322
+ v_idx += 1
323
+
324
+ viscon_button = gr.Button(value='Save as Default',visible=False if DEMO else True)
325
+
326
+ viscon_galleries = []
327
+
328
+ with gr.Column():
329
+ with gr.Accordion("Female", open=False):
330
+ for garment, number in zip(['hair', 'top', 'bottom', 'outer'], [150, 500, 500, 250]):
331
+ with gr.Tab(garment):
332
+ samples = []
333
+ if WOMEN_GALLERY_PATH and os.path.exists(WOMEN_GALLERY_PATH):
334
+ samples = glob(os.path.join(WOMEN_GALLERY_PATH, f'**/{garment}.jpg'), recursive=True)
335
+ #samples = glob(f'/home/soon/datasets/deepfashion_inshop/styles_default/WOMEN/**/{garment}.jpg', recursive=True)
336
+ samples = random.choices(samples, k=number)
337
+ viscon_gallery = gr.Gallery(label='hair', allow_preview=False, show_label=False, value=samples).style(grid=4, height='auto')
338
+ viscon_galleries.append({'component':viscon_gallery, 'inputs':[garment]})
339
+ with gr.Accordion("Male", open=False):
340
+ for garment, number in zip(['hair', 'top', 'bottom', 'outer'], [150, 500, 500, 250]):
341
+ with gr.Tab(garment):
342
+ samples = []
343
+ if MEN_GALLERY_PATH and os.path.exists(MEN_GALLERY_PATH):
344
+ samples = glob(os.path.join(MEN_GALLERY_PATH, f'**/{garment}.jpg'), recursive=True)
345
+ samples = random.choices(samples, k=number)
346
+ viscon_gallery = gr.Gallery(label='hair', allow_preview=False, show_label=False, value=samples).style(grid=4, height='auto')
347
+ viscon_galleries.append({'component':viscon_gallery, 'inputs':[garment]})
348
+
349
+ with gr.Accordion("Control Strength Scaling", open=False):
350
+ gr.Markdown("smaller value for stronger textual influence. c12 is highest spatial resolution controlling textures")
351
+ with gr.Row():
352
+ strength_select = gr.Dropdown(list(SCALE_CONFIG.keys()), label='strength settings', value=DEFAULT_SCALE_CONFIG)
353
+ scale_all = gr.Slider(label=f'set all scales', minimum=0, maximum=1, value=DEFAULT_CONTROL_SCALE, step=0.05)
354
+ scale_values = SCALE_CONFIG[DEFAULT_SCALE_CONFIG]
355
+ control_scales = []
356
+ c_idx = 12
357
+ with gr.Accordion("Advanced settings", open=False):
358
+ with gr.Row():
359
+ for _ in range(3):
360
+ control_scales.append(gr.Slider(label=f'c{c_idx}', minimum=0, maximum=1, value=scale_values[12-c_idx], step=0.05))
361
+ c_idx -= 1
362
+ with gr.Row():
363
+ for _ in range(3):
364
+ control_scales.append(gr.Slider(label=f'c{c_idx}', minimum=0, maximum=1, value=scale_values[12-c_idx], step=0.05))
365
+ c_idx -= 1
366
+ with gr.Row():
367
+ for _ in range(3):
368
+ control_scales.append(gr.Slider(label=f'c{c_idx}', minimum=0, maximum=1, value=scale_values[12-c_idx], step=0.05))
369
+ c_idx -= 1
370
+ with gr.Row():
371
+ for _ in range(4):
372
+ control_scales.append(gr.Slider(label=f'c{c_idx}', minimum=0, maximum=1, value=scale_values[12-c_idx], step=0.05))
373
+ c_idx -= 1
374
+
375
+ with gr.Accordion("Advanced options", open=False):
376
+ with gr.Row():
377
+ detect_resolution = gr.Slider(label="OpenPose Resolution", minimum=128, maximum=512, value=512, step=1)
378
+ ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=50, value=20, step=1)
379
+ scale = gr.Slider(label="Guidance Scale", minimum=0.1, maximum=30.0, value=12.0, step=0.1)
380
+
381
+ eta = gr.Number(label="eta (DDIM)", value=0.0, visible=False)
382
+ a_prompt = gr.Textbox(label="Added Prompt", value='best quality, extremely detailed')
383
+ n_prompt = gr.Textbox(label="Negative Prompt",
384
+ value='longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, sunglasses, hat')
385
+ with gr.Column():
386
+ result_gallery = gr.Gallery(label='Output', show_label=False, show_download_button=True, elem_id="gallery").style(grid=1, height='auto')
387
+ with gr.Row():
388
+ max_samples = 8 if not DEMO else 4
389
+ num_samples = gr.Slider(label="Images", minimum=1, maximum=max_samples, value=1, step=1)
390
+ seed = gr.Slider(label="Seed (-1 for random)", minimum=-1, maximum=2147483647, step=1, value=1561194236)#randomize=True) #value=1561194234)
391
+ if not DEMO:
392
+ DF_DEMO = 'fashionWOMENTees_Tanksid0000762403_1front___fashionWOMENTees_Tanksid0000762403_1front'
393
+ DF_EVAL = 'fashionWOMENBlouses_Shirtsid0000035501_1front___fashionWOMENBlouses_Shirtsid0000035501_1front'
394
+ DF_RESULT ="fashionWOMENTees_Tanksid0000796209_1front___fashionWOMENTees_Tanksid0000796209_2side"
395
+ deepfashion_names = gr.Textbox(label='Deepfashion name', value=DF_EVAL)
396
+ gr.Markdown("Default config reconstruct image faithful to pose, mask and visual condition. Reduce control strength to tip balance towards text prompt for more creativity.")
397
+ prompt = gr.Textbox(label="Text Prompt", value="")
398
+
399
+ run_button = gr.Button(label="Run")
400
+
401
+
402
+ female_pose_gallery.select(fn=select_gallery_image, inputs=None, outputs=input_image)
403
+ male_pose_gallery.select(fn=select_gallery_image, inputs=None, outputs=input_image)
404
+ for vision_gallery in viscon_galleries:
405
+ viscon_idx = viscon_images_names2index[vision_gallery['inputs'][0]]
406
+ vision_gallery['component'].select(fn=select_gallery_image, inputs=None,
407
+ outputs=viscon_images[viscon_idx])
408
+ ips = [prompt, a_prompt, n_prompt, num_samples, ddim_steps, scale, seed, eta, mask_image, pose_image,
409
+ *control_scales, *viscon_images]
410
+ run_button.click(fn=process, inputs=ips, outputs=[result_gallery])
411
+ prompt.submit(fn=process, inputs=ips, outputs=[result_gallery])
412
+ get_pose_button.click(fn=extract_pose_mask, inputs=[input_image, detect_resolution,
413
+ ignorehead_checkbox, ignorehair_checkbox],
414
+ outputs=[pose_image, mask_image])
415
+ get_fashion_button.click(fn=extract_fashion, inputs=input_image, outputs=[*viscon_images])
416
+ viscon_button.click(fn=save_viscon_images, inputs=[*viscon_images], outputs=[*viscon_images])
417
+ strength_select.select(fn=select_default_strength, inputs=[strength_select], outputs=[*control_scales])
418
+ scale_all.release(fn=change_all_scales, inputs=[scale_all], outputs=[*control_scales])
419
+ if not DEMO:
420
+ deepfashion_names.submit(fn=fetch_deepfashion, inputs=[deepfashion_names], outputs=[input_image, pose_image, mask_image, *viscon_images])
421
+ return block
422
+
423
+ if __name__ == "__main__":
424
+ parser = argparse.ArgumentParser(description='Calculate image-text similarity score.')
425
+
426
+ parser.add_argument('--gpu', type=int, default=0, help='GPU id')
427
+ parser.add_argument('--config', type=str, default='./configs/visconet_v1.yaml')
428
+ parser.add_argument('--ckpt', type=str, default='./models/visconet_v1.pth')
429
+ parser.add_argument('--public_link', action='store_true', default='', help='Create public link')
430
+ args = parser.parse_args()
431
+
432
+ global device
433
+ global segmentor
434
+ global apply_openpose
435
+ global style_encoder
436
+ global model
437
+ global ddim_sampler
438
+
439
+ device = f'cuda:{args.gpu}' if torch.cuda.is_available() else 'cpu'
440
+ config_file = args.config
441
+ model_ckpt = args.ckpt
442
+
443
+ proj_config = OmegaConf.load(config_file)
444
+ style_names = proj_config.dataset.train.params.style_names
445
+ data_root = Path(proj_config.dataset.train.params.image_root)
446
+ image_root = data_root/proj_config.dataset.train.params.image_dir
447
+ style_root = data_root/proj_config.dataset.train.params.style_dir
448
+ pose_root = data_root/proj_config.dataset.train.params.pose_dir
449
+ mask_root = data_root/proj_config.dataset.train.params.mask_dir
450
+
451
+ segmentor = SegmentCropper()
452
+ apply_openpose = OpenposeDetector()
453
+
454
+ snapshot_download(repo_id=HF_REPO, local_dir='./models',
455
+ allow_patterns=os.path.basename(model_ckpt))
456
+
457
+ style_encoder = instantiate_from_config(proj_config.model.style_embedding_config).to(device)
458
+ model = create_model(config_file).cpu()
459
+ model.load_state_dict(load_state_dict(model_ckpt, location=device))
460
+
461
+ model = model.to(device)
462
+ model.cond_stage_model.device = device
463
+ ddim_sampler = DDIMSampler(model)
464
+
465
+ if not GALLERY_PATH.exists():
466
+ zip_name = 'fashion.zip'
467
+ snapshot_download(repo_id=HF_REPO, allow_patterns=zip_name, local_dir='.')
468
+ from zipfile import ZipFile
469
+ with ZipFile(zip_name, 'r') as zip_ref:
470
+ zip_ref.extractall('.')
471
+ os.remove(zip_name)
472
+
473
+ # Calling the main function with parsed arguments
474
+ block = create_app()
475
+ block.launch(server_name='0.0.0.0', share=args.public_link)
app_files/default_images/mask.png ADDED
app_files/default_images/pose.png ADDED
app_files/default_images/ref.png ADDED
app_files/samples/pose/MEN/full_1.png ADDED
app_files/samples/pose/MEN/full_2.png ADDED
app_files/samples/pose/MEN/half_back.png ADDED
app_files/samples/pose/MEN/half_front.png ADDED
app_files/samples/pose/MEN/half_left.png ADDED
app_files/samples/pose/WOMEN/pose_0.png ADDED
app_files/samples/pose/WOMEN/pose_1.png ADDED
app_files/samples/pose/WOMEN/pose_2.png ADDED
app_files/samples/pose/WOMEN/pose_3.png ADDED
app_files/samples/pose/WOMEN/pose_4.png ADDED
app_files/samples/pose/WOMEN/pose_5.png ADDED
app_files/samples/pose/WOMEN/pose_6.png ADDED