Spaces:

soonyau
/

visconet

Sleeping

App Files Files Community

soonyau commited on Feb 5, 2024

Commit

42b0b31

1 Parent(s): 5fed475

first commit

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

LICENSE +427 -0
README.md +38 -0
annotator/openpose/__init__.py +73 -0
annotator/openpose/body.py +219 -0
annotator/openpose/hand.py +86 -0
annotator/openpose/model.py +219 -0
annotator/openpose/util.py +163 -0
annotator/segm/__init__.py +162 -0
annotator/segm/modules/__init__.py +5 -0
annotator/segm/modules/bn.py +132 -0
annotator/segm/modules/deeplab.py +84 -0
annotator/segm/modules/dense.py +42 -0
annotator/segm/modules/functions.py +244 -0
annotator/segm/modules/misc.py +21 -0
annotator/segm/modules/residual.py +182 -0
annotator/segm/modules/src/checks.h +15 -0
annotator/segm/modules/src/inplace_abn.cpp +95 -0
annotator/segm/modules/src/inplace_abn.h +88 -0
annotator/segm/modules/src/inplace_abn_cpu.cpp +119 -0
annotator/segm/modules/src/inplace_abn_cuda.cu +333 -0
annotator/segm/modules/src/inplace_abn_cuda_half.cu +275 -0
annotator/segm/modules/src/utils/checks.h +15 -0
annotator/segm/modules/src/utils/common.h +49 -0
annotator/segm/modules/src/utils/cuda.cuh +71 -0
annotator/segm/networks/AugmentCE2P.py +337 -0
annotator/segm/networks/__init__.py +13 -0
annotator/segm/networks/backbone/mobilenetv2.py +156 -0
annotator/segm/networks/backbone/resnet.py +205 -0
annotator/segm/networks/backbone/resnext.py +149 -0
annotator/segm/networks/context_encoding/aspp.py +64 -0
annotator/segm/networks/context_encoding/ocnet.py +226 -0
annotator/segm/networks/context_encoding/psp.py +48 -0
annotator/segm/transforms.py +167 -0
annotator/util.py +49 -0
app.py +475 -0
app_files/default_images/mask.png +0 -0
app_files/default_images/pose.png +0 -0
app_files/default_images/ref.png +0 -0
app_files/samples/pose/MEN/full_1.png +0 -0
app_files/samples/pose/MEN/full_2.png +0 -0
app_files/samples/pose/MEN/half_back.png +0 -0
app_files/samples/pose/MEN/half_front.png +0 -0
app_files/samples/pose/MEN/half_left.png +0 -0
app_files/samples/pose/WOMEN/pose_0.png +0 -0
app_files/samples/pose/WOMEN/pose_1.png +0 -0
app_files/samples/pose/WOMEN/pose_2.png +0 -0
app_files/samples/pose/WOMEN/pose_3.png +0 -0
app_files/samples/pose/WOMEN/pose_4.png +0 -0
app_files/samples/pose/WOMEN/pose_5.png +0 -0
app_files/samples/pose/WOMEN/pose_6.png +0 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,427 @@

+CC BY-SA 4.0 DEED
+Attribution-ShareAlike 4.0 International
+=======================================================================
+Creative Commons Corporation ("Creative Commons") is not a law firm and
+does not provide legal services or legal advice. Distribution of
+Creative Commons public licenses does not create a lawyer-client or
+other relationship. Creative Commons makes its licenses and related
+information available on an "as-is" basis. Creative Commons gives no
+warranties regarding its licenses, any material licensed under their
+terms and conditions, or any related information. Creative Commons
+disclaims all liability for damages resulting from their use to the
+fullest extent possible.
+Using Creative Commons Public Licenses
+Creative Commons public licenses provide a standard set of terms and
+conditions that creators and other rights holders may use to share
+original works of authorship and other material subject to copyright
+and certain other rights specified in the public license below. The
+following considerations are for informational purposes only, are not
+exhaustive, and do not form part of our licenses.
+     Considerations for licensors: Our public licenses are
+     intended for use by those authorized to give the public
+     permission to use material in ways otherwise restricted by
+     copyright and certain other rights. Our licenses are
+     irrevocable. Licensors should read and understand the terms
+     and conditions of the license they choose before applying it.
+     Licensors should also secure all rights necessary before
+     applying our licenses so that the public can reuse the
+     material as expected. Licensors should clearly mark any
+     material not subject to the license. This includes other CC-
+     licensed material, or material used under an exception or
+     limitation to copyright. More considerations for licensors:
+    wiki.creativecommons.org/Considerations_for_licensors
+     Considerations for the public: By using one of our public
+     licenses, a licensor grants the public permission to use the
+     licensed material under specified terms and conditions. If
+     the licensor's permission is not necessary for any reason--for
+     example, because of any applicable exception or limitation to
+     copyright--then that use is not regulated by the license. Our
+     licenses grant only permissions under copyright and certain
+     other rights that a licensor has authority to grant. Use of
+     the licensed material may still be restricted for other
+     reasons, including because others have copyright or other
+     rights in the material. A licensor may make special requests,
+     such as asking that all changes be marked or described.
+     Although not required by our licenses, you are encouraged to
+     respect those requests where reasonable. More considerations
+     for the public:
+    wiki.creativecommons.org/Considerations_for_licensees
+=======================================================================
+Creative Commons Attribution-ShareAlike 4.0 International Public
+License
+By exercising the Licensed Rights (defined below), You accept and agree
+to be bound by the terms and conditions of this Creative Commons
+Attribution-ShareAlike 4.0 International Public License ("Public
+License"). To the extent this Public License may be interpreted as a
+contract, You are granted the Licensed Rights in consideration of Your
+acceptance of these terms and conditions, and the Licensor grants You
+such rights in consideration of benefits the Licensor receives from
+making the Licensed Material available under these terms and
+conditions.
+Section 1 -- Definitions.
+  a. Adapted Material means material subject to Copyright and Similar
+     Rights that is derived from or based upon the Licensed Material
+     and in which the Licensed Material is translated, altered,
+     arranged, transformed, or otherwise modified in a manner requiring
+     permission under the Copyright and Similar Rights held by the
+     Licensor. For purposes of this Public License, where the Licensed
+     Material is a musical work, performance, or sound recording,
+     Adapted Material is always produced where the Licensed Material is
+     synched in timed relation with a moving image.
+  b. Adapter's License means the license You apply to Your Copyright
+     and Similar Rights in Your contributions to Adapted Material in
+     accordance with the terms and conditions of this Public License.
+  c. BY-SA Compatible License means a license listed at
+     creativecommons.org/compatiblelicenses, approved by Creative
+     Commons as essentially the equivalent of this Public License.
+  d. Copyright and Similar Rights means copyright and/or similar rights
+     closely related to copyright including, without limitation,
+     performance, broadcast, sound recording, and Sui Generis Database
+     Rights, without regard to how the rights are labeled or
+     categorized. For purposes of this Public License, the rights
+     specified in Section 2(b)(1)-(2) are not Copyright and Similar
+     Rights.
+  e. Effective Technological Measures means those measures that, in the
+     absence of proper authority, may not be circumvented under laws
+     fulfilling obligations under Article 11 of the WIPO Copyright
+     Treaty adopted on December 20, 1996, and/or similar international
+     agreements.
+  f. Exceptions and Limitations means fair use, fair dealing, and/or
+     any other exception or limitation to Copyright and Similar Rights
+     that applies to Your use of the Licensed Material.
+  g. License Elements means the license attributes listed in the name
+     of a Creative Commons Public License. The License Elements of this
+     Public License are Attribution and ShareAlike.
+  h. Licensed Material means the artistic or literary work, database,
+     or other material to which the Licensor applied this Public
+     License.
+  i. Licensed Rights means the rights granted to You subject to the
+     terms and conditions of this Public License, which are limited to
+     all Copyright and Similar Rights that apply to Your use of the
+     Licensed Material and that the Licensor has authority to license.
+  j. Licensor means the individual(s) or entity(ies) granting rights
+     under this Public License.
+  k. Share means to provide material to the public by any means or
+     process that requires permission under the Licensed Rights, such
+     as reproduction, public display, public performance, distribution,
+     dissemination, communication, or importation, and to make material
+     available to the public including in ways that members of the
+     public may access the material from a place and at a time
+     individually chosen by them.
+  l. Sui Generis Database Rights means rights other than copyright
+     resulting from Directive 96/9/EC of the European Parliament and of
+     the Council of 11 March 1996 on the legal protection of databases,
+     as amended and/or succeeded, as well as other essentially
+     equivalent rights anywhere in the world.
+  m. You means the individual or entity exercising the Licensed Rights
+     under this Public License. Your has a corresponding meaning.
+Section 2 -- Scope.
+  a. License grant.
+       1. Subject to the terms and conditions of this Public License,
+          the Licensor hereby grants You a worldwide, royalty-free,
+          non-sublicensable, non-exclusive, irrevocable license to
+          exercise the Licensed Rights in the Licensed Material to:
+            a. reproduce and Share the Licensed Material, in whole or
+               in part; and
+            b. produce, reproduce, and Share Adapted Material.
+       2. Exceptions and Limitations. For the avoidance of doubt, where
+          Exceptions and Limitations apply to Your use, this Public
+          License does not apply, and You do not need to comply with
+          its terms and conditions.
+       3. Term. The term of this Public License is specified in Section
+          6(a).
+       4. Media and formats; technical modifications allowed. The
+          Licensor authorizes You to exercise the Licensed Rights in
+          all media and formats whether now known or hereafter created,
+          and to make technical modifications necessary to do so. The
+          Licensor waives and/or agrees not to assert any right or
+          authority to forbid You from making technical modifications
+          necessary to exercise the Licensed Rights, including
+          technical modifications necessary to circumvent Effective
+          Technological Measures. For purposes of this Public License,
+          simply making modifications authorized by this Section 2(a)
+          (4) never produces Adapted Material.
+       5. Downstream recipients.
+            a. Offer from the Licensor -- Licensed Material. Every
+               recipient of the Licensed Material automatically
+               receives an offer from the Licensor to exercise the
+               Licensed Rights under the terms and conditions of this
+               Public License.
+            b. Additional offer from the Licensor -- Adapted Material.
+               Every recipient of Adapted Material from You
+               automatically receives an offer from the Licensor to
+               exercise the Licensed Rights in the Adapted Material
+               under the conditions of the Adapter's License You apply.
+            c. No downstream restrictions. You may not offer or impose
+               any additional or different terms or conditions on, or
+               apply any Effective Technological Measures to, the
+               Licensed Material if doing so restricts exercise of the
+               Licensed Rights by any recipient of the Licensed
+               Material.
+       6. No endorsement. Nothing in this Public License constitutes or
+          may be construed as permission to assert or imply that You
+          are, or that Your use of the Licensed Material is, connected
+          with, or sponsored, endorsed, or granted official status by,
+          the Licensor or others designated to receive attribution as
+          provided in Section 3(a)(1)(A)(i).
+  b. Other rights.
+       1. Moral rights, such as the right of integrity, are not
+          licensed under this Public License, nor are publicity,
+          privacy, and/or other similar personality rights; however, to
+          the extent possible, the Licensor waives and/or agrees not to
+          assert any such rights held by the Licensor to the limited
+          extent necessary to allow You to exercise the Licensed
+          Rights, but not otherwise.
+       2. Patent and trademark rights are not licensed under this
+          Public License.
+       3. To the extent possible, the Licensor waives any right to
+          collect royalties from You for the exercise of the Licensed
+          Rights, whether directly or through a collecting society
+          under any voluntary or waivable statutory or compulsory
+          licensing scheme. In all other cases the Licensor expressly
+          reserves any right to collect such royalties.
+Section 3 -- License Conditions.
+Your exercise of the Licensed Rights is expressly made subject to the
+following conditions.
+  a. Attribution.
+       1. If You Share the Licensed Material (including in modified
+          form), You must:
+            a. retain the following if it is supplied by the Licensor
+               with the Licensed Material:
+                 i. identification of the creator(s) of the Licensed
+                    Material and any others designated to receive
+                    attribution, in any reasonable manner requested by
+                    the Licensor (including by pseudonym if
+                    designated);
+                ii. a copyright notice;
+               iii. a notice that refers to this Public License;
+                iv. a notice that refers to the disclaimer of
+                    warranties;
+                 v. a URI or hyperlink to the Licensed Material to the
+                    extent reasonably practicable;
+            b. indicate if You modified the Licensed Material and
+               retain an indication of any previous modifications; and
+            c. indicate the Licensed Material is licensed under this
+               Public License, and include the text of, or the URI or
+               hyperlink to, this Public License.
+       2. You may satisfy the conditions in Section 3(a)(1) in any
+          reasonable manner based on the medium, means, and context in
+          which You Share the Licensed Material. For example, it may be
+          reasonable to satisfy the conditions by providing a URI or
+          hyperlink to a resource that includes the required
+          information.
+       3. If requested by the Licensor, You must remove any of the
+          information required by Section 3(a)(1)(A) to the extent
+          reasonably practicable.
+  b. ShareAlike.
+     In addition to the conditions in Section 3(a), if You Share
+     Adapted Material You produce, the following conditions also apply.
+       1. The Adapter's License You apply must be a Creative Commons
+          license with the same License Elements, this version or
+          later, or a BY-SA Compatible License.
+       2. You must include the text of, or the URI or hyperlink to, the
+          Adapter's License You apply. You may satisfy this condition
+          in any reasonable manner based on the medium, means, and
+          context in which You Share Adapted Material.
+       3. You may not offer or impose any additional or different terms
+          or conditions on, or apply any Effective Technological
+          Measures to, Adapted Material that restrict exercise of the
+          rights granted under the Adapter's License You apply.
+Section 4 -- Sui Generis Database Rights.
+Where the Licensed Rights include Sui Generis Database Rights that
+apply to Your use of the Licensed Material:
+  a. for the avoidance of doubt, Section 2(a)(1) grants You the right
+     to extract, reuse, reproduce, and Share all or a substantial
+     portion of the contents of the database;
+  b. if You include all or a substantial portion of the database
+     contents in a database in which You have Sui Generis Database
+     Rights, then the database in which You have Sui Generis Database
+     Rights (but not its individual contents) is Adapted Material,
+     including for purposes of Section 3(b); and
+  c. You must comply with the conditions in Section 3(a) if You Share
+     all or a substantial portion of the contents of the database.
+For the avoidance of doubt, this Section 4 supplements and does not
+replace Your obligations under this Public License where the Licensed
+Rights include other Copyright and Similar Rights.
+Section 5 -- Disclaimer of Warranties and Limitation of Liability.
+  a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
+     EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
+     AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
+     ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
+     IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
+     WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
+     PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
+     ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
+     KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
+     ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
+  b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
+     TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
+     NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
+     INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
+     COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
+     USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
+     ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
+     DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
+     IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
+  c. The disclaimer of warranties and limitation of liability provided
+     above shall be interpreted in a manner that, to the extent
+     possible, most closely approximates an absolute disclaimer and
+     waiver of all liability.
+Section 6 -- Term and Termination.
+  a. This Public License applies for the term of the Copyright and
+     Similar Rights licensed here. However, if You fail to comply with
+     this Public License, then Your rights under this Public License
+     terminate automatically.
+  b. Where Your right to use the Licensed Material has terminated under
+     Section 6(a), it reinstates:
+       1. automatically as of the date the violation is cured, provided
+          it is cured within 30 days of Your discovery of the
+          violation; or
+       2. upon express reinstatement by the Licensor.
+     For the avoidance of doubt, this Section 6(b) does not affect any
+     right the Licensor may have to seek remedies for Your violations
+     of this Public License.
+  c. For the avoidance of doubt, the Licensor may also offer the
+     Licensed Material under separate terms or conditions or stop
+     distributing the Licensed Material at any time; however, doing so
+     will not terminate this Public License.
+  d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
+     License.
+Section 7 -- Other Terms and Conditions.
+  a. The Licensor shall not be bound by any additional or different
+     terms or conditions communicated by You unless expressly agreed.
+  b. Any arrangements, understandings, or agreements regarding the
+     Licensed Material not stated herein are separate from and
+     independent of the terms and conditions of this Public License.
+Section 8 -- Interpretation.
+  a. For the avoidance of doubt, this Public License does not, and
+     shall not be interpreted to, reduce, limit, restrict, or impose
+     conditions on any use of the Licensed Material that could lawfully
+     be made without permission under this Public License.
+  b. To the extent possible, if any provision of this Public License is
+     deemed unenforceable, it shall be automatically reformed to the
+     minimum extent necessary to make it enforceable. If the provision
+     cannot be reformed, it shall be severed from this Public License
+     without affecting the enforceability of the remaining terms and
+     conditions.
+  c. No term or condition of this Public License will be waived and no
+     failure to comply consented to unless expressly agreed to by the
+     Licensor.
+  d. Nothing in this Public License constitutes or may be interpreted
+     as a limitation upon, or waiver of, any privileges and immunities
+     that apply to the Licensor or You, including from the legal
+     processes of any jurisdiction or authority.
+=======================================================================
+Creative Commons is not a party to its public
+licenses. Notwithstanding, Creative Commons may elect to apply one of
+its public licenses to material it publishes and in those instances
+will be considered the â€œLicensor.â€ The text of the Creative Commons
+public licenses is dedicated to the public domain under the CC0 Public
+Domain Dedication. Except for the limited purpose of indicating that
+material is shared under a Creative Commons public license or as
+otherwise permitted by the Creative Commons policies published at
+creativecommons.org/policies, Creative Commons does not authorize the
+use of the trademark "Creative Commons" or any other trademark or logo
+of Creative Commons without its prior written consent including,
+without limitation, in connection with any unauthorized modifications
+to any of its public licenses or any other arrangements,
+understandings, or agreements concerning use of licensed material. For
+the avoidance of doubt, this paragraph does not form part of the
+public licenses.
+Creative Commons may be contacted at creativecommons.org.

README.md ADDED Viewed

	@@ -0,0 +1,38 @@

+## *ViscoNet*: Bridging and Harmonizing Visual and Textual Conditioning for ControlNet
+[Soon Yau Cheong](https://scholar.google.com/citations?user=dRot7GUAAAAJ&hl=en)
+[Armin Mustafa](https://scholar.google.com/citations?user=0xOHqkMAAAAJ&hl=en)
+[Andrew Gilbert](https://scholar.google.com/citations?user=NNhnVwoAAAAJ&hl=en)
+<a href='https://soon-yau.github.io/visconet/'><img src='https://img.shields.io/badge/Project-Page-Green'></a>
+<a href='https://arxiv.org/abs/2312.03154'><img src='https://img.shields.io/badge/Paper-Arxiv-red'></a>
+[![YouTube](https://badges.aleen42.com/src/youtube.svg)](https://youtu.be/3_6Zq3hk86Q)
+https://github.com/soon-yau/visconet/assets/19167278/ae58b7ab-fa76-4253-8a10-46656f234b20
+### Requirements
+A suitable [conda](https://conda.io/) environment named `control` can be created
+and activated with:
+```
+conda env create -f environment.yaml
+conda activate control
+```
+### Files
+All model and data files are in [here](https://huggingface.co/soonyau/visconet/tree/main).
+Including eval.zip containing all images used in human evaluation.
+### Gradio App
+[![App](./assets/app.png)](https://youtu.be/3_6Zq3hk86Q)
+1. Download *visconet_v1.pth* and *exp-schp-201908301523-atr.pth* into directory ./models
+2. (Optional) download fashion.zip and unzip it to home directory.
+3. run ```python gradio_visconet.py```
+### Citation
+```
+@article{cheong2023visconet,
+        author    = {Cheong, Soon Yau and Mustafa, Armin and Gilbert, Andrew},
+        title     = {ViscoNet: Bridging and Harmonizing Visual and Textual Conditioning for ControlNet},
+        journal = {Arxiv Preprint 2312.03154},
+        month     = {December},
+        year      = {2023}}
+```

annotator/openpose/__init__.py ADDED Viewed

	@@ -0,0 +1,73 @@

+# Openpose
+# Original from CMU https://github.com/CMU-Perceptual-Computing-Lab/openpose
+# 2nd Edited by https://github.com/Hzzone/pytorch-openpose
+# 3rd Edited by ControlNet
+import os
+os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
+import torch
+import numpy as np
+from . import util
+from .body import Body
+from .hand import Hand
+from annotator.util import annotator_ckpts_path
+body_model_path = "https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/body_pose_model.pth"
+hand_model_path = "https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/hand_pose_model.pth"
+class OpenposeDetector:
+    def __init__(self):
+        body_modelpath = os.path.join(annotator_ckpts_path, "body_pose_model.pth")
+        hand_modelpath = os.path.join(annotator_ckpts_path, "hand_pose_model.pth")
+        if not os.path.exists(hand_modelpath):
+            from basicsr.utils.download_util import load_file_from_url
+            load_file_from_url(body_model_path, model_dir=annotator_ckpts_path)
+            load_file_from_url(hand_model_path, model_dir=annotator_ckpts_path)
+        self.body_estimation = Body(body_modelpath)
+        self.hand_estimation = Hand(hand_modelpath)
+    def __call__(self, oriImg, hand=False):
+        oriImg = oriImg[:, :, ::-1].copy()
+        with torch.no_grad():
+            candidate, subset = self.body_estimation(oriImg)
+            canvas = np.zeros_like(oriImg)
+            canvas = util.draw_bodypose(canvas, candidate, subset)
+            if hand:
+                hands_list = util.handDetect(candidate, subset, oriImg)
+                all_hand_peaks = []
+                for x, y, w, is_left in hands_list:
+                    peaks = self.hand_estimation(oriImg[y:y+w, x:x+w, :])
+                    peaks[:, 0] = np.where(peaks[:, 0] == 0, peaks[:, 0], peaks[:, 0] + x)
+                    peaks[:, 1] = np.where(peaks[:, 1] == 0, peaks[:, 1], peaks[:, 1] + y)
+                    all_hand_peaks.append(peaks)
+                canvas = util.draw_handpose(canvas, all_hand_peaks)
+            return canvas, dict(candidate=candidate.tolist(), subset=subset.tolist())
+class VisconetDetector(OpenposeDetector):
+    def __init__(self):
+        super().__init__()
+    def __call__(self, oriImg):
+        oriImg = oriImg[:, :, ::-1].copy()
+        with torch.no_grad():
+            candidate, subset = self.body_estimation(oriImg)
+            canvas = util.draw_bodypose(np.zeros_like(oriImg), candidate, subset, stickwidth=1, circlewidth=2)
+            # detect hand
+            hands_list = util.handDetect(candidate, subset, oriImg)
+            all_hand_peaks = []
+            for x, y, w, is_left in hands_list:
+                peaks = self.hand_estimation(oriImg[y:y+w, x:x+w, :])
+                peaks[:, 0] = np.where(peaks[:, 0]==0, peaks[:, 0], peaks[:, 0]+x)
+                peaks[:, 1] = np.where(peaks[:, 1]==0, peaks[:, 1], peaks[:, 1]+y)
+                all_hand_peaks.append(peaks)
+            canvas = util.draw_handpose(canvas, all_hand_peaks,stickwidth=1)
+            return canvas, dict(candidate=candidate.tolist(), subset=subset.tolist())

annotator/openpose/body.py ADDED Viewed

	@@ -0,0 +1,219 @@

+import cv2
+import numpy as np
+import math
+import time
+from scipy.ndimage.filters import gaussian_filter
+import matplotlib.pyplot as plt
+import matplotlib
+import torch
+from torchvision import transforms
+from . import util
+from .model import bodypose_model
+class Body(object):
+    def __init__(self, model_path):
+        self.model = bodypose_model()
+        if torch.cuda.is_available():
+            self.model = self.model.cuda()
+            print('cuda')
+        model_dict = util.transfer(self.model, torch.load(model_path))
+        self.model.load_state_dict(model_dict)
+        self.model.eval()
+    def __call__(self, oriImg):
+        # scale_search = [0.5, 1.0, 1.5, 2.0]
+        scale_search = [0.5]
+        boxsize = 368
+        stride = 8
+        padValue = 128
+        thre1 = 0.1
+        thre2 = 0.05
+        multiplier = [x * boxsize / oriImg.shape[0] for x in scale_search]
+        heatmap_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 19))
+        paf_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 38))
+        for m in range(len(multiplier)):
+            scale = multiplier[m]
+            imageToTest = cv2.resize(oriImg, (0, 0), fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
+            imageToTest_padded, pad = util.padRightDownCorner(imageToTest, stride, padValue)
+            im = np.transpose(np.float32(imageToTest_padded[:, :, :, np.newaxis]), (3, 2, 0, 1)) / 256 - 0.5
+            im = np.ascontiguousarray(im)
+            data = torch.from_numpy(im).float()
+            if torch.cuda.is_available():
+                data = data.cuda()
+            # data = data.permute([2, 0, 1]).unsqueeze(0).float()
+            with torch.no_grad():
+                Mconv7_stage6_L1, Mconv7_stage6_L2 = self.model(data)
+            Mconv7_stage6_L1 = Mconv7_stage6_L1.cpu().numpy()
+            Mconv7_stage6_L2 = Mconv7_stage6_L2.cpu().numpy()
+            # extract outputs, resize, and remove padding
+            # heatmap = np.transpose(np.squeeze(net.blobs[output_blobs.keys()[1]].data), (1, 2, 0))  # output 1 is heatmaps
+            heatmap = np.transpose(np.squeeze(Mconv7_stage6_L2), (1, 2, 0))  # output 1 is heatmaps
+            heatmap = cv2.resize(heatmap, (0, 0), fx=stride, fy=stride, interpolation=cv2.INTER_CUBIC)
+            heatmap = heatmap[:imageToTest_padded.shape[0] - pad[2], :imageToTest_padded.shape[1] - pad[3], :]
+            heatmap = cv2.resize(heatmap, (oriImg.shape[1], oriImg.shape[0]), interpolation=cv2.INTER_CUBIC)
+            # paf = np.transpose(np.squeeze(net.blobs[output_blobs.keys()[0]].data), (1, 2, 0))  # output 0 is PAFs
+            paf = np.transpose(np.squeeze(Mconv7_stage6_L1), (1, 2, 0))  # output 0 is PAFs
+            paf = cv2.resize(paf, (0, 0), fx=stride, fy=stride, interpolation=cv2.INTER_CUBIC)
+            paf = paf[:imageToTest_padded.shape[0] - pad[2], :imageToTest_padded.shape[1] - pad[3], :]
+            paf = cv2.resize(paf, (oriImg.shape[1], oriImg.shape[0]), interpolation=cv2.INTER_CUBIC)
+            heatmap_avg += heatmap_avg + heatmap / len(multiplier)
+            paf_avg += + paf / len(multiplier)
+        all_peaks = []
+        peak_counter = 0
+        for part in range(18):
+            map_ori = heatmap_avg[:, :, part]
+            one_heatmap = gaussian_filter(map_ori, sigma=3)
+            map_left = np.zeros(one_heatmap.shape)
+            map_left[1:, :] = one_heatmap[:-1, :]
+            map_right = np.zeros(one_heatmap.shape)
+            map_right[:-1, :] = one_heatmap[1:, :]
+            map_up = np.zeros(one_heatmap.shape)
+            map_up[:, 1:] = one_heatmap[:, :-1]
+            map_down = np.zeros(one_heatmap.shape)
+            map_down[:, :-1] = one_heatmap[:, 1:]
+            peaks_binary = np.logical_and.reduce(
+                (one_heatmap >= map_left, one_heatmap >= map_right, one_heatmap >= map_up, one_heatmap >= map_down, one_heatmap > thre1))
+            peaks = list(zip(np.nonzero(peaks_binary)[1], np.nonzero(peaks_binary)[0]))  # note reverse
+            peaks_with_score = [x + (map_ori[x[1], x[0]],) for x in peaks]
+            peak_id = range(peak_counter, peak_counter + len(peaks))
+            peaks_with_score_and_id = [peaks_with_score[i] + (peak_id[i],) for i in range(len(peak_id))]
+            all_peaks.append(peaks_with_score_and_id)
+            peak_counter += len(peaks)
+        # find connection in the specified sequence, center 29 is in the position 15
+        limbSeq = [[2, 3], [2, 6], [3, 4], [4, 5], [6, 7], [7, 8], [2, 9], [9, 10], \
+                   [10, 11], [2, 12], [12, 13], [13, 14], [2, 1], [1, 15], [15, 17], \
+                   [1, 16], [16, 18], [3, 17], [6, 18]]
+        # the middle joints heatmap correpondence
+        mapIdx = [[31, 32], [39, 40], [33, 34], [35, 36], [41, 42], [43, 44], [19, 20], [21, 22], \
+                  [23, 24], [25, 26], [27, 28], [29, 30], [47, 48], [49, 50], [53, 54], [51, 52], \
+                  [55, 56], [37, 38], [45, 46]]
+        connection_all = []
+        special_k = []
+        mid_num = 10
+        for k in range(len(mapIdx)):
+            score_mid = paf_avg[:, :, [x - 19 for x in mapIdx[k]]]
+            candA = all_peaks[limbSeq[k][0] - 1]
+            candB = all_peaks[limbSeq[k][1] - 1]
+            nA = len(candA)
+            nB = len(candB)
+            indexA, indexB = limbSeq[k]
+            if (nA != 0 and nB != 0):
+                connection_candidate = []
+                for i in range(nA):
+                    for j in range(nB):
+                        vec = np.subtract(candB[j][:2], candA[i][:2])
+                        norm = math.sqrt(vec[0] * vec[0] + vec[1] * vec[1])
+                        norm = max(0.001, norm)
+                        vec = np.divide(vec, norm)
+                        startend = list(zip(np.linspace(candA[i][0], candB[j][0], num=mid_num), \
+                                            np.linspace(candA[i][1], candB[j][1], num=mid_num)))
+                        vec_x = np.array([score_mid[int(round(startend[I][1])), int(round(startend[I][0])), 0] \
+                                          for I in range(len(startend))])
+                        vec_y = np.array([score_mid[int(round(startend[I][1])), int(round(startend[I][0])), 1] \
+                                          for I in range(len(startend))])
+                        score_midpts = np.multiply(vec_x, vec[0]) + np.multiply(vec_y, vec[1])
+                        score_with_dist_prior = sum(score_midpts) / len(score_midpts) + min(
+                            0.5 * oriImg.shape[0] / norm - 1, 0)
+                        criterion1 = len(np.nonzero(score_midpts > thre2)[0]) > 0.8 * len(score_midpts)
+                        criterion2 = score_with_dist_prior > 0
+                        if criterion1 and criterion2:
+                            connection_candidate.append(
+                                [i, j, score_with_dist_prior, score_with_dist_prior + candA[i][2] + candB[j][2]])
+                connection_candidate = sorted(connection_candidate, key=lambda x: x[2], reverse=True)
+                connection = np.zeros((0, 5))
+                for c in range(len(connection_candidate)):
+                    i, j, s = connection_candidate[c][0:3]
+                    if (i not in connection[:, 3] and j not in connection[:, 4]):
+                        connection = np.vstack([connection, [candA[i][3], candB[j][3], s, i, j]])
+                        if (len(connection) >= min(nA, nB)):
+                            break
+                connection_all.append(connection)
+            else:
+                special_k.append(k)
+                connection_all.append([])
+        # last number in each row is the total parts number of that person
+        # the second last number in each row is the score of the overall configuration
+        subset = -1 * np.ones((0, 20))
+        candidate = np.array([item for sublist in all_peaks for item in sublist])
+        for k in range(len(mapIdx)):
+            if k not in special_k:
+                partAs = connection_all[k][:, 0]
+                partBs = connection_all[k][:, 1]
+                indexA, indexB = np.array(limbSeq[k]) - 1
+                for i in range(len(connection_all[k])):  # = 1:size(temp,1)
+                    found = 0
+                    subset_idx = [-1, -1]
+                    for j in range(len(subset)):  # 1:size(subset,1):
+                        if subset[j][indexA] == partAs[i] or subset[j][indexB] == partBs[i]:
+                            subset_idx[found] = j
+                            found += 1
+                    if found == 1:
+                        j = subset_idx[0]
+                        if subset[j][indexB] != partBs[i]:
+                            subset[j][indexB] = partBs[i]
+                            subset[j][-1] += 1
+                            subset[j][-2] += candidate[partBs[i].astype(int), 2] + connection_all[k][i][2]
+                    elif found == 2:  # if found 2 and disjoint, merge them
+                        j1, j2 = subset_idx
+                        membership = ((subset[j1] >= 0).astype(int) + (subset[j2] >= 0).astype(int))[:-2]
+                        if len(np.nonzero(membership == 2)[0]) == 0:  # merge
+                            subset[j1][:-2] += (subset[j2][:-2] + 1)
+                            subset[j1][-2:] += subset[j2][-2:]
+                            subset[j1][-2] += connection_all[k][i][2]
+                            subset = np.delete(subset, j2, 0)
+                        else:  # as like found == 1
+                            subset[j1][indexB] = partBs[i]
+                            subset[j1][-1] += 1
+                            subset[j1][-2] += candidate[partBs[i].astype(int), 2] + connection_all[k][i][2]
+                    # if find no partA in the subset, create a new subset
+                    elif not found and k < 17:
+                        row = -1 * np.ones(20)
+                        row[indexA] = partAs[i]
+                        row[indexB] = partBs[i]
+                        row[-1] = 2
+                        row[-2] = sum(candidate[connection_all[k][i, :2].astype(int), 2]) + connection_all[k][i][2]
+                        subset = np.vstack([subset, row])
+        # delete some rows of subset which has few parts occur
+        deleteIdx = []
+        for i in range(len(subset)):
+            if subset[i][-1] < 4 or subset[i][-2] / subset[i][-1] < 0.4:
+                deleteIdx.append(i)
+        subset = np.delete(subset, deleteIdx, axis=0)
+        # subset: n*20 array, 0-17 is the index in candidate, 18 is the total score, 19 is the total parts
+        # candidate: x, y, score, id
+        return candidate, subset
+if __name__ == "__main__":
+    body_estimation = Body('../model/body_pose_model.pth')
+    test_image = '../images/ski.jpg'
+    oriImg = cv2.imread(test_image)  # B,G,R order
+    candidate, subset = body_estimation(oriImg)
+    canvas = util.draw_bodypose(oriImg, candidate, subset)
+    plt.imshow(canvas[:, :, [2, 1, 0]])
+    plt.show()

annotator/openpose/hand.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import cv2
+import json
+import numpy as np
+import math
+import time
+from scipy.ndimage.filters import gaussian_filter
+import matplotlib.pyplot as plt
+import matplotlib
+import torch
+from skimage.measure import label
+from .model import handpose_model
+from . import util
+class Hand(object):
+    def __init__(self, model_path):
+        self.model = handpose_model()
+        if torch.cuda.is_available():
+            self.model = self.model.cuda()
+            print('cuda')
+        model_dict = util.transfer(self.model, torch.load(model_path))
+        self.model.load_state_dict(model_dict)
+        self.model.eval()
+    def __call__(self, oriImg):
+        scale_search = [0.5, 1.0, 1.5, 2.0]
+        # scale_search = [0.5]
+        boxsize = 368
+        stride = 8
+        padValue = 128
+        thre = 0.05
+        multiplier = [x * boxsize / oriImg.shape[0] for x in scale_search]
+        heatmap_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 22))
+        # paf_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 38))
+        for m in range(len(multiplier)):
+            scale = multiplier[m]
+            imageToTest = cv2.resize(oriImg, (0, 0), fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
+            imageToTest_padded, pad = util.padRightDownCorner(imageToTest, stride, padValue)
+            im = np.transpose(np.float32(imageToTest_padded[:, :, :, np.newaxis]), (3, 2, 0, 1)) / 256 - 0.5
+            im = np.ascontiguousarray(im)
+            data = torch.from_numpy(im).float()
+            if torch.cuda.is_available():
+                data = data.cuda()
+            # data = data.permute([2, 0, 1]).unsqueeze(0).float()
+            with torch.no_grad():
+                output = self.model(data).cpu().numpy()
+                # output = self.model(data).numpy()q
+            # extract outputs, resize, and remove padding
+            heatmap = np.transpose(np.squeeze(output), (1, 2, 0))  # output 1 is heatmaps
+            heatmap = cv2.resize(heatmap, (0, 0), fx=stride, fy=stride, interpolation=cv2.INTER_CUBIC)
+            heatmap = heatmap[:imageToTest_padded.shape[0] - pad[2], :imageToTest_padded.shape[1] - pad[3], :]
+            heatmap = cv2.resize(heatmap, (oriImg.shape[1], oriImg.shape[0]), interpolation=cv2.INTER_CUBIC)
+            heatmap_avg += heatmap / len(multiplier)
+        all_peaks = []
+        for part in range(21):
+            map_ori = heatmap_avg[:, :, part]
+            one_heatmap = gaussian_filter(map_ori, sigma=3)
+            binary = np.ascontiguousarray(one_heatmap > thre, dtype=np.uint8)
+            # 全部小于阈值
+            if np.sum(binary) == 0:
+                all_peaks.append([0, 0])
+                continue
+            label_img, label_numbers = label(binary, return_num=True, connectivity=binary.ndim)
+            max_index = np.argmax([np.sum(map_ori[label_img == i]) for i in range(1, label_numbers + 1)]) + 1
+            label_img[label_img != max_index] = 0
+            map_ori[label_img == 0] = 0
+            y, x = util.npmax(map_ori)
+            all_peaks.append([x, y])
+        return np.array(all_peaks)
+if __name__ == "__main__":
+    hand_estimation = Hand('../model/hand_pose_model.pth')
+    # test_image = '../images/hand.jpg'
+    test_image = '../images/hand.jpg'
+    oriImg = cv2.imread(test_image)  # B,G,R order
+    peaks = hand_estimation(oriImg)
+    canvas = util.draw_handpose(oriImg, peaks, True)
+    cv2.imshow('', canvas)
+    cv2.waitKey(0)

annotator/openpose/model.py ADDED Viewed

	@@ -0,0 +1,219 @@

+import torch
+from collections import OrderedDict
+import torch
+import torch.nn as nn
+def make_layers(block, no_relu_layers):
+    layers = []
+    for layer_name, v in block.items():
+        if 'pool' in layer_name:
+            layer = nn.MaxPool2d(kernel_size=v[0], stride=v[1],
+                                    padding=v[2])
+            layers.append((layer_name, layer))
+        else:
+            conv2d = nn.Conv2d(in_channels=v[0], out_channels=v[1],
+                               kernel_size=v[2], stride=v[3],
+                               padding=v[4])
+            layers.append((layer_name, conv2d))
+            if layer_name not in no_relu_layers:
+                layers.append(('relu_'+layer_name, nn.ReLU(inplace=True)))
+    return nn.Sequential(OrderedDict(layers))
+class bodypose_model(nn.Module):
+    def __init__(self):
+        super(bodypose_model, self).__init__()
+        # these layers have no relu layer
+        no_relu_layers = ['conv5_5_CPM_L1', 'conv5_5_CPM_L2', 'Mconv7_stage2_L1',\
+                          'Mconv7_stage2_L2', 'Mconv7_stage3_L1', 'Mconv7_stage3_L2',\
+                          'Mconv7_stage4_L1', 'Mconv7_stage4_L2', 'Mconv7_stage5_L1',\
+                          'Mconv7_stage5_L2', 'Mconv7_stage6_L1', 'Mconv7_stage6_L1']
+        blocks = {}
+        block0 = OrderedDict([
+                      ('conv1_1', [3, 64, 3, 1, 1]),
+                      ('conv1_2', [64, 64, 3, 1, 1]),
+                      ('pool1_stage1', [2, 2, 0]),
+                      ('conv2_1', [64, 128, 3, 1, 1]),
+                      ('conv2_2', [128, 128, 3, 1, 1]),
+                      ('pool2_stage1', [2, 2, 0]),
+                      ('conv3_1', [128, 256, 3, 1, 1]),
+                      ('conv3_2', [256, 256, 3, 1, 1]),
+                      ('conv3_3', [256, 256, 3, 1, 1]),
+                      ('conv3_4', [256, 256, 3, 1, 1]),
+                      ('pool3_stage1', [2, 2, 0]),
+                      ('conv4_1', [256, 512, 3, 1, 1]),
+                      ('conv4_2', [512, 512, 3, 1, 1]),
+                      ('conv4_3_CPM', [512, 256, 3, 1, 1]),
+                      ('conv4_4_CPM', [256, 128, 3, 1, 1])
+                  ])
+        # Stage 1
+        block1_1 = OrderedDict([
+                        ('conv5_1_CPM_L1', [128, 128, 3, 1, 1]),
+                        ('conv5_2_CPM_L1', [128, 128, 3, 1, 1]),
+                        ('conv5_3_CPM_L1', [128, 128, 3, 1, 1]),
+                        ('conv5_4_CPM_L1', [128, 512, 1, 1, 0]),
+                        ('conv5_5_CPM_L1', [512, 38, 1, 1, 0])
+                    ])
+        block1_2 = OrderedDict([
+                        ('conv5_1_CPM_L2', [128, 128, 3, 1, 1]),
+                        ('conv5_2_CPM_L2', [128, 128, 3, 1, 1]),
+                        ('conv5_3_CPM_L2', [128, 128, 3, 1, 1]),
+                        ('conv5_4_CPM_L2', [128, 512, 1, 1, 0]),
+                        ('conv5_5_CPM_L2', [512, 19, 1, 1, 0])
+                    ])
+        blocks['block1_1'] = block1_1
+        blocks['block1_2'] = block1_2
+        self.model0 = make_layers(block0, no_relu_layers)
+        # Stages 2 - 6
+        for i in range(2, 7):
+            blocks['block%d_1' % i] = OrderedDict([
+                    ('Mconv1_stage%d_L1' % i, [185, 128, 7, 1, 3]),
+                    ('Mconv2_stage%d_L1' % i, [128, 128, 7, 1, 3]),
+                    ('Mconv3_stage%d_L1' % i, [128, 128, 7, 1, 3]),
+                    ('Mconv4_stage%d_L1' % i, [128, 128, 7, 1, 3]),
+                    ('Mconv5_stage%d_L1' % i, [128, 128, 7, 1, 3]),
+                    ('Mconv6_stage%d_L1' % i, [128, 128, 1, 1, 0]),
+                    ('Mconv7_stage%d_L1' % i, [128, 38, 1, 1, 0])
+                ])
+            blocks['block%d_2' % i] = OrderedDict([
+                    ('Mconv1_stage%d_L2' % i, [185, 128, 7, 1, 3]),
+                    ('Mconv2_stage%d_L2' % i, [128, 128, 7, 1, 3]),
+                    ('Mconv3_stage%d_L2' % i, [128, 128, 7, 1, 3]),
+                    ('Mconv4_stage%d_L2' % i, [128, 128, 7, 1, 3]),
+                    ('Mconv5_stage%d_L2' % i, [128, 128, 7, 1, 3]),
+                    ('Mconv6_stage%d_L2' % i, [128, 128, 1, 1, 0]),
+                    ('Mconv7_stage%d_L2' % i, [128, 19, 1, 1, 0])
+                ])
+        for k in blocks.keys():
+            blocks[k] = make_layers(blocks[k], no_relu_layers)
+        self.model1_1 = blocks['block1_1']
+        self.model2_1 = blocks['block2_1']
+        self.model3_1 = blocks['block3_1']
+        self.model4_1 = blocks['block4_1']
+        self.model5_1 = blocks['block5_1']
+        self.model6_1 = blocks['block6_1']
+        self.model1_2 = blocks['block1_2']
+        self.model2_2 = blocks['block2_2']
+        self.model3_2 = blocks['block3_2']
+        self.model4_2 = blocks['block4_2']
+        self.model5_2 = blocks['block5_2']
+        self.model6_2 = blocks['block6_2']
+    def forward(self, x):
+        out1 = self.model0(x)
+        out1_1 = self.model1_1(out1)
+        out1_2 = self.model1_2(out1)
+        out2 = torch.cat([out1_1, out1_2, out1], 1)
+        out2_1 = self.model2_1(out2)
+        out2_2 = self.model2_2(out2)
+        out3 = torch.cat([out2_1, out2_2, out1], 1)
+        out3_1 = self.model3_1(out3)
+        out3_2 = self.model3_2(out3)
+        out4 = torch.cat([out3_1, out3_2, out1], 1)
+        out4_1 = self.model4_1(out4)
+        out4_2 = self.model4_2(out4)
+        out5 = torch.cat([out4_1, out4_2, out1], 1)
+        out5_1 = self.model5_1(out5)
+        out5_2 = self.model5_2(out5)
+        out6 = torch.cat([out5_1, out5_2, out1], 1)
+        out6_1 = self.model6_1(out6)
+        out6_2 = self.model6_2(out6)
+        return out6_1, out6_2
+class handpose_model(nn.Module):
+    def __init__(self):
+        super(handpose_model, self).__init__()
+        # these layers have no relu layer
+        no_relu_layers = ['conv6_2_CPM', 'Mconv7_stage2', 'Mconv7_stage3',\
+                          'Mconv7_stage4', 'Mconv7_stage5', 'Mconv7_stage6']
+        # stage 1
+        block1_0 = OrderedDict([
+                ('conv1_1', [3, 64, 3, 1, 1]),
+                ('conv1_2', [64, 64, 3, 1, 1]),
+                ('pool1_stage1', [2, 2, 0]),
+                ('conv2_1', [64, 128, 3, 1, 1]),
+                ('conv2_2', [128, 128, 3, 1, 1]),
+                ('pool2_stage1', [2, 2, 0]),
+                ('conv3_1', [128, 256, 3, 1, 1]),
+                ('conv3_2', [256, 256, 3, 1, 1]),
+                ('conv3_3', [256, 256, 3, 1, 1]),
+                ('conv3_4', [256, 256, 3, 1, 1]),
+                ('pool3_stage1', [2, 2, 0]),
+                ('conv4_1', [256, 512, 3, 1, 1]),
+                ('conv4_2', [512, 512, 3, 1, 1]),
+                ('conv4_3', [512, 512, 3, 1, 1]),
+                ('conv4_4', [512, 512, 3, 1, 1]),
+                ('conv5_1', [512, 512, 3, 1, 1]),
+                ('conv5_2', [512, 512, 3, 1, 1]),
+                ('conv5_3_CPM', [512, 128, 3, 1, 1])
+            ])
+        block1_1 = OrderedDict([
+            ('conv6_1_CPM', [128, 512, 1, 1, 0]),
+            ('conv6_2_CPM', [512, 22, 1, 1, 0])
+        ])
+        blocks = {}
+        blocks['block1_0'] = block1_0
+        blocks['block1_1'] = block1_1
+        # stage 2-6
+        for i in range(2, 7):
+            blocks['block%d' % i] = OrderedDict([
+                    ('Mconv1_stage%d' % i, [150, 128, 7, 1, 3]),
+                    ('Mconv2_stage%d' % i, [128, 128, 7, 1, 3]),
+                    ('Mconv3_stage%d' % i, [128, 128, 7, 1, 3]),
+                    ('Mconv4_stage%d' % i, [128, 128, 7, 1, 3]),
+                    ('Mconv5_stage%d' % i, [128, 128, 7, 1, 3]),
+                    ('Mconv6_stage%d' % i, [128, 128, 1, 1, 0]),
+                    ('Mconv7_stage%d' % i, [128, 22, 1, 1, 0])
+                ])
+        for k in blocks.keys():
+            blocks[k] = make_layers(blocks[k], no_relu_layers)
+        self.model1_0 = blocks['block1_0']
+        self.model1_1 = blocks['block1_1']
+        self.model2 = blocks['block2']
+        self.model3 = blocks['block3']
+        self.model4 = blocks['block4']
+        self.model5 = blocks['block5']
+        self.model6 = blocks['block6']
+    def forward(self, x):
+        out1_0 = self.model1_0(x)
+        out1_1 = self.model1_1(out1_0)
+        concat_stage2 = torch.cat([out1_1, out1_0], 1)
+        out_stage2 = self.model2(concat_stage2)
+        concat_stage3 = torch.cat([out_stage2, out1_0], 1)
+        out_stage3 = self.model3(concat_stage3)
+        concat_stage4 = torch.cat([out_stage3, out1_0], 1)
+        out_stage4 = self.model4(concat_stage4)
+        concat_stage5 = torch.cat([out_stage4, out1_0], 1)
+        out_stage5 = self.model5(concat_stage5)
+        concat_stage6 = torch.cat([out_stage5, out1_0], 1)
+        out_stage6 = self.model6(concat_stage6)
+        return out_stage6

annotator/openpose/util.py ADDED Viewed

	@@ -0,0 +1,163 @@

+import math
+import numpy as np
+import matplotlib
+import cv2
+def padRightDownCorner(img, stride, padValue):
+    h = img.shape[0]
+    w = img.shape[1]
+    pad = 4 * [None]
+    pad[0] = 0 # up
+    pad[1] = 0 # left
+    pad[2] = 0 if (h % stride == 0) else stride - (h % stride) # down
+    pad[3] = 0 if (w % stride == 0) else stride - (w % stride) # right
+    img_padded = img
+    pad_up = np.tile(img_padded[0:1, :, :]*0 + padValue, (pad[0], 1, 1))
+    img_padded = np.concatenate((pad_up, img_padded), axis=0)
+    pad_left = np.tile(img_padded[:, 0:1, :]*0 + padValue, (1, pad[1], 1))
+    img_padded = np.concatenate((pad_left, img_padded), axis=1)
+    pad_down = np.tile(img_padded[-2:-1, :, :]*0 + padValue, (pad[2], 1, 1))
+    img_padded = np.concatenate((img_padded, pad_down), axis=0)
+    pad_right = np.tile(img_padded[:, -2:-1, :]*0 + padValue, (1, pad[3], 1))
+    img_padded = np.concatenate((img_padded, pad_right), axis=1)
+    return img_padded, pad
+# transfer caffe model to pytorch which will match the layer name
+def transfer(model, model_weights):
+    transfered_model_weights = {}
+    for weights_name in model.state_dict().keys():
+        transfered_model_weights[weights_name] = model_weights['.'.join(weights_name.split('.')[1:])]
+    return transfered_model_weights
+# draw the body keypoint and lims
+def draw_bodypose(canvas, candidate, subset, stickwidth=4, circlewidth=4):
+    limbSeq = [[2, 3], [2, 6], [3, 4], [4, 5], [6, 7], [7, 8], [2, 9], [9, 10], \
+               [10, 11], [2, 12], [12, 13], [13, 14], [2, 1], [1, 15], [15, 17], \
+               [1, 16], [16, 18], [3, 17], [6, 18]]
+    colors = [[255, 0, 0], [255, 85, 0], [255, 170, 0], [255, 255, 0], [170, 255, 0], [85, 255, 0], [0, 255, 0], \
+              [0, 255, 85], [0, 255, 170], [0, 255, 255], [0, 170, 255], [0, 85, 255], [0, 0, 255], [85, 0, 255], \
+              [170, 0, 255], [255, 0, 255], [255, 0, 170], [255, 0, 85]]
+    for i in range(18):
+        for n in range(len(subset)):
+            index = int(subset[n][i])
+            if index == -1:
+                continue
+            x, y = candidate[index][0:2]
+            cv2.circle(canvas, (int(x), int(y)), circlewidth, colors[i], thickness=-1)
+    for i in range(17):
+        for n in range(len(subset)):
+            index = subset[n][np.array(limbSeq[i]) - 1]
+            if -1 in index:
+                continue
+            cur_canvas = canvas.copy()
+            Y = candidate[index.astype(int), 0]
+            X = candidate[index.astype(int), 1]
+            mX = np.mean(X)
+            mY = np.mean(Y)
+            length = ((X[0] - X[1]) ** 2 + (Y[0] - Y[1]) ** 2) ** 0.5
+            angle = math.degrees(math.atan2(X[0] - X[1], Y[0] - Y[1]))
+            polygon = cv2.ellipse2Poly((int(mY), int(mX)), (int(length / 2), stickwidth), int(angle), 0, 360, 1)
+            cv2.fillConvexPoly(cur_canvas, polygon, colors[i])
+            canvas = cv2.addWeighted(canvas, 0.4, cur_canvas, 0.6, 0)
+    # plt.imsave("preview.jpg", canvas[:, :, [2, 1, 0]])
+    # plt.imshow(canvas[:, :, [2, 1, 0]])
+    return canvas
+# image drawed by opencv is not good.
+def draw_handpose(canvas, all_hand_peaks, show_number=False, stickwidth=2):
+    edges = [[0, 1], [1, 2], [2, 3], [3, 4], [0, 5], [5, 6], [6, 7], [7, 8], [0, 9], [9, 10], \
+             [10, 11], [11, 12], [0, 13], [13, 14], [14, 15], [15, 16], [0, 17], [17, 18], [18, 19], [19, 20]]
+    for peaks in all_hand_peaks:
+        for ie, e in enumerate(edges):
+            if np.sum(np.all(peaks[e], axis=1)==0)==0:
+                x1, y1 = peaks[e[0]]
+                x2, y2 = peaks[e[1]]
+                cv2.line(canvas, (x1, y1), (x2, y2), matplotlib.colors.hsv_to_rgb([ie/float(len(edges)), 1.0, 1.0])*255, thickness=stickwidth)
+        for i, keyponit in enumerate(peaks):
+            x, y = keyponit
+            cv2.circle(canvas, (x, y), 4, (0, 0, 255), thickness=-1)
+            if show_number:
+                cv2.putText(canvas, str(i), (x, y), cv2.FONT_HERSHEY_SIMPLEX, 0.3, (0, 0, 0), lineType=cv2.LINE_AA)
+    return canvas
+# detect hand according to body pose keypoints
+# please refer to https://github.com/CMU-Perceptual-Computing-Lab/openpose/blob/master/src/openpose/hand/handDetector.cpp
+def handDetect(candidate, subset, oriImg):
+    # right hand: wrist 4, elbow 3, shoulder 2
+    # left hand: wrist 7, elbow 6, shoulder 5
+    ratioWristElbow = 0.33
+    detect_result = []
+    image_height, image_width = oriImg.shape[0:2]
+    for person in subset.astype(int):
+        # if any of three not detected
+        has_left = np.sum(person[[5, 6, 7]] == -1) == 0
+        has_right = np.sum(person[[2, 3, 4]] == -1) == 0
+        if not (has_left or has_right):
+            continue
+        hands = []
+        #left hand
+        if has_left:
+            left_shoulder_index, left_elbow_index, left_wrist_index = person[[5, 6, 7]]
+            x1, y1 = candidate[left_shoulder_index][:2]
+            x2, y2 = candidate[left_elbow_index][:2]
+            x3, y3 = candidate[left_wrist_index][:2]
+            hands.append([x1, y1, x2, y2, x3, y3, True])
+        # right hand
+        if has_right:
+            right_shoulder_index, right_elbow_index, right_wrist_index = person[[2, 3, 4]]
+            x1, y1 = candidate[right_shoulder_index][:2]
+            x2, y2 = candidate[right_elbow_index][:2]
+            x3, y3 = candidate[right_wrist_index][:2]
+            hands.append([x1, y1, x2, y2, x3, y3, False])
+        for x1, y1, x2, y2, x3, y3, is_left in hands:
+            # pos_hand = pos_wrist + ratio * (pos_wrist - pos_elbox) = (1 + ratio) * pos_wrist - ratio * pos_elbox
+            # handRectangle.x = posePtr[wrist*3] + ratioWristElbow * (posePtr[wrist*3] - posePtr[elbow*3]);
+            # handRectangle.y = posePtr[wrist*3+1] + ratioWristElbow * (posePtr[wrist*3+1] - posePtr[elbow*3+1]);
+            # const auto distanceWristElbow = getDistance(poseKeypoints, person, wrist, elbow);
+            # const auto distanceElbowShoulder = getDistance(poseKeypoints, person, elbow, shoulder);
+            # handRectangle.width = 1.5f * fastMax(distanceWristElbow, 0.9f * distanceElbowShoulder);
+            x = x3 + ratioWristElbow * (x3 - x2)
+            y = y3 + ratioWristElbow * (y3 - y2)
+            distanceWristElbow = math.sqrt((x3 - x2) ** 2 + (y3 - y2) ** 2)
+            distanceElbowShoulder = math.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2)
+            width = 1.5 * max(distanceWristElbow, 0.9 * distanceElbowShoulder)
+            # x-y refers to the center --> offset to topLeft point
+            # handRectangle.x -= handRectangle.width / 2.f;
+            # handRectangle.y -= handRectangle.height / 2.f;
+            x -= width / 2
+            y -= width / 2  # width = height
+            # overflow the image
+            if x < 0: x = 0
+            if y < 0: y = 0
+            width1 = width
+            width2 = width
+            if x + width > image_width: width1 = image_width - x
+            if y + width > image_height: width2 = image_height - y
+            width = min(width1, width2)
+            # the max hand box value is 20 pixels
+            if width >= 20:
+                detect_result.append([int(x), int(y), int(width), is_left])
+    '''
+    return value: [[x, y, w, True if left hand else False]].
+    width=height since the network require squared input.
+    x, y is the coordinate of top left
+    '''
+    return detect_result
+# get max index of 2d array
+def npmax(array):
+    arrayindex = array.argmax(1)
+    arrayvalue = array.max(1)
+    i = arrayvalue.argmax()
+    j = arrayindex[i]
+    return i, j

annotator/segm/__init__.py ADDED Viewed

	@@ -0,0 +1,162 @@

+# Self-Correction-Human-Parsing
+# Original https://github.com/GoGoDuck912/Self-Correction-Human-Parsing
+import os
+import torch
+import numpy as np
+from PIL import Image
+import cv2
+import torchvision.transforms as T
+from .transforms import transform_logits, get_affine_transform
+from . import networks
+from annotator.util import annotator_ckpts_path
+from huggingface_hub import snapshot_download
+dataset_settings = {
+    'lip': {
+        'input_size': [473, 473],
+        'num_classes': 20,
+        'label': ['Background', 'Hat', 'Hair', 'Glove', 'Sunglasses', 'Upper-clothes', 'Dress', 'Coat',
+                  'Socks', 'Pants', 'Jumpsuits', 'Scarf', 'Skirt', 'Face', 'Left-arm', 'Right-arm',
+                  'Left-leg', 'Right-leg', 'Left-shoe', 'Right-shoe']
+    },
+    'atr': {
+        'input_size': [512, 512],
+        'num_classes': 18,
+        'label': ['Background', 'Hat', 'Hair', 'Sunglasses', 'Upper-clothes', 'Skirt', 'Pants', 'Dress', 'Belt',
+                  'Left-shoe', 'Right-shoe', 'Face', 'Left-leg', 'Right-leg', 'Left-arm', 'Right-arm', 'Bag', 'Scarf']
+    },
+    'pascal': {
+        'input_size': [512, 512],
+        'num_classes': 7,
+        'label': ['Background', 'Head', 'Torso', 'Upper Arms', 'Lower Arms', 'Upper Legs', 'Lower Legs'],
+    }
+}
+def get_palette(num_cls):
+    """ Returns the color map for visualizing the segmentation mask.
+    Args:
+        num_cls: Number of classes
+    Returns:
+        The color map
+    """
+    n = num_cls
+    palette = [0] * (n * 3)
+    for j in range(0, n):
+        lab = j
+        palette[j * 3 + 0] = 0
+        palette[j * 3 + 1] = 0
+        palette[j * 3 + 2] = 0
+        i = 0
+        while lab:
+            palette[j * 3 + 0] |= (((lab >> 0) & 1) << (7 - i))
+            palette[j * 3 + 1] |= (((lab >> 1) & 1) << (7 - i))
+            palette[j * 3 + 2] |= (((lab >> 2) & 1) << (7 - i))
+            i += 1
+            lab >>= 3
+    return palette
+class Segmentator(torch.nn.Module):
+    def __init__(self, dataset='lip'):
+        super().__init__()
+        num_classes = dataset_settings[dataset]['num_classes']
+        input_size = dataset_settings[dataset]['input_size']
+        label = dataset_settings[dataset]['label']
+        if dataset == 'atr':
+            model_path='exp-schp-201908301523-atr.pth'
+        elif dataset == 'lip':
+            model_path='exp-schp-201908261155-lip.pth'
+        model_path = os.path.join(annotator_ckpts_path, model_path)
+        snapshot_download(repo_id="soonyau/visconet", allow_patterns="exp-schp-201908301523-atr.pth", local_dir=annotator_ckpts_path)
+        self.model = networks.init_model('resnet101', num_classes=num_classes, pretrained=None)
+        state_dict = torch.load(model_path)['state_dict']
+        from collections import OrderedDict
+        new_state_dict = OrderedDict()
+        for k, v in state_dict.items():
+            name = k[7:]  # remove `module.`
+            new_state_dict[name] = v
+        self.model.load_state_dict(new_state_dict)
+        self.model.eval()
+        self.palette = get_palette(num_classes)
+        self.transform = T.Compose([
+            T.ToTensor(),
+            T.Normalize(mean=[0.406, 0.456, 0.485], std=[0.225, 0.224, 0.229])
+        ])
+        self.aspect_ratio = input_size[1] * 1.0 / input_size[0]
+        self.input_size = np.asarray(input_size)
+    def _box2cs(self, box):
+        x, y, w, h = box[:4]
+        return self._xywh2cs(x, y, w, h)
+    def _xywh2cs(self, x, y, w, h):
+        center = np.zeros((2), dtype=np.float32)
+        center[0] = x + w * 0.5
+        center[1] = y + h * 0.5
+        if w > self.aspect_ratio * h:
+            h = w * 1.0 / self.aspect_ratio
+        elif w < self.aspect_ratio * h:
+            w = h * self.aspect_ratio
+        scale = np.array([w, h], dtype=np.float32)
+        return center, scale
+    def preprocess(self, image:np.array):
+        # convert numpy to cv2
+        image = image[:,:,::-1]
+        h, w, _ = image.shape
+        # Get person center and scale
+        person_center, s = self._box2cs([0, 0, w - 1, h - 1])
+        r = 0
+        trans = get_affine_transform(person_center, s, r, self.input_size)
+        input = cv2.warpAffine(
+            image,
+            trans,
+            (int(self.input_size[1]), int(self.input_size[0])),
+            flags=cv2.INTER_LINEAR,
+            borderMode=cv2.BORDER_CONSTANT,
+            borderValue=(0, 0, 0))
+        input = self.transform(input)
+        meta = {
+            'center': person_center,
+            'height': h,
+            'width': w,
+            'scale': s,
+            'rotation': r
+        }
+        return input, meta
+    @torch.no_grad()
+    def __call__(self, input_image):
+        image, meta = self.preprocess(input_image)
+        c = meta['center']
+        s = meta['scale']
+        w = meta['width']
+        h = meta['height']
+        input_size = list(self.input_size)
+        device = next(self.parameters()).device
+        output = self.model(image.unsqueeze(0).to(device))
+        upsample = torch.nn.Upsample(size=input_size, mode='bilinear', align_corners=True)
+        upsample_output = upsample(output[0][-1][0].unsqueeze(0))
+        upsample_output = upsample_output.squeeze()
+        upsample_output = upsample_output.permute(1, 2, 0)  # CHW -> HWC
+        logits_result = transform_logits(upsample_output.data.cpu().numpy(), c, s, w, h, input_size=input_size)
+        parsing_result = np.argmax(logits_result, axis=2)
+        output_img = Image.fromarray(np.asarray(parsing_result, dtype=np.uint8))
+        #return output_img
+        output_img.putpalette(self.palette)
+        return output_img
+        #return np.array(output_img)

annotator/segm/modules/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .bn import ABN, InPlaceABN, InPlaceABNSync
+from .functions import ACT_RELU, ACT_LEAKY_RELU, ACT_ELU, ACT_NONE
+from .misc import GlobalAvgPool2d, SingleGPU
+from .residual import IdentityResidualBlock
+from .dense import DenseModule

annotator/segm/modules/bn.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as functional
+try:
+    from queue import Queue
+except ImportError:
+    from Queue import Queue
+from .functions import *
+class ABN(nn.Module):
+    """Activated Batch Normalization
+    This gathers a `BatchNorm2d` and an activation function in a single module
+    """
+    def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, activation="leaky_relu", slope=0.01):
+        """Creates an Activated Batch Normalization module
+        Parameters
+        ----------
+        num_features : int
+            Number of feature channels in the input and output.
+        eps : float
+            Small constant to prevent numerical issues.
+        momentum : float
+            Momentum factor applied to compute running statistics as.
+        affine : bool
+            If `True` apply learned scale and shift transformation after normalization.
+        activation : str
+            Name of the activation functions, one of: `leaky_relu`, `elu` or `none`.
+        slope : float
+            Negative slope for the `leaky_relu` activation.
+        """
+        super(ABN, self).__init__()
+        self.num_features = num_features
+        self.affine = affine
+        self.eps = eps
+        self.momentum = momentum
+        self.activation = activation
+        self.slope = slope
+        if self.affine:
+            self.weight = nn.Parameter(torch.ones(num_features))
+            self.bias = nn.Parameter(torch.zeros(num_features))
+        else:
+            self.register_parameter('weight', None)
+            self.register_parameter('bias', None)
+        self.register_buffer('running_mean', torch.zeros(num_features))
+        self.register_buffer('running_var', torch.ones(num_features))
+        self.reset_parameters()
+    def reset_parameters(self):
+        nn.init.constant_(self.running_mean, 0)
+        nn.init.constant_(self.running_var, 1)
+        if self.affine:
+            nn.init.constant_(self.weight, 1)
+            nn.init.constant_(self.bias, 0)
+    def forward(self, x):
+        x = functional.batch_norm(x, self.running_mean, self.running_var, self.weight, self.bias,
+                                  self.training, self.momentum, self.eps)
+        if self.activation == ACT_RELU:
+            return functional.relu(x, inplace=True)
+        elif self.activation == ACT_LEAKY_RELU:
+            return functional.leaky_relu(x, negative_slope=self.slope, inplace=True)
+        elif self.activation == ACT_ELU:
+            return functional.elu(x, inplace=True)
+        else:
+            return x
+    def __repr__(self):
+        rep = '{name}({num_features}, eps={eps}, momentum={momentum},' \
+              ' affine={affine}, activation={activation}'
+        if self.activation == "leaky_relu":
+            rep += ', slope={slope})'
+        else:
+            rep += ')'
+        return rep.format(name=self.__class__.__name__, **self.__dict__)
+class InPlaceABN(ABN):
+    """InPlace Activated Batch Normalization"""
+    def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, activation="leaky_relu", slope=0.01):
+        """Creates an InPlace Activated Batch Normalization module
+        Parameters
+        ----------
+        num_features : int
+            Number of feature channels in the input and output.
+        eps : float
+            Small constant to prevent numerical issues.
+        momentum : float
+            Momentum factor applied to compute running statistics as.
+        affine : bool
+            If `True` apply learned scale and shift transformation after normalization.
+        activation : str
+            Name of the activation functions, one of: `leaky_relu`, `elu` or `none`.
+        slope : float
+            Negative slope for the `leaky_relu` activation.
+        """
+        super(InPlaceABN, self).__init__(num_features, eps, momentum, affine, activation, slope)
+    def forward(self, x):
+        x, _, _ = inplace_abn(x, self.weight, self.bias, self.running_mean, self.running_var,
+                           self.training, self.momentum, self.eps, self.activation, self.slope)
+        return x
+class InPlaceABNSync(ABN):
+    """InPlace Activated Batch Normalization with cross-GPU synchronization
+    This assumes that it will be replicated across GPUs using the same mechanism as in `nn.DistributedDataParallel`.
+    """
+    def forward(self, x):
+        x, _, _ =  inplace_abn_sync(x, self.weight, self.bias, self.running_mean, self.running_var,
+                                   self.training, self.momentum, self.eps, self.activation, self.slope)
+        return x
+    def __repr__(self):
+        rep = '{name}({num_features}, eps={eps}, momentum={momentum},' \
+              ' affine={affine}, activation={activation}'
+        if self.activation == "leaky_relu":
+            rep += ', slope={slope})'
+        else:
+            rep += ')'
+        return rep.format(name=self.__class__.__name__, **self.__dict__)

annotator/segm/modules/deeplab.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as functional
+from models._util import try_index
+from .bn import ABN
+class DeeplabV3(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 hidden_channels=256,
+                 dilations=(12, 24, 36),
+                 norm_act=ABN,
+                 pooling_size=None):
+        super(DeeplabV3, self).__init__()
+        self.pooling_size = pooling_size
+        self.map_convs = nn.ModuleList([
+            nn.Conv2d(in_channels, hidden_channels, 1, bias=False),
+            nn.Conv2d(in_channels, hidden_channels, 3, bias=False, dilation=dilations[0], padding=dilations[0]),
+            nn.Conv2d(in_channels, hidden_channels, 3, bias=False, dilation=dilations[1], padding=dilations[1]),
+            nn.Conv2d(in_channels, hidden_channels, 3, bias=False, dilation=dilations[2], padding=dilations[2])
+        ])
+        self.map_bn = norm_act(hidden_channels * 4)
+        self.global_pooling_conv = nn.Conv2d(in_channels, hidden_channels, 1, bias=False)
+        self.global_pooling_bn = norm_act(hidden_channels)
+        self.red_conv = nn.Conv2d(hidden_channels * 4, out_channels, 1, bias=False)
+        self.pool_red_conv = nn.Conv2d(hidden_channels, out_channels, 1, bias=False)
+        self.red_bn = norm_act(out_channels)
+        self.reset_parameters(self.map_bn.activation, self.map_bn.slope)
+    def reset_parameters(self, activation, slope):
+        gain = nn.init.calculate_gain(activation, slope)
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.xavier_normal_(m.weight.data, gain)
+                if hasattr(m, "bias") and m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, ABN):
+                if hasattr(m, "weight") and m.weight is not None:
+                    nn.init.constant_(m.weight, 1)
+                if hasattr(m, "bias") and m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+    def forward(self, x):
+        # Map convolutions
+        out = torch.cat([m(x) for m in self.map_convs], dim=1)
+        out = self.map_bn(out)
+        out = self.red_conv(out)
+        # Global pooling
+        pool = self._global_pooling(x)
+        pool = self.global_pooling_conv(pool)
+        pool = self.global_pooling_bn(pool)
+        pool = self.pool_red_conv(pool)
+        if self.training or self.pooling_size is None:
+            pool = pool.repeat(1, 1, x.size(2), x.size(3))
+        out += pool
+        out = self.red_bn(out)
+        return out
+    def _global_pooling(self, x):
+        if self.training or self.pooling_size is None:
+            pool = x.view(x.size(0), x.size(1), -1).mean(dim=-1)
+            pool = pool.view(x.size(0), x.size(1), 1, 1)
+        else:
+            pooling_size = (min(try_index(self.pooling_size, 0), x.shape[2]),
+                            min(try_index(self.pooling_size, 1), x.shape[3]))
+            padding = (
+                (pooling_size[1] - 1) // 2,
+                (pooling_size[1] - 1) // 2 if pooling_size[1] % 2 == 1 else (pooling_size[1] - 1) // 2 + 1,
+                (pooling_size[0] - 1) // 2,
+                (pooling_size[0] - 1) // 2 if pooling_size[0] % 2 == 1 else (pooling_size[0] - 1) // 2 + 1
+            )
+            pool = functional.avg_pool2d(x, pooling_size, stride=1)
+            pool = functional.pad(pool, pad=padding, mode="replicate")
+        return pool

annotator/segm/modules/dense.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from collections import OrderedDict
+import torch
+import torch.nn as nn
+from .bn import ABN
+class DenseModule(nn.Module):
+    def __init__(self, in_channels, growth, layers, bottleneck_factor=4, norm_act=ABN, dilation=1):
+        super(DenseModule, self).__init__()
+        self.in_channels = in_channels
+        self.growth = growth
+        self.layers = layers
+        self.convs1 = nn.ModuleList()
+        self.convs3 = nn.ModuleList()
+        for i in range(self.layers):
+            self.convs1.append(nn.Sequential(OrderedDict([
+                ("bn", norm_act(in_channels)),
+                ("conv", nn.Conv2d(in_channels, self.growth * bottleneck_factor, 1, bias=False))
+            ])))
+            self.convs3.append(nn.Sequential(OrderedDict([
+                ("bn", norm_act(self.growth * bottleneck_factor)),
+                ("conv", nn.Conv2d(self.growth * bottleneck_factor, self.growth, 3, padding=dilation, bias=False,
+                                   dilation=dilation))
+            ])))
+            in_channels += self.growth
+    @property
+    def out_channels(self):
+        return self.in_channels + self.growth * self.layers
+    def forward(self, x):
+        inputs = [x]
+        for i in range(self.layers):
+            x = torch.cat(inputs, dim=1)
+            x = self.convs1[i](x)
+            x = self.convs3[i](x)
+            inputs += [x]
+        return torch.cat(inputs, dim=1)

annotator/segm/modules/functions.py ADDED Viewed

	@@ -0,0 +1,244 @@

+from os import path
+import torch
+import torch.distributed as dist
+import torch.autograd as autograd
+import torch.cuda.comm as comm
+from torch.autograd.function import once_differentiable
+from torch.utils.cpp_extension import load
+_src_path = path.join(path.dirname(path.abspath(__file__)), "src")
+_backend = load(name="inplace_abn",
+                extra_cflags=["-O3"],
+                sources=[path.join(_src_path, f) for f in [
+                    "inplace_abn.cpp",
+                    "inplace_abn_cpu.cpp",
+                    "inplace_abn_cuda.cu",
+                    "inplace_abn_cuda_half.cu"
+                ]],
+                extra_cuda_cflags=["--expt-extended-lambda"])
+# Activation names
+ACT_RELU = "relu"
+ACT_LEAKY_RELU = "leaky_relu"
+ACT_ELU = "elu"
+ACT_NONE = "none"
+def _check(fn, *args, **kwargs):
+    success = fn(*args, **kwargs)
+    if not success:
+        raise RuntimeError("CUDA Error encountered in {}".format(fn))
+def _broadcast_shape(x):
+    out_size = []
+    for i, s in enumerate(x.size()):
+        if i != 1:
+            out_size.append(1)
+        else:
+            out_size.append(s)
+    return out_size
+def _reduce(x):
+    if len(x.size()) == 2:
+        return x.sum(dim=0)
+    else:
+        n, c = x.size()[0:2]
+        return x.contiguous().view((n, c, -1)).sum(2).sum(0)
+def _count_samples(x):
+    count = 1
+    for i, s in enumerate(x.size()):
+        if i != 1:
+            count *= s
+    return count
+def _act_forward(ctx, x):
+    if ctx.activation == ACT_LEAKY_RELU:
+        _backend.leaky_relu_forward(x, ctx.slope)
+    elif ctx.activation == ACT_ELU:
+        _backend.elu_forward(x)
+    elif ctx.activation == ACT_NONE:
+        pass
+def _act_backward(ctx, x, dx):
+    if ctx.activation == ACT_LEAKY_RELU:
+        _backend.leaky_relu_backward(x, dx, ctx.slope)
+    elif ctx.activation == ACT_ELU:
+        _backend.elu_backward(x, dx)
+    elif ctx.activation == ACT_NONE:
+        pass
+class InPlaceABN(autograd.Function):
+    @staticmethod
+    def forward(ctx, x, weight, bias, running_mean, running_var,
+                training=True, momentum=0.1, eps=1e-05, activation=ACT_LEAKY_RELU, slope=0.01):
+        # Save context
+        ctx.training = training
+        ctx.momentum = momentum
+        ctx.eps = eps
+        ctx.activation = activation
+        ctx.slope = slope
+        ctx.affine = weight is not None and bias is not None
+        # Prepare inputs
+        count = _count_samples(x)
+        x = x.contiguous()
+        weight = weight.contiguous() if ctx.affine else x.new_empty(0)
+        bias = bias.contiguous() if ctx.affine else x.new_empty(0)
+        if ctx.training:
+            mean, var = _backend.mean_var(x)
+            # Update running stats
+            running_mean.mul_((1 - ctx.momentum)).add_(ctx.momentum * mean)
+            running_var.mul_((1 - ctx.momentum)).add_(ctx.momentum * var * count / (count - 1))
+            # Mark in-place modified tensors
+            ctx.mark_dirty(x, running_mean, running_var)
+        else:
+            mean, var = running_mean.contiguous(), running_var.contiguous()
+            ctx.mark_dirty(x)
+        # BN forward + activation
+        _backend.forward(x, mean, var, weight, bias, ctx.affine, ctx.eps)
+        _act_forward(ctx, x)
+        # Output
+        ctx.var = var
+        ctx.save_for_backward(x, var, weight, bias)
+        ctx.mark_non_differentiable(running_mean, running_var)
+        return x, running_mean, running_var
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, dz, _drunning_mean, _drunning_var):
+        z, var, weight, bias = ctx.saved_tensors
+        dz = dz.contiguous()
+        # Undo activation
+        _act_backward(ctx, z, dz)
+        if ctx.training:
+            edz, eydz = _backend.edz_eydz(z, dz, weight, bias, ctx.affine, ctx.eps)
+        else:
+            # TODO: implement simplified CUDA backward for inference mode
+            edz = dz.new_zeros(dz.size(1))
+            eydz = dz.new_zeros(dz.size(1))
+        dx = _backend.backward(z, dz, var, weight, bias, edz, eydz, ctx.affine, ctx.eps)
+        # dweight = eydz * weight.sign() if ctx.affine else None
+        dweight = eydz if ctx.affine else None
+        if dweight is not None:
+            dweight[weight < 0] *= -1
+        dbias = edz if ctx.affine else None
+        return dx, dweight, dbias, None, None, None, None, None, None, None
+class InPlaceABNSync(autograd.Function):
+    @classmethod
+    def forward(cls, ctx, x, weight, bias, running_mean, running_var,
+                training=True, momentum=0.1, eps=1e-05, activation=ACT_LEAKY_RELU, slope=0.01, equal_batches=True):
+        # Save context
+        ctx.training = training
+        ctx.momentum = momentum
+        ctx.eps = eps
+        ctx.activation = activation
+        ctx.slope = slope
+        ctx.affine = weight is not None and bias is not None
+        # Prepare inputs
+        ctx.world_size = dist.get_world_size() if dist.is_initialized() else 1
+        # count = _count_samples(x)
+        batch_size = x.new_tensor([x.shape[0]], dtype=torch.long)
+        x = x.contiguous()
+        weight = weight.contiguous() if ctx.affine else x.new_empty(0)
+        bias = bias.contiguous() if ctx.affine else x.new_empty(0)
+        if ctx.training:
+            mean, var = _backend.mean_var(x)
+            if ctx.world_size > 1:
+                # get global batch size
+                if equal_batches:
+                    batch_size *= ctx.world_size
+                else:
+                    dist.all_reduce(batch_size, dist.ReduceOp.SUM)
+                ctx.factor = x.shape[0] / float(batch_size.item())
+                mean_all = mean.clone() * ctx.factor
+                dist.all_reduce(mean_all, dist.ReduceOp.SUM)
+                var_all = (var + (mean - mean_all) ** 2) * ctx.factor
+                dist.all_reduce(var_all, dist.ReduceOp.SUM)
+                mean = mean_all
+                var = var_all
+            # Update running stats
+            running_mean.mul_((1 - ctx.momentum)).add_(ctx.momentum * mean)
+            count = batch_size.item() * x.view(x.shape[0], x.shape[1], -1).shape[-1]
+            running_var.mul_((1 - ctx.momentum)).add_(ctx.momentum * var * (float(count) / (count - 1)))
+            # Mark in-place modified tensors
+            ctx.mark_dirty(x, running_mean, running_var)
+        else:
+            mean, var = running_mean.contiguous(), running_var.contiguous()
+            ctx.mark_dirty(x)
+        # BN forward + activation
+        _backend.forward(x, mean, var, weight, bias, ctx.affine, ctx.eps)
+        _act_forward(ctx, x)
+        # Output
+        ctx.var = var
+        ctx.save_for_backward(x, var, weight, bias)
+        ctx.mark_non_differentiable(running_mean, running_var)
+        return x, running_mean, running_var
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, dz, _drunning_mean, _drunning_var):
+        z, var, weight, bias = ctx.saved_tensors
+        dz = dz.contiguous()
+        # Undo activation
+        _act_backward(ctx, z, dz)
+        if ctx.training:
+            edz, eydz = _backend.edz_eydz(z, dz, weight, bias, ctx.affine, ctx.eps)
+            edz_local = edz.clone()
+            eydz_local = eydz.clone()
+            if ctx.world_size > 1:
+                edz *= ctx.factor
+                dist.all_reduce(edz, dist.ReduceOp.SUM)
+                eydz *= ctx.factor
+                dist.all_reduce(eydz, dist.ReduceOp.SUM)
+        else:
+            edz_local = edz = dz.new_zeros(dz.size(1))
+            eydz_local = eydz = dz.new_zeros(dz.size(1))
+        dx = _backend.backward(z, dz, var, weight, bias, edz, eydz, ctx.affine, ctx.eps)
+        # dweight = eydz_local * weight.sign() if ctx.affine else None
+        dweight = eydz_local if ctx.affine else None
+        if dweight is not None:
+            dweight[weight < 0] *= -1
+        dbias = edz_local if ctx.affine else None
+        return dx, dweight, dbias, None, None, None, None, None, None, None
+inplace_abn = InPlaceABN.apply
+inplace_abn_sync = InPlaceABNSync.apply
+__all__ = ["inplace_abn", "inplace_abn_sync", "ACT_RELU", "ACT_LEAKY_RELU", "ACT_ELU", "ACT_NONE"]

annotator/segm/modules/misc.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import torch.nn as nn
+import torch
+import torch.distributed as dist
+class GlobalAvgPool2d(nn.Module):
+    def __init__(self):
+        """Global average pooling over the input's spatial dimensions"""
+        super(GlobalAvgPool2d, self).__init__()
+    def forward(self, inputs):
+        in_size = inputs.size()
+        return inputs.view((in_size[0], in_size[1], -1)).mean(dim=2)
+class SingleGPU(nn.Module):
+    def __init__(self, module):
+        super(SingleGPU, self).__init__()
+        self.module=module
+    def forward(self, input):
+        return self.module(input.cuda(non_blocking=True))

annotator/segm/modules/residual.py ADDED Viewed

	@@ -0,0 +1,182 @@

+from collections import OrderedDict
+import torch.nn as nn
+from .bn import ABN, ACT_LEAKY_RELU, ACT_ELU, ACT_NONE
+import torch.nn.functional as functional
+class ResidualBlock(nn.Module):
+    """Configurable residual block
+    Parameters
+    ----------
+    in_channels : int
+        Number of input channels.
+    channels : list of int
+        Number of channels in the internal feature maps. Can either have two or three elements: if three construct
+        a residual block with two `3 x 3` convolutions, otherwise construct a bottleneck block with `1 x 1`, then
+        `3 x 3` then `1 x 1` convolutions.
+    stride : int
+        Stride of the first `3 x 3` convolution
+    dilation : int
+        Dilation to apply to the `3 x 3` convolutions.
+    groups : int
+        Number of convolution groups. This is used to create ResNeXt-style blocks and is only compatible with
+        bottleneck blocks.
+    norm_act : callable
+        Function to create normalization / activation Module.
+    dropout: callable
+        Function to create Dropout Module.
+    """
+    def __init__(self,
+                 in_channels,
+                 channels,
+                 stride=1,
+                 dilation=1,
+                 groups=1,
+                 norm_act=ABN,
+                 dropout=None):
+        super(ResidualBlock, self).__init__()
+        # Check parameters for inconsistencies
+        if len(channels) != 2 and len(channels) != 3:
+            raise ValueError("channels must contain either two or three values")
+        if len(channels) == 2 and groups != 1:
+            raise ValueError("groups > 1 are only valid if len(channels) == 3")
+        is_bottleneck = len(channels) == 3
+        need_proj_conv = stride != 1 or in_channels != channels[-1]
+        if not is_bottleneck:
+            bn2 = norm_act(channels[1])
+            bn2.activation = ACT_NONE
+            layers = [
+                ("conv1", nn.Conv2d(in_channels, channels[0], 3, stride=stride, padding=dilation, bias=False,
+                                    dilation=dilation)),
+                ("bn1", norm_act(channels[0])),
+                ("conv2", nn.Conv2d(channels[0], channels[1], 3, stride=1, padding=dilation, bias=False,
+                                    dilation=dilation)),
+                ("bn2", bn2)
+            ]
+            if dropout is not None:
+                layers = layers[0:2] + [("dropout", dropout())] + layers[2:]
+        else:
+            bn3 = norm_act(channels[2])
+            bn3.activation = ACT_NONE
+            layers = [
+                ("conv1", nn.Conv2d(in_channels, channels[0], 1, stride=1, padding=0, bias=False)),
+                ("bn1", norm_act(channels[0])),
+                ("conv2", nn.Conv2d(channels[0], channels[1], 3, stride=stride, padding=dilation, bias=False,
+                                    groups=groups, dilation=dilation)),
+                ("bn2", norm_act(channels[1])),
+                ("conv3", nn.Conv2d(channels[1], channels[2], 1, stride=1, padding=0, bias=False)),
+                ("bn3", bn3)
+            ]
+            if dropout is not None:
+                layers = layers[0:4] + [("dropout", dropout())] + layers[4:]
+        self.convs = nn.Sequential(OrderedDict(layers))
+        if need_proj_conv:
+            self.proj_conv = nn.Conv2d(in_channels, channels[-1], 1, stride=stride, padding=0, bias=False)
+            self.proj_bn = norm_act(channels[-1])
+            self.proj_bn.activation = ACT_NONE
+    def forward(self, x):
+        if hasattr(self, "proj_conv"):
+            residual = self.proj_conv(x)
+            residual = self.proj_bn(residual)
+        else:
+            residual = x
+        x = self.convs(x) + residual
+        if self.convs.bn1.activation == ACT_LEAKY_RELU:
+            return functional.leaky_relu(x, negative_slope=self.convs.bn1.slope, inplace=True)
+        elif self.convs.bn1.activation == ACT_ELU:
+            return functional.elu(x, inplace=True)
+        else:
+            return x
+class IdentityResidualBlock(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 channels,
+                 stride=1,
+                 dilation=1,
+                 groups=1,
+                 norm_act=ABN,
+                 dropout=None):
+        """Configurable identity-mapping residual block
+        Parameters
+        ----------
+        in_channels : int
+            Number of input channels.
+        channels : list of int
+            Number of channels in the internal feature maps. Can either have two or three elements: if three construct
+            a residual block with two `3 x 3` convolutions, otherwise construct a bottleneck block with `1 x 1`, then
+            `3 x 3` then `1 x 1` convolutions.
+        stride : int
+            Stride of the first `3 x 3` convolution
+        dilation : int
+            Dilation to apply to the `3 x 3` convolutions.
+        groups : int
+            Number of convolution groups. This is used to create ResNeXt-style blocks and is only compatible with
+            bottleneck blocks.
+        norm_act : callable
+            Function to create normalization / activation Module.
+        dropout: callable
+            Function to create Dropout Module.
+        """
+        super(IdentityResidualBlock, self).__init__()
+        # Check parameters for inconsistencies
+        if len(channels) != 2 and len(channels) != 3:
+            raise ValueError("channels must contain either two or three values")
+        if len(channels) == 2 and groups != 1:
+            raise ValueError("groups > 1 are only valid if len(channels) == 3")
+        is_bottleneck = len(channels) == 3
+        need_proj_conv = stride != 1 or in_channels != channels[-1]
+        self.bn1 = norm_act(in_channels)
+        if not is_bottleneck:
+            layers = [
+                ("conv1", nn.Conv2d(in_channels, channels[0], 3, stride=stride, padding=dilation, bias=False,
+                                    dilation=dilation)),
+                ("bn2", norm_act(channels[0])),
+                ("conv2", nn.Conv2d(channels[0], channels[1], 3, stride=1, padding=dilation, bias=False,
+                                    dilation=dilation))
+            ]
+            if dropout is not None:
+                layers = layers[0:2] + [("dropout", dropout())] + layers[2:]
+        else:
+            layers = [
+                ("conv1", nn.Conv2d(in_channels, channels[0], 1, stride=stride, padding=0, bias=False)),
+                ("bn2", norm_act(channels[0])),
+                ("conv2", nn.Conv2d(channels[0], channels[1], 3, stride=1, padding=dilation, bias=False,
+                                    groups=groups, dilation=dilation)),
+                ("bn3", norm_act(channels[1])),
+                ("conv3", nn.Conv2d(channels[1], channels[2], 1, stride=1, padding=0, bias=False))
+            ]
+            if dropout is not None:
+                layers = layers[0:4] + [("dropout", dropout())] + layers[4:]
+        self.convs = nn.Sequential(OrderedDict(layers))
+        if need_proj_conv:
+            self.proj_conv = nn.Conv2d(in_channels, channels[-1], 1, stride=stride, padding=0, bias=False)
+    def forward(self, x):
+        if hasattr(self, "proj_conv"):
+            bn1 = self.bn1(x)
+            shortcut = self.proj_conv(bn1)
+        else:
+            shortcut = x.clone()
+            bn1 = self.bn1(x)
+        out = self.convs(bn1)
+        out.add_(shortcut)
+        return out

annotator/segm/modules/src/checks.h ADDED Viewed

	@@ -0,0 +1,15 @@

+#pragma once
+#include <ATen/ATen.h>
+// Define AT_CHECK for old version of ATen where the same function was called AT_ASSERT
+#ifndef AT_CHECK
+#define AT_CHECK AT_ASSERT
+#endif
+#define CHECK_CUDA(x) AT_CHECK((x).type().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CPU(x) AT_CHECK(!(x).type().is_cuda(), #x " must be a CPU tensor")
+#define CHECK_CONTIGUOUS(x) AT_CHECK((x).is_contiguous(), #x " must be contiguous")
+#define CHECK_CUDA_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
+#define CHECK_CPU_INPUT(x) CHECK_CPU(x); CHECK_CONTIGUOUS(x)

annotator/segm/modules/src/inplace_abn.cpp ADDED Viewed

	@@ -0,0 +1,95 @@

+#include <torch/extension.h>
+#include <vector>
+#include "inplace_abn.h"
+std::vector<at::Tensor> mean_var(at::Tensor x) {
+  if (x.is_cuda()) {
+    if (x.type().scalarType() == at::ScalarType::Half) {
+      return mean_var_cuda_h(x);
+    } else {
+      return mean_var_cuda(x);
+    }
+  } else {
+    return mean_var_cpu(x);
+  }
+}
+at::Tensor forward(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                   bool affine, float eps) {
+  if (x.is_cuda()) {
+    if (x.type().scalarType() == at::ScalarType::Half) {
+      return forward_cuda_h(x, mean, var, weight, bias, affine, eps);
+    } else {
+      return forward_cuda(x, mean, var, weight, bias, affine, eps);
+    }
+  } else {
+    return forward_cpu(x, mean, var, weight, bias, affine, eps);
+  }
+}
+std::vector<at::Tensor> edz_eydz(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
+                                 bool affine, float eps) {
+  if (z.is_cuda()) {
+    if (z.type().scalarType() == at::ScalarType::Half) {
+      return edz_eydz_cuda_h(z, dz, weight, bias, affine, eps);
+    } else {
+      return edz_eydz_cuda(z, dz, weight, bias, affine, eps);
+	}
+  } else {
+    return edz_eydz_cpu(z, dz, weight, bias, affine, eps);
+  }
+}
+at::Tensor backward(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                                 at::Tensor edz, at::Tensor eydz, bool affine, float eps) {
+  if (z.is_cuda()) {
+    if (z.type().scalarType() == at::ScalarType::Half) {
+      return backward_cuda_h(z, dz, var, weight, bias, edz, eydz, affine, eps);
+	} else {
+      return backward_cuda(z, dz, var, weight, bias, edz, eydz, affine, eps);
+    }
+  } else {
+    return backward_cpu(z, dz, var, weight, bias, edz, eydz, affine, eps);
+  }
+}
+void leaky_relu_forward(at::Tensor z, float slope) {
+  at::leaky_relu_(z, slope);
+}
+void leaky_relu_backward(at::Tensor z, at::Tensor dz, float slope) {
+  if (z.is_cuda()) {
+    if (z.type().scalarType() == at::ScalarType::Half) {
+      return leaky_relu_backward_cuda_h(z, dz, slope);
+	} else {
+      return leaky_relu_backward_cuda(z, dz, slope);
+    }
+  } else {
+    return leaky_relu_backward_cpu(z, dz, slope);
+  }
+}
+void elu_forward(at::Tensor z) {
+  at::elu_(z);
+}
+void elu_backward(at::Tensor z, at::Tensor dz) {
+  if (z.is_cuda()) {
+    return elu_backward_cuda(z, dz);
+  } else {
+    return elu_backward_cpu(z, dz);
+  }
+}
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("mean_var", &mean_var, "Mean and variance computation");
+  m.def("forward", &forward, "In-place forward computation");
+  m.def("edz_eydz", &edz_eydz, "First part of backward computation");
+  m.def("backward", &backward, "Second part of backward computation");
+  m.def("leaky_relu_forward", &leaky_relu_forward, "Leaky relu forward computation");
+  m.def("leaky_relu_backward", &leaky_relu_backward, "Leaky relu backward computation and inversion");
+  m.def("elu_forward", &elu_forward, "Elu forward computation");
+  m.def("elu_backward", &elu_backward, "Elu backward computation and inversion");
+}

annotator/segm/modules/src/inplace_abn.h ADDED Viewed

	@@ -0,0 +1,88 @@

+#pragma once
+#include <ATen/ATen.h>
+#include <vector>
+std::vector<at::Tensor> mean_var_cpu(at::Tensor x);
+std::vector<at::Tensor> mean_var_cuda(at::Tensor x);
+std::vector<at::Tensor> mean_var_cuda_h(at::Tensor x);
+at::Tensor forward_cpu(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                       bool affine, float eps);
+at::Tensor forward_cuda(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                        bool affine, float eps);
+at::Tensor forward_cuda_h(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                          bool affine, float eps);
+std::vector<at::Tensor> edz_eydz_cpu(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
+                                     bool affine, float eps);
+std::vector<at::Tensor> edz_eydz_cuda(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
+                                      bool affine, float eps);
+std::vector<at::Tensor> edz_eydz_cuda_h(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
+                                        bool affine, float eps);
+at::Tensor backward_cpu(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                                     at::Tensor edz, at::Tensor eydz, bool affine, float eps);
+at::Tensor backward_cuda(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                                      at::Tensor edz, at::Tensor eydz, bool affine, float eps);
+at::Tensor backward_cuda_h(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                                        at::Tensor edz, at::Tensor eydz, bool affine, float eps);
+void leaky_relu_backward_cpu(at::Tensor z, at::Tensor dz, float slope);
+void leaky_relu_backward_cuda(at::Tensor z, at::Tensor dz, float slope);
+void leaky_relu_backward_cuda_h(at::Tensor z, at::Tensor dz, float slope);
+void elu_backward_cpu(at::Tensor z, at::Tensor dz);
+void elu_backward_cuda(at::Tensor z, at::Tensor dz);
+static void get_dims(at::Tensor x, int64_t& num, int64_t& chn, int64_t& sp) {
+  num = x.size(0);
+  chn = x.size(1);
+  sp = 1;
+  for (int64_t i = 2; i < x.ndimension(); ++i)
+    sp *= x.size(i);
+}
+/*
+ * Specialized CUDA reduction functions for BN
+ */
+#ifdef __CUDACC__
+#include "utils/cuda.cuh"
+template <typename T, typename Op>
+__device__ T reduce(Op op, int plane, int N, int S) {
+  T sum = (T)0;
+  for (int batch = 0; batch < N; ++batch) {
+    for (int x = threadIdx.x; x < S; x += blockDim.x) {
+      sum += op(batch, plane, x);
+    }
+  }
+  // sum over NumThreads within a warp
+  sum = warpSum(sum);
+  // 'transpose', and reduce within warp again
+  __shared__ T shared[32];
+  __syncthreads();
+  if (threadIdx.x % WARP_SIZE == 0) {
+    shared[threadIdx.x / WARP_SIZE] = sum;
+  }
+  if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
+    // zero out the other entries in shared
+    shared[threadIdx.x] = (T)0;
+  }
+  __syncthreads();
+  if (threadIdx.x / WARP_SIZE == 0) {
+    sum = warpSum(shared[threadIdx.x]);
+    if (threadIdx.x == 0) {
+      shared[0] = sum;
+    }
+  }
+  __syncthreads();
+  // Everyone picks it up, should be broadcast into the whole gradInput
+  return shared[0];
+}
+#endif

annotator/segm/modules/src/inplace_abn_cpu.cpp ADDED Viewed

	@@ -0,0 +1,119 @@

+#include <ATen/ATen.h>
+#include <vector>
+#include "utils/checks.h"
+#include "inplace_abn.h"
+at::Tensor reduce_sum(at::Tensor x) {
+  if (x.ndimension() == 2) {
+    return x.sum(0);
+  } else {
+    auto x_view = x.view({x.size(0), x.size(1), -1});
+    return x_view.sum(-1).sum(0);
+  }
+}
+at::Tensor broadcast_to(at::Tensor v, at::Tensor x) {
+  if (x.ndimension() == 2) {
+    return v;
+  } else {
+    std::vector<int64_t> broadcast_size = {1, -1};
+    for (int64_t i = 2; i < x.ndimension(); ++i)
+      broadcast_size.push_back(1);
+    return v.view(broadcast_size);
+  }
+}
+int64_t count(at::Tensor x) {
+  int64_t count = x.size(0);
+  for (int64_t i = 2; i < x.ndimension(); ++i)
+    count *= x.size(i);
+  return count;
+}
+at::Tensor invert_affine(at::Tensor z, at::Tensor weight, at::Tensor bias, bool affine, float eps) {
+  if (affine) {
+    return (z - broadcast_to(bias, z)) / broadcast_to(at::abs(weight) + eps, z);
+  } else {
+    return z;
+  }
+}
+std::vector<at::Tensor> mean_var_cpu(at::Tensor x) {
+  auto num = count(x);
+  auto mean = reduce_sum(x) / num;
+  auto diff = x - broadcast_to(mean, x);
+  auto var = reduce_sum(diff.pow(2)) / num;
+  return {mean, var};
+}
+at::Tensor forward_cpu(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                       bool affine, float eps) {
+  auto gamma = affine ? at::abs(weight) + eps : at::ones_like(var);
+  auto mul = at::rsqrt(var + eps) * gamma;
+  x.sub_(broadcast_to(mean, x));
+  x.mul_(broadcast_to(mul, x));
+  if (affine) x.add_(broadcast_to(bias, x));
+  return x;
+}
+std::vector<at::Tensor> edz_eydz_cpu(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
+                                     bool affine, float eps) {
+  auto edz = reduce_sum(dz);
+  auto y = invert_affine(z, weight, bias, affine, eps);
+  auto eydz = reduce_sum(y * dz);
+  return {edz, eydz};
+}
+at::Tensor backward_cpu(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                                     at::Tensor edz, at::Tensor eydz, bool affine, float eps) {
+  auto y = invert_affine(z, weight, bias, affine, eps);
+  auto mul = affine ? at::rsqrt(var + eps) * (at::abs(weight) + eps) : at::rsqrt(var + eps);
+  auto num = count(z);
+  auto dx = (dz - broadcast_to(edz / num, dz) - y * broadcast_to(eydz / num, dz)) * broadcast_to(mul, dz);
+  return dx;
+}
+void leaky_relu_backward_cpu(at::Tensor z, at::Tensor dz, float slope) {
+  CHECK_CPU_INPUT(z);
+  CHECK_CPU_INPUT(dz);
+  AT_DISPATCH_FLOATING_TYPES(z.type(), "leaky_relu_backward_cpu", ([&] {
+    int64_t count = z.numel();
+    auto *_z = z.data<scalar_t>();
+    auto *_dz = dz.data<scalar_t>();
+    for (int64_t i = 0; i < count; ++i) {
+      if (_z[i] < 0) {
+        _z[i] *= 1 / slope;
+        _dz[i] *= slope;
+      }
+    }
+  }));
+}
+void elu_backward_cpu(at::Tensor z, at::Tensor dz) {
+  CHECK_CPU_INPUT(z);
+  CHECK_CPU_INPUT(dz);
+  AT_DISPATCH_FLOATING_TYPES(z.type(), "elu_backward_cpu", ([&] {
+    int64_t count = z.numel();
+    auto *_z = z.data<scalar_t>();
+    auto *_dz = dz.data<scalar_t>();
+    for (int64_t i = 0; i < count; ++i) {
+      if (_z[i] < 0) {
+        _z[i] = log1p(_z[i]);
+        _dz[i] *= (_z[i] + 1.f);
+      }
+    }
+  }));
+}

annotator/segm/modules/src/inplace_abn_cuda.cu ADDED Viewed

	@@ -0,0 +1,333 @@

+#include <ATen/ATen.h>
+#include <thrust/device_ptr.h>
+#include <thrust/transform.h>
+#include <vector>
+#include "utils/checks.h"
+#include "utils/cuda.cuh"
+#include "inplace_abn.h"
+#include <ATen/cuda/CUDAContext.h>
+// Operations for reduce
+template<typename T>
+struct SumOp {
+  __device__ SumOp(const T *t, int c, int s)
+      : tensor(t), chn(c), sp(s) {}
+  __device__ __forceinline__ T operator()(int batch, int plane, int n) {
+    return tensor[(batch * chn + plane) * sp + n];
+  }
+  const T *tensor;
+  const int chn;
+  const int sp;
+};
+template<typename T>
+struct VarOp {
+  __device__ VarOp(T m, const T *t, int c, int s)
+      : mean(m), tensor(t), chn(c), sp(s) {}
+  __device__ __forceinline__ T operator()(int batch, int plane, int n) {
+    T val = tensor[(batch * chn + plane) * sp + n];
+    return (val - mean) * (val - mean);
+  }
+  const T mean;
+  const T *tensor;
+  const int chn;
+  const int sp;
+};
+template<typename T>
+struct GradOp {
+  __device__ GradOp(T _weight, T _bias, const T *_z, const T *_dz, int c, int s)
+      : weight(_weight), bias(_bias), z(_z), dz(_dz), chn(c), sp(s) {}
+  __device__ __forceinline__ Pair<T> operator()(int batch, int plane, int n) {
+    T _y = (z[(batch * chn + plane) * sp + n] - bias) / weight;
+    T _dz = dz[(batch * chn + plane) * sp + n];
+    return Pair<T>(_dz, _y * _dz);
+  }
+  const T weight;
+  const T bias;
+  const T *z;
+  const T *dz;
+  const int chn;
+  const int sp;
+};
+/***********
+ * mean_var
+ ***********/
+template<typename T>
+__global__ void mean_var_kernel(const T *x, T *mean, T *var, int num, int chn, int sp) {
+  int plane = blockIdx.x;
+  T norm = T(1) / T(num * sp);
+  T _mean = reduce<T, SumOp<T>>(SumOp<T>(x, chn, sp), plane, num, sp) * norm;
+  __syncthreads();
+  T _var = reduce<T, VarOp<T>>(VarOp<T>(_mean, x, chn, sp), plane, num, sp) * norm;
+  if (threadIdx.x == 0) {
+    mean[plane] = _mean;
+    var[plane] = _var;
+  }
+}
+std::vector<at::Tensor> mean_var_cuda(at::Tensor x) {
+  CHECK_CUDA_INPUT(x);
+  // Extract dimensions
+  int64_t num, chn, sp;
+  get_dims(x, num, chn, sp);
+  // Prepare output tensors
+  auto mean = at::empty({chn}, x.options());
+  auto var = at::empty({chn}, x.options());
+  // Run kernel
+  dim3 blocks(chn);
+  dim3 threads(getNumThreads(sp));
+  auto stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES(x.type(), "mean_var_cuda", ([&] {
+    mean_var_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
+        x.data<scalar_t>(),
+        mean.data<scalar_t>(),
+        var.data<scalar_t>(),
+        num, chn, sp);
+  }));
+  return {mean, var};
+}
+/**********
+ * forward
+ **********/
+template<typename T>
+__global__ void forward_kernel(T *x, const T *mean, const T *var, const T *weight, const T *bias,
+                               bool affine, float eps, int num, int chn, int sp) {
+  int plane = blockIdx.x;
+  T _mean = mean[plane];
+  T _var = var[plane];
+  T _weight = affine ? abs(weight[plane]) + eps : T(1);
+  T _bias = affine ? bias[plane] : T(0);
+  T mul = rsqrt(_var + eps) * _weight;
+  for (int batch = 0; batch < num; ++batch) {
+    for (int n = threadIdx.x; n < sp; n += blockDim.x) {
+      T _x = x[(batch * chn + plane) * sp + n];
+      T _y = (_x - _mean) * mul + _bias;
+      x[(batch * chn + plane) * sp + n] = _y;
+    }
+  }
+}
+at::Tensor forward_cuda(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                        bool affine, float eps) {
+  CHECK_CUDA_INPUT(x);
+  CHECK_CUDA_INPUT(mean);
+  CHECK_CUDA_INPUT(var);
+  CHECK_CUDA_INPUT(weight);
+  CHECK_CUDA_INPUT(bias);
+  // Extract dimensions
+  int64_t num, chn, sp;
+  get_dims(x, num, chn, sp);
+  // Run kernel
+  dim3 blocks(chn);
+  dim3 threads(getNumThreads(sp));
+  auto stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES(x.type(), "forward_cuda", ([&] {
+    forward_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
+        x.data<scalar_t>(),
+        mean.data<scalar_t>(),
+        var.data<scalar_t>(),
+        weight.data<scalar_t>(),
+        bias.data<scalar_t>(),
+        affine, eps, num, chn, sp);
+  }));
+  return x;
+}
+/***********
+ * edz_eydz
+ ***********/
+template<typename T>
+__global__ void edz_eydz_kernel(const T *z, const T *dz, const T *weight, const T *bias,
+                                T *edz, T *eydz, bool affine, float eps, int num, int chn, int sp) {
+  int plane = blockIdx.x;
+  T _weight = affine ? abs(weight[plane]) + eps : 1.f;
+  T _bias = affine ? bias[plane] : 0.f;
+  Pair<T> res = reduce<Pair<T>, GradOp<T>>(GradOp<T>(_weight, _bias, z, dz, chn, sp), plane, num, sp);
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    edz[plane] = res.v1;
+    eydz[plane] = res.v2;
+  }
+}
+std::vector<at::Tensor> edz_eydz_cuda(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
+                                      bool affine, float eps) {
+  CHECK_CUDA_INPUT(z);
+  CHECK_CUDA_INPUT(dz);
+  CHECK_CUDA_INPUT(weight);
+  CHECK_CUDA_INPUT(bias);
+  // Extract dimensions
+  int64_t num, chn, sp;
+  get_dims(z, num, chn, sp);
+  auto edz = at::empty({chn}, z.options());
+  auto eydz = at::empty({chn}, z.options());
+  // Run kernel
+  dim3 blocks(chn);
+  dim3 threads(getNumThreads(sp));
+  auto stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES(z.type(), "edz_eydz_cuda", ([&] {
+    edz_eydz_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
+        z.data<scalar_t>(),
+        dz.data<scalar_t>(),
+        weight.data<scalar_t>(),
+        bias.data<scalar_t>(),
+        edz.data<scalar_t>(),
+        eydz.data<scalar_t>(),
+        affine, eps, num, chn, sp);
+  }));
+  return {edz, eydz};
+}
+/***********
+ * backward
+ ***********/
+template<typename T>
+__global__ void backward_kernel(const T *z, const T *dz, const T *var, const T *weight, const T *bias, const T *edz,
+	                        const T *eydz, T *dx, bool affine, float eps, int num, int chn, int sp) {
+  int plane = blockIdx.x;
+  T _weight = affine ? abs(weight[plane]) + eps : 1.f;
+  T _bias = affine ? bias[plane] : 0.f;
+  T _var = var[plane];
+  T _edz = edz[plane];
+  T _eydz = eydz[plane];
+  T _mul = _weight * rsqrt(_var + eps);
+  T count = T(num * sp);
+  for (int batch = 0; batch < num; ++batch) {
+    for (int n = threadIdx.x; n < sp; n += blockDim.x) {
+      T _dz = dz[(batch * chn + plane) * sp + n];
+      T _y = (z[(batch * chn + plane) * sp + n] - _bias) / _weight;
+      dx[(batch * chn + plane) * sp + n] = (_dz - _edz / count - _y * _eydz / count) * _mul;
+    }
+  }
+}
+at::Tensor backward_cuda(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                                      at::Tensor edz, at::Tensor eydz, bool affine, float eps) {
+  CHECK_CUDA_INPUT(z);
+  CHECK_CUDA_INPUT(dz);
+  CHECK_CUDA_INPUT(var);
+  CHECK_CUDA_INPUT(weight);
+  CHECK_CUDA_INPUT(bias);
+  CHECK_CUDA_INPUT(edz);
+  CHECK_CUDA_INPUT(eydz);
+  // Extract dimensions
+  int64_t num, chn, sp;
+  get_dims(z, num, chn, sp);
+  auto dx = at::zeros_like(z);
+  // Run kernel
+  dim3 blocks(chn);
+  dim3 threads(getNumThreads(sp));
+  auto stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES(z.type(), "backward_cuda", ([&] {
+    backward_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
+        z.data<scalar_t>(),
+        dz.data<scalar_t>(),
+        var.data<scalar_t>(),
+        weight.data<scalar_t>(),
+        bias.data<scalar_t>(),
+        edz.data<scalar_t>(),
+        eydz.data<scalar_t>(),
+        dx.data<scalar_t>(),
+        affine, eps, num, chn, sp);
+  }));
+  return dx;
+}
+/**************
+ * activations
+ **************/
+template<typename T>
+inline void leaky_relu_backward_impl(T *z, T *dz, float slope, int64_t count) {
+  // Create thrust pointers
+  thrust::device_ptr<T> th_z = thrust::device_pointer_cast(z);
+  thrust::device_ptr<T> th_dz = thrust::device_pointer_cast(dz);
+  auto stream = at::cuda::getCurrentCUDAStream();
+  thrust::transform_if(thrust::cuda::par.on(stream),
+                       th_dz, th_dz + count, th_z, th_dz,
+                       [slope] __device__ (const T& dz) { return dz * slope; },
+                       [] __device__ (const T& z) { return z < 0; });
+  thrust::transform_if(thrust::cuda::par.on(stream),
+                       th_z, th_z + count, th_z,
+                       [slope] __device__ (const T& z) { return z / slope; },
+                       [] __device__ (const T& z) { return z < 0; });
+}
+void leaky_relu_backward_cuda(at::Tensor z, at::Tensor dz, float slope) {
+  CHECK_CUDA_INPUT(z);
+  CHECK_CUDA_INPUT(dz);
+  int64_t count = z.numel();
+  AT_DISPATCH_FLOATING_TYPES(z.type(), "leaky_relu_backward_cuda", ([&] {
+    leaky_relu_backward_impl<scalar_t>(z.data<scalar_t>(), dz.data<scalar_t>(), slope, count);
+  }));
+}
+template<typename T>
+inline void elu_backward_impl(T *z, T *dz, int64_t count) {
+  // Create thrust pointers
+  thrust::device_ptr<T> th_z = thrust::device_pointer_cast(z);
+  thrust::device_ptr<T> th_dz = thrust::device_pointer_cast(dz);
+  auto stream = at::cuda::getCurrentCUDAStream();
+  thrust::transform_if(thrust::cuda::par.on(stream),
+                       th_dz, th_dz + count, th_z, th_z, th_dz,
+                       [] __device__ (const T& dz, const T& z) { return dz * (z + 1.); },
+                       [] __device__ (const T& z) { return z < 0; });
+  thrust::transform_if(thrust::cuda::par.on(stream),
+                       th_z, th_z + count, th_z,
+                       [] __device__ (const T& z) { return log1p(z); },
+                       [] __device__ (const T& z) { return z < 0; });
+}
+void elu_backward_cuda(at::Tensor z, at::Tensor dz) {
+  CHECK_CUDA_INPUT(z);
+  CHECK_CUDA_INPUT(dz);
+  int64_t count = z.numel();
+  AT_DISPATCH_FLOATING_TYPES(z.type(), "leaky_relu_backward_cuda", ([&] {
+    elu_backward_impl<scalar_t>(z.data<scalar_t>(), dz.data<scalar_t>(), count);
+  }));
+}

annotator/segm/modules/src/inplace_abn_cuda_half.cu ADDED Viewed

	@@ -0,0 +1,275 @@

+#include <ATen/ATen.h>
+#include <cuda_fp16.h>
+#include <vector>
+#include "utils/checks.h"
+#include "utils/cuda.cuh"
+#include "inplace_abn.h"
+#include <ATen/cuda/CUDAContext.h>
+// Operations for reduce
+struct SumOpH {
+  __device__ SumOpH(const half *t, int c, int s)
+      : tensor(t), chn(c), sp(s) {}
+  __device__ __forceinline__ float operator()(int batch, int plane, int n) {
+    return __half2float(tensor[(batch * chn + plane) * sp + n]);
+  }
+  const half *tensor;
+  const int chn;
+  const int sp;
+};
+struct VarOpH {
+  __device__ VarOpH(float m, const half *t, int c, int s)
+      : mean(m), tensor(t), chn(c), sp(s) {}
+  __device__ __forceinline__ float operator()(int batch, int plane, int n) {
+    const auto t = __half2float(tensor[(batch * chn + plane) * sp + n]);
+    return (t - mean) * (t - mean);
+  }
+  const float mean;
+  const half *tensor;
+  const int chn;
+  const int sp;
+};
+struct GradOpH {
+  __device__ GradOpH(float _weight, float _bias, const half *_z, const half *_dz, int c, int s)
+      : weight(_weight), bias(_bias), z(_z), dz(_dz), chn(c), sp(s) {}
+  __device__ __forceinline__ Pair<float> operator()(int batch, int plane, int n) {
+    float _y = (__half2float(z[(batch * chn + plane) * sp + n]) - bias) / weight;
+    float _dz = __half2float(dz[(batch * chn + plane) * sp + n]);
+    return Pair<float>(_dz, _y * _dz);
+  }
+  const float weight;
+  const float bias;
+  const half *z;
+  const half *dz;
+  const int chn;
+  const int sp;
+};
+/***********
+ * mean_var
+ ***********/
+__global__ void mean_var_kernel_h(const half *x, float *mean, float *var, int num, int chn, int sp) {
+  int plane = blockIdx.x;
+  float norm = 1.f / static_cast<float>(num * sp);
+  float _mean = reduce<float, SumOpH>(SumOpH(x, chn, sp), plane, num, sp) * norm;
+  __syncthreads();
+  float _var = reduce<float, VarOpH>(VarOpH(_mean, x, chn, sp), plane, num, sp) * norm;
+  if (threadIdx.x == 0) {
+    mean[plane] = _mean;
+    var[plane] = _var;
+  }
+}
+std::vector<at::Tensor> mean_var_cuda_h(at::Tensor x) {
+  CHECK_CUDA_INPUT(x);
+  // Extract dimensions
+  int64_t num, chn, sp;
+  get_dims(x, num, chn, sp);
+  // Prepare output tensors
+  auto mean = at::empty({chn},x.options().dtype(at::kFloat));
+  auto var = at::empty({chn},x.options().dtype(at::kFloat));
+  // Run kernel
+  dim3 blocks(chn);
+  dim3 threads(getNumThreads(sp));
+  auto stream = at::cuda::getCurrentCUDAStream();
+  mean_var_kernel_h<<<blocks, threads, 0, stream>>>(
+      reinterpret_cast<half*>(x.data<at::Half>()),
+      mean.data<float>(),
+      var.data<float>(),
+      num, chn, sp);
+  return {mean, var};
+}
+/**********
+ * forward
+ **********/
+__global__ void forward_kernel_h(half *x, const float *mean, const float *var, const float *weight, const float *bias,
+                                 bool affine, float eps, int num, int chn, int sp) {
+  int plane = blockIdx.x;
+  const float _mean = mean[plane];
+  const float _var = var[plane];
+  const float _weight = affine ? abs(weight[plane]) + eps : 1.f;
+  const float _bias = affine ? bias[plane] : 0.f;
+  const float mul = rsqrt(_var + eps) * _weight;
+  for (int batch = 0; batch < num; ++batch) {
+    for (int n = threadIdx.x; n < sp; n += blockDim.x) {
+      half *x_ptr = x + (batch * chn + plane) * sp + n;
+      float _x = __half2float(*x_ptr);
+      float _y = (_x - _mean) * mul + _bias;
+      *x_ptr = __float2half(_y);
+    }
+  }
+}
+at::Tensor forward_cuda_h(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                        bool affine, float eps) {
+  CHECK_CUDA_INPUT(x);
+  CHECK_CUDA_INPUT(mean);
+  CHECK_CUDA_INPUT(var);
+  CHECK_CUDA_INPUT(weight);
+  CHECK_CUDA_INPUT(bias);
+  // Extract dimensions
+  int64_t num, chn, sp;
+  get_dims(x, num, chn, sp);
+  // Run kernel
+  dim3 blocks(chn);
+  dim3 threads(getNumThreads(sp));
+  auto stream = at::cuda::getCurrentCUDAStream();
+  forward_kernel_h<<<blocks, threads, 0, stream>>>(
+      reinterpret_cast<half*>(x.data<at::Half>()),
+      mean.data<float>(),
+      var.data<float>(),
+      weight.data<float>(),
+      bias.data<float>(),
+      affine, eps, num, chn, sp);
+  return x;
+}
+__global__ void edz_eydz_kernel_h(const half *z, const half *dz, const float *weight, const float *bias,
+                                float *edz, float *eydz, bool affine, float eps, int num, int chn, int sp) {
+  int plane = blockIdx.x;
+  float _weight = affine ? abs(weight[plane]) + eps : 1.f;
+  float _bias = affine ? bias[plane] : 0.f;
+  Pair<float> res = reduce<Pair<float>, GradOpH>(GradOpH(_weight, _bias, z, dz, chn, sp), plane, num, sp);
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    edz[plane] = res.v1;
+    eydz[plane] = res.v2;
+  }
+}
+std::vector<at::Tensor> edz_eydz_cuda_h(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
+                                      bool affine, float eps) {
+  CHECK_CUDA_INPUT(z);
+  CHECK_CUDA_INPUT(dz);
+  CHECK_CUDA_INPUT(weight);
+  CHECK_CUDA_INPUT(bias);
+  // Extract dimensions
+  int64_t num, chn, sp;
+  get_dims(z, num, chn, sp);
+  auto edz = at::empty({chn},z.options().dtype(at::kFloat));
+  auto eydz = at::empty({chn},z.options().dtype(at::kFloat));
+  // Run kernel
+  dim3 blocks(chn);
+  dim3 threads(getNumThreads(sp));
+  auto stream = at::cuda::getCurrentCUDAStream();
+  edz_eydz_kernel_h<<<blocks, threads, 0, stream>>>(
+        reinterpret_cast<half*>(z.data<at::Half>()),
+        reinterpret_cast<half*>(dz.data<at::Half>()),
+        weight.data<float>(),
+        bias.data<float>(),
+        edz.data<float>(),
+        eydz.data<float>(),
+        affine, eps, num, chn, sp);
+  return {edz, eydz};
+}
+__global__ void backward_kernel_h(const half *z, const half *dz, const float *var, const float *weight, const float *bias, const float *edz,
+                                  const float *eydz, half *dx, bool affine, float eps, int num, int chn, int sp) {
+  int plane = blockIdx.x;
+  float _weight = affine ? abs(weight[plane]) + eps : 1.f;
+  float _bias = affine ? bias[plane] : 0.f;
+  float _var = var[plane];
+  float _edz = edz[plane];
+  float _eydz = eydz[plane];
+  float _mul = _weight * rsqrt(_var + eps);
+  float count = float(num * sp);
+  for (int batch = 0; batch < num; ++batch) {
+    for (int n = threadIdx.x; n < sp; n += blockDim.x) {
+      float _dz = __half2float(dz[(batch * chn + plane) * sp + n]);
+      float _y = (__half2float(z[(batch * chn + plane) * sp + n]) - _bias) / _weight;
+      dx[(batch * chn + plane) * sp + n] = __float2half((_dz - _edz / count - _y * _eydz / count) * _mul);
+    }
+  }
+}
+at::Tensor backward_cuda_h(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                                      at::Tensor edz, at::Tensor eydz, bool affine, float eps) {
+  CHECK_CUDA_INPUT(z);
+  CHECK_CUDA_INPUT(dz);
+  CHECK_CUDA_INPUT(var);
+  CHECK_CUDA_INPUT(weight);
+  CHECK_CUDA_INPUT(bias);
+  CHECK_CUDA_INPUT(edz);
+  CHECK_CUDA_INPUT(eydz);
+  // Extract dimensions
+  int64_t num, chn, sp;
+  get_dims(z, num, chn, sp);
+  auto dx = at::zeros_like(z);
+  // Run kernel
+  dim3 blocks(chn);
+  dim3 threads(getNumThreads(sp));
+  auto stream = at::cuda::getCurrentCUDAStream();
+  backward_kernel_h<<<blocks, threads, 0, stream>>>(
+        reinterpret_cast<half*>(z.data<at::Half>()),
+        reinterpret_cast<half*>(dz.data<at::Half>()),
+        var.data<float>(),
+        weight.data<float>(),
+        bias.data<float>(),
+        edz.data<float>(),
+        eydz.data<float>(),
+        reinterpret_cast<half*>(dx.data<at::Half>()),
+        affine, eps, num, chn, sp);
+  return dx;
+}
+__global__ void leaky_relu_backward_impl_h(half *z, half *dz, float slope, int64_t count) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < count;  i += blockDim.x * gridDim.x){
+    float _z = __half2float(z[i]);
+    if (_z < 0) {
+      dz[i] = __float2half(__half2float(dz[i]) * slope);
+      z[i] = __float2half(_z / slope);
+    }
+  }
+}
+void leaky_relu_backward_cuda_h(at::Tensor z, at::Tensor dz, float slope) {
+  CHECK_CUDA_INPUT(z);
+  CHECK_CUDA_INPUT(dz);
+  int64_t count = z.numel();
+  dim3 threads(getNumThreads(count));
+  dim3 blocks = (count + threads.x - 1) / threads.x;
+  auto stream = at::cuda::getCurrentCUDAStream();
+  leaky_relu_backward_impl_h<<<blocks, threads, 0, stream>>>(
+      reinterpret_cast<half*>(z.data<at::Half>()),
+      reinterpret_cast<half*>(dz.data<at::Half>()),
+      slope, count);
+}

annotator/segm/modules/src/utils/checks.h ADDED Viewed

	@@ -0,0 +1,15 @@

+#pragma once
+#include <ATen/ATen.h>
+// Define AT_CHECK for old version of ATen where the same function was called AT_ASSERT
+#ifndef AT_CHECK
+#define AT_CHECK AT_ASSERT
+#endif
+#define CHECK_CUDA(x) AT_CHECK((x).type().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CPU(x) AT_CHECK(!(x).type().is_cuda(), #x " must be a CPU tensor")
+#define CHECK_CONTIGUOUS(x) AT_CHECK((x).is_contiguous(), #x " must be contiguous")
+#define CHECK_CUDA_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
+#define CHECK_CPU_INPUT(x) CHECK_CPU(x); CHECK_CONTIGUOUS(x)

annotator/segm/modules/src/utils/common.h ADDED Viewed

	@@ -0,0 +1,49 @@

+#pragma once
+#include <ATen/ATen.h>
+/*
+ * Functions to share code between CPU and GPU
+ */
+#ifdef __CUDACC__
+// CUDA versions
+#define HOST_DEVICE __host__ __device__
+#define INLINE_HOST_DEVICE __host__ __device__ inline
+#define FLOOR(x) floor(x)
+#if __CUDA_ARCH__ >= 600
+// Recent compute capabilities have block-level atomicAdd for all data types, so we use that
+#define ACCUM(x,y) atomicAdd_block(&(x),(y))
+#else
+// Older architectures don't have block-level atomicAdd, nor atomicAdd for doubles, so we defer to atomicAdd for float
+// and use the known atomicCAS-based implementation for double
+template<typename data_t>
+__device__ inline data_t atomic_add(data_t *address, data_t val) {
+  return atomicAdd(address, val);
+}
+template<>
+__device__ inline double atomic_add(double *address, double val) {
+  unsigned long long int* address_as_ull = (unsigned long long int*)address;
+  unsigned long long int old = *address_as_ull, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
+  } while (assumed != old);
+  return __longlong_as_double(old);
+}
+#define ACCUM(x,y) atomic_add(&(x),(y))
+#endif // #if __CUDA_ARCH__ >= 600
+#else
+// CPU versions
+#define HOST_DEVICE
+#define INLINE_HOST_DEVICE inline
+#define FLOOR(x) std::floor(x)
+#define ACCUM(x,y) (x) += (y)
+#endif // #ifdef __CUDACC__

annotator/segm/modules/src/utils/cuda.cuh ADDED Viewed

	@@ -0,0 +1,71 @@

+#pragma once
+/*
+ * General settings and functions
+ */
+const int WARP_SIZE = 32;
+const int MAX_BLOCK_SIZE = 1024;
+static int getNumThreads(int nElem) {
+  int threadSizes[6] = {32, 64, 128, 256, 512, MAX_BLOCK_SIZE};
+  for (int i = 0; i < 6; ++i) {
+    if (nElem <= threadSizes[i]) {
+      return threadSizes[i];
+    }
+  }
+  return MAX_BLOCK_SIZE;
+}
+/*
+ * Reduction utilities
+ */
+template <typename T>
+__device__ __forceinline__ T WARP_SHFL_XOR(T value, int laneMask, int width = warpSize,
+                                           unsigned int mask = 0xffffffff) {
+#if CUDART_VERSION >= 9000
+  return __shfl_xor_sync(mask, value, laneMask, width);
+#else
+  return __shfl_xor(value, laneMask, width);
+#endif
+}
+__device__ __forceinline__ int getMSB(int val) { return 31 - __clz(val); }
+template<typename T>
+struct Pair {
+  T v1, v2;
+  __device__ Pair() {}
+  __device__ Pair(T _v1, T _v2) : v1(_v1), v2(_v2) {}
+  __device__ Pair(T v) : v1(v), v2(v) {}
+  __device__ Pair(int v) : v1(v), v2(v) {}
+  __device__ Pair &operator+=(const Pair<T> &a) {
+    v1 += a.v1;
+    v2 += a.v2;
+    return *this;
+  }
+};
+template<typename T>
+static __device__ __forceinline__ T warpSum(T val) {
+#if __CUDA_ARCH__ >= 300
+  for (int i = 0; i < getMSB(WARP_SIZE); ++i) {
+    val += WARP_SHFL_XOR(val, 1 << i, WARP_SIZE);
+  }
+#else
+  __shared__ T values[MAX_BLOCK_SIZE];
+  values[threadIdx.x] = val;
+  __threadfence_block();
+  const int base = (threadIdx.x / WARP_SIZE) * WARP_SIZE;
+  for (int i = 1; i < WARP_SIZE; i++) {
+    val += values[base + ((i + threadIdx.x) % WARP_SIZE)];
+  }
+#endif
+  return val;
+}
+template<typename T>
+static __device__ __forceinline__ Pair<T> warpSum(Pair<T> value) {
+  value.v1 = warpSum(value.v1);
+  value.v2 = warpSum(value.v2);
+  return value;
+}

annotator/segm/networks/AugmentCE2P.py ADDED Viewed

	@@ -0,0 +1,337 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@Author  :   Peike Li
+@Contact :   peike.li@yahoo.com
+@File    :   AugmentCE2P.py
+@Time    :   8/4/19 3:35 PM
+@Desc    :
+@License :   This source code is licensed under the license found in the
+             LICENSE file in the root directory of this source tree.
+"""
+import functools
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+# Note here we adopt the InplaceABNSync implementation from https://github.com/mapillary/inplace_abn
+# By default, the InplaceABNSync module contains a BatchNorm Layer and a LeakyReLu layer
+from ..modules import InPlaceABNSync
+BatchNorm2d = functools.partial(InPlaceABNSync, activation='none')
+affine_par = True
+pretrained_settings = {
+    'resnet101': {
+        'imagenet': {
+            'input_space': 'BGR',
+            'input_size': [3, 224, 224],
+            'input_range': [0, 1],
+            'mean': [0.406, 0.456, 0.485],
+            'std': [0.225, 0.224, 0.229],
+            'num_classes': 1000
+        }
+    },
+}
+def conv3x3(in_planes, out_planes, stride=1):
+    "3x3 convolution with padding"
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=1, bias=False)
+class Bottleneck(nn.Module):
+    expansion = 4
+    def __init__(self, inplanes, planes, stride=1, dilation=1, downsample=None, fist_dilation=1, multi_grid=1):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
+                               padding=dilation * multi_grid, dilation=dilation * multi_grid, bias=False)
+        self.bn2 = BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
+        self.bn3 = BatchNorm2d(planes * 4)
+        self.relu = nn.ReLU(inplace=False)
+        self.relu_inplace = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.dilation = dilation
+        self.stride = stride
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+        out = self.conv3(out)
+        out = self.bn3(out)
+        if self.downsample is not None:
+            residual = self.downsample(x)
+        out = out + residual
+        out = self.relu_inplace(out)
+        return out
+class PSPModule(nn.Module):
+    """
+    Reference:
+        Zhao, Hengshuang, et al. *"Pyramid scene parsing network."*
+    """
+    def __init__(self, features, out_features=512, sizes=(1, 2, 3, 6)):
+        super(PSPModule, self).__init__()
+        self.stages = []
+        self.stages = nn.ModuleList([self._make_stage(features, out_features, size) for size in sizes])
+        self.bottleneck = nn.Sequential(
+            nn.Conv2d(features + len(sizes) * out_features, out_features, kernel_size=3, padding=1, dilation=1,
+                      bias=False),
+            InPlaceABNSync(out_features),
+        )
+    def _make_stage(self, features, out_features, size):
+        prior = nn.AdaptiveAvgPool2d(output_size=(size, size))
+        conv = nn.Conv2d(features, out_features, kernel_size=1, bias=False)
+        bn = InPlaceABNSync(out_features)
+        return nn.Sequential(prior, conv, bn)
+    def forward(self, feats):
+        h, w = feats.size(2), feats.size(3)
+        priors = [F.interpolate(input=stage(feats), size=(h, w), mode='bilinear', align_corners=True) for stage in
+                  self.stages] + [feats]
+        bottle = self.bottleneck(torch.cat(priors, 1))
+        return bottle
+class ASPPModule(nn.Module):
+    """
+    Reference:
+        Chen, Liang-Chieh, et al. *"Rethinking Atrous Convolution for Semantic Image Segmentation."*
+    """
+    def __init__(self, features, inner_features=256, out_features=512, dilations=(12, 24, 36)):
+        super(ASPPModule, self).__init__()
+        self.conv1 = nn.Sequential(nn.AdaptiveAvgPool2d((1, 1)),
+                                   nn.Conv2d(features, inner_features, kernel_size=1, padding=0, dilation=1,
+                                             bias=False),
+                                   InPlaceABNSync(inner_features))
+        self.conv2 = nn.Sequential(
+            nn.Conv2d(features, inner_features, kernel_size=1, padding=0, dilation=1, bias=False),
+            InPlaceABNSync(inner_features))
+        self.conv3 = nn.Sequential(
+            nn.Conv2d(features, inner_features, kernel_size=3, padding=dilations[0], dilation=dilations[0], bias=False),
+            InPlaceABNSync(inner_features))
+        self.conv4 = nn.Sequential(
+            nn.Conv2d(features, inner_features, kernel_size=3, padding=dilations[1], dilation=dilations[1], bias=False),
+            InPlaceABNSync(inner_features))
+        self.conv5 = nn.Sequential(
+            nn.Conv2d(features, inner_features, kernel_size=3, padding=dilations[2], dilation=dilations[2], bias=False),
+            InPlaceABNSync(inner_features))
+        self.bottleneck = nn.Sequential(
+            nn.Conv2d(inner_features * 5, out_features, kernel_size=1, padding=0, dilation=1, bias=False),
+            InPlaceABNSync(out_features),
+            nn.Dropout2d(0.1)
+        )
+    def forward(self, x):
+        _, _, h, w = x.size()
+        feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True)
+        feat2 = self.conv2(x)
+        feat3 = self.conv3(x)
+        feat4 = self.conv4(x)
+        feat5 = self.conv5(x)
+        out = torch.cat((feat1, feat2, feat3, feat4, feat5), 1)
+        bottle = self.bottleneck(out)
+        return bottle
+class Edge_Module(nn.Module):
+    """
+    Edge Learning Branch
+    """
+    def __init__(self, in_fea=[256, 512, 1024], mid_fea=256, out_fea=2):
+        super(Edge_Module, self).__init__()
+        self.conv1 = nn.Sequential(
+            nn.Conv2d(in_fea[0], mid_fea, kernel_size=1, padding=0, dilation=1, bias=False),
+            InPlaceABNSync(mid_fea)
+        )
+        self.conv2 = nn.Sequential(
+            nn.Conv2d(in_fea[1], mid_fea, kernel_size=1, padding=0, dilation=1, bias=False),
+            InPlaceABNSync(mid_fea)
+        )
+        self.conv3 = nn.Sequential(
+            nn.Conv2d(in_fea[2], mid_fea, kernel_size=1, padding=0, dilation=1, bias=False),
+            InPlaceABNSync(mid_fea)
+        )
+        self.conv4 = nn.Conv2d(mid_fea, out_fea, kernel_size=3, padding=1, dilation=1, bias=True)
+        self.conv5 = nn.Conv2d(out_fea * 3, out_fea, kernel_size=1, padding=0, dilation=1, bias=True)
+    def forward(self, x1, x2, x3):
+        _, _, h, w = x1.size()
+        edge1_fea = self.conv1(x1)
+        edge1 = self.conv4(edge1_fea)
+        edge2_fea = self.conv2(x2)
+        edge2 = self.conv4(edge2_fea)
+        edge3_fea = self.conv3(x3)
+        edge3 = self.conv4(edge3_fea)
+        edge2_fea = F.interpolate(edge2_fea, size=(h, w), mode='bilinear', align_corners=True)
+        edge3_fea = F.interpolate(edge3_fea, size=(h, w), mode='bilinear', align_corners=True)
+        edge2 = F.interpolate(edge2, size=(h, w), mode='bilinear', align_corners=True)
+        edge3 = F.interpolate(edge3, size=(h, w), mode='bilinear', align_corners=True)
+        edge = torch.cat([edge1, edge2, edge3], dim=1)
+        edge_fea = torch.cat([edge1_fea, edge2_fea, edge3_fea], dim=1)
+        edge = self.conv5(edge)
+        return edge, edge_fea
+class Decoder_Module(nn.Module):
+    """
+    Parsing Branch Decoder Module.
+    """
+    def __init__(self, num_classes):
+        super(Decoder_Module, self).__init__()
+        self.conv1 = nn.Sequential(
+            nn.Conv2d(512, 256, kernel_size=1, padding=0, dilation=1, bias=False),
+            InPlaceABNSync(256)
+        )
+        self.conv2 = nn.Sequential(
+            nn.Conv2d(256, 48, kernel_size=1, stride=1, padding=0, dilation=1, bias=False),
+            InPlaceABNSync(48)
+        )
+        self.conv3 = nn.Sequential(
+            nn.Conv2d(304, 256, kernel_size=1, padding=0, dilation=1, bias=False),
+            InPlaceABNSync(256),
+            nn.Conv2d(256, 256, kernel_size=1, padding=0, dilation=1, bias=False),
+            InPlaceABNSync(256)
+        )
+        self.conv4 = nn.Conv2d(256, num_classes, kernel_size=1, padding=0, dilation=1, bias=True)
+    def forward(self, xt, xl):
+        _, _, h, w = xl.size()
+        xt = F.interpolate(self.conv1(xt), size=(h, w), mode='bilinear', align_corners=True)
+        xl = self.conv2(xl)
+        x = torch.cat([xt, xl], dim=1)
+        x = self.conv3(x)
+        seg = self.conv4(x)
+        return seg, x
+class ResNet(nn.Module):
+    def __init__(self, block, layers, num_classes):
+        self.inplanes = 128
+        super(ResNet, self).__init__()
+        self.conv1 = conv3x3(3, 64, stride=2)
+        self.bn1 = BatchNorm2d(64)
+        self.relu1 = nn.ReLU(inplace=False)
+        self.conv2 = conv3x3(64, 64)
+        self.bn2 = BatchNorm2d(64)
+        self.relu2 = nn.ReLU(inplace=False)
+        self.conv3 = conv3x3(64, 128)
+        self.bn3 = BatchNorm2d(128)
+        self.relu3 = nn.ReLU(inplace=False)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=1, dilation=2, multi_grid=(1, 1, 1))
+        self.context_encoding = PSPModule(2048, 512)
+        self.edge = Edge_Module()
+        self.decoder = Decoder_Module(num_classes)
+        self.fushion = nn.Sequential(
+            nn.Conv2d(1024, 256, kernel_size=1, padding=0, dilation=1, bias=False),
+            InPlaceABNSync(256),
+            nn.Dropout2d(0.1),
+            nn.Conv2d(256, num_classes, kernel_size=1, padding=0, dilation=1, bias=True)
+        )
+    def _make_layer(self, block, planes, blocks, stride=1, dilation=1, multi_grid=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.inplanes, planes * block.expansion,
+                          kernel_size=1, stride=stride, bias=False),
+                BatchNorm2d(planes * block.expansion, affine=affine_par))
+        layers = []
+        generate_multi_grid = lambda index, grids: grids[index % len(grids)] if isinstance(grids, tuple) else 1
+        layers.append(block(self.inplanes, planes, stride, dilation=dilation, downsample=downsample,
+                            multi_grid=generate_multi_grid(0, multi_grid)))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(
+                block(self.inplanes, planes, dilation=dilation, multi_grid=generate_multi_grid(i, multi_grid)))
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        x = self.relu1(self.bn1(self.conv1(x)))
+        x = self.relu2(self.bn2(self.conv2(x)))
+        x = self.relu3(self.bn3(self.conv3(x)))
+        x = self.maxpool(x)
+        x2 = self.layer1(x)
+        x3 = self.layer2(x2)
+        x4 = self.layer3(x3)
+        x5 = self.layer4(x4)
+        x = self.context_encoding(x5)
+        parsing_result, parsing_fea = self.decoder(x, x2)
+        # Edge Branch
+        edge_result, edge_fea = self.edge(x2, x3, x4)
+        # Fusion Branch
+        x = torch.cat([parsing_fea, edge_fea], dim=1)
+        fusion_result = self.fushion(x)
+        return [[parsing_result, fusion_result], [edge_result]]
+def initialize_pretrained_model(model, settings, pretrained='./models/resnet101-imagenet.pth'):
+    model.input_space = settings['input_space']
+    model.input_size = settings['input_size']
+    model.input_range = settings['input_range']
+    model.mean = settings['mean']
+    model.std = settings['std']
+    if pretrained is not None:
+        saved_state_dict = torch.load(pretrained)
+        new_params = model.state_dict().copy()
+        for i in saved_state_dict:
+            i_parts = i.split('.')
+            if not i_parts[0] == 'fc':
+                new_params['.'.join(i_parts[0:])] = saved_state_dict[i]
+        model.load_state_dict(new_params)
+def resnet101(num_classes=20, pretrained='./models/resnet101-imagenet.pth'):
+    model = ResNet(Bottleneck, [3, 4, 23, 3], num_classes)
+    settings = pretrained_settings['resnet101']['imagenet']
+    initialize_pretrained_model(model, settings, pretrained)
+    return model

annotator/segm/networks/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from __future__ import absolute_import
+from ..networks.AugmentCE2P import resnet101
+__factory = {
+    'resnet101': resnet101,
+}
+def init_model(name, *args, **kwargs):
+    if name not in __factory.keys():
+        raise KeyError("Unknown model arch: {}".format(name))
+    return __factory[name](*args, **kwargs)

annotator/segm/networks/backbone/mobilenetv2.py ADDED Viewed

	@@ -0,0 +1,156 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@Author  :   Peike Li
+@Contact :   peike.li@yahoo.com
+@File    :   mobilenetv2.py
+@Time    :   8/4/19 3:35 PM
+@Desc    :
+@License :   This source code is licensed under the license found in the
+             LICENSE file in the root directory of this source tree.
+"""
+import torch.nn as nn
+import math
+import functools
+from modules import InPlaceABN, InPlaceABNSync
+BatchNorm2d = functools.partial(InPlaceABNSync, activation='none')
+__all__ = ['mobilenetv2']
+def conv_bn(inp, oup, stride):
+    return nn.Sequential(
+        nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
+        BatchNorm2d(oup),
+        nn.ReLU6(inplace=True)
+    )
+def conv_1x1_bn(inp, oup):
+    return nn.Sequential(
+        nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
+        BatchNorm2d(oup),
+        nn.ReLU6(inplace=True)
+    )
+class InvertedResidual(nn.Module):
+    def __init__(self, inp, oup, stride, expand_ratio):
+        super(InvertedResidual, self).__init__()
+        self.stride = stride
+        assert stride in [1, 2]
+        hidden_dim = round(inp * expand_ratio)
+        self.use_res_connect = self.stride == 1 and inp == oup
+        if expand_ratio == 1:
+            self.conv = nn.Sequential(
+                # dw
+                nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
+                BatchNorm2d(hidden_dim),
+                nn.ReLU6(inplace=True),
+                # pw-linear
+                nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
+                BatchNorm2d(oup),
+            )
+        else:
+            self.conv = nn.Sequential(
+                # pw
+                nn.Conv2d(inp, hidden_dim, 1, 1, 0, bias=False),
+                BatchNorm2d(hidden_dim),
+                nn.ReLU6(inplace=True),
+                # dw
+                nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
+                BatchNorm2d(hidden_dim),
+                nn.ReLU6(inplace=True),
+                # pw-linear
+                nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
+                BatchNorm2d(oup),
+            )
+    def forward(self, x):
+        if self.use_res_connect:
+            return x + self.conv(x)
+        else:
+            return self.conv(x)
+class MobileNetV2(nn.Module):
+    def __init__(self, n_class=1000, input_size=224, width_mult=1.):
+        super(MobileNetV2, self).__init__()
+        block = InvertedResidual
+        input_channel = 32
+        last_channel = 1280
+        interverted_residual_setting = [
+            # t, c, n, s
+            [1, 16, 1, 1],
+            [6, 24, 2, 2],  # layer 2
+            [6, 32, 3, 2],  # layer 3
+            [6, 64, 4, 2],
+            [6, 96, 3, 1],  # layer 4
+            [6, 160, 3, 2],
+            [6, 320, 1, 1],  # layer 5
+        ]
+        # building first layer
+        assert input_size % 32 == 0
+        input_channel = int(input_channel * width_mult)
+        self.last_channel = int(last_channel * width_mult) if width_mult > 1.0 else last_channel
+        self.features = [conv_bn(3, input_channel, 2)]
+        # building inverted residual blocks
+        for t, c, n, s in interverted_residual_setting:
+            output_channel = int(c * width_mult)
+            for i in range(n):
+                if i == 0:
+                    self.features.append(block(input_channel, output_channel, s, expand_ratio=t))
+                else:
+                    self.features.append(block(input_channel, output_channel, 1, expand_ratio=t))
+                input_channel = output_channel
+        # building last several layers
+        self.features.append(conv_1x1_bn(input_channel, self.last_channel))
+        # make it nn.Sequential
+        self.features = nn.Sequential(*self.features)
+        # building classifier
+        self.classifier = nn.Sequential(
+            nn.Dropout(0.2),
+            nn.Linear(self.last_channel, n_class),
+        )
+        self._initialize_weights()
+    def forward(self, x):
+        x = self.features(x)
+        x = x.mean(3).mean(2)
+        x = self.classifier(x)
+        return x
+    def _initialize_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+                if m.bias is not None:
+                    m.bias.data.zero_()
+            elif isinstance(m, BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+            elif isinstance(m, nn.Linear):
+                n = m.weight.size(1)
+                m.weight.data.normal_(0, 0.01)
+                m.bias.data.zero_()
+def mobilenetv2(pretrained=False, **kwargs):
+    """Constructs a MobileNet_V2 model.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = MobileNetV2(n_class=1000, **kwargs)
+    if pretrained:
+        model.load_state_dict(load_url(model_urls['mobilenetv2']), strict=False)
+    return model

annotator/segm/networks/backbone/resnet.py ADDED Viewed

	@@ -0,0 +1,205 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@Author  :   Peike Li
+@Contact :   peike.li@yahoo.com
+@File    :   resnet.py
+@Time    :   8/4/19 3:35 PM
+@Desc    :
+@License :   This source code is licensed under the license found in the
+             LICENSE file in the root directory of this source tree.
+"""
+import functools
+import torch.nn as nn
+import math
+from torch.utils.model_zoo import load_url
+from modules import InPlaceABNSync
+BatchNorm2d = functools.partial(InPlaceABNSync, activation='none')
+__all__ = ['ResNet', 'resnet18', 'resnet50', 'resnet101']  # resnet101 is coming soon!
+model_urls = {
+    'resnet18': 'http://sceneparsing.csail.mit.edu/model/pretrained_resnet/resnet18-imagenet.pth',
+    'resnet50': 'http://sceneparsing.csail.mit.edu/model/pretrained_resnet/resnet50-imagenet.pth',
+    'resnet101': 'http://sceneparsing.csail.mit.edu/model/pretrained_resnet/resnet101-imagenet.pth'
+}
+def conv3x3(in_planes, out_planes, stride=1):
+    "3x3 convolution with padding"
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=1, bias=False)
+class BasicBlock(nn.Module):
+    expansion = 1
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(BasicBlock, self).__init__()
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = BatchNorm2d(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = BatchNorm2d(planes)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        if self.downsample is not None:
+            residual = self.downsample(x)
+        out += residual
+        out = self.relu(out)
+        return out
+class Bottleneck(nn.Module):
+    expansion = 4
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
+                               padding=1, bias=False)
+        self.bn2 = BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
+        self.bn3 = BatchNorm2d(planes * 4)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+        out = self.conv3(out)
+        out = self.bn3(out)
+        if self.downsample is not None:
+            residual = self.downsample(x)
+        out += residual
+        out = self.relu(out)
+        return out
+class ResNet(nn.Module):
+    def __init__(self, block, layers, num_classes=1000):
+        self.inplanes = 128
+        super(ResNet, self).__init__()
+        self.conv1 = conv3x3(3, 64, stride=2)
+        self.bn1 = BatchNorm2d(64)
+        self.relu1 = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(64, 64)
+        self.bn2 = BatchNorm2d(64)
+        self.relu2 = nn.ReLU(inplace=True)
+        self.conv3 = conv3x3(64, 128)
+        self.bn3 = BatchNorm2d(128)
+        self.relu3 = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
+        self.avgpool = nn.AvgPool2d(7, stride=1)
+        self.fc = nn.Linear(512 * block.expansion, num_classes)
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+            elif isinstance(m, BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+    def _make_layer(self, block, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.inplanes, planes * block.expansion,
+                          kernel_size=1, stride=stride, bias=False),
+                BatchNorm2d(planes * block.expansion),
+            )
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes))
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        x = self.relu1(self.bn1(self.conv1(x)))
+        x = self.relu2(self.bn2(self.conv2(x)))
+        x = self.relu3(self.bn3(self.conv3(x)))
+        x = self.maxpool(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.avgpool(x)
+        x = x.view(x.size(0), -1)
+        x = self.fc(x)
+        return x
+def resnet18(pretrained=False, **kwargs):
+    """Constructs a ResNet-18 model.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
+    if pretrained:
+        model.load_state_dict(load_url(model_urls['resnet18']))
+    return model
+def resnet50(pretrained=False, **kwargs):
+    """Constructs a ResNet-50 model.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
+    if pretrained:
+        model.load_state_dict(load_url(model_urls['resnet50']), strict=False)
+    return model
+def resnet101(pretrained=False, **kwargs):
+    """Constructs a ResNet-101 model.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
+    if pretrained:
+        model.load_state_dict(load_url(model_urls['resnet101']), strict=False)
+    return model

annotator/segm/networks/backbone/resnext.py ADDED Viewed

	@@ -0,0 +1,149 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@Author  :   Peike Li
+@Contact :   peike.li@yahoo.com
+@File    :   resnext.py.py
+@Time    :   8/11/19 8:58 PM
+@Desc    :
+@License :   This source code is licensed under the license found in the
+             LICENSE file in the root directory of this source tree.
+"""
+import functools
+import torch.nn as nn
+import math
+from torch.utils.model_zoo import load_url
+from modules import InPlaceABNSync
+BatchNorm2d = functools.partial(InPlaceABNSync, activation='none')
+__all__ = ['ResNeXt', 'resnext101']  # support resnext 101
+model_urls = {
+    'resnext50': 'http://sceneparsing.csail.mit.edu/model/pretrained_resnet/resnext50-imagenet.pth',
+    'resnext101': 'http://sceneparsing.csail.mit.edu/model/pretrained_resnet/resnext101-imagenet.pth'
+}
+def conv3x3(in_planes, out_planes, stride=1):
+    "3x3 convolution with padding"
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=1, bias=False)
+class GroupBottleneck(nn.Module):
+    expansion = 2
+    def __init__(self, inplanes, planes, stride=1, groups=1, downsample=None):
+        super(GroupBottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
+                               padding=1, groups=groups, bias=False)
+        self.bn2 = BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(planes, planes * 2, kernel_size=1, bias=False)
+        self.bn3 = BatchNorm2d(planes * 2)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+        out = self.conv3(out)
+        out = self.bn3(out)
+        if self.downsample is not None:
+            residual = self.downsample(x)
+        out += residual
+        out = self.relu(out)
+        return out
+class ResNeXt(nn.Module):
+    def __init__(self, block, layers, groups=32, num_classes=1000):
+        self.inplanes = 128
+        super(ResNeXt, self).__init__()
+        self.conv1 = conv3x3(3, 64, stride=2)
+        self.bn1 = BatchNorm2d(64)
+        self.relu1 = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(64, 64)
+        self.bn2 = BatchNorm2d(64)
+        self.relu2 = nn.ReLU(inplace=True)
+        self.conv3 = conv3x3(64, 128)
+        self.bn3 = BatchNorm2d(128)
+        self.relu3 = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 128, layers[0], groups=groups)
+        self.layer2 = self._make_layer(block, 256, layers[1], stride=2, groups=groups)
+        self.layer3 = self._make_layer(block, 512, layers[2], stride=2, groups=groups)
+        self.layer4 = self._make_layer(block, 1024, layers[3], stride=2, groups=groups)
+        self.avgpool = nn.AvgPool2d(7, stride=1)
+        self.fc = nn.Linear(1024 * block.expansion, num_classes)
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels // m.groups
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+            elif isinstance(m, BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+    def _make_layer(self, block, planes, blocks, stride=1, groups=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.inplanes, planes * block.expansion,
+                          kernel_size=1, stride=stride, bias=False),
+                BatchNorm2d(planes * block.expansion),
+            )
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, groups, downsample))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes, groups=groups))
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        x = self.relu1(self.bn1(self.conv1(x)))
+        x = self.relu2(self.bn2(self.conv2(x)))
+        x = self.relu3(self.bn3(self.conv3(x)))
+        x = self.maxpool(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.avgpool(x)
+        x = x.view(x.size(0), -1)
+        x = self.fc(x)
+        return x
+def resnext101(pretrained=False, **kwargs):
+    """Constructs a ResNet-101 model.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on Places
+    """
+    model = ResNeXt(GroupBottleneck, [3, 4, 23, 3], **kwargs)
+    if pretrained:
+        model.load_state_dict(load_url(model_urls['resnext101']), strict=False)
+    return model

annotator/segm/networks/context_encoding/aspp.py ADDED Viewed

	@@ -0,0 +1,64 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@Author  :   Peike Li
+@Contact :   peike.li@yahoo.com
+@File    :   aspp.py
+@Time    :   8/4/19 3:36 PM
+@Desc    :
+@License :   This source code is licensed under the license found in the
+             LICENSE file in the root directory of this source tree.
+"""
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from modules import InPlaceABNSync
+class ASPPModule(nn.Module):
+    """
+    Reference:
+        Chen, Liang-Chieh, et al. *"Rethinking Atrous Convolution for Semantic Image Segmentation."*
+    """
+    def __init__(self, features, out_features=512, inner_features=256, dilations=(12, 24, 36)):
+        super(ASPPModule, self).__init__()
+        self.conv1 = nn.Sequential(nn.AdaptiveAvgPool2d((1, 1)),
+                                   nn.Conv2d(features, inner_features, kernel_size=1, padding=0, dilation=1,
+                                             bias=False),
+                                   InPlaceABNSync(inner_features))
+        self.conv2 = nn.Sequential(
+            nn.Conv2d(features, inner_features, kernel_size=1, padding=0, dilation=1, bias=False),
+            InPlaceABNSync(inner_features))
+        self.conv3 = nn.Sequential(
+            nn.Conv2d(features, inner_features, kernel_size=3, padding=dilations[0], dilation=dilations[0], bias=False),
+            InPlaceABNSync(inner_features))
+        self.conv4 = nn.Sequential(
+            nn.Conv2d(features, inner_features, kernel_size=3, padding=dilations[1], dilation=dilations[1], bias=False),
+            InPlaceABNSync(inner_features))
+        self.conv5 = nn.Sequential(
+            nn.Conv2d(features, inner_features, kernel_size=3, padding=dilations[2], dilation=dilations[2], bias=False),
+            InPlaceABNSync(inner_features))
+        self.bottleneck = nn.Sequential(
+            nn.Conv2d(inner_features * 5, out_features, kernel_size=1, padding=0, dilation=1, bias=False),
+            InPlaceABNSync(out_features),
+            nn.Dropout2d(0.1)
+        )
+    def forward(self, x):
+        _, _, h, w = x.size()
+        feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True)
+        feat2 = self.conv2(x)
+        feat3 = self.conv3(x)
+        feat4 = self.conv4(x)
+        feat5 = self.conv5(x)
+        out = torch.cat((feat1, feat2, feat3, feat4, feat5), 1)
+        bottle = self.bottleneck(out)
+        return bottle

annotator/segm/networks/context_encoding/ocnet.py ADDED Viewed

	@@ -0,0 +1,226 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@Author  :   Peike Li
+@Contact :   peike.li@yahoo.com
+@File    :   ocnet.py
+@Time    :   8/4/19 3:36 PM
+@Desc    :
+@License :   This source code is licensed under the license found in the
+             LICENSE file in the root directory of this source tree.
+"""
+import functools
+import torch
+import torch.nn as nn
+from torch.autograd import Variable
+from torch.nn import functional as F
+from modules import InPlaceABNSync
+BatchNorm2d = functools.partial(InPlaceABNSync, activation='none')
+class _SelfAttentionBlock(nn.Module):
+    '''
+    The basic implementation for self-attention block/non-local block
+    Input:
+        N X C X H X W
+    Parameters:
+        in_channels       : the dimension of the input feature map
+        key_channels      : the dimension after the key/query transform
+        value_channels    : the dimension after the value transform
+        scale             : choose the scale to downsample the input feature maps (save memory cost)
+    Return:
+        N X C X H X W
+        position-aware context features.(w/o concate or add with the input)
+    '''
+    def __init__(self, in_channels, key_channels, value_channels, out_channels=None, scale=1):
+        super(_SelfAttentionBlock, self).__init__()
+        self.scale = scale
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.key_channels = key_channels
+        self.value_channels = value_channels
+        if out_channels == None:
+            self.out_channels = in_channels
+        self.pool = nn.MaxPool2d(kernel_size=(scale, scale))
+        self.f_key = nn.Sequential(
+            nn.Conv2d(in_channels=self.in_channels, out_channels=self.key_channels,
+                      kernel_size=1, stride=1, padding=0),
+            InPlaceABNSync(self.key_channels),
+        )
+        self.f_query = self.f_key
+        self.f_value = nn.Conv2d(in_channels=self.in_channels, out_channels=self.value_channels,
+                                 kernel_size=1, stride=1, padding=0)
+        self.W = nn.Conv2d(in_channels=self.value_channels, out_channels=self.out_channels,
+                           kernel_size=1, stride=1, padding=0)
+        nn.init.constant(self.W.weight, 0)
+        nn.init.constant(self.W.bias, 0)
+    def forward(self, x):
+        batch_size, h, w = x.size(0), x.size(2), x.size(3)
+        if self.scale > 1:
+            x = self.pool(x)
+        value = self.f_value(x).view(batch_size, self.value_channels, -1)
+        value = value.permute(0, 2, 1)
+        query = self.f_query(x).view(batch_size, self.key_channels, -1)
+        query = query.permute(0, 2, 1)
+        key = self.f_key(x).view(batch_size, self.key_channels, -1)
+        sim_map = torch.matmul(query, key)
+        sim_map = (self.key_channels ** -.5) * sim_map
+        sim_map = F.softmax(sim_map, dim=-1)
+        context = torch.matmul(sim_map, value)
+        context = context.permute(0, 2, 1).contiguous()
+        context = context.view(batch_size, self.value_channels, *x.size()[2:])
+        context = self.W(context)
+        if self.scale > 1:
+            context = F.upsample(input=context, size=(h, w), mode='bilinear', align_corners=True)
+        return context
+class SelfAttentionBlock2D(_SelfAttentionBlock):
+    def __init__(self, in_channels, key_channels, value_channels, out_channels=None, scale=1):
+        super(SelfAttentionBlock2D, self).__init__(in_channels,
+                                                   key_channels,
+                                                   value_channels,
+                                                   out_channels,
+                                                   scale)
+class BaseOC_Module(nn.Module):
+    """
+    Implementation of the BaseOC module
+    Parameters:
+        in_features / out_features: the channels of the input / output feature maps.
+        dropout: we choose 0.05 as the default value.
+        size: you can apply multiple sizes. Here we only use one size.
+    Return:
+        features fused with Object context information.
+    """
+    def __init__(self, in_channels, out_channels, key_channels, value_channels, dropout, sizes=([1])):
+        super(BaseOC_Module, self).__init__()
+        self.stages = []
+        self.stages = nn.ModuleList(
+            [self._make_stage(in_channels, out_channels, key_channels, value_channels, size) for size in sizes])
+        self.conv_bn_dropout = nn.Sequential(
+            nn.Conv2d(2 * in_channels, out_channels, kernel_size=1, padding=0),
+            InPlaceABNSync(out_channels),
+            nn.Dropout2d(dropout)
+        )
+    def _make_stage(self, in_channels, output_channels, key_channels, value_channels, size):
+        return SelfAttentionBlock2D(in_channels,
+                                    key_channels,
+                                    value_channels,
+                                    output_channels,
+                                    size)
+    def forward(self, feats):
+        priors = [stage(feats) for stage in self.stages]
+        context = priors[0]
+        for i in range(1, len(priors)):
+            context += priors[i]
+        output = self.conv_bn_dropout(torch.cat([context, feats], 1))
+        return output
+class BaseOC_Context_Module(nn.Module):
+    """
+    Output only the context features.
+    Parameters:
+        in_features / out_features: the channels of the input / output feature maps.
+        dropout: specify the dropout ratio
+        fusion: We provide two different fusion method, "concat" or "add"
+        size: we find that directly learn the attention weights on even 1/8 feature maps is hard.
+    Return:
+        features after "concat" or "add"
+    """
+    def __init__(self, in_channels, out_channels, key_channels, value_channels, dropout, sizes=([1])):
+        super(BaseOC_Context_Module, self).__init__()
+        self.stages = []
+        self.stages = nn.ModuleList(
+            [self._make_stage(in_channels, out_channels, key_channels, value_channels, size) for size in sizes])
+        self.conv_bn_dropout = nn.Sequential(
+            nn.Conv2d(in_channels, out_channels, kernel_size=1, padding=0),
+            InPlaceABNSync(out_channels),
+        )
+    def _make_stage(self, in_channels, output_channels, key_channels, value_channels, size):
+        return SelfAttentionBlock2D(in_channels,
+                                    key_channels,
+                                    value_channels,
+                                    output_channels,
+                                    size)
+    def forward(self, feats):
+        priors = [stage(feats) for stage in self.stages]
+        context = priors[0]
+        for i in range(1, len(priors)):
+            context += priors[i]
+        output = self.conv_bn_dropout(context)
+        return output
+class ASP_OC_Module(nn.Module):
+    def __init__(self, features, out_features=256, dilations=(12, 24, 36)):
+        super(ASP_OC_Module, self).__init__()
+        self.context = nn.Sequential(nn.Conv2d(features, out_features, kernel_size=3, padding=1, dilation=1, bias=True),
+                                     InPlaceABNSync(out_features),
+                                     BaseOC_Context_Module(in_channels=out_features, out_channels=out_features,
+                                                           key_channels=out_features // 2, value_channels=out_features,
+                                                           dropout=0, sizes=([2])))
+        self.conv2 = nn.Sequential(nn.Conv2d(features, out_features, kernel_size=1, padding=0, dilation=1, bias=False),
+                                   InPlaceABNSync(out_features))
+        self.conv3 = nn.Sequential(
+            nn.Conv2d(features, out_features, kernel_size=3, padding=dilations[0], dilation=dilations[0], bias=False),
+            InPlaceABNSync(out_features))
+        self.conv4 = nn.Sequential(
+            nn.Conv2d(features, out_features, kernel_size=3, padding=dilations[1], dilation=dilations[1], bias=False),
+            InPlaceABNSync(out_features))
+        self.conv5 = nn.Sequential(
+            nn.Conv2d(features, out_features, kernel_size=3, padding=dilations[2], dilation=dilations[2], bias=False),
+            InPlaceABNSync(out_features))
+        self.conv_bn_dropout = nn.Sequential(
+            nn.Conv2d(out_features * 5, out_features, kernel_size=1, padding=0, dilation=1, bias=False),
+            InPlaceABNSync(out_features),
+            nn.Dropout2d(0.1)
+        )
+    def _cat_each(self, feat1, feat2, feat3, feat4, feat5):
+        assert (len(feat1) == len(feat2))
+        z = []
+        for i in range(len(feat1)):
+            z.append(torch.cat((feat1[i], feat2[i], feat3[i], feat4[i], feat5[i]), 1))
+        return z
+    def forward(self, x):
+        if isinstance(x, Variable):
+            _, _, h, w = x.size()
+        elif isinstance(x, tuple) or isinstance(x, list):
+            _, _, h, w = x[0].size()
+        else:
+            raise RuntimeError('unknown input type')
+        feat1 = self.context(x)
+        feat2 = self.conv2(x)
+        feat3 = self.conv3(x)
+        feat4 = self.conv4(x)
+        feat5 = self.conv5(x)
+        if isinstance(x, Variable):
+            out = torch.cat((feat1, feat2, feat3, feat4, feat5), 1)
+        elif isinstance(x, tuple) or isinstance(x, list):
+            out = self._cat_each(feat1, feat2, feat3, feat4, feat5)
+        else:
+            raise RuntimeError('unknown input type')
+        output = self.conv_bn_dropout(out)
+        return output

annotator/segm/networks/context_encoding/psp.py ADDED Viewed

	@@ -0,0 +1,48 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@Author  :   Peike Li
+@Contact :   peike.li@yahoo.com
+@File    :   psp.py
+@Time    :   8/4/19 3:36 PM
+@Desc    :
+@License :   This source code is licensed under the license found in the
+             LICENSE file in the root directory of this source tree.
+"""
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from modules import InPlaceABNSync
+class PSPModule(nn.Module):
+    """
+    Reference:
+        Zhao, Hengshuang, et al. *"Pyramid scene parsing network."*
+    """
+    def __init__(self, features, out_features=512, sizes=(1, 2, 3, 6)):
+        super(PSPModule, self).__init__()
+        self.stages = []
+        self.stages = nn.ModuleList([self._make_stage(features, out_features, size) for size in sizes])
+        self.bottleneck = nn.Sequential(
+            nn.Conv2d(features + len(sizes) * out_features, out_features, kernel_size=3, padding=1, dilation=1,
+                      bias=False),
+            InPlaceABNSync(out_features),
+        )
+    def _make_stage(self, features, out_features, size):
+        prior = nn.AdaptiveAvgPool2d(output_size=(size, size))
+        conv = nn.Conv2d(features, out_features, kernel_size=1, bias=False)
+        bn = InPlaceABNSync(out_features)
+        return nn.Sequential(prior, conv, bn)
+    def forward(self, feats):
+        h, w = feats.size(2), feats.size(3)
+        priors = [F.interpolate(input=stage(feats), size=(h, w), mode='bilinear', align_corners=True) for stage in
+                  self.stages] + [feats]
+        bottle = self.bottleneck(torch.cat(priors, 1))
+        return bottle

annotator/segm/transforms.py ADDED Viewed

	@@ -0,0 +1,167 @@

+# ------------------------------------------------------------------------------
+# Copyright (c) Microsoft
+# Licensed under the MIT License.
+# Written by Bin Xiao (Bin.Xiao@microsoft.com)
+# ------------------------------------------------------------------------------
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+import cv2
+import torch
+class BRG2Tensor_transform(object):
+    def __call__(self, pic):
+        img = torch.from_numpy(pic.transpose((2, 0, 1)))
+        if isinstance(img, torch.ByteTensor):
+            return img.float()
+        else:
+            return img
+class BGR2RGB_transform(object):
+    def __call__(self, tensor):
+        return tensor[[2,1,0],:,:]
+def flip_back(output_flipped, matched_parts):
+    '''
+    ouput_flipped: numpy.ndarray(batch_size, num_joints, height, width)
+    '''
+    assert output_flipped.ndim == 4,\
+        'output_flipped should be [batch_size, num_joints, height, width]'
+    output_flipped = output_flipped[:, :, :, ::-1]
+    for pair in matched_parts:
+        tmp = output_flipped[:, pair[0], :, :].copy()
+        output_flipped[:, pair[0], :, :] = output_flipped[:, pair[1], :, :]
+        output_flipped[:, pair[1], :, :] = tmp
+    return output_flipped
+def fliplr_joints(joints, joints_vis, width, matched_parts):
+    """
+    flip coords
+    """
+    # Flip horizontal
+    joints[:, 0] = width - joints[:, 0] - 1
+    # Change left-right parts
+    for pair in matched_parts:
+        joints[pair[0], :], joints[pair[1], :] = \
+            joints[pair[1], :], joints[pair[0], :].copy()
+        joints_vis[pair[0], :], joints_vis[pair[1], :] = \
+            joints_vis[pair[1], :], joints_vis[pair[0], :].copy()
+    return joints*joints_vis, joints_vis
+def transform_preds(coords, center, scale, input_size):
+    target_coords = np.zeros(coords.shape)
+    trans = get_affine_transform(center, scale, 0, input_size, inv=1)
+    for p in range(coords.shape[0]):
+        target_coords[p, 0:2] = affine_transform(coords[p, 0:2], trans)
+    return target_coords
+def transform_parsing(pred, center, scale, width, height, input_size):
+    trans = get_affine_transform(center, scale, 0, input_size, inv=1)
+    target_pred = cv2.warpAffine(
+            pred,
+            trans,
+            (int(width), int(height)), #(int(width), int(height)),
+            flags=cv2.INTER_NEAREST,
+            borderMode=cv2.BORDER_CONSTANT,
+            borderValue=(0))
+    return target_pred
+def transform_logits(logits, center, scale, width, height, input_size):
+    trans = get_affine_transform(center, scale, 0, input_size, inv=1)
+    channel = logits.shape[2]
+    target_logits = []
+    for i in range(channel):
+        target_logit = cv2.warpAffine(
+            logits[:,:,i],
+            trans,
+            (int(width), int(height)), #(int(width), int(height)),
+            flags=cv2.INTER_LINEAR,
+            borderMode=cv2.BORDER_CONSTANT,
+            borderValue=(0))
+        target_logits.append(target_logit)
+    target_logits = np.stack(target_logits,axis=2)
+    return target_logits
+def get_affine_transform(center,
+                         scale,
+                         rot,
+                         output_size,
+                         shift=np.array([0, 0], dtype=np.float32),
+                         inv=0):
+    if not isinstance(scale, np.ndarray) and not isinstance(scale, list):
+        print(scale)
+        scale = np.array([scale, scale])
+    scale_tmp = scale
+    src_w = scale_tmp[0]
+    dst_w = output_size[1]
+    dst_h = output_size[0]
+    rot_rad = np.pi * rot / 180
+    src_dir = get_dir([0, src_w * -0.5], rot_rad)
+    dst_dir = np.array([0, (dst_w-1) * -0.5], np.float32)
+    src = np.zeros((3, 2), dtype=np.float32)
+    dst = np.zeros((3, 2), dtype=np.float32)
+    src[0, :] = center + scale_tmp * shift
+    src[1, :] = center + src_dir + scale_tmp * shift
+    dst[0, :] = [(dst_w-1) * 0.5, (dst_h-1) * 0.5]
+    dst[1, :] = np.array([(dst_w-1) * 0.5, (dst_h-1) * 0.5]) + dst_dir
+    src[2:, :] = get_3rd_point(src[0, :], src[1, :])
+    dst[2:, :] = get_3rd_point(dst[0, :], dst[1, :])
+    if inv:
+        trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
+    else:
+        trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))
+    return trans
+def affine_transform(pt, t):
+    new_pt = np.array([pt[0], pt[1], 1.]).T
+    new_pt = np.dot(t, new_pt)
+    return new_pt[:2]
+def get_3rd_point(a, b):
+    direct = a - b
+    return b + np.array([-direct[1], direct[0]], dtype=np.float32)
+def get_dir(src_point, rot_rad):
+    sn, cs = np.sin(rot_rad), np.cos(rot_rad)
+    src_result = [0, 0]
+    src_result[0] = src_point[0] * cs - src_point[1] * sn
+    src_result[1] = src_point[0] * sn + src_point[1] * cs
+    return src_result
+def crop(img, center, scale, output_size, rot=0):
+    trans = get_affine_transform(center, scale, rot, output_size)
+    dst_img = cv2.warpAffine(img,
+                             trans,
+                             (int(output_size[1]), int(output_size[0])),
+                             flags=cv2.INTER_LINEAR)
+    return dst_img

annotator/util.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import numpy as np
+import cv2
+import os
+annotator_ckpts_path = os.path.join(os.path.dirname(__file__), 'ckpts')
+def HWC3(x):
+    assert x.dtype == np.uint8
+    if x.ndim == 2:
+        x = x[:, :, None]
+    assert x.ndim == 3
+    H, W, C = x.shape
+    assert C == 1 or C == 3 or C == 4
+    if C == 3:
+        return x
+    if C == 1:
+        return np.concatenate([x, x, x], axis=2)
+    if C == 4:
+        color = x[:, :, 0:3].astype(np.float32)
+        alpha = x[:, :, 3:4].astype(np.float32) / 255.0
+        y = color * alpha + 255.0 * (1.0 - alpha)
+        y = y.clip(0, 255).astype(np.uint8)
+        return y
+def resize_image(input_image, resolution):
+    H, W, C = input_image.shape
+    H = float(H)
+    W = float(W)
+    k = float(resolution) / min(H, W)
+    H *= k
+    W *= k
+    H = int(np.round(H / 64.0)) * 64
+    W = int(np.round(W / 64.0)) * 64
+    img = cv2.resize(input_image, (W, H), interpolation=cv2.INTER_LANCZOS4 if k > 1 else cv2.INTER_AREA)
+    return img
+def pad_image(img, min_aspect_ratio=0.625):
+    H, W, C = img.shape
+    if W/H < min_aspect_ratio:
+        NEW_W = int(min_aspect_ratio * H)
+        width_padding = (NEW_W-W)//2
+        black_bg = np.zeros((H, NEW_W, 3), dtype=img.dtype)
+        black_bg[:, width_padding:width_padding+W,:] = img
+        return black_bg
+    else:
+        return img

app.py ADDED Viewed

	@@ -0,0 +1,475 @@

+from share import *
+import config
+import os
+import cv2
+import einops
+import gradio as gr
+import numpy as np
+import torch
+import random
+import re
+from datetime import datetime
+from glob import glob
+import argparse
+from pytorch_lightning import seed_everything
+from torchvision.transforms import ToPILImage
+from annotator.util import pad_image, resize_image, HWC3
+from annotator.openpose import  OpenposeDetector
+from cldm.model import create_model, load_state_dict
+from cldm.ddim_hacked import DDIMSampler
+from pathlib import Path
+from PIL import Image
+from omegaconf import OmegaConf
+from ldm.util import instantiate_from_config, log_txt_as_img
+from visconet.segm import ATRSegmentCropper as SegmentCropper
+from huggingface_hub import snapshot_download
+# supply  directory of visual prompt images
+HF_REPO = 'soonyau/visconet'
+GALLERY_PATH = Path('./fashion/')
+WOMEN_GALLERY_PATH = GALLERY_PATH/'WOMEN'
+MEN_GALLERY_PATH = GALLERY_PATH/'MEN'
+DEMO = True
+LOG_SAMPLES = False
+APP_FILES_PATH = Path('./app_files')
+VISCON_IMAGE_PATH = APP_FILES_PATH/'default_images'
+LOG_PATH = APP_FILES_PATH/'logs'
+SAMPLE_IMAGE_PATH = APP_FILES_PATH/'samples'
+DEFAULT_CONTROL_SCALE = 1.0
+SCALE_CONFIG = {
+    'Default': [DEFAULT_CONTROL_SCALE]*13,
+    'DeepFakes':[1.0, 1.0, 1.0,
+                 1.0, 1.0, 1.0,
+                 0.5, 0.5, 0.5,
+                 0.0, 0.0, 0.0, 0.0,],
+    'Faithful':[1,1,1,
+                 1,1,1,
+                 1,1,0.5,
+                 0.5,0.5,0,0],
+    'Painting':[0.0,0.0,0.0,
+                0.5,0.5,0.5,
+                0.5,0.5,0.5,
+                0.5,0,0,0],
+    'Pose':    [0.0,0.0,0.0,
+                0.0,0.0,0.0,
+                0.0,0.0,0.5,
+                0.0,0.0,0,0],
+    'Texture Transfer':  [1.0,1.0,1.0,
+                1.0,1.0,1.0,
+                0.5,0.0,0.5,
+                0.0,0.0,0,0]
+    }
+DEFAULT_SCALE_CONFIG = 'Default'
+ignore_style_list = ['headwear', 'accesories', 'shoes']
+global device
+global segmentor
+global apply_openpose
+global style_encoder
+global model
+global ddim_sampler
+def convert_fname(long_name):
+    gender = 'MEN' if long_name[7:10]  == 'MEN' else 'WOMEN'
+    input_list = long_name.replace('fashion','').split('___')
+    # Define a regular expression pattern to match the relevant parts of each input string
+    if gender == 'MEN':
+        pattern = r'MEN(\w+)id(\d+)_(\d)(\w+)'
+    else:
+        pattern = r'WOMEN(\w+)id(\d+)_(\d)(\w+)'
+    # Use a list comprehension to extract the matching substrings from each input string, and format them into the desired output format
+    output_list = [f'{gender}/{category}/id_{id_num[:8]}/{id_num[8:]}_{view_num}_{view_desc}' for (category, id_num, view_num, view_desc) in re.findall(pattern, ' '.join(input_list))]
+    # Print the resulting list of formatted strings
+    return [f +'.jpg' for f in output_list]
+def fetch_deepfashion(deepfashion_names):
+    src_name, dst_name = convert_fname(deepfashion_names)
+    input_image = np.array(Image.open(image_root/src_name))
+    pose_image = np.array(Image.open(str(pose_root/dst_name)))
+    mask_image = Image.open(str(mask_root/dst_name).replace('.jpg','_mask.png'))
+    temp = src_name.replace('.jpg','').split('/')
+    lastfolder = temp.pop(-1).replace('_','/', 1)
+    style_folder = style_root/('/'.join(temp+[lastfolder]))
+    viscon_images = []
+    for style_name in style_names:
+        f_path = style_folder/f'{style_name}.jpg'
+        if os.path.exists(str(f_path)):
+            viscon_images.append(np.array(Image.open(f_path)))
+        else:
+            viscon_images.append(None)
+    return [input_image, pose_image, mask_image, *viscon_images]
+def select_gallery_image(evt: gr.SelectData):
+    return evt.target.value[evt.index]['name']
+def select_default_strength(strength_config):
+    return SCALE_CONFIG[strength_config]
+def change_all_scales(scale):
+    return [float(scale)]*13
+def encode_style_images(style_images):
+    style_embeddings = []
+    for style_name, style_image in zip(style_names, style_images):
+        if style_image == None:
+            style_image = Image.fromarray(np.zeros((224, 224, 3), dtype=np.uint8))
+        #style_image = style_image.resize((224,224))
+        style_image = style_encoder.preprocess(style_image).to(device)
+        style_emb = style_encoder.postprocess(style_encoder(style_image)[0])
+        style_embeddings.append(style_emb)
+    styles = torch.tensor(np.array(style_embeddings)).squeeze(-2).unsqueeze(0).float().to(device)
+    return styles
+def save_viscon_images(*viscon_images):
+    ret_images = []
+    for image, name in zip(viscon_images, style_names):
+        fname = str(VISCON_IMAGE_PATH/name)+'.jpg'
+        if image:
+            image = image.resize((224,224))
+            if os.path.exists(fname):
+                os.remove(fname)
+            image.save(fname)
+        ret_images.append(image)
+    return ret_images
+def extract_pose_mask(input_image, detect_resolution,
+                      ignore_head=True, ignore_hair=False):
+    # skeleton
+    input_image = pad_image(input_image, min_aspect_ratio=0.625)
+    detected_map, _ = apply_openpose(resize_image(input_image, detect_resolution), hand=True)
+    detected_map = HWC3(detected_map)
+    # human mask
+    cropped = segmentor(input_image, ignore_head=ignore_head, ignore_hair=ignore_hair)
+    mask = cropped['human_mask']
+    mask = Image.fromarray(np.array(mask*255, dtype=np.uint8), mode='L')
+    return [detected_map, mask]
+def extract_fashion(input_image):
+    # style images
+    cropped = segmentor(input_image)
+    cropped_images = []
+    for style_name in style_names:
+        if style_name in cropped and style_name not in ignore_style_list:
+            cropped_images.append(cropped[style_name])
+        else:
+            cropped_images.append(None)
+    return [*cropped_images]
+def get_image_files(image_path, ret_image=True, exts=['.jpg','.jpeg','.png']):
+    images = []
+    for ext in exts:
+        images += [x for x in glob(str(Path(image_path)/f'*{ext}'))]
+    if ret_image:
+        images = [Image.open(x) for x in images]
+    return images
+def log_sample(seed, results, prompt, skeleton_image,  mask_image, control_scales, *viscon_images):
+    time_str = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+    log_dir = LOG_PATH/time_str
+    os.makedirs(str(log_dir), exist_ok=True)
+    # save result
+    concat = np.hstack((skeleton_image, *results))
+    Image.fromarray(skeleton_image).save(str(log_dir/'skeleton.jpg'))
+    Image.fromarray(mask_image).save(str(log_dir/'mask.png'))
+    for i, result in enumerate(results):
+        Image.fromarray(result).save(str(log_dir/f'result_{i}.jpg'))
+    # save text
+    with open(str(log_dir/'info.txt'),'w') as f:
+        f.write(f'prompt: {prompt} \n')
+        f.write(f'seed: {seed}\n')
+        control_str = [str(x) for x in control_scales]
+        f.write(','.join(control_str) + '\n')
+    # save vison images
+    for style_name, style_image in zip(style_names, viscon_images):
+        if style_image is not None:
+            style_image.save(str(log_dir/f'{style_name}.jpg'))
+def process(prompt, a_prompt, n_prompt, num_samples,
+            ddim_steps, scale, seed, eta, mask_image, pose_image,
+            c12, c11, c10, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0,
+            *viscon_images):
+    with torch.no_grad():
+        control_scales = [c12, c11, c10, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0]
+        mask = torch.tensor(mask_image.mean(-1)/255.,dtype=torch.float) #(512,512), [0,1]
+        mask = mask.unsqueeze(0).to(device) # (1, 512, 512)
+        style_emb = encode_style_images(viscon_images)
+        # fix me
+        detected_map = HWC3(pose_image)
+        #detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_NEAREST)
+        H, W, C = detected_map.shape
+        control = torch.from_numpy(detected_map.copy()).float().to(device) / 255.0
+        control = torch.stack([control for _ in range(num_samples)], dim=0)
+        control = einops.rearrange(control, 'b h w c -> b c h w').clone()
+        if seed == -1:
+            seed = random.randint(0, 65535)
+        seed_everything(seed)
+        if config.save_memory:
+            model.low_vram_shift(is_diffusing=False)
+        new_style_shape = [num_samples] + [1] * (len(style_emb.shape)-1)
+        cond = {"c_concat": [control],
+                "c_crossattn": [style_emb.repeat(new_style_shape)],
+                "c_text": [model.get_learned_conditioning([prompt + ', ' + a_prompt] * num_samples)],
+                'c_concat_mask': [mask.repeat(num_samples, 1, 1, 1)]}
+        un_cond = {"c_concat": [control],
+                   "c_crossattn": [torch.zeros_like(style_emb).repeat(new_style_shape)],
+                   "c_text":[model.get_learned_conditioning([n_prompt] * num_samples)],
+                   'c_concat_mask': [torch.zeros_like(mask).repeat(num_samples, 1, 1, 1)]}
+        shape = (4, H // 8, W // 8)
+        if config.save_memory:
+            model.low_vram_shift(is_diffusing=True)
+        model.control_scales = control_scales
+        samples, _ = ddim_sampler.sample(ddim_steps, num_samples,
+                                                     shape, cond, verbose=False, eta=eta,
+                                                     unconditional_guidance_scale=scale,
+                                                     unconditional_conditioning=un_cond)
+        if config.save_memory:
+            model.low_vram_shift(is_diffusing=False)
+        x_samples = model.decode_first_stage(samples)
+        x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(np.uint8)
+        results = [x_samples[i] for i in range(num_samples)]
+    if LOG_SAMPLES:
+        log_sample(seed, results, prompt, detected_map, mask_image, control_scales, *viscon_images)
+    return results
+def get_image(name, file_ext='.jpg'):
+    fname = str(VISCON_IMAGE_PATH/name)+file_ext
+    if not os.path.exists(fname):
+        return None
+    return Image.open(fname)
+def get_image_numpy(name, file_ext='.png'):
+    fname = str(VISCON_IMAGE_PATH/name)+file_ext
+    if not os.path.exists(fname):
+        return None
+    return np.array(Image.open(fname))
+def create_app():
+    block = gr.Blocks().queue()
+    with block:
+        with gr.Row():
+            gr.Markdown("## ViscoNet: Visual ControlNet with Human Pose and Fashion <br> [Video tutorial](https://youtu.be/85NyIuLeV00)")
+        with gr.Row():
+            with gr.Column():
+                with gr.Accordion("Get pose and mask", open=False):
+                    with gr.Row():
+                        input_image = gr.Image(source='upload', type="numpy", label='input image', value=np.array(get_image_numpy('ref')))
+                        pose_image = gr.Image(source='upload', type="numpy", label='pose', value=np.array(get_image_numpy('pose')))
+                        mask_image = gr.Image(source='upload', type="numpy", label='mask', value=np.array(get_image_numpy('mask')))
+                    with gr.Accordion("Samples", open=False):
+                        with gr.Tab('Female'):
+                            samples = get_image_files(str(SAMPLE_IMAGE_PATH/'pose/WOMEN/'))
+                            female_pose_gallery = gr.Gallery(label='pose', show_label=False, value=samples).style(grid=3, height='auto')
+                        with gr.Tab('Male'):
+                            samples = get_image_files(str(SAMPLE_IMAGE_PATH/'pose/MEN/'))
+                            male_pose_gallery = gr.Gallery(label='pose', show_label=False, value=samples).style(grid=3, height='auto')
+                    with gr.Row():
+                        #pad_checkbox = gr.Checkbox(label='Pad pose to square', value=True)
+                        ignorehead_checkbox = gr.Checkbox(label='Ignore face in masking (for DeepFake)', value=True)
+                        ignorehair_checkbox = gr.Checkbox(label='Ignore hair in masking', value=False, visible=True)
+                    with gr.Row():
+                        #ignore_head_checkbox = gr.Checkbox(label='Ignore head', value=False)
+                        get_pose_button = gr.Button(label="Get pose", value='Get pose')
+                        get_fashion_button = gr.Button(label="Get visual", value='Get visual prompt')
+                with gr.Accordion("Visual Conditions", open=False):
+                    gr.Markdown('Drag-and-drop, or click from samples below.')
+                    with gr.Column():
+                        viscon_images = []
+                        viscon_images_names2index = {}
+                        viscon_len = len(style_names)
+                        v_idx = 0
+                        with gr.Row():
+                            for _ in range(8):
+                                viscon_name = style_names[v_idx]
+                                vis = False if viscon_name in ignore_style_list else True
+                                viscon_images.append(gr.Image(source='upload', type="pil", min_height=112, min_width=112, label=viscon_name, value=get_image(viscon_name), visible=vis))
+                                viscon_images_names2index[viscon_name] = v_idx
+                                v_idx += 1
+                        viscon_button = gr.Button(value='Save as Default',visible=False if DEMO else True)
+                    viscon_galleries = []
+                    with gr.Column():
+                        with gr.Accordion("Female", open=False):
+                            for garment, number in zip(['hair', 'top', 'bottom', 'outer'], [150, 500, 500, 250]):
+                                with gr.Tab(garment):
+                                    samples = []
+                                    if WOMEN_GALLERY_PATH and os.path.exists(WOMEN_GALLERY_PATH):
+                                        samples = glob(os.path.join(WOMEN_GALLERY_PATH, f'**/{garment}.jpg'), recursive=True)
+                                        #samples = glob(f'/home/soon/datasets/deepfashion_inshop/styles_default/WOMEN/**/{garment}.jpg', recursive=True)
+                                        samples = random.choices(samples, k=number)
+                                    viscon_gallery = gr.Gallery(label='hair', allow_preview=False, show_label=False, value=samples).style(grid=4, height='auto')
+                                    viscon_galleries.append({'component':viscon_gallery, 'inputs':[garment]})
+                        with gr.Accordion("Male", open=False):
+                            for garment, number in zip(['hair', 'top', 'bottom', 'outer'], [150, 500, 500, 250]):
+                                with gr.Tab(garment):
+                                    samples = []
+                                    if MEN_GALLERY_PATH and os.path.exists(MEN_GALLERY_PATH):
+                                        samples = glob(os.path.join(MEN_GALLERY_PATH, f'**/{garment}.jpg'), recursive=True)
+                                        samples = random.choices(samples, k=number)
+                                    viscon_gallery = gr.Gallery(label='hair', allow_preview=False, show_label=False, value=samples).style(grid=4, height='auto')
+                                    viscon_galleries.append({'component':viscon_gallery, 'inputs':[garment]})
+                with gr.Accordion("Control Strength Scaling", open=False):
+                    gr.Markdown("smaller value for stronger textual influence. c12 is highest spatial resolution controlling textures")
+                    with gr.Row():
+                        strength_select = gr.Dropdown(list(SCALE_CONFIG.keys()), label='strength settings', value=DEFAULT_SCALE_CONFIG)
+                        scale_all = gr.Slider(label=f'set all scales', minimum=0, maximum=1, value=DEFAULT_CONTROL_SCALE, step=0.05)
+                    scale_values = SCALE_CONFIG[DEFAULT_SCALE_CONFIG]
+                    control_scales = []
+                    c_idx = 12
+                    with gr.Accordion("Advanced settings", open=False):
+                        with gr.Row():
+                            for _ in range(3):
+                                control_scales.append(gr.Slider(label=f'c{c_idx}', minimum=0, maximum=1, value=scale_values[12-c_idx], step=0.05))
+                                c_idx -= 1
+                        with gr.Row():
+                            for _ in range(3):
+                                control_scales.append(gr.Slider(label=f'c{c_idx}', minimum=0, maximum=1, value=scale_values[12-c_idx], step=0.05))
+                                c_idx -= 1
+                        with gr.Row():
+                            for _ in range(3):
+                                control_scales.append(gr.Slider(label=f'c{c_idx}', minimum=0, maximum=1, value=scale_values[12-c_idx], step=0.05))
+                                c_idx -= 1
+                        with gr.Row():
+                            for _ in range(4):
+                                control_scales.append(gr.Slider(label=f'c{c_idx}', minimum=0, maximum=1, value=scale_values[12-c_idx], step=0.05))
+                                c_idx -= 1
+                with gr.Accordion("Advanced options", open=False):
+                    with gr.Row():
+                        detect_resolution = gr.Slider(label="OpenPose Resolution", minimum=128, maximum=512, value=512, step=1)
+                        ddim_steps = gr.Slider(label="Steps", minimum=1, maximum=50, value=20, step=1)
+                        scale = gr.Slider(label="Guidance Scale", minimum=0.1, maximum=30.0, value=12.0, step=0.1)
+                    eta = gr.Number(label="eta (DDIM)", value=0.0, visible=False)
+                    a_prompt = gr.Textbox(label="Added Prompt", value='best quality, extremely detailed')
+                    n_prompt = gr.Textbox(label="Negative Prompt",
+                                        value='longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, sunglasses, hat')
+            with gr.Column():
+                result_gallery = gr.Gallery(label='Output', show_label=False, show_download_button=True, elem_id="gallery").style(grid=1, height='auto')
+                with gr.Row():
+                    max_samples = 8 if not DEMO else 4
+                    num_samples = gr.Slider(label="Images", minimum=1, maximum=max_samples, value=1, step=1)
+                    seed = gr.Slider(label="Seed (-1 for random)", minimum=-1, maximum=2147483647, step=1, value=1561194236)#randomize=True) #value=1561194234)
+                    if not DEMO:
+                        DF_DEMO = 'fashionWOMENTees_Tanksid0000762403_1front___fashionWOMENTees_Tanksid0000762403_1front'
+                        DF_EVAL = 'fashionWOMENBlouses_Shirtsid0000035501_1front___fashionWOMENBlouses_Shirtsid0000035501_1front'
+                        DF_RESULT ="fashionWOMENTees_Tanksid0000796209_1front___fashionWOMENTees_Tanksid0000796209_2side"
+                        deepfashion_names = gr.Textbox(label='Deepfashion name', value=DF_EVAL)
+                gr.Markdown("Default config reconstruct image faithful to pose, mask and visual condition. Reduce control strength to tip balance towards text prompt for more creativity.")
+                prompt = gr.Textbox(label="Text Prompt", value="")
+                run_button = gr.Button(label="Run")
+        female_pose_gallery.select(fn=select_gallery_image, inputs=None, outputs=input_image)
+        male_pose_gallery.select(fn=select_gallery_image, inputs=None, outputs=input_image)
+        for vision_gallery in viscon_galleries:
+            viscon_idx = viscon_images_names2index[vision_gallery['inputs'][0]]
+            vision_gallery['component'].select(fn=select_gallery_image, inputs=None,
+                                            outputs=viscon_images[viscon_idx])
+        ips = [prompt, a_prompt, n_prompt, num_samples, ddim_steps, scale, seed, eta, mask_image, pose_image,
+            *control_scales, *viscon_images]
+        run_button.click(fn=process, inputs=ips, outputs=[result_gallery])
+        prompt.submit(fn=process, inputs=ips, outputs=[result_gallery])
+        get_pose_button.click(fn=extract_pose_mask, inputs=[input_image, detect_resolution,
+                                                            ignorehead_checkbox, ignorehair_checkbox],
+                            outputs=[pose_image, mask_image])
+        get_fashion_button.click(fn=extract_fashion, inputs=input_image, outputs=[*viscon_images])
+        viscon_button.click(fn=save_viscon_images, inputs=[*viscon_images], outputs=[*viscon_images])
+        strength_select.select(fn=select_default_strength, inputs=[strength_select], outputs=[*control_scales])
+        scale_all.release(fn=change_all_scales, inputs=[scale_all], outputs=[*control_scales])
+        if not DEMO:
+            deepfashion_names.submit(fn=fetch_deepfashion, inputs=[deepfashion_names], outputs=[input_image, pose_image, mask_image, *viscon_images])
+    return block
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Calculate image-text similarity score.')
+    parser.add_argument('--gpu', type=int, default=0, help='GPU id')
+    parser.add_argument('--config', type=str, default='./configs/visconet_v1.yaml')
+    parser.add_argument('--ckpt', type=str, default='./models/visconet_v1.pth')
+    parser.add_argument('--public_link', action='store_true', default='', help='Create public link')
+    args = parser.parse_args()
+    global device
+    global segmentor
+    global apply_openpose
+    global style_encoder
+    global model
+    global ddim_sampler
+    device = f'cuda:{args.gpu}' if torch.cuda.is_available() else 'cpu'
+    config_file = args.config
+    model_ckpt = args.ckpt
+    proj_config = OmegaConf.load(config_file)
+    style_names = proj_config.dataset.train.params.style_names
+    data_root = Path(proj_config.dataset.train.params.image_root)
+    image_root = data_root/proj_config.dataset.train.params.image_dir
+    style_root = data_root/proj_config.dataset.train.params.style_dir
+    pose_root = data_root/proj_config.dataset.train.params.pose_dir
+    mask_root = data_root/proj_config.dataset.train.params.mask_dir
+    segmentor = SegmentCropper()
+    apply_openpose = OpenposeDetector()
+    snapshot_download(repo_id=HF_REPO, local_dir='./models',
+                      allow_patterns=os.path.basename(model_ckpt))
+    style_encoder = instantiate_from_config(proj_config.model.style_embedding_config).to(device)
+    model = create_model(config_file).cpu()
+    model.load_state_dict(load_state_dict(model_ckpt, location=device))
+    model = model.to(device)
+    model.cond_stage_model.device = device
+    ddim_sampler = DDIMSampler(model)
+    if not GALLERY_PATH.exists():
+        zip_name = 'fashion.zip'
+        snapshot_download(repo_id=HF_REPO, allow_patterns=zip_name, local_dir='.')
+        from zipfile import ZipFile
+        with ZipFile(zip_name, 'r') as zip_ref:
+            zip_ref.extractall('.')
+        os.remove(zip_name)
+    # Calling the main function with parsed arguments
+    block = create_app()
+    block.launch(server_name='0.0.0.0', share=args.public_link)