toto10 commited on
Commit
605f6d5
1 Parent(s): dfc0d4a

676e2c59aa5f2e27514c3795349ecbd4b2cb266b3b631e4a273b82cd4c9536a9

Browse files
Files changed (50) hide show
  1. extensions/microsoftexcel-controlnet/annotator/oneformer/oneformer/modeling/transformer_decoder/position_encoding.py +67 -0
  2. extensions/microsoftexcel-controlnet/annotator/oneformer/oneformer/modeling/transformer_decoder/text_transformer.py +257 -0
  3. extensions/microsoftexcel-controlnet/annotator/oneformer/oneformer/modeling/transformer_decoder/transformer.py +376 -0
  4. extensions/microsoftexcel-controlnet/annotator/oneformer/oneformer/oneformer_model.py +470 -0
  5. extensions/microsoftexcel-controlnet/annotator/oneformer/oneformer/utils/__init__.py +2 -0
  6. extensions/microsoftexcel-controlnet/annotator/oneformer/oneformer/utils/box_ops.py +133 -0
  7. extensions/microsoftexcel-controlnet/annotator/oneformer/oneformer/utils/events.py +120 -0
  8. extensions/microsoftexcel-controlnet/annotator/oneformer/oneformer/utils/misc.py +197 -0
  9. extensions/microsoftexcel-controlnet/annotator/oneformer/oneformer/utils/pos_embed.py +122 -0
  10. extensions/microsoftexcel-controlnet/annotator/oneformer/pycocotools/__init__.py +1 -0
  11. extensions/microsoftexcel-controlnet/annotator/oneformer/pycocotools/coco.py +444 -0
  12. extensions/microsoftexcel-controlnet/annotator/oneformer/pycocotools/cocoeval.py +534 -0
  13. extensions/microsoftexcel-controlnet/annotator/oneformer/pycocotools/mask.py +107 -0
  14. extensions/microsoftexcel-controlnet/annotator/openpose/LICENSE +108 -0
  15. extensions/microsoftexcel-controlnet/annotator/openpose/__init__.py +262 -0
  16. extensions/microsoftexcel-controlnet/annotator/openpose/__pycache__/__init__.cpython-310.pyc +0 -0
  17. extensions/microsoftexcel-controlnet/annotator/openpose/__pycache__/body.cpython-310.pyc +0 -0
  18. extensions/microsoftexcel-controlnet/annotator/openpose/__pycache__/face.cpython-310.pyc +0 -0
  19. extensions/microsoftexcel-controlnet/annotator/openpose/__pycache__/hand.cpython-310.pyc +0 -0
  20. extensions/microsoftexcel-controlnet/annotator/openpose/__pycache__/model.cpython-310.pyc +0 -0
  21. extensions/microsoftexcel-controlnet/annotator/openpose/__pycache__/util.cpython-310.pyc +0 -0
  22. extensions/microsoftexcel-controlnet/annotator/openpose/body.py +278 -0
  23. extensions/microsoftexcel-controlnet/annotator/openpose/face.py +362 -0
  24. extensions/microsoftexcel-controlnet/annotator/openpose/hand.py +94 -0
  25. extensions/microsoftexcel-controlnet/annotator/openpose/model.py +218 -0
  26. extensions/microsoftexcel-controlnet/annotator/openpose/util.py +383 -0
  27. extensions/microsoftexcel-controlnet/annotator/pidinet/LICENSE +21 -0
  28. extensions/microsoftexcel-controlnet/annotator/pidinet/__init__.py +51 -0
  29. extensions/microsoftexcel-controlnet/annotator/pidinet/model.py +653 -0
  30. extensions/microsoftexcel-controlnet/annotator/shuffle/__init__.py +74 -0
  31. extensions/microsoftexcel-controlnet/annotator/uniformer/LICENSE +203 -0
  32. extensions/microsoftexcel-controlnet/annotator/uniformer/__init__.py +56 -0
  33. extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/datasets/ade20k.py +54 -0
  34. extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/datasets/chase_db1.py +59 -0
  35. extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/datasets/cityscapes.py +54 -0
  36. extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/datasets/cityscapes_769x769.py +35 -0
  37. extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/datasets/drive.py +59 -0
  38. extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/datasets/hrf.py +59 -0
  39. extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/datasets/pascal_context.py +60 -0
  40. extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/datasets/pascal_context_59.py +60 -0
  41. extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/datasets/pascal_voc12.py +57 -0
  42. extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/datasets/pascal_voc12_aug.py +9 -0
  43. extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/datasets/stare.py +59 -0
  44. extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/default_runtime.py +14 -0
  45. extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/ann_r50-d8.py +46 -0
  46. extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/apcnet_r50-d8.py +44 -0
  47. extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/ccnet_r50-d8.py +44 -0
  48. extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/cgnet.py +35 -0
  49. extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/danet_r50-d8.py +44 -0
  50. extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/deeplabv3_r50-d8.py +44 -0
extensions/microsoftexcel-controlnet/annotator/oneformer/oneformer/modeling/transformer_decoder/position_encoding.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ------------------------------------------------------------------------------
2
+ # Reference: https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/modeling/transformer_decoder/position_encoding.py
3
+ # Modified by Jitesh Jain (https://github.com/praeclarumjj3)
4
+ # ------------------------------------------------------------------------------
5
+
6
+ """
7
+ Various positional encodings for the transformer.
8
+ """
9
+ import math
10
+
11
+ import torch
12
+ from torch import nn
13
+
14
+
15
+ class PositionEmbeddingSine(nn.Module):
16
+ """
17
+ This is a more standard version of the position embedding, very similar to the one
18
+ used by the Attention is all you need paper, generalized to work on images.
19
+ """
20
+
21
+ def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
22
+ super().__init__()
23
+ self.num_pos_feats = num_pos_feats
24
+ self.temperature = temperature
25
+ self.normalize = normalize
26
+ if scale is not None and normalize is False:
27
+ raise ValueError("normalize should be True if scale is passed")
28
+ if scale is None:
29
+ scale = 2 * math.pi
30
+ self.scale = scale
31
+
32
+ def forward(self, x, mask=None):
33
+ if mask is None:
34
+ mask = torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool)
35
+ not_mask = ~mask
36
+ y_embed = not_mask.cumsum(1, dtype=torch.float32)
37
+ x_embed = not_mask.cumsum(2, dtype=torch.float32)
38
+ if self.normalize:
39
+ eps = 1e-6
40
+ y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
41
+ x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
42
+
43
+ dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
44
+ dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
45
+
46
+ pos_x = x_embed[:, :, :, None] / dim_t
47
+ pos_y = y_embed[:, :, :, None] / dim_t
48
+ pos_x = torch.stack(
49
+ (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4
50
+ ).flatten(3)
51
+ pos_y = torch.stack(
52
+ (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4
53
+ ).flatten(3)
54
+ pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
55
+ return pos
56
+
57
+ def __repr__(self, _repr_indent=4):
58
+ head = "Positional encoding " + self.__class__.__name__
59
+ body = [
60
+ "num_pos_feats: {}".format(self.num_pos_feats),
61
+ "temperature: {}".format(self.temperature),
62
+ "normalize: {}".format(self.normalize),
63
+ "scale: {}".format(self.scale),
64
+ ]
65
+ # _repr_indent = 4
66
+ lines = [head] + [" " * _repr_indent + line for line in body]
67
+ return "\n".join(lines)
extensions/microsoftexcel-controlnet/annotator/oneformer/oneformer/modeling/transformer_decoder/text_transformer.py ADDED
@@ -0,0 +1,257 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -------------------------------------------------------------------------
2
+ # MIT License
3
+ #
4
+ # Copyright (c) 2021 OpenAI
5
+ #
6
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ # of this software and associated documentation files (the "Software"), to deal
8
+ # in the Software without restriction, including without limitation the rights
9
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ # copies of the Software, and to permit persons to whom the Software is
11
+ # furnished to do so, subject to the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be included in all
14
+ # copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
+ # SOFTWARE.
23
+ #
24
+ # -------------------------------------------------------------------------
25
+
26
+ import torch
27
+ import torch.utils.checkpoint as checkpoint
28
+ from torch import nn
29
+ from collections import OrderedDict
30
+ from timm.models.layers import trunc_normal_
31
+
32
+ class Attention(nn.Module):
33
+ def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
34
+ super().__init__()
35
+ self.num_heads = num_heads
36
+ head_dim = dim // num_heads
37
+ # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
38
+ self.scale = qk_scale or head_dim ** -0.5
39
+
40
+ self.q_proj = nn.Linear(dim, dim, bias=qkv_bias)
41
+ self.k_proj = nn.Linear(dim, dim, bias=qkv_bias)
42
+ self.v_proj = nn.Linear(dim, dim, bias=qkv_bias)
43
+
44
+
45
+ self.attn_drop = nn.Dropout(attn_drop)
46
+ self.proj = nn.Linear(dim, dim)
47
+ self.proj_drop = nn.Dropout(proj_drop)
48
+
49
+ def forward(self, q, k, v):
50
+ B, N, C = q.shape
51
+ assert k.shape == v.shape
52
+ B, M, C = k.shape
53
+ q = self.q_proj(q).reshape(B, N, self.num_heads, C // self.num_heads)
54
+ k = self.k_proj(k).reshape(B, M, self.num_heads, C // self.num_heads)
55
+ v = self.v_proj(v).reshape(B, M, self.num_heads, C // self.num_heads)
56
+
57
+ attn = torch.einsum('bnkc,bmkc->bknm', q, k) * self.scale
58
+
59
+ attn = attn.softmax(dim=-1)
60
+
61
+ x = torch.einsum('bknm,bmkc->bnkc', attn, v).reshape(B, N, C)
62
+
63
+ x = self.proj(x)
64
+ x = self.proj_drop(x)
65
+ return x
66
+
67
+ class TransformerDecoderLayer(nn.Module):
68
+ def __init__(
69
+ self,
70
+ d_model,
71
+ nhead,
72
+ dropout=0.1,
73
+ ):
74
+ super().__init__()
75
+ self.self_attn = Attention(d_model, nhead, proj_drop=dropout)
76
+ self.cross_attn = Attention(d_model, nhead, proj_drop=dropout)
77
+
78
+ self.norm1 = nn.LayerNorm(d_model)
79
+ self.norm2 = nn.LayerNorm(d_model)
80
+ self.norm3 = nn.LayerNorm(d_model)
81
+ self.dropout = nn.Dropout(dropout)
82
+
83
+ self.mlp = nn.Sequential(
84
+ nn.Linear(d_model, d_model * 4),
85
+ nn.GELU(),
86
+ nn.Dropout(dropout),
87
+ nn.Linear(d_model * 4, d_model)
88
+ )
89
+
90
+ def forward(self, x, mem):
91
+ q = k = v = self.norm1(x)
92
+ x = x + self.self_attn(q, k, v)
93
+ q = self.norm2(x)
94
+ x = x + self.cross_attn(q, mem, mem)
95
+ x = x + self.dropout(self.mlp(self.norm3(x)))
96
+ return x
97
+
98
+
99
+ class ContextDecoder(nn.Module):
100
+ def __init__(self,
101
+ transformer_width=256,
102
+ transformer_heads=4,
103
+ transformer_layers=6,
104
+ visual_dim=1024,
105
+ dropout=0.1,
106
+ **kwargs):
107
+ super().__init__()
108
+
109
+ self.memory_proj = nn.Sequential(
110
+ nn.LayerNorm(visual_dim),
111
+ nn.Linear(visual_dim, transformer_width),
112
+ nn.LayerNorm(transformer_width),
113
+ )
114
+
115
+ self.text_proj = nn.Sequential(
116
+ nn.LayerNorm(visual_dim),
117
+ nn.Linear(visual_dim, transformer_width),
118
+ )
119
+
120
+ self.decoder = nn.ModuleList([
121
+ TransformerDecoderLayer(transformer_width, transformer_heads, dropout) for _ in range(transformer_layers)
122
+ ])
123
+
124
+ self.out_proj = nn.Sequential(
125
+ nn.LayerNorm(transformer_width),
126
+ nn.Linear(transformer_width, visual_dim)
127
+ )
128
+
129
+ self.apply(self._init_weights)
130
+
131
+ def _init_weights(self, m):
132
+ if isinstance(m, nn.Linear):
133
+ trunc_normal_(m.weight, std=.02)
134
+ if isinstance(m, nn.Linear) and m.bias is not None:
135
+ nn.init.constant_(m.bias, 0)
136
+ elif isinstance(m, nn.LayerNorm):
137
+ nn.init.constant_(m.bias, 0)
138
+ nn.init.constant_(m.weight, 1.0)
139
+
140
+
141
+ def forward(self, text, visual):
142
+ B, N, C = visual.shape
143
+ visual = self.memory_proj(visual)
144
+ x = self.text_proj(text)
145
+
146
+ for layer in self.decoder:
147
+ x = layer(x, visual)
148
+
149
+ return self.out_proj(x)
150
+
151
+
152
+ class QuickGELU(nn.Module):
153
+
154
+ def forward(self, x: torch.Tensor):
155
+ return x * torch.sigmoid(1.702 * x)
156
+
157
+
158
+ class ResidualAttentionBlock(nn.Module):
159
+
160
+ def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None):
161
+ super().__init__()
162
+
163
+ self.attn = nn.MultiheadAttention(d_model, n_head)
164
+ self.ln_1 = nn.LayerNorm(d_model)
165
+ self.mlp = nn.Sequential(
166
+ OrderedDict([('c_fc', nn.Linear(d_model, d_model * 4)), ('gelu', QuickGELU()),
167
+ ('c_proj', nn.Linear(d_model * 4, d_model))]))
168
+ self.ln_2 = nn.LayerNorm(d_model)
169
+ self.attn_mask = attn_mask
170
+
171
+ def attention(self, x: torch.Tensor, key_padding_mask: torch.Tensor):
172
+ self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None
173
+ return self.attn(x, x, x, need_weights=False, attn_mask=self.attn_mask, key_padding_mask=key_padding_mask)[0]
174
+
175
+ def forward(self, x: torch.Tensor, key_padding_mask=None):
176
+ x = x + self.attention(self.ln_1(x), key_padding_mask=key_padding_mask)
177
+ x = x + self.mlp(self.ln_2(x))
178
+ return x
179
+
180
+ class Transformer(nn.Module):
181
+
182
+ def __init__(self, width: int, layers: int, heads: int, attn_mask: torch.Tensor = None, use_checkpoint=False):
183
+ super().__init__()
184
+ self.width = width
185
+ self.layers = layers
186
+ self.resblocks = nn.Sequential(*[ResidualAttentionBlock(width, heads, attn_mask) for _ in range(layers)])
187
+ proj_std = (self.width**-0.5) * ((2 * self.layers)**-0.5)
188
+ attn_std = self.width**-0.5
189
+ fc_std = (2 * self.width)**-0.5
190
+ for block in self.resblocks:
191
+ nn.init.normal_(block.attn.in_proj_weight, std=attn_std)
192
+ nn.init.normal_(block.attn.out_proj.weight, std=proj_std)
193
+ nn.init.normal_(block.mlp.c_fc.weight, std=fc_std)
194
+ nn.init.normal_(block.mlp.c_proj.weight, std=proj_std)
195
+
196
+ self.use_checkpoint = use_checkpoint
197
+
198
+ def forward(self, x: torch.Tensor):
199
+ for resblock in self.resblocks:
200
+ if self.use_checkpoint:
201
+ x = checkpoint.checkpoint(resblock, x)
202
+ else:
203
+ x = resblock(x)
204
+ return x
205
+
206
+
207
+ class TextTransformer(nn.Module):
208
+
209
+ def __init__(
210
+ self,
211
+ context_length: int,
212
+ width: int,
213
+ layers: int,
214
+ vocab_size,
215
+ use_checkpoint=False,
216
+ ):
217
+
218
+ super().__init__()
219
+ heads = width // 64
220
+ self.context_length = context_length
221
+ self.width = width
222
+ self.transformer = Transformer(
223
+ width=width,
224
+ layers=layers,
225
+ heads=heads,
226
+ attn_mask=self.build_attention_mask(),
227
+ use_checkpoint=use_checkpoint)
228
+
229
+ self.positional_embedding = nn.Parameter(torch.empty(self.context_length, width))
230
+ self.ln_final = nn.LayerNorm(width)
231
+ self.token_embedding = nn.Embedding(vocab_size, width)
232
+ nn.init.normal_(self.token_embedding.weight, std=0.02)
233
+
234
+ # initialization
235
+ nn.init.normal_(self.positional_embedding, std=0.01)
236
+
237
+ def build_attention_mask(self):
238
+ # lazily create causal attention mask, with full attention between the vision tokens
239
+ # pytorch uses additive attention mask; fill with -inf
240
+ mask = torch.empty(self.context_length, self.context_length)
241
+ mask.fill_(float('-inf'))
242
+ mask.triu_(1) # zero out the lower diagonal
243
+ return mask
244
+
245
+ def forward(self, text):
246
+ x = self.token_embedding(text)
247
+ x = x + self.positional_embedding
248
+ x = x.permute(1, 0, 2) # NLD -> LND
249
+ x = self.transformer(x)
250
+ x = x.permute(1, 0, 2) # LND -> NLD
251
+ x = self.ln_final(x)
252
+
253
+ # x.shape = [batch_size, n_ctx, transformer.width]
254
+ # take features from the eot embedding (eot_token is the highest number in each sequence)
255
+ x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)]
256
+
257
+ return x
extensions/microsoftexcel-controlnet/annotator/oneformer/oneformer/modeling/transformer_decoder/transformer.py ADDED
@@ -0,0 +1,376 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ------------------------------------------------------------------------------
2
+ # Reference: https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/modeling/transformer_decoder/transformer.py
3
+ # Modified by Jitesh Jain (https://github.com/praeclarumjj3)
4
+ # ------------------------------------------------------------------------------
5
+
6
+ """
7
+ Transformer class.
8
+
9
+ Copy-paste from torch.nn.Transformer with modifications:
10
+ * positional encodings are passed in MHattention
11
+ * extra LN at the end of encoder is removed
12
+ * decoder returns a stack of activations from all decoding layers
13
+ """
14
+ import copy
15
+ from typing import List, Optional
16
+
17
+ import torch
18
+ import torch.nn.functional as F
19
+ from torch import Tensor, nn
20
+
21
+
22
+ class Transformer(nn.Module):
23
+ def __init__(
24
+ self,
25
+ d_model=512,
26
+ nhead=8,
27
+ num_encoder_layers=6,
28
+ num_decoder_layers=6,
29
+ dim_feedforward=2048,
30
+ dropout=0.1,
31
+ activation="relu",
32
+ normalize_before=False,
33
+ return_intermediate_dec=False,
34
+ ):
35
+ super().__init__()
36
+
37
+ encoder_layer = TransformerEncoderLayer(
38
+ d_model, nhead, dim_feedforward, dropout, activation, normalize_before
39
+ )
40
+ encoder_norm = nn.LayerNorm(d_model) if normalize_before else None
41
+ self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)
42
+
43
+ decoder_layer = TransformerDecoderLayer(
44
+ d_model, nhead, dim_feedforward, dropout, activation, normalize_before
45
+ )
46
+ decoder_norm = nn.LayerNorm(d_model)
47
+ self.decoder = TransformerDecoder(
48
+ decoder_layer,
49
+ num_decoder_layers,
50
+ decoder_norm,
51
+ return_intermediate=return_intermediate_dec,
52
+ )
53
+
54
+ self._reset_parameters()
55
+
56
+ self.d_model = d_model
57
+ self.nhead = nhead
58
+
59
+ def _reset_parameters(self):
60
+ for p in self.parameters():
61
+ if p.dim() > 1:
62
+ nn.init.xavier_uniform_(p)
63
+
64
+ def forward(self, src, mask, query_embed, pos_embed, task_token=None):
65
+ # flatten NxCxHxW to HWxNxC
66
+ bs, c, h, w = src.shape
67
+ src = src.flatten(2).permute(2, 0, 1)
68
+ pos_embed = pos_embed.flatten(2).permute(2, 0, 1)
69
+ query_embed = query_embed.unsqueeze(1).repeat(1, bs, 1)
70
+ if mask is not None:
71
+ mask = mask.flatten(1)
72
+
73
+ if task_token is None:
74
+ tgt = torch.zeros_like(query_embed)
75
+ else:
76
+ tgt = task_token.repeat(query_embed.shape[0], 1, 1)
77
+
78
+ memory = self.encoder(src, src_key_padding_mask=mask, pos=pos_embed)
79
+ hs = self.decoder(
80
+ tgt, memory, memory_key_padding_mask=mask, pos=pos_embed, query_pos=query_embed
81
+ )
82
+ return hs.transpose(1, 2), memory.permute(1, 2, 0).view(bs, c, h, w)
83
+
84
+
85
+ class TransformerEncoder(nn.Module):
86
+ def __init__(self, encoder_layer, num_layers, norm=None):
87
+ super().__init__()
88
+ self.layers = _get_clones(encoder_layer, num_layers)
89
+ self.num_layers = num_layers
90
+ self.norm = norm
91
+
92
+ def forward(
93
+ self,
94
+ src,
95
+ mask: Optional[Tensor] = None,
96
+ src_key_padding_mask: Optional[Tensor] = None,
97
+ pos: Optional[Tensor] = None,
98
+ ):
99
+ output = src
100
+
101
+ for layer in self.layers:
102
+ output = layer(
103
+ output, src_mask=mask, src_key_padding_mask=src_key_padding_mask, pos=pos
104
+ )
105
+
106
+ if self.norm is not None:
107
+ output = self.norm(output)
108
+
109
+ return output
110
+
111
+
112
+ class TransformerDecoder(nn.Module):
113
+ def __init__(self, decoder_layer, num_layers, norm=None, return_intermediate=False):
114
+ super().__init__()
115
+ self.layers = _get_clones(decoder_layer, num_layers)
116
+ self.num_layers = num_layers
117
+ self.norm = norm
118
+ self.return_intermediate = return_intermediate
119
+
120
+ def forward(
121
+ self,
122
+ tgt,
123
+ memory,
124
+ tgt_mask: Optional[Tensor] = None,
125
+ memory_mask: Optional[Tensor] = None,
126
+ tgt_key_padding_mask: Optional[Tensor] = None,
127
+ memory_key_padding_mask: Optional[Tensor] = None,
128
+ pos: Optional[Tensor] = None,
129
+ query_pos: Optional[Tensor] = None,
130
+ ):
131
+ output = tgt
132
+
133
+ intermediate = []
134
+
135
+ for layer in self.layers:
136
+ output = layer(
137
+ output,
138
+ memory,
139
+ tgt_mask=tgt_mask,
140
+ memory_mask=memory_mask,
141
+ tgt_key_padding_mask=tgt_key_padding_mask,
142
+ memory_key_padding_mask=memory_key_padding_mask,
143
+ pos=pos,
144
+ query_pos=query_pos,
145
+ )
146
+ if self.return_intermediate:
147
+ intermediate.append(self.norm(output))
148
+
149
+ if self.norm is not None:
150
+ output = self.norm(output)
151
+ if self.return_intermediate:
152
+ intermediate.pop()
153
+ intermediate.append(output)
154
+
155
+ if self.return_intermediate:
156
+ return torch.stack(intermediate)
157
+
158
+ return output.unsqueeze(0)
159
+
160
+
161
+ class TransformerEncoderLayer(nn.Module):
162
+ def __init__(
163
+ self,
164
+ d_model,
165
+ nhead,
166
+ dim_feedforward=2048,
167
+ dropout=0.1,
168
+ activation="relu",
169
+ normalize_before=False,
170
+ ):
171
+ super().__init__()
172
+ self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
173
+ # Implementation of Feedforward model
174
+ self.linear1 = nn.Linear(d_model, dim_feedforward)
175
+ self.dropout = nn.Dropout(dropout)
176
+ self.linear2 = nn.Linear(dim_feedforward, d_model)
177
+
178
+ self.norm1 = nn.LayerNorm(d_model)
179
+ self.norm2 = nn.LayerNorm(d_model)
180
+ self.dropout1 = nn.Dropout(dropout)
181
+ self.dropout2 = nn.Dropout(dropout)
182
+
183
+ self.activation = _get_activation_fn(activation)
184
+ self.normalize_before = normalize_before
185
+
186
+ def with_pos_embed(self, tensor, pos: Optional[Tensor]):
187
+ return tensor if pos is None else tensor + pos
188
+
189
+ def forward_post(
190
+ self,
191
+ src,
192
+ src_mask: Optional[Tensor] = None,
193
+ src_key_padding_mask: Optional[Tensor] = None,
194
+ pos: Optional[Tensor] = None,
195
+ ):
196
+ q = k = self.with_pos_embed(src, pos)
197
+ src2 = self.self_attn(
198
+ q, k, value=src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask
199
+ )[0]
200
+ src = src + self.dropout1(src2)
201
+ src = self.norm1(src)
202
+ src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
203
+ src = src + self.dropout2(src2)
204
+ src = self.norm2(src)
205
+ return src
206
+
207
+ def forward_pre(
208
+ self,
209
+ src,
210
+ src_mask: Optional[Tensor] = None,
211
+ src_key_padding_mask: Optional[Tensor] = None,
212
+ pos: Optional[Tensor] = None,
213
+ ):
214
+ src2 = self.norm1(src)
215
+ q = k = self.with_pos_embed(src2, pos)
216
+ src2 = self.self_attn(
217
+ q, k, value=src2, attn_mask=src_mask, key_padding_mask=src_key_padding_mask
218
+ )[0]
219
+ src = src + self.dropout1(src2)
220
+ src2 = self.norm2(src)
221
+ src2 = self.linear2(self.dropout(self.activation(self.linear1(src2))))
222
+ src = src + self.dropout2(src2)
223
+ return src
224
+
225
+ def forward(
226
+ self,
227
+ src,
228
+ src_mask: Optional[Tensor] = None,
229
+ src_key_padding_mask: Optional[Tensor] = None,
230
+ pos: Optional[Tensor] = None,
231
+ ):
232
+ if self.normalize_before:
233
+ return self.forward_pre(src, src_mask, src_key_padding_mask, pos)
234
+ return self.forward_post(src, src_mask, src_key_padding_mask, pos)
235
+
236
+
237
+ class TransformerDecoderLayer(nn.Module):
238
+ def __init__(
239
+ self,
240
+ d_model,
241
+ nhead,
242
+ dim_feedforward=2048,
243
+ dropout=0.1,
244
+ activation="relu",
245
+ normalize_before=False,
246
+ ):
247
+ super().__init__()
248
+ self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
249
+ self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
250
+ # Implementation of Feedforward model
251
+ self.linear1 = nn.Linear(d_model, dim_feedforward)
252
+ self.dropout = nn.Dropout(dropout)
253
+ self.linear2 = nn.Linear(dim_feedforward, d_model)
254
+
255
+ self.norm1 = nn.LayerNorm(d_model)
256
+ self.norm2 = nn.LayerNorm(d_model)
257
+ self.norm3 = nn.LayerNorm(d_model)
258
+ self.dropout1 = nn.Dropout(dropout)
259
+ self.dropout2 = nn.Dropout(dropout)
260
+ self.dropout3 = nn.Dropout(dropout)
261
+
262
+ self.activation = _get_activation_fn(activation)
263
+ self.normalize_before = normalize_before
264
+
265
+ def with_pos_embed(self, tensor, pos: Optional[Tensor]):
266
+ return tensor if pos is None else tensor + pos
267
+
268
+ def forward_post(
269
+ self,
270
+ tgt,
271
+ memory,
272
+ tgt_mask: Optional[Tensor] = None,
273
+ memory_mask: Optional[Tensor] = None,
274
+ tgt_key_padding_mask: Optional[Tensor] = None,
275
+ memory_key_padding_mask: Optional[Tensor] = None,
276
+ pos: Optional[Tensor] = None,
277
+ query_pos: Optional[Tensor] = None,
278
+ ):
279
+ q = k = self.with_pos_embed(tgt, query_pos)
280
+ tgt2 = self.self_attn(
281
+ q, k, value=tgt, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask
282
+ )[0]
283
+ tgt = tgt + self.dropout1(tgt2)
284
+ tgt = self.norm1(tgt)
285
+ tgt2 = self.multihead_attn(
286
+ query=self.with_pos_embed(tgt, query_pos),
287
+ key=self.with_pos_embed(memory, pos),
288
+ value=memory,
289
+ attn_mask=memory_mask,
290
+ key_padding_mask=memory_key_padding_mask,
291
+ )[0]
292
+ tgt = tgt + self.dropout2(tgt2)
293
+ tgt = self.norm2(tgt)
294
+ tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
295
+ tgt = tgt + self.dropout3(tgt2)
296
+ tgt = self.norm3(tgt)
297
+ return tgt
298
+
299
+ def forward_pre(
300
+ self,
301
+ tgt,
302
+ memory,
303
+ tgt_mask: Optional[Tensor] = None,
304
+ memory_mask: Optional[Tensor] = None,
305
+ tgt_key_padding_mask: Optional[Tensor] = None,
306
+ memory_key_padding_mask: Optional[Tensor] = None,
307
+ pos: Optional[Tensor] = None,
308
+ query_pos: Optional[Tensor] = None,
309
+ ):
310
+ tgt2 = self.norm1(tgt)
311
+ q = k = self.with_pos_embed(tgt2, query_pos)
312
+ tgt2 = self.self_attn(
313
+ q, k, value=tgt2, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask
314
+ )[0]
315
+ tgt = tgt + self.dropout1(tgt2)
316
+ tgt2 = self.norm2(tgt)
317
+ tgt2 = self.multihead_attn(
318
+ query=self.with_pos_embed(tgt2, query_pos),
319
+ key=self.with_pos_embed(memory, pos),
320
+ value=memory,
321
+ attn_mask=memory_mask,
322
+ key_padding_mask=memory_key_padding_mask,
323
+ )[0]
324
+ tgt = tgt + self.dropout2(tgt2)
325
+ tgt2 = self.norm3(tgt)
326
+ tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
327
+ tgt = tgt + self.dropout3(tgt2)
328
+ return tgt
329
+
330
+ def forward(
331
+ self,
332
+ tgt,
333
+ memory,
334
+ tgt_mask: Optional[Tensor] = None,
335
+ memory_mask: Optional[Tensor] = None,
336
+ tgt_key_padding_mask: Optional[Tensor] = None,
337
+ memory_key_padding_mask: Optional[Tensor] = None,
338
+ pos: Optional[Tensor] = None,
339
+ query_pos: Optional[Tensor] = None,
340
+ ):
341
+ if self.normalize_before:
342
+ return self.forward_pre(
343
+ tgt,
344
+ memory,
345
+ tgt_mask,
346
+ memory_mask,
347
+ tgt_key_padding_mask,
348
+ memory_key_padding_mask,
349
+ pos,
350
+ query_pos,
351
+ )
352
+ return self.forward_post(
353
+ tgt,
354
+ memory,
355
+ tgt_mask,
356
+ memory_mask,
357
+ tgt_key_padding_mask,
358
+ memory_key_padding_mask,
359
+ pos,
360
+ query_pos,
361
+ )
362
+
363
+
364
+ def _get_clones(module, N):
365
+ return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
366
+
367
+
368
+ def _get_activation_fn(activation):
369
+ """Return an activation function given a string"""
370
+ if activation == "relu":
371
+ return F.relu
372
+ if activation == "gelu":
373
+ return F.gelu
374
+ if activation == "glu":
375
+ return F.glu
376
+ raise RuntimeError(f"activation should be relu/gelu, not {activation}.")
extensions/microsoftexcel-controlnet/annotator/oneformer/oneformer/oneformer_model.py ADDED
@@ -0,0 +1,470 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ------------------------------------------------------------------------------
2
+ # Reference: https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/maskformer_model.py
3
+ # Modified by Jitesh Jain (https://github.com/praeclarumjj3)
4
+ # ------------------------------------------------------------------------------
5
+
6
+ from typing import Tuple
7
+
8
+ import torch
9
+ from torch import nn
10
+ from torch.nn import functional as F
11
+
12
+ from annotator.oneformer.detectron2.config import configurable
13
+ from annotator.oneformer.detectron2.data import MetadataCatalog
14
+ from annotator.oneformer.detectron2.modeling import META_ARCH_REGISTRY, build_backbone, build_sem_seg_head
15
+ from annotator.oneformer.detectron2.modeling.backbone import Backbone
16
+ from annotator.oneformer.detectron2.modeling.postprocessing import sem_seg_postprocess
17
+ from annotator.oneformer.detectron2.structures import Boxes, ImageList, Instances, BitMasks
18
+ from annotator.oneformer.detectron2.utils.memory import retry_if_cuda_oom
19
+
20
+ from .modeling.matcher import HungarianMatcher
21
+ from einops import rearrange
22
+ from .modeling.transformer_decoder.text_transformer import TextTransformer
23
+ from .modeling.transformer_decoder.oneformer_transformer_decoder import MLP
24
+ from annotator.oneformer.oneformer.data.tokenizer import SimpleTokenizer, Tokenize
25
+
26
+ @META_ARCH_REGISTRY.register()
27
+ class OneFormer(nn.Module):
28
+ """
29
+ Main class for mask classification semantic segmentation architectures.
30
+ """
31
+
32
+ @configurable
33
+ def __init__(
34
+ self,
35
+ *,
36
+ backbone: Backbone,
37
+ sem_seg_head: nn.Module,
38
+ task_mlp: nn.Module,
39
+ text_encoder: nn.Module,
40
+ text_projector: nn.Module,
41
+ prompt_ctx: nn.Embedding,
42
+ num_queries: int,
43
+ object_mask_threshold: float,
44
+ overlap_threshold: float,
45
+ metadata,
46
+ size_divisibility: int,
47
+ sem_seg_postprocess_before_inference: bool,
48
+ pixel_mean: Tuple[float],
49
+ pixel_std: Tuple[float],
50
+ # inference
51
+ semantic_on: bool,
52
+ panoptic_on: bool,
53
+ instance_on: bool,
54
+ detection_on: bool,
55
+ test_topk_per_image: int,
56
+ task_seq_len: int,
57
+ max_seq_len: int,
58
+ is_demo: bool,
59
+ ):
60
+ """
61
+ Args:
62
+ backbone: a backbone module, must follow detectron2's backbone interface
63
+ sem_seg_head: a module that predicts semantic segmentation from backbone features
64
+ criterion: a module that defines the loss
65
+ num_queries: int, number of queries
66
+ object_mask_threshold: float, threshold to filter query based on classification score
67
+ for panoptic segmentation inference
68
+ overlap_threshold: overlap threshold used in general inference for panoptic segmentation
69
+ metadata: dataset meta, get `thing` and `stuff` category names for panoptic
70
+ segmentation inference
71
+ size_divisibility: Some backbones require the input height and width to be divisible by a
72
+ specific integer. We can use this to override such requirement.
73
+ sem_seg_postprocess_before_inference: whether to resize the prediction back
74
+ to original input size before semantic segmentation inference or after.
75
+ For high-resolution dataset like Mapillary, resizing predictions before
76
+ inference will cause OOM error.
77
+ pixel_mean, pixel_std: list or tuple with #channels element, representing
78
+ the per-channel mean and std to be used to normalize the input image
79
+ semantic_on: bool, whether to output semantic segmentation prediction
80
+ instance_on: bool, whether to output instance segmentation prediction
81
+ panoptic_on: bool, whether to output panoptic segmentation prediction
82
+ test_topk_per_image: int, instance segmentation parameter, keep topk instances per image
83
+ """
84
+ super().__init__()
85
+ self.backbone = backbone
86
+ self.sem_seg_head = sem_seg_head
87
+ self.task_mlp = task_mlp
88
+ self.text_encoder = text_encoder
89
+ self.text_projector = text_projector
90
+ self.prompt_ctx = prompt_ctx
91
+ self.num_queries = num_queries
92
+ self.overlap_threshold = overlap_threshold
93
+ self.object_mask_threshold = object_mask_threshold
94
+ self.metadata = metadata
95
+ if size_divisibility < 0:
96
+ # use backbone size_divisibility if not set
97
+ size_divisibility = self.backbone.size_divisibility
98
+ self.size_divisibility = size_divisibility
99
+ self.sem_seg_postprocess_before_inference = sem_seg_postprocess_before_inference
100
+ self.register_buffer("pixel_mean", torch.Tensor(pixel_mean).view(-1, 1, 1), False)
101
+ self.register_buffer("pixel_std", torch.Tensor(pixel_std).view(-1, 1, 1), False)
102
+
103
+ # additional args
104
+ self.semantic_on = semantic_on
105
+ self.instance_on = instance_on
106
+ self.panoptic_on = panoptic_on
107
+ self.detection_on = detection_on
108
+ self.test_topk_per_image = test_topk_per_image
109
+
110
+ self.text_tokenizer = Tokenize(SimpleTokenizer(), max_seq_len=max_seq_len)
111
+ self.task_tokenizer = Tokenize(SimpleTokenizer(), max_seq_len=task_seq_len)
112
+ self.is_demo = is_demo
113
+
114
+ self.thing_indices = [k for k in self.metadata.thing_dataset_id_to_contiguous_id.keys()]
115
+
116
+ if not self.semantic_on:
117
+ assert self.sem_seg_postprocess_before_inference
118
+
119
+ @classmethod
120
+ def from_config(cls, cfg):
121
+ backbone = build_backbone(cfg)
122
+ sem_seg_head = build_sem_seg_head(cfg, backbone.output_shape())
123
+
124
+ if cfg.MODEL.IS_TRAIN:
125
+ text_encoder = TextTransformer(context_length=cfg.MODEL.TEXT_ENCODER.CONTEXT_LENGTH,
126
+ width=cfg.MODEL.TEXT_ENCODER.WIDTH,
127
+ layers=cfg.MODEL.TEXT_ENCODER.NUM_LAYERS,
128
+ vocab_size=cfg.MODEL.TEXT_ENCODER.VOCAB_SIZE)
129
+ text_projector = MLP(text_encoder.width, cfg.MODEL.ONE_FORMER.HIDDEN_DIM,
130
+ cfg.MODEL.ONE_FORMER.HIDDEN_DIM, cfg.MODEL.TEXT_ENCODER.PROJ_NUM_LAYERS)
131
+ if cfg.MODEL.TEXT_ENCODER.N_CTX > 0:
132
+ prompt_ctx = nn.Embedding(cfg.MODEL.TEXT_ENCODER.N_CTX, cfg.MODEL.TEXT_ENCODER.WIDTH)
133
+ else:
134
+ prompt_ctx = None
135
+ else:
136
+ text_encoder = None
137
+ text_projector = None
138
+ prompt_ctx = None
139
+
140
+ task_mlp = MLP(cfg.INPUT.TASK_SEQ_LEN, cfg.MODEL.ONE_FORMER.HIDDEN_DIM,
141
+ cfg.MODEL.ONE_FORMER.HIDDEN_DIM, 2)
142
+
143
+ # Loss parameters:
144
+ deep_supervision = cfg.MODEL.ONE_FORMER.DEEP_SUPERVISION
145
+ no_object_weight = cfg.MODEL.ONE_FORMER.NO_OBJECT_WEIGHT
146
+
147
+ # loss weights
148
+ class_weight = cfg.MODEL.ONE_FORMER.CLASS_WEIGHT
149
+ dice_weight = cfg.MODEL.ONE_FORMER.DICE_WEIGHT
150
+ mask_weight = cfg.MODEL.ONE_FORMER.MASK_WEIGHT
151
+ contrastive_weight = cfg.MODEL.ONE_FORMER.CONTRASTIVE_WEIGHT
152
+
153
+ # building criterion
154
+ matcher = HungarianMatcher(
155
+ cost_class=class_weight,
156
+ cost_mask=mask_weight,
157
+ cost_dice=dice_weight,
158
+ num_points=cfg.MODEL.ONE_FORMER.TRAIN_NUM_POINTS,
159
+ )
160
+
161
+ weight_dict = {"loss_ce": class_weight, "loss_mask": mask_weight,
162
+ "loss_dice": dice_weight, "loss_contrastive": contrastive_weight}
163
+
164
+
165
+ if deep_supervision:
166
+ dec_layers = cfg.MODEL.ONE_FORMER.DEC_LAYERS
167
+ aux_weight_dict = {}
168
+ for i in range(dec_layers - 1):
169
+ aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()})
170
+ weight_dict.update(aux_weight_dict)
171
+
172
+ losses = ["labels", "masks", "contrastive"]
173
+
174
+ return {
175
+ "backbone": backbone,
176
+ "sem_seg_head": sem_seg_head,
177
+ "task_mlp": task_mlp,
178
+ "prompt_ctx": prompt_ctx,
179
+ "text_encoder": text_encoder,
180
+ "text_projector": text_projector,
181
+ "num_queries": cfg.MODEL.ONE_FORMER.NUM_OBJECT_QUERIES,
182
+ "object_mask_threshold": cfg.MODEL.TEST.OBJECT_MASK_THRESHOLD,
183
+ "overlap_threshold": cfg.MODEL.TEST.OVERLAP_THRESHOLD,
184
+ "metadata": MetadataCatalog.get(cfg.DATASETS.TRAIN[0]),
185
+ "size_divisibility": cfg.MODEL.ONE_FORMER.SIZE_DIVISIBILITY,
186
+ "sem_seg_postprocess_before_inference": (
187
+ cfg.MODEL.TEST.SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE
188
+ or cfg.MODEL.TEST.PANOPTIC_ON
189
+ or cfg.MODEL.TEST.INSTANCE_ON
190
+ ),
191
+ "pixel_mean": cfg.MODEL.PIXEL_MEAN,
192
+ "pixel_std": cfg.MODEL.PIXEL_STD,
193
+ # inference
194
+ "semantic_on": cfg.MODEL.TEST.SEMANTIC_ON,
195
+ "instance_on": cfg.MODEL.TEST.INSTANCE_ON,
196
+ "panoptic_on": cfg.MODEL.TEST.PANOPTIC_ON,
197
+ "detection_on": cfg.MODEL.TEST.DETECTION_ON,
198
+ "test_topk_per_image": cfg.TEST.DETECTIONS_PER_IMAGE,
199
+ "task_seq_len": cfg.INPUT.TASK_SEQ_LEN,
200
+ "max_seq_len": cfg.INPUT.MAX_SEQ_LEN,
201
+ "is_demo": cfg.MODEL.IS_DEMO,
202
+ }
203
+
204
+ @property
205
+ def device(self):
206
+ return self.pixel_mean.device
207
+
208
+ def encode_text(self, text):
209
+ assert text.ndim in [2, 3], text.ndim
210
+ b = text.shape[0]
211
+ squeeze_dim = False
212
+ num_text = 1
213
+ if text.ndim == 3:
214
+ num_text = text.shape[1]
215
+ text = rearrange(text, 'b n l -> (b n) l', n=num_text)
216
+ squeeze_dim = True
217
+
218
+ # [B, C]
219
+ x = self.text_encoder(text)
220
+
221
+ text_x = self.text_projector(x)
222
+
223
+ if squeeze_dim:
224
+ text_x = rearrange(text_x, '(b n) c -> b n c', n=num_text)
225
+ if self.prompt_ctx is not None:
226
+ text_ctx = self.prompt_ctx.weight.unsqueeze(0).repeat(text_x.shape[0], 1, 1)
227
+ text_x = torch.cat([text_x, text_ctx], dim=1)
228
+
229
+ return {"texts": text_x}
230
+
231
+ def forward(self, batched_inputs):
232
+ """
233
+ Args:
234
+ batched_inputs: a list, batched outputs of :class:`DatasetMapper`.
235
+ Each item in the list contains the inputs for one image.
236
+ For now, each item in the list is a dict that contains:
237
+ * "image": Tensor, image in (C, H, W) format.
238
+ * "instances": per-region ground truth
239
+ * Other information that's included in the original dicts, such as:
240
+ "height", "width" (int): the output resolution of the model (may be different
241
+ from input resolution), used in inference.
242
+ Returns:
243
+ list[dict]:
244
+ each dict has the results for one image. The dict contains the following keys:
245
+ * "sem_seg":
246
+ A Tensor that represents the
247
+ per-pixel segmentation prediced by the head.
248
+ The prediction has shape KxHxW that represents the logits of
249
+ each class for each pixel.
250
+ * "panoptic_seg":
251
+ A tuple that represent panoptic output
252
+ panoptic_seg (Tensor): of shape (height, width) where the values are ids for each segment.
253
+ segments_info (list[dict]): Describe each segment in `panoptic_seg`.
254
+ Each dict contains keys "id", "category_id", "isthing".
255
+ """
256
+ images = [x["image"].to(self.device) for x in batched_inputs]
257
+ images = [(x - self.pixel_mean) / self.pixel_std for x in images]
258
+ images = ImageList.from_tensors(images, self.size_divisibility)
259
+
260
+ tasks = torch.cat([self.task_tokenizer(x["task"]).to(self.device).unsqueeze(0) for x in batched_inputs], dim=0)
261
+ tasks = self.task_mlp(tasks.float())
262
+
263
+ features = self.backbone(images.tensor)
264
+ outputs = self.sem_seg_head(features, tasks)
265
+
266
+ if self.training:
267
+ texts = torch.cat([self.text_tokenizer(x["text"]).to(self.device).unsqueeze(0) for x in batched_inputs], dim=0)
268
+ texts_x = self.encode_text(texts)
269
+
270
+ outputs = {**outputs, **texts_x}
271
+
272
+ # mask classification target
273
+ if "instances" in batched_inputs[0]:
274
+ gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
275
+ targets = self.prepare_targets(gt_instances, images)
276
+ else:
277
+ targets = None
278
+
279
+ # bipartite matching-based loss
280
+ losses = self.criterion(outputs, targets)
281
+
282
+ for k in list(losses.keys()):
283
+ if k in self.criterion.weight_dict:
284
+ losses[k] *= self.criterion.weight_dict[k]
285
+ else:
286
+ # remove this loss if not specified in `weight_dict`
287
+ losses.pop(k)
288
+ return losses
289
+ else:
290
+ mask_cls_results = outputs["pred_logits"]
291
+ mask_pred_results = outputs["pred_masks"]
292
+ # upsample masks
293
+ mask_pred_results = F.interpolate(
294
+ mask_pred_results,
295
+ size=(images.tensor.shape[-2], images.tensor.shape[-1]),
296
+ mode="bilinear",
297
+ align_corners=False,
298
+ )
299
+
300
+ del outputs
301
+
302
+ processed_results = []
303
+ for i, data in enumerate(zip(
304
+ mask_cls_results, mask_pred_results, batched_inputs, images.image_sizes
305
+ )):
306
+ mask_cls_result, mask_pred_result, input_per_image, image_size = data
307
+ height = input_per_image.get("height", image_size[0])
308
+ width = input_per_image.get("width", image_size[1])
309
+ processed_results.append({})
310
+
311
+ if self.sem_seg_postprocess_before_inference:
312
+ mask_pred_result = retry_if_cuda_oom(sem_seg_postprocess)(
313
+ mask_pred_result, image_size, height, width
314
+ )
315
+ mask_cls_result = mask_cls_result.to(mask_pred_result)
316
+
317
+ # semantic segmentation inference
318
+ if self.semantic_on:
319
+ r = retry_if_cuda_oom(self.semantic_inference)(mask_cls_result, mask_pred_result)
320
+ if not self.sem_seg_postprocess_before_inference:
321
+ r = retry_if_cuda_oom(sem_seg_postprocess)(r, image_size, height, width)
322
+ processed_results[-1]["sem_seg"] = r
323
+
324
+ # panoptic segmentation inference
325
+ if self.panoptic_on:
326
+ panoptic_r = retry_if_cuda_oom(self.panoptic_inference)(mask_cls_result, mask_pred_result)
327
+ processed_results[-1]["panoptic_seg"] = panoptic_r
328
+
329
+ # instance segmentation inference
330
+ if self.instance_on:
331
+ instance_r = retry_if_cuda_oom(self.instance_inference)(mask_cls_result, mask_pred_result)
332
+ processed_results[-1]["instances"] = instance_r
333
+
334
+ if self.detection_on:
335
+ bbox_r = retry_if_cuda_oom(self.instance_inference)(mask_cls_result, mask_pred_result)
336
+ processed_results[-1]["box_instances"] = bbox_r
337
+
338
+ return processed_results
339
+
340
+ def prepare_targets(self, targets, images):
341
+ h_pad, w_pad = images.tensor.shape[-2:]
342
+ new_targets = []
343
+ for targets_per_image in targets:
344
+ # pad gt
345
+ gt_masks = targets_per_image.gt_masks
346
+ padded_masks = torch.zeros((gt_masks.shape[0], h_pad, w_pad), dtype=gt_masks.dtype, device=gt_masks.device)
347
+ padded_masks[:, : gt_masks.shape[1], : gt_masks.shape[2]] = gt_masks
348
+ new_targets.append(
349
+ {
350
+ "labels": targets_per_image.gt_classes,
351
+ "masks": padded_masks,
352
+ }
353
+ )
354
+ return new_targets
355
+
356
+ def semantic_inference(self, mask_cls, mask_pred):
357
+ mask_cls = F.softmax(mask_cls, dim=-1)[..., :-1]
358
+ mask_pred = mask_pred.sigmoid()
359
+ semseg = torch.einsum("qc,qhw->chw", mask_cls, mask_pred)
360
+ return semseg
361
+
362
+ def panoptic_inference(self, mask_cls, mask_pred):
363
+ scores, labels = F.softmax(mask_cls, dim=-1).max(-1)
364
+ mask_pred = mask_pred.sigmoid()
365
+
366
+ keep = labels.ne(self.sem_seg_head.num_classes) & (scores > self.object_mask_threshold)
367
+ cur_scores = scores[keep]
368
+ cur_classes = labels[keep]
369
+ cur_masks = mask_pred[keep]
370
+ cur_mask_cls = mask_cls[keep]
371
+ cur_mask_cls = cur_mask_cls[:, :-1]
372
+
373
+ cur_prob_masks = cur_scores.view(-1, 1, 1) * cur_masks
374
+
375
+ h, w = cur_masks.shape[-2:]
376
+ panoptic_seg = torch.zeros((h, w), dtype=torch.int32, device=cur_masks.device)
377
+ segments_info = []
378
+
379
+ current_segment_id = 0
380
+
381
+ if cur_masks.shape[0] == 0:
382
+ # We didn't detect any mask :(
383
+ return panoptic_seg, segments_info
384
+ else:
385
+ # take argmax
386
+ cur_mask_ids = cur_prob_masks.argmax(0)
387
+ stuff_memory_list = {}
388
+ for k in range(cur_classes.shape[0]):
389
+ pred_class = cur_classes[k].item()
390
+ isthing = pred_class in self.metadata.thing_dataset_id_to_contiguous_id.values()
391
+ mask_area = (cur_mask_ids == k).sum().item()
392
+ original_area = (cur_masks[k] >= 0.5).sum().item()
393
+ mask = (cur_mask_ids == k) & (cur_masks[k] >= 0.5)
394
+
395
+ if mask_area > 0 and original_area > 0 and mask.sum().item() > 0:
396
+ if mask_area / original_area < self.overlap_threshold:
397
+ continue
398
+
399
+ # merge stuff regions
400
+ if not isthing:
401
+ if int(pred_class) in stuff_memory_list.keys():
402
+ panoptic_seg[mask] = stuff_memory_list[int(pred_class)]
403
+ continue
404
+ else:
405
+ stuff_memory_list[int(pred_class)] = current_segment_id + 1
406
+
407
+ current_segment_id += 1
408
+ panoptic_seg[mask] = current_segment_id
409
+
410
+ segments_info.append(
411
+ {
412
+ "id": current_segment_id,
413
+ "isthing": bool(isthing),
414
+ "category_id": int(pred_class),
415
+ }
416
+ )
417
+
418
+ return panoptic_seg, segments_info
419
+
420
+ def instance_inference(self, mask_cls, mask_pred):
421
+ # mask_pred is already processed to have the same shape as original input
422
+ image_size = mask_pred.shape[-2:]
423
+
424
+ # [Q, K]
425
+ scores = F.softmax(mask_cls, dim=-1)[:, :-1]
426
+ labels = torch.arange(self.sem_seg_head.num_classes, device=self.device).unsqueeze(0).repeat(self.num_queries, 1).flatten(0, 1)
427
+
428
+ # scores_per_image, topk_indices = scores.flatten(0, 1).topk(self.num_queries, sorted=False)
429
+ scores_per_image, topk_indices = scores.flatten(0, 1).topk(self.test_topk_per_image, sorted=False)
430
+ labels_per_image = labels[topk_indices]
431
+
432
+ topk_indices = topk_indices // self.sem_seg_head.num_classes
433
+ # mask_pred = mask_pred.unsqueeze(1).repeat(1, self.sem_seg_head.num_classes, 1).flatten(0, 1)
434
+ mask_pred = mask_pred[topk_indices]
435
+
436
+ # Only consider scores with confidence over [self.object_mask_threshold] for demo
437
+ if self.is_demo:
438
+ keep = scores_per_image > self.object_mask_threshold
439
+ scores_per_image = scores_per_image[keep]
440
+ labels_per_image = labels_per_image[keep]
441
+ mask_pred = mask_pred[keep]
442
+
443
+ # if this is panoptic segmentation, we only keep the "thing" classes
444
+ if self.panoptic_on:
445
+ keep = torch.zeros_like(scores_per_image).bool()
446
+ for i, lab in enumerate(labels_per_image):
447
+ keep[i] = lab in self.metadata.thing_dataset_id_to_contiguous_id.values()
448
+
449
+ scores_per_image = scores_per_image[keep]
450
+ labels_per_image = labels_per_image[keep]
451
+ mask_pred = mask_pred[keep]
452
+
453
+ if 'ade20k' in self.metadata.name:
454
+ for i in range(labels_per_image.shape[0]):
455
+ labels_per_image[i] = self.thing_indices.index(labels_per_image[i].item())
456
+
457
+ result = Instances(image_size)
458
+ # mask (before sigmoid)
459
+ result.pred_masks = (mask_pred > 0).float()
460
+ if self.detection_on:
461
+ # Uncomment the following to get boxes from masks (this is slow)
462
+ result.pred_boxes = BitMasks(mask_pred > 0).get_bounding_boxes()
463
+ else:
464
+ result.pred_boxes = Boxes(torch.zeros(mask_pred.size(0), 4))
465
+
466
+ # calculate average mask prob
467
+ mask_scores_per_image = (mask_pred.sigmoid().flatten(1) * result.pred_masks.flatten(1)).sum(1) / (result.pred_masks.flatten(1).sum(1) + 1e-6)
468
+ result.scores = scores_per_image * mask_scores_per_image
469
+ result.pred_classes = labels_per_image
470
+ return result
extensions/microsoftexcel-controlnet/annotator/oneformer/oneformer/utils/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ from .events import setup_wandb, WandbWriter
extensions/microsoftexcel-controlnet/annotator/oneformer/oneformer/utils/box_ops.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
2
+ """
3
+ Utilities for bounding box manipulation and GIoU.
4
+ """
5
+ import torch, os
6
+ from torchvision.ops.boxes import box_area
7
+
8
+
9
+ def box_cxcywh_to_xyxy(x):
10
+ x_c, y_c, w, h = x.unbind(-1)
11
+ b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
12
+ (x_c + 0.5 * w), (y_c + 0.5 * h)]
13
+ return torch.stack(b, dim=-1)
14
+
15
+
16
+ def box_xyxy_to_cxcywh(x):
17
+ x0, y0, x1, y1 = x.unbind(-1)
18
+ b = [(x0 + x1) / 2, (y0 + y1) / 2,
19
+ (x1 - x0), (y1 - y0)]
20
+ return torch.stack(b, dim=-1)
21
+
22
+
23
+ # modified from torchvision to also return the union
24
+ def box_iou(boxes1, boxes2):
25
+ area1 = box_area(boxes1)
26
+ area2 = box_area(boxes2)
27
+
28
+ # import ipdb; ipdb.set_trace()
29
+ lt = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2]
30
+ rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2]
31
+
32
+ wh = (rb - lt).clamp(min=0) # [N,M,2]
33
+ inter = wh[:, :, 0] * wh[:, :, 1] # [N,M]
34
+
35
+ union = area1[:, None] + area2 - inter
36
+
37
+ iou = inter / (union + 1e-6)
38
+ return iou, union
39
+
40
+
41
+ def generalized_box_iou(boxes1, boxes2):
42
+ """
43
+ Generalized IoU from https://giou.stanford.edu/
44
+ The boxes should be in [x0, y0, x1, y1] format
45
+ Returns a [N, M] pairwise matrix, where N = len(boxes1)
46
+ and M = len(boxes2)
47
+ """
48
+ # degenerate boxes gives inf / nan results
49
+ # so do an early check
50
+ assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
51
+ assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
52
+ # except:
53
+ # import ipdb; ipdb.set_trace()
54
+ iou, union = box_iou(boxes1, boxes2)
55
+
56
+ lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])
57
+ rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
58
+
59
+ wh = (rb - lt).clamp(min=0) # [N,M,2]
60
+ area = wh[:, :, 0] * wh[:, :, 1]
61
+
62
+ return iou - (area - union) / (area + 1e-6)
63
+
64
+
65
+
66
+ # modified from torchvision to also return the union
67
+ def box_iou_pairwise(boxes1, boxes2):
68
+ area1 = box_area(boxes1)
69
+ area2 = box_area(boxes2)
70
+
71
+ lt = torch.max(boxes1[:, :2], boxes2[:, :2]) # [N,2]
72
+ rb = torch.min(boxes1[:, 2:], boxes2[:, 2:]) # [N,2]
73
+
74
+ wh = (rb - lt).clamp(min=0) # [N,2]
75
+ inter = wh[:, 0] * wh[:, 1] # [N]
76
+
77
+ union = area1 + area2 - inter
78
+
79
+ iou = inter / union
80
+ return iou, union
81
+
82
+
83
+ def generalized_box_iou_pairwise(boxes1, boxes2):
84
+ """
85
+ Generalized IoU from https://giou.stanford.edu/
86
+ Input:
87
+ - boxes1, boxes2: N,4
88
+ Output:
89
+ - giou: N, 4
90
+ """
91
+ # degenerate boxes gives inf / nan results
92
+ # so do an early check
93
+ assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
94
+ assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
95
+ assert boxes1.shape == boxes2.shape
96
+ iou, union = box_iou_pairwise(boxes1, boxes2) # N, 4
97
+
98
+ lt = torch.min(boxes1[:, :2], boxes2[:, :2])
99
+ rb = torch.max(boxes1[:, 2:], boxes2[:, 2:])
100
+
101
+ wh = (rb - lt).clamp(min=0) # [N,2]
102
+ area = wh[:, 0] * wh[:, 1]
103
+
104
+ return iou - (area - union) / area
105
+
106
+ def masks_to_boxes(masks):
107
+ """Compute the bounding boxes around the provided masks
108
+ The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions.
109
+ Returns a [N, 4] tensors, with the boxes in xyxy format
110
+ """
111
+ if masks.numel() == 0:
112
+ return torch.zeros((0, 4), device=masks.device)
113
+
114
+ h, w = masks.shape[-2:]
115
+
116
+ y = torch.arange(0, h, dtype=torch.float)
117
+ x = torch.arange(0, w, dtype=torch.float)
118
+ y, x = torch.meshgrid(y, x)
119
+
120
+ x_mask = (masks * x.unsqueeze(0))
121
+ x_max = x_mask.flatten(1).max(-1)[0]
122
+ x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
123
+
124
+ y_mask = (masks * y.unsqueeze(0))
125
+ y_max = y_mask.flatten(1).max(-1)[0]
126
+ y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
127
+
128
+ return torch.stack([x_min, y_min, x_max, y_max], 1)
129
+
130
+ if __name__ == '__main__':
131
+ x = torch.rand(5, 4)
132
+ y = torch.rand(3, 4)
133
+ iou, union = box_iou(x, y)
extensions/microsoftexcel-controlnet/annotator/oneformer/oneformer/utils/events.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import wandb
3
+ from annotator.oneformer.detectron2.utils import comm
4
+ from annotator.oneformer.detectron2.utils.events import EventWriter, get_event_storage
5
+
6
+
7
+ def setup_wandb(cfg, args):
8
+ if comm.is_main_process():
9
+ init_args = {
10
+ k.lower(): v
11
+ for k, v in cfg.WANDB.items()
12
+ if isinstance(k, str) and k not in ["config"]
13
+ }
14
+ # only include most related part to avoid too big table
15
+ # TODO: add configurable params to select which part of `cfg` should be saved in config
16
+ if "config_exclude_keys" in init_args:
17
+ init_args["config"] = cfg
18
+ init_args["config"]["cfg_file"] = args.config_file
19
+ else:
20
+ init_args["config"] = {
21
+ "model": cfg.MODEL,
22
+ "solver": cfg.SOLVER,
23
+ "cfg_file": args.config_file,
24
+ }
25
+ if ("name" not in init_args) or (init_args["name"] is None):
26
+ init_args["name"] = os.path.basename(args.config_file)
27
+ else:
28
+ init_args["name"] = init_args["name"] + '_' + os.path.basename(args.config_file)
29
+ wandb.init(**init_args)
30
+
31
+
32
+ class BaseRule(object):
33
+ def __call__(self, target):
34
+ return target
35
+
36
+
37
+ class IsIn(BaseRule):
38
+ def __init__(self, keyword: str):
39
+ self.keyword = keyword
40
+
41
+ def __call__(self, target):
42
+ return self.keyword in target
43
+
44
+
45
+ class Prefix(BaseRule):
46
+ def __init__(self, keyword: str):
47
+ self.keyword = keyword
48
+
49
+ def __call__(self, target):
50
+ return "/".join([self.keyword, target])
51
+
52
+
53
+ class WandbWriter(EventWriter):
54
+ """
55
+ Write all scalars to a tensorboard file.
56
+ """
57
+
58
+ def __init__(self):
59
+ """
60
+ Args:
61
+ log_dir (str): the directory to save the output events
62
+ kwargs: other arguments passed to `torch.utils.tensorboard.SummaryWriter(...)`
63
+ """
64
+ self._last_write = -1
65
+ self._group_rules = [
66
+ (IsIn("/"), BaseRule()),
67
+ (IsIn("loss"), Prefix("train")),
68
+ ]
69
+
70
+ def write(self):
71
+
72
+ storage = get_event_storage()
73
+
74
+ def _group_name(scalar_name):
75
+ for (rule, op) in self._group_rules:
76
+ if rule(scalar_name):
77
+ return op(scalar_name)
78
+ return scalar_name
79
+
80
+ stats = {
81
+ _group_name(name): scalars[0]
82
+ for name, scalars in storage.latest().items()
83
+ if scalars[1] > self._last_write
84
+ }
85
+ if len(stats) > 0:
86
+ self._last_write = max([v[1] for k, v in storage.latest().items()])
87
+
88
+ # storage.put_{image,histogram} is only meant to be used by
89
+ # tensorboard writer. So we access its internal fields directly from here.
90
+ if len(storage._vis_data) >= 1:
91
+ stats["image"] = [
92
+ wandb.Image(img, caption=img_name)
93
+ for img_name, img, step_num in storage._vis_data
94
+ ]
95
+ # Storage stores all image data and rely on this writer to clear them.
96
+ # As a result it assumes only one writer will use its image data.
97
+ # An alternative design is to let storage store limited recent
98
+ # data (e.g. only the most recent image) that all writers can access.
99
+ # In that case a writer may not see all image data if its period is long.
100
+ storage.clear_images()
101
+
102
+ if len(storage._histograms) >= 1:
103
+
104
+ def create_bar(tag, bucket_limits, bucket_counts, **kwargs):
105
+ data = [
106
+ [label, val] for (label, val) in zip(bucket_limits, bucket_counts)
107
+ ]
108
+ table = wandb.Table(data=data, columns=["label", "value"])
109
+ return wandb.plot.bar(table, "label", "value", title=tag)
110
+
111
+ stats["hist"] = [create_bar(**params) for params in storage._histograms]
112
+
113
+ storage.clear_histograms()
114
+
115
+ if len(stats) == 0:
116
+ return
117
+ wandb.log(stats, step=storage.iter)
118
+
119
+ def close(self):
120
+ wandb.finish()
extensions/microsoftexcel-controlnet/annotator/oneformer/oneformer/utils/misc.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ # Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/util/misc.py
3
+ """
4
+ Misc functions, including distributed helpers.
5
+
6
+ Mostly copy-paste from torchvision references.
7
+ """
8
+ from typing import List, Optional
9
+
10
+ import torch
11
+ import torch.distributed as dist
12
+ import torchvision
13
+ from torch import Tensor
14
+ import warnings
15
+ import torch.nn.functional as F
16
+ import math
17
+
18
+ def inverse_sigmoid(x, eps=1e-3):
19
+ x = x.clamp(min=0, max=1)
20
+ x1 = x.clamp(min=eps)
21
+ x2 = (1 - x).clamp(min=eps)
22
+ return torch.log(x1/x2)
23
+
24
+ def _no_grad_trunc_normal_(tensor, mean, std, a, b):
25
+ # Cut & paste from PyTorch official master until it's in a few official releases - RW
26
+ # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
27
+ def norm_cdf(x):
28
+ # Computes standard normal cumulative distribution function
29
+ return (1. + math.erf(x / math.sqrt(2.))) / 2.
30
+
31
+ if (mean < a - 2 * std) or (mean > b + 2 * std):
32
+ warnings.warn("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
33
+ "The distribution of values may be incorrect.",
34
+ stacklevel=2)
35
+
36
+ with torch.no_grad():
37
+ # Values are generated by using a truncated uniform distribution and
38
+ # then using the inverse CDF for the normal distribution.
39
+ # Get upper and lower cdf values
40
+ l = norm_cdf((a - mean) / std)
41
+ u = norm_cdf((b - mean) / std)
42
+
43
+ # Uniformly fill tensor with values from [l, u], then translate to
44
+ # [2l-1, 2u-1].
45
+ tensor.uniform_(2 * l - 1, 2 * u - 1)
46
+
47
+ # Use inverse cdf transform for normal distribution to get truncated
48
+ # standard normal
49
+ tensor.erfinv_()
50
+
51
+ # Transform to proper mean, std
52
+ tensor.mul_(std * math.sqrt(2.))
53
+ tensor.add_(mean)
54
+
55
+ # Clamp to ensure it's in the proper range
56
+ tensor.clamp_(min=a, max=b)
57
+ return tensor
58
+
59
+ def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
60
+ # type: (Tensor, float, float, float, float) -> Tensor
61
+ r"""Fills the input Tensor with values drawn from a truncated
62
+ normal distribution. The values are effectively drawn from the
63
+ normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
64
+ with values outside :math:`[a, b]` redrawn until they are within
65
+ the bounds. The method used for generating the random values works
66
+ best when :math:`a \leq \text{mean} \leq b`.
67
+ Args:
68
+ tensor: an n-dimensional `torch.Tensor`
69
+ mean: the mean of the normal distribution
70
+ std: the standard deviation of the normal distribution
71
+ a: the minimum cutoff value
72
+ b: the maximum cutoff value
73
+ Examples:
74
+ >>> w = torch.empty(3, 5)
75
+ >>> nn.init.trunc_normal_(w)
76
+ """
77
+ return _no_grad_trunc_normal_(tensor, mean, std, a, b)
78
+
79
+ def resize(input,
80
+ size=None,
81
+ scale_factor=None,
82
+ mode='nearest',
83
+ align_corners=None,
84
+ warning=True):
85
+ if warning:
86
+ if size is not None and align_corners:
87
+ input_h, input_w = tuple(int(x) for x in input.shape[2:])
88
+ output_h, output_w = tuple(int(x) for x in size)
89
+ if output_h > input_h or output_w > output_h:
90
+ if ((output_h > 1 and output_w > 1 and input_h > 1
91
+ and input_w > 1) and (output_h - 1) % (input_h - 1)
92
+ and (output_w - 1) % (input_w - 1)):
93
+ warnings.warn(
94
+ f'When align_corners={align_corners}, '
95
+ 'the output would more aligned if '
96
+ f'input size {(input_h, input_w)} is `x+1` and '
97
+ f'out size {(output_h, output_w)} is `nx+1`')
98
+ if isinstance(size, torch.Size):
99
+ size = tuple(int(x) for x in size)
100
+ return F.interpolate(input, size, scale_factor, mode, align_corners)
101
+
102
+ def _max_by_axis(the_list):
103
+ # type: (List[List[int]]) -> List[int]
104
+ maxes = the_list[0]
105
+ for sublist in the_list[1:]:
106
+ for index, item in enumerate(sublist):
107
+ maxes[index] = max(maxes[index], item)
108
+ return maxes
109
+
110
+
111
+ class NestedTensor(object):
112
+ def __init__(self, tensors, mask: Optional[Tensor]):
113
+ self.tensors = tensors
114
+ self.mask = mask
115
+
116
+ def to(self, device):
117
+ # type: (Device) -> NestedTensor # noqa
118
+ cast_tensor = self.tensors.to(device)
119
+ mask = self.mask
120
+ if mask is not None:
121
+ assert mask is not None
122
+ cast_mask = mask.to(device)
123
+ else:
124
+ cast_mask = None
125
+ return NestedTensor(cast_tensor, cast_mask)
126
+
127
+ def decompose(self):
128
+ return self.tensors, self.mask
129
+
130
+ def __repr__(self):
131
+ return str(self.tensors)
132
+
133
+
134
+ def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
135
+ # TODO make this more general
136
+ if tensor_list[0].ndim == 3:
137
+ if torchvision._is_tracing():
138
+ # nested_tensor_from_tensor_list() does not export well to ONNX
139
+ # call _onnx_nested_tensor_from_tensor_list() instead
140
+ return _onnx_nested_tensor_from_tensor_list(tensor_list)
141
+
142
+ # TODO make it support different-sized images
143
+ max_size = _max_by_axis([list(img.shape) for img in tensor_list])
144
+ # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list]))
145
+ batch_shape = [len(tensor_list)] + max_size
146
+ b, c, h, w = batch_shape
147
+ dtype = tensor_list[0].dtype
148
+ device = tensor_list[0].device
149
+ tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
150
+ mask = torch.ones((b, h, w), dtype=torch.bool, device=device)
151
+ for img, pad_img, m in zip(tensor_list, tensor, mask):
152
+ pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
153
+ m[: img.shape[1], : img.shape[2]] = False
154
+ else:
155
+ raise ValueError("not supported")
156
+ return NestedTensor(tensor, mask)
157
+
158
+
159
+ # _onnx_nested_tensor_from_tensor_list() is an implementation of
160
+ # nested_tensor_from_tensor_list() that is supported by ONNX tracing.
161
+ @torch.jit.unused
162
+ def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor:
163
+ max_size = []
164
+ for i in range(tensor_list[0].dim()):
165
+ max_size_i = torch.max(
166
+ torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32)
167
+ ).to(torch.int64)
168
+ max_size.append(max_size_i)
169
+ max_size = tuple(max_size)
170
+
171
+ # work around for
172
+ # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
173
+ # m[: img.shape[1], :img.shape[2]] = False
174
+ # which is not yet supported in onnx
175
+ padded_imgs = []
176
+ padded_masks = []
177
+ for img in tensor_list:
178
+ padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))]
179
+ padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0]))
180
+ padded_imgs.append(padded_img)
181
+
182
+ m = torch.zeros_like(img[0], dtype=torch.int, device=img.device)
183
+ padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1)
184
+ padded_masks.append(padded_mask.to(torch.bool))
185
+
186
+ tensor = torch.stack(padded_imgs)
187
+ mask = torch.stack(padded_masks)
188
+
189
+ return NestedTensor(tensor, mask=mask)
190
+
191
+
192
+ def is_dist_avail_and_initialized():
193
+ if not dist.is_available():
194
+ return False
195
+ if not dist.is_initialized():
196
+ return False
197
+ return True
extensions/microsoftexcel-controlnet/annotator/oneformer/oneformer/utils/pos_embed.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # Position embedding utils
3
+ # --------------------------------------------------------
4
+
5
+ from typing import Tuple
6
+
7
+ import numpy as np
8
+ import torch
9
+
10
+
11
+ # --------------------------------------------------------
12
+ # 2D sine-cosine position embedding
13
+ # References:
14
+ # Transformer: https://github.com/tensorflow/models/blob/master/official/nlp/transformer/model_utils.py
15
+ # MoCo v3: https://github.com/facebookresearch/moco-v3
16
+ # --------------------------------------------------------
17
+ def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False):
18
+ """
19
+ grid_size: int of the grid height and width
20
+ return:
21
+ pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
22
+ """
23
+ grid_h = np.arange(grid_size, dtype=np.float32)
24
+ grid_w = np.arange(grid_size, dtype=np.float32)
25
+ grid = np.meshgrid(grid_w, grid_h) # here w goes first
26
+ grid = np.stack(grid, axis=0)
27
+
28
+ grid = grid.reshape([2, 1, grid_size, grid_size])
29
+ pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
30
+ if cls_token:
31
+ pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
32
+ return pos_embed
33
+
34
+
35
+ def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
36
+ assert embed_dim % 2 == 0
37
+
38
+ # use half of dimensions to encode grid_h
39
+ emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2)
40
+ emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2)
41
+
42
+ emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
43
+ return emb
44
+
45
+
46
+ def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
47
+ """
48
+ embed_dim: output dimension for each position
49
+ pos: a list of positions to be encoded: size (M,)
50
+ out: (M, D)
51
+ """
52
+ assert embed_dim % 2 == 0
53
+ omega = np.arange(embed_dim // 2, dtype=np.float)
54
+ omega /= embed_dim / 2.0
55
+ omega = 1.0 / 10000 ** omega # (D/2,)
56
+
57
+ pos = pos.reshape(-1) # (M,)
58
+ out = np.einsum("m,d->md", pos, omega) # (M, D/2), outer product
59
+
60
+ emb_sin = np.sin(out) # (M, D/2)
61
+ emb_cos = np.cos(out) # (M, D/2)
62
+
63
+ emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D)
64
+ return emb
65
+
66
+
67
+ # --------------------------------------------------------
68
+ # Interpolate position embeddings for high-resolution
69
+ # References:
70
+ # DeiT: https://github.com/facebookresearch/deit
71
+ # --------------------------------------------------------
72
+ def interpolate_pos_embed(model, checkpoint_model, pos_embed_key):
73
+ if pos_embed_key in checkpoint_model:
74
+ pos_embed_checkpoint = checkpoint_model[pos_embed_key]
75
+ embedding_size = pos_embed_checkpoint.shape[-1]
76
+ num_patches = model.num_patches
77
+ if pos_embed_key.startswith("decoder"):
78
+ num_extra_tokens = model.decoder_pos_embed.shape[-2] - num_patches
79
+ else:
80
+ num_extra_tokens = model.pos_embed.shape[-2] - num_patches
81
+ # height (== width) for the checkpoint position embedding
82
+ orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5)
83
+ # height (== width) for the new position embedding
84
+ new_size = int(num_patches ** 0.5)
85
+ # class_token and dist_token are kept unchanged
86
+ if orig_size != new_size:
87
+ print(
88
+ "Position interpolate from %dx%d to %dx%d"
89
+ % (orig_size, orig_size, new_size, new_size)
90
+ )
91
+ extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
92
+ # only the position tokens are interpolated
93
+ pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
94
+ pos_tokens = pos_tokens.reshape(
95
+ -1, orig_size, orig_size, embedding_size
96
+ ).permute(0, 3, 1, 2)
97
+ pos_tokens = torch.nn.functional.interpolate(
98
+ pos_tokens,
99
+ size=(new_size, new_size),
100
+ mode="bicubic",
101
+ align_corners=False,
102
+ )
103
+ pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
104
+ new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
105
+ checkpoint_model[pos_embed_key] = new_pos_embed
106
+
107
+
108
+ def interpolate_pos_embed_online(
109
+ pos_embed, orig_size: Tuple[int], new_size: Tuple[int], num_extra_tokens: int
110
+ ):
111
+ extra_tokens = pos_embed[:, :num_extra_tokens]
112
+ pos_tokens = pos_embed[:, num_extra_tokens:]
113
+ embedding_size = pos_tokens.shape[-1]
114
+ pos_tokens = pos_tokens.reshape(
115
+ -1, orig_size[0], orig_size[1], embedding_size
116
+ ).permute(0, 3, 1, 2)
117
+ pos_tokens = torch.nn.functional.interpolate(
118
+ pos_tokens, size=new_size, mode="bicubic", align_corners=False,
119
+ )
120
+ pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
121
+ new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
122
+ return new_pos_embed
extensions/microsoftexcel-controlnet/annotator/oneformer/pycocotools/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ __author__ = 'tylin'
extensions/microsoftexcel-controlnet/annotator/oneformer/pycocotools/coco.py ADDED
@@ -0,0 +1,444 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __author__ = 'tylin'
2
+ __version__ = '2.0'
3
+ # Interface for accessing the Microsoft COCO dataset.
4
+
5
+ # Microsoft COCO is a large image dataset designed for object detection,
6
+ # segmentation, and caption generation. annotator.oneformer.pycocotools is a Python API that
7
+ # assists in loading, parsing and visualizing the annotations in COCO.
8
+ # Please visit http://mscoco.org/ for more information on COCO, including
9
+ # for the data, paper, and tutorials. The exact format of the annotations
10
+ # is also described on the COCO website. For example usage of the annotator.oneformer.pycocotools
11
+ # please see annotator.oneformer.pycocotools_demo.ipynb. In addition to this API, please download both
12
+ # the COCO images and annotations in order to run the demo.
13
+
14
+ # An alternative to using the API is to load the annotations directly
15
+ # into Python dictionary
16
+ # Using the API provides additional utility functions. Note that this API
17
+ # supports both *instance* and *caption* annotations. In the case of
18
+ # captions not all functions are defined (e.g. categories are undefined).
19
+
20
+ # The following API functions are defined:
21
+ # COCO - COCO api class that loads COCO annotation file and prepare data structures.
22
+ # decodeMask - Decode binary mask M encoded via run-length encoding.
23
+ # encodeMask - Encode binary mask M using run-length encoding.
24
+ # getAnnIds - Get ann ids that satisfy given filter conditions.
25
+ # getCatIds - Get cat ids that satisfy given filter conditions.
26
+ # getImgIds - Get img ids that satisfy given filter conditions.
27
+ # loadAnns - Load anns with the specified ids.
28
+ # loadCats - Load cats with the specified ids.
29
+ # loadImgs - Load imgs with the specified ids.
30
+ # annToMask - Convert segmentation in an annotation to binary mask.
31
+ # showAnns - Display the specified annotations.
32
+ # loadRes - Load algorithm results and create API for accessing them.
33
+ # download - Download COCO images from mscoco.org server.
34
+ # Throughout the API "ann"=annotation, "cat"=category, and "img"=image.
35
+ # Help on each functions can be accessed by: "help COCO>function".
36
+
37
+ # See also COCO>decodeMask,
38
+ # COCO>encodeMask, COCO>getAnnIds, COCO>getCatIds,
39
+ # COCO>getImgIds, COCO>loadAnns, COCO>loadCats,
40
+ # COCO>loadImgs, COCO>annToMask, COCO>showAnns
41
+
42
+ # Microsoft COCO Toolbox. version 2.0
43
+ # Data, paper, and tutorials available at: http://mscoco.org/
44
+ # Code written by Piotr Dollar and Tsung-Yi Lin, 2014.
45
+ # Licensed under the Simplified BSD License [see bsd.txt]
46
+
47
+ import json
48
+ import time
49
+ import numpy as np
50
+ import copy
51
+ import itertools
52
+ from . import mask as maskUtils
53
+ import os
54
+ from collections import defaultdict
55
+ import sys
56
+ PYTHON_VERSION = sys.version_info[0]
57
+ if PYTHON_VERSION == 2:
58
+ from urllib import urlretrieve
59
+ elif PYTHON_VERSION == 3:
60
+ from urllib.request import urlretrieve
61
+
62
+
63
+ def _isArrayLike(obj):
64
+ return hasattr(obj, '__iter__') and hasattr(obj, '__len__')
65
+
66
+
67
+ class COCO:
68
+ def __init__(self, annotation_file=None):
69
+ """
70
+ Constructor of Microsoft COCO helper class for reading and visualizing annotations.
71
+ :param annotation_file (str): location of annotation file
72
+ :param image_folder (str): location to the folder that hosts images.
73
+ :return:
74
+ """
75
+ # load dataset
76
+ self.dataset,self.anns,self.cats,self.imgs = dict(),dict(),dict(),dict()
77
+ self.imgToAnns, self.catToImgs = defaultdict(list), defaultdict(list)
78
+ if not annotation_file == None:
79
+ print('loading annotations into memory...')
80
+ tic = time.time()
81
+ with open(annotation_file, 'r') as f:
82
+ dataset = json.load(f)
83
+ assert type(dataset)==dict, 'annotation file format {} not supported'.format(type(dataset))
84
+ print('Done (t={:0.2f}s)'.format(time.time()- tic))
85
+ self.dataset = dataset
86
+ self.createIndex()
87
+
88
+ def createIndex(self):
89
+ # create index
90
+ print('creating index...')
91
+ anns, cats, imgs = {}, {}, {}
92
+ imgToAnns,catToImgs = defaultdict(list),defaultdict(list)
93
+ if 'annotations' in self.dataset:
94
+ for ann in self.dataset['annotations']:
95
+ imgToAnns[ann['image_id']].append(ann)
96
+ anns[ann['id']] = ann
97
+
98
+ if 'images' in self.dataset:
99
+ for img in self.dataset['images']:
100
+ imgs[img['id']] = img
101
+
102
+ if 'categories' in self.dataset:
103
+ for cat in self.dataset['categories']:
104
+ cats[cat['id']] = cat
105
+
106
+ if 'annotations' in self.dataset and 'categories' in self.dataset:
107
+ for ann in self.dataset['annotations']:
108
+ catToImgs[ann['category_id']].append(ann['image_id'])
109
+
110
+ print('index created!')
111
+
112
+ # create class members
113
+ self.anns = anns
114
+ self.imgToAnns = imgToAnns
115
+ self.catToImgs = catToImgs
116
+ self.imgs = imgs
117
+ self.cats = cats
118
+
119
+ def info(self):
120
+ """
121
+ Print information about the annotation file.
122
+ :return:
123
+ """
124
+ for key, value in self.dataset['info'].items():
125
+ print('{}: {}'.format(key, value))
126
+
127
+ def getAnnIds(self, imgIds=[], catIds=[], areaRng=[], iscrowd=None):
128
+ """
129
+ Get ann ids that satisfy given filter conditions. default skips that filter
130
+ :param imgIds (int array) : get anns for given imgs
131
+ catIds (int array) : get anns for given cats
132
+ areaRng (float array) : get anns for given area range (e.g. [0 inf])
133
+ iscrowd (boolean) : get anns for given crowd label (False or True)
134
+ :return: ids (int array) : integer array of ann ids
135
+ """
136
+ imgIds = imgIds if _isArrayLike(imgIds) else [imgIds]
137
+ catIds = catIds if _isArrayLike(catIds) else [catIds]
138
+
139
+ if len(imgIds) == len(catIds) == len(areaRng) == 0:
140
+ anns = self.dataset['annotations']
141
+ else:
142
+ if not len(imgIds) == 0:
143
+ lists = [self.imgToAnns[imgId] for imgId in imgIds if imgId in self.imgToAnns]
144
+ anns = list(itertools.chain.from_iterable(lists))
145
+ else:
146
+ anns = self.dataset['annotations']
147
+ anns = anns if len(catIds) == 0 else [ann for ann in anns if ann['category_id'] in catIds]
148
+ anns = anns if len(areaRng) == 0 else [ann for ann in anns if ann['area'] > areaRng[0] and ann['area'] < areaRng[1]]
149
+ if not iscrowd == None:
150
+ ids = [ann['id'] for ann in anns if ann['iscrowd'] == iscrowd]
151
+ else:
152
+ ids = [ann['id'] for ann in anns]
153
+ return ids
154
+
155
+ def getCatIds(self, catNms=[], supNms=[], catIds=[]):
156
+ """
157
+ filtering parameters. default skips that filter.
158
+ :param catNms (str array) : get cats for given cat names
159
+ :param supNms (str array) : get cats for given supercategory names
160
+ :param catIds (int array) : get cats for given cat ids
161
+ :return: ids (int array) : integer array of cat ids
162
+ """
163
+ catNms = catNms if _isArrayLike(catNms) else [catNms]
164
+ supNms = supNms if _isArrayLike(supNms) else [supNms]
165
+ catIds = catIds if _isArrayLike(catIds) else [catIds]
166
+
167
+ if len(catNms) == len(supNms) == len(catIds) == 0:
168
+ cats = self.dataset['categories']
169
+ else:
170
+ cats = self.dataset['categories']
171
+ cats = cats if len(catNms) == 0 else [cat for cat in cats if cat['name'] in catNms]
172
+ cats = cats if len(supNms) == 0 else [cat for cat in cats if cat['supercategory'] in supNms]
173
+ cats = cats if len(catIds) == 0 else [cat for cat in cats if cat['id'] in catIds]
174
+ ids = [cat['id'] for cat in cats]
175
+ return ids
176
+
177
+ def getImgIds(self, imgIds=[], catIds=[]):
178
+ '''
179
+ Get img ids that satisfy given filter conditions.
180
+ :param imgIds (int array) : get imgs for given ids
181
+ :param catIds (int array) : get imgs with all given cats
182
+ :return: ids (int array) : integer array of img ids
183
+ '''
184
+ imgIds = imgIds if _isArrayLike(imgIds) else [imgIds]
185
+ catIds = catIds if _isArrayLike(catIds) else [catIds]
186
+
187
+ if len(imgIds) == len(catIds) == 0:
188
+ ids = self.imgs.keys()
189
+ else:
190
+ ids = set(imgIds)
191
+ for i, catId in enumerate(catIds):
192
+ if i == 0 and len(ids) == 0:
193
+ ids = set(self.catToImgs[catId])
194
+ else:
195
+ ids &= set(self.catToImgs[catId])
196
+ return list(ids)
197
+
198
+ def loadAnns(self, ids=[]):
199
+ """
200
+ Load anns with the specified ids.
201
+ :param ids (int array) : integer ids specifying anns
202
+ :return: anns (object array) : loaded ann objects
203
+ """
204
+ if _isArrayLike(ids):
205
+ return [self.anns[id] for id in ids]
206
+ elif type(ids) == int:
207
+ return [self.anns[ids]]
208
+
209
+ def loadCats(self, ids=[]):
210
+ """
211
+ Load cats with the specified ids.
212
+ :param ids (int array) : integer ids specifying cats
213
+ :return: cats (object array) : loaded cat objects
214
+ """
215
+ if _isArrayLike(ids):
216
+ return [self.cats[id] for id in ids]
217
+ elif type(ids) == int:
218
+ return [self.cats[ids]]
219
+
220
+ def loadImgs(self, ids=[]):
221
+ """
222
+ Load anns with the specified ids.
223
+ :param ids (int array) : integer ids specifying img
224
+ :return: imgs (object array) : loaded img objects
225
+ """
226
+ if _isArrayLike(ids):
227
+ return [self.imgs[id] for id in ids]
228
+ elif type(ids) == int:
229
+ return [self.imgs[ids]]
230
+
231
+ def showAnns(self, anns, draw_bbox=False):
232
+ """
233
+ Display the specified annotations.
234
+ :param anns (array of object): annotations to display
235
+ :return: None
236
+ """
237
+ if len(anns) == 0:
238
+ return 0
239
+ if 'segmentation' in anns[0] or 'keypoints' in anns[0]:
240
+ datasetType = 'instances'
241
+ elif 'caption' in anns[0]:
242
+ datasetType = 'captions'
243
+ else:
244
+ raise Exception('datasetType not supported')
245
+ if datasetType == 'instances':
246
+ import matplotlib.pyplot as plt
247
+ from matplotlib.collections import PatchCollection
248
+ from matplotlib.patches import Polygon
249
+
250
+ ax = plt.gca()
251
+ ax.set_autoscale_on(False)
252
+ polygons = []
253
+ color = []
254
+ for ann in anns:
255
+ c = (np.random.random((1, 3))*0.6+0.4).tolist()[0]
256
+ if 'segmentation' in ann:
257
+ if type(ann['segmentation']) == list:
258
+ # polygon
259
+ for seg in ann['segmentation']:
260
+ poly = np.array(seg).reshape((int(len(seg)/2), 2))
261
+ polygons.append(Polygon(poly))
262
+ color.append(c)
263
+ else:
264
+ # mask
265
+ t = self.imgs[ann['image_id']]
266
+ if type(ann['segmentation']['counts']) == list:
267
+ rle = maskUtils.frPyObjects([ann['segmentation']], t['height'], t['width'])
268
+ else:
269
+ rle = [ann['segmentation']]
270
+ m = maskUtils.decode(rle)
271
+ img = np.ones( (m.shape[0], m.shape[1], 3) )
272
+ if ann['iscrowd'] == 1:
273
+ color_mask = np.array([2.0,166.0,101.0])/255
274
+ if ann['iscrowd'] == 0:
275
+ color_mask = np.random.random((1, 3)).tolist()[0]
276
+ for i in range(3):
277
+ img[:,:,i] = color_mask[i]
278
+ ax.imshow(np.dstack( (img, m*0.5) ))
279
+ if 'keypoints' in ann and type(ann['keypoints']) == list:
280
+ # turn skeleton into zero-based index
281
+ sks = np.array(self.loadCats(ann['category_id'])[0]['skeleton'])-1
282
+ kp = np.array(ann['keypoints'])
283
+ x = kp[0::3]
284
+ y = kp[1::3]
285
+ v = kp[2::3]
286
+ for sk in sks:
287
+ if np.all(v[sk]>0):
288
+ plt.plot(x[sk],y[sk], linewidth=3, color=c)
289
+ plt.plot(x[v>0], y[v>0],'o',markersize=8, markerfacecolor=c, markeredgecolor='k',markeredgewidth=2)
290
+ plt.plot(x[v>1], y[v>1],'o',markersize=8, markerfacecolor=c, markeredgecolor=c, markeredgewidth=2)
291
+
292
+ if draw_bbox:
293
+ [bbox_x, bbox_y, bbox_w, bbox_h] = ann['bbox']
294
+ poly = [[bbox_x, bbox_y], [bbox_x, bbox_y+bbox_h], [bbox_x+bbox_w, bbox_y+bbox_h], [bbox_x+bbox_w, bbox_y]]
295
+ np_poly = np.array(poly).reshape((4,2))
296
+ polygons.append(Polygon(np_poly))
297
+ color.append(c)
298
+
299
+ p = PatchCollection(polygons, facecolor=color, linewidths=0, alpha=0.4)
300
+ ax.add_collection(p)
301
+ p = PatchCollection(polygons, facecolor='none', edgecolors=color, linewidths=2)
302
+ ax.add_collection(p)
303
+ elif datasetType == 'captions':
304
+ for ann in anns:
305
+ print(ann['caption'])
306
+
307
+ def loadRes(self, resFile):
308
+ """
309
+ Load result file and return a result api object.
310
+ :param resFile (str) : file name of result file
311
+ :return: res (obj) : result api object
312
+ """
313
+ res = COCO()
314
+ res.dataset['images'] = [img for img in self.dataset['images']]
315
+
316
+ print('Loading and preparing results...')
317
+ tic = time.time()
318
+ if type(resFile) == str or (PYTHON_VERSION == 2 and type(resFile) == unicode):
319
+ with open(resFile) as f:
320
+ anns = json.load(f)
321
+ elif type(resFile) == np.ndarray:
322
+ anns = self.loadNumpyAnnotations(resFile)
323
+ else:
324
+ anns = resFile
325
+ assert type(anns) == list, 'results in not an array of objects'
326
+ annsImgIds = [ann['image_id'] for ann in anns]
327
+ assert set(annsImgIds) == (set(annsImgIds) & set(self.getImgIds())), \
328
+ 'Results do not correspond to current coco set'
329
+ if 'caption' in anns[0]:
330
+ imgIds = set([img['id'] for img in res.dataset['images']]) & set([ann['image_id'] for ann in anns])
331
+ res.dataset['images'] = [img for img in res.dataset['images'] if img['id'] in imgIds]
332
+ for id, ann in enumerate(anns):
333
+ ann['id'] = id+1
334
+ elif 'bbox' in anns[0] and not anns[0]['bbox'] == []:
335
+ res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
336
+ for id, ann in enumerate(anns):
337
+ bb = ann['bbox']
338
+ x1, x2, y1, y2 = [bb[0], bb[0]+bb[2], bb[1], bb[1]+bb[3]]
339
+ if not 'segmentation' in ann:
340
+ ann['segmentation'] = [[x1, y1, x1, y2, x2, y2, x2, y1]]
341
+ ann['area'] = bb[2]*bb[3]
342
+ ann['id'] = id+1
343
+ ann['iscrowd'] = 0
344
+ elif 'segmentation' in anns[0]:
345
+ res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
346
+ for id, ann in enumerate(anns):
347
+ # now only support compressed RLE format as segmentation results
348
+ ann['area'] = maskUtils.area(ann['segmentation'])
349
+ if not 'bbox' in ann:
350
+ ann['bbox'] = maskUtils.toBbox(ann['segmentation'])
351
+ ann['id'] = id+1
352
+ ann['iscrowd'] = 0
353
+ elif 'keypoints' in anns[0]:
354
+ res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
355
+ for id, ann in enumerate(anns):
356
+ s = ann['keypoints']
357
+ x = s[0::3]
358
+ y = s[1::3]
359
+ x0,x1,y0,y1 = np.min(x), np.max(x), np.min(y), np.max(y)
360
+ ann['area'] = (x1-x0)*(y1-y0)
361
+ ann['id'] = id + 1
362
+ ann['bbox'] = [x0,y0,x1-x0,y1-y0]
363
+ print('DONE (t={:0.2f}s)'.format(time.time()- tic))
364
+
365
+ res.dataset['annotations'] = anns
366
+ res.createIndex()
367
+ return res
368
+
369
+ def download(self, tarDir = None, imgIds = [] ):
370
+ '''
371
+ Download COCO images from mscoco.org server.
372
+ :param tarDir (str): COCO results directory name
373
+ imgIds (list): images to be downloaded
374
+ :return:
375
+ '''
376
+ if tarDir is None:
377
+ print('Please specify target directory')
378
+ return -1
379
+ if len(imgIds) == 0:
380
+ imgs = self.imgs.values()
381
+ else:
382
+ imgs = self.loadImgs(imgIds)
383
+ N = len(imgs)
384
+ if not os.path.exists(tarDir):
385
+ os.makedirs(tarDir)
386
+ for i, img in enumerate(imgs):
387
+ tic = time.time()
388
+ fname = os.path.join(tarDir, img['file_name'])
389
+ if not os.path.exists(fname):
390
+ urlretrieve(img['coco_url'], fname)
391
+ print('downloaded {}/{} images (t={:0.1f}s)'.format(i, N, time.time()- tic))
392
+
393
+ def loadNumpyAnnotations(self, data):
394
+ """
395
+ Convert result data from a numpy array [Nx7] where each row contains {imageID,x1,y1,w,h,score,class}
396
+ :param data (numpy.ndarray)
397
+ :return: annotations (python nested list)
398
+ """
399
+ print('Converting ndarray to lists...')
400
+ assert(type(data) == np.ndarray)
401
+ print(data.shape)
402
+ assert(data.shape[1] == 7)
403
+ N = data.shape[0]
404
+ ann = []
405
+ for i in range(N):
406
+ if i % 1000000 == 0:
407
+ print('{}/{}'.format(i,N))
408
+ ann += [{
409
+ 'image_id' : int(data[i, 0]),
410
+ 'bbox' : [ data[i, 1], data[i, 2], data[i, 3], data[i, 4] ],
411
+ 'score' : data[i, 5],
412
+ 'category_id': int(data[i, 6]),
413
+ }]
414
+ return ann
415
+
416
+ def annToRLE(self, ann):
417
+ """
418
+ Convert annotation which can be polygons, uncompressed RLE to RLE.
419
+ :return: binary mask (numpy 2D array)
420
+ """
421
+ t = self.imgs[ann['image_id']]
422
+ h, w = t['height'], t['width']
423
+ segm = ann['segmentation']
424
+ if type(segm) == list:
425
+ # polygon -- a single object might consist of multiple parts
426
+ # we merge all parts into one mask rle code
427
+ rles = maskUtils.frPyObjects(segm, h, w)
428
+ rle = maskUtils.merge(rles)
429
+ elif type(segm['counts']) == list:
430
+ # uncompressed RLE
431
+ rle = maskUtils.frPyObjects(segm, h, w)
432
+ else:
433
+ # rle
434
+ rle = ann['segmentation']
435
+ return rle
436
+
437
+ def annToMask(self, ann):
438
+ """
439
+ Convert annotation which can be polygons, uncompressed RLE, or RLE to binary mask.
440
+ :return: binary mask (numpy 2D array)
441
+ """
442
+ rle = self.annToRLE(ann)
443
+ m = maskUtils.decode(rle)
444
+ return m
extensions/microsoftexcel-controlnet/annotator/oneformer/pycocotools/cocoeval.py ADDED
@@ -0,0 +1,534 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __author__ = 'tsungyi'
2
+
3
+ import numpy as np
4
+ import datetime
5
+ import time
6
+ from collections import defaultdict
7
+ from . import mask as maskUtils
8
+ import copy
9
+
10
+ class COCOeval:
11
+ # Interface for evaluating detection on the Microsoft COCO dataset.
12
+ #
13
+ # The usage for CocoEval is as follows:
14
+ # cocoGt=..., cocoDt=... # load dataset and results
15
+ # E = CocoEval(cocoGt,cocoDt); # initialize CocoEval object
16
+ # E.params.recThrs = ...; # set parameters as desired
17
+ # E.evaluate(); # run per image evaluation
18
+ # E.accumulate(); # accumulate per image results
19
+ # E.summarize(); # display summary metrics of results
20
+ # For example usage see evalDemo.m and http://mscoco.org/.
21
+ #
22
+ # The evaluation parameters are as follows (defaults in brackets):
23
+ # imgIds - [all] N img ids to use for evaluation
24
+ # catIds - [all] K cat ids to use for evaluation
25
+ # iouThrs - [.5:.05:.95] T=10 IoU thresholds for evaluation
26
+ # recThrs - [0:.01:1] R=101 recall thresholds for evaluation
27
+ # areaRng - [...] A=4 object area ranges for evaluation
28
+ # maxDets - [1 10 100] M=3 thresholds on max detections per image
29
+ # iouType - ['segm'] set iouType to 'segm', 'bbox' or 'keypoints'
30
+ # iouType replaced the now DEPRECATED useSegm parameter.
31
+ # useCats - [1] if true use category labels for evaluation
32
+ # Note: if useCats=0 category labels are ignored as in proposal scoring.
33
+ # Note: multiple areaRngs [Ax2] and maxDets [Mx1] can be specified.
34
+ #
35
+ # evaluate(): evaluates detections on every image and every category and
36
+ # concats the results into the "evalImgs" with fields:
37
+ # dtIds - [1xD] id for each of the D detections (dt)
38
+ # gtIds - [1xG] id for each of the G ground truths (gt)
39
+ # dtMatches - [TxD] matching gt id at each IoU or 0
40
+ # gtMatches - [TxG] matching dt id at each IoU or 0
41
+ # dtScores - [1xD] confidence of each dt
42
+ # gtIgnore - [1xG] ignore flag for each gt
43
+ # dtIgnore - [TxD] ignore flag for each dt at each IoU
44
+ #
45
+ # accumulate(): accumulates the per-image, per-category evaluation
46
+ # results in "evalImgs" into the dictionary "eval" with fields:
47
+ # params - parameters used for evaluation
48
+ # date - date evaluation was performed
49
+ # counts - [T,R,K,A,M] parameter dimensions (see above)
50
+ # precision - [TxRxKxAxM] precision for every evaluation setting
51
+ # recall - [TxKxAxM] max recall for every evaluation setting
52
+ # Note: precision and recall==-1 for settings with no gt objects.
53
+ #
54
+ # See also coco, mask, pycocoDemo, pycocoEvalDemo
55
+ #
56
+ # Microsoft COCO Toolbox. version 2.0
57
+ # Data, paper, and tutorials available at: http://mscoco.org/
58
+ # Code written by Piotr Dollar and Tsung-Yi Lin, 2015.
59
+ # Licensed under the Simplified BSD License [see coco/license.txt]
60
+ def __init__(self, cocoGt=None, cocoDt=None, iouType='segm'):
61
+ '''
62
+ Initialize CocoEval using coco APIs for gt and dt
63
+ :param cocoGt: coco object with ground truth annotations
64
+ :param cocoDt: coco object with detection results
65
+ :return: None
66
+ '''
67
+ if not iouType:
68
+ print('iouType not specified. use default iouType segm')
69
+ self.cocoGt = cocoGt # ground truth COCO API
70
+ self.cocoDt = cocoDt # detections COCO API
71
+ self.evalImgs = defaultdict(list) # per-image per-category evaluation results [KxAxI] elements
72
+ self.eval = {} # accumulated evaluation results
73
+ self._gts = defaultdict(list) # gt for evaluation
74
+ self._dts = defaultdict(list) # dt for evaluation
75
+ self.params = Params(iouType=iouType) # parameters
76
+ self._paramsEval = {} # parameters for evaluation
77
+ self.stats = [] # result summarization
78
+ self.ious = {} # ious between all gts and dts
79
+ if not cocoGt is None:
80
+ self.params.imgIds = sorted(cocoGt.getImgIds())
81
+ self.params.catIds = sorted(cocoGt.getCatIds())
82
+
83
+
84
+ def _prepare(self):
85
+ '''
86
+ Prepare ._gts and ._dts for evaluation based on params
87
+ :return: None
88
+ '''
89
+ def _toMask(anns, coco):
90
+ # modify ann['segmentation'] by reference
91
+ for ann in anns:
92
+ rle = coco.annToRLE(ann)
93
+ ann['segmentation'] = rle
94
+ p = self.params
95
+ if p.useCats:
96
+ gts=self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds))
97
+ dts=self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds))
98
+ else:
99
+ gts=self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds))
100
+ dts=self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds))
101
+
102
+ # convert ground truth to mask if iouType == 'segm'
103
+ if p.iouType == 'segm':
104
+ _toMask(gts, self.cocoGt)
105
+ _toMask(dts, self.cocoDt)
106
+ # set ignore flag
107
+ for gt in gts:
108
+ gt['ignore'] = gt['ignore'] if 'ignore' in gt else 0
109
+ gt['ignore'] = 'iscrowd' in gt and gt['iscrowd']
110
+ if p.iouType == 'keypoints':
111
+ gt['ignore'] = (gt['num_keypoints'] == 0) or gt['ignore']
112
+ self._gts = defaultdict(list) # gt for evaluation
113
+ self._dts = defaultdict(list) # dt for evaluation
114
+ for gt in gts:
115
+ self._gts[gt['image_id'], gt['category_id']].append(gt)
116
+ for dt in dts:
117
+ self._dts[dt['image_id'], dt['category_id']].append(dt)
118
+ self.evalImgs = defaultdict(list) # per-image per-category evaluation results
119
+ self.eval = {} # accumulated evaluation results
120
+
121
+ def evaluate(self):
122
+ '''
123
+ Run per image evaluation on given images and store results (a list of dict) in self.evalImgs
124
+ :return: None
125
+ '''
126
+ tic = time.time()
127
+ print('Running per image evaluation...')
128
+ p = self.params
129
+ # add backward compatibility if useSegm is specified in params
130
+ if not p.useSegm is None:
131
+ p.iouType = 'segm' if p.useSegm == 1 else 'bbox'
132
+ print('useSegm (deprecated) is not None. Running {} evaluation'.format(p.iouType))
133
+ print('Evaluate annotation type *{}*'.format(p.iouType))
134
+ p.imgIds = list(np.unique(p.imgIds))
135
+ if p.useCats:
136
+ p.catIds = list(np.unique(p.catIds))
137
+ p.maxDets = sorted(p.maxDets)
138
+ self.params=p
139
+
140
+ self._prepare()
141
+ # loop through images, area range, max detection number
142
+ catIds = p.catIds if p.useCats else [-1]
143
+
144
+ if p.iouType == 'segm' or p.iouType == 'bbox':
145
+ computeIoU = self.computeIoU
146
+ elif p.iouType == 'keypoints':
147
+ computeIoU = self.computeOks
148
+ self.ious = {(imgId, catId): computeIoU(imgId, catId) \
149
+ for imgId in p.imgIds
150
+ for catId in catIds}
151
+
152
+ evaluateImg = self.evaluateImg
153
+ maxDet = p.maxDets[-1]
154
+ self.evalImgs = [evaluateImg(imgId, catId, areaRng, maxDet)
155
+ for catId in catIds
156
+ for areaRng in p.areaRng
157
+ for imgId in p.imgIds
158
+ ]
159
+ self._paramsEval = copy.deepcopy(self.params)
160
+ toc = time.time()
161
+ print('DONE (t={:0.2f}s).'.format(toc-tic))
162
+
163
+ def computeIoU(self, imgId, catId):
164
+ p = self.params
165
+ if p.useCats:
166
+ gt = self._gts[imgId,catId]
167
+ dt = self._dts[imgId,catId]
168
+ else:
169
+ gt = [_ for cId in p.catIds for _ in self._gts[imgId,cId]]
170
+ dt = [_ for cId in p.catIds for _ in self._dts[imgId,cId]]
171
+ if len(gt) == 0 and len(dt) ==0:
172
+ return []
173
+ inds = np.argsort([-d['score'] for d in dt], kind='mergesort')
174
+ dt = [dt[i] for i in inds]
175
+ if len(dt) > p.maxDets[-1]:
176
+ dt=dt[0:p.maxDets[-1]]
177
+
178
+ if p.iouType == 'segm':
179
+ g = [g['segmentation'] for g in gt]
180
+ d = [d['segmentation'] for d in dt]
181
+ elif p.iouType == 'bbox':
182
+ g = [g['bbox'] for g in gt]
183
+ d = [d['bbox'] for d in dt]
184
+ else:
185
+ raise Exception('unknown iouType for iou computation')
186
+
187
+ # compute iou between each dt and gt region
188
+ iscrowd = [int(o['iscrowd']) for o in gt]
189
+ ious = maskUtils.iou(d,g,iscrowd)
190
+ return ious
191
+
192
+ def computeOks(self, imgId, catId):
193
+ p = self.params
194
+ # dimention here should be Nxm
195
+ gts = self._gts[imgId, catId]
196
+ dts = self._dts[imgId, catId]
197
+ inds = np.argsort([-d['score'] for d in dts], kind='mergesort')
198
+ dts = [dts[i] for i in inds]
199
+ if len(dts) > p.maxDets[-1]:
200
+ dts = dts[0:p.maxDets[-1]]
201
+ # if len(gts) == 0 and len(dts) == 0:
202
+ if len(gts) == 0 or len(dts) == 0:
203
+ return []
204
+ ious = np.zeros((len(dts), len(gts)))
205
+ sigmas = p.kpt_oks_sigmas
206
+ vars = (sigmas * 2)**2
207
+ k = len(sigmas)
208
+ # compute oks between each detection and ground truth object
209
+ for j, gt in enumerate(gts):
210
+ # create bounds for ignore regions(double the gt bbox)
211
+ g = np.array(gt['keypoints'])
212
+ xg = g[0::3]; yg = g[1::3]; vg = g[2::3]
213
+ k1 = np.count_nonzero(vg > 0)
214
+ bb = gt['bbox']
215
+ x0 = bb[0] - bb[2]; x1 = bb[0] + bb[2] * 2
216
+ y0 = bb[1] - bb[3]; y1 = bb[1] + bb[3] * 2
217
+ for i, dt in enumerate(dts):
218
+ d = np.array(dt['keypoints'])
219
+ xd = d[0::3]; yd = d[1::3]
220
+ if k1>0:
221
+ # measure the per-keypoint distance if keypoints visible
222
+ dx = xd - xg
223
+ dy = yd - yg
224
+ else:
225
+ # measure minimum distance to keypoints in (x0,y0) & (x1,y1)
226
+ z = np.zeros((k))
227
+ dx = np.max((z, x0-xd),axis=0)+np.max((z, xd-x1),axis=0)
228
+ dy = np.max((z, y0-yd),axis=0)+np.max((z, yd-y1),axis=0)
229
+ e = (dx**2 + dy**2) / vars / (gt['area']+np.spacing(1)) / 2
230
+ if k1 > 0:
231
+ e=e[vg > 0]
232
+ ious[i, j] = np.sum(np.exp(-e)) / e.shape[0]
233
+ return ious
234
+
235
+ def evaluateImg(self, imgId, catId, aRng, maxDet):
236
+ '''
237
+ perform evaluation for single category and image
238
+ :return: dict (single image results)
239
+ '''
240
+ p = self.params
241
+ if p.useCats:
242
+ gt = self._gts[imgId,catId]
243
+ dt = self._dts[imgId,catId]
244
+ else:
245
+ gt = [_ for cId in p.catIds for _ in self._gts[imgId,cId]]
246
+ dt = [_ for cId in p.catIds for _ in self._dts[imgId,cId]]
247
+ if len(gt) == 0 and len(dt) ==0:
248
+ return None
249
+
250
+ for g in gt:
251
+ if g['ignore'] or (g['area']<aRng[0] or g['area']>aRng[1]):
252
+ g['_ignore'] = 1
253
+ else:
254
+ g['_ignore'] = 0
255
+
256
+ # sort dt highest score first, sort gt ignore last
257
+ gtind = np.argsort([g['_ignore'] for g in gt], kind='mergesort')
258
+ gt = [gt[i] for i in gtind]
259
+ dtind = np.argsort([-d['score'] for d in dt], kind='mergesort')
260
+ dt = [dt[i] for i in dtind[0:maxDet]]
261
+ iscrowd = [int(o['iscrowd']) for o in gt]
262
+ # load computed ious
263
+ ious = self.ious[imgId, catId][:, gtind] if len(self.ious[imgId, catId]) > 0 else self.ious[imgId, catId]
264
+
265
+ T = len(p.iouThrs)
266
+ G = len(gt)
267
+ D = len(dt)
268
+ gtm = np.zeros((T,G))
269
+ dtm = np.zeros((T,D))
270
+ gtIg = np.array([g['_ignore'] for g in gt])
271
+ dtIg = np.zeros((T,D))
272
+ if not len(ious)==0:
273
+ for tind, t in enumerate(p.iouThrs):
274
+ for dind, d in enumerate(dt):
275
+ # information about best match so far (m=-1 -> unmatched)
276
+ iou = min([t,1-1e-10])
277
+ m = -1
278
+ for gind, g in enumerate(gt):
279
+ # if this gt already matched, and not a crowd, continue
280
+ if gtm[tind,gind]>0 and not iscrowd[gind]:
281
+ continue
282
+ # if dt matched to reg gt, and on ignore gt, stop
283
+ if m>-1 and gtIg[m]==0 and gtIg[gind]==1:
284
+ break
285
+ # continue to next gt unless better match made
286
+ if ious[dind,gind] < iou:
287
+ continue
288
+ # if match successful and best so far, store appropriately
289
+ iou=ious[dind,gind]
290
+ m=gind
291
+ # if match made store id of match for both dt and gt
292
+ if m ==-1:
293
+ continue
294
+ dtIg[tind,dind] = gtIg[m]
295
+ dtm[tind,dind] = gt[m]['id']
296
+ gtm[tind,m] = d['id']
297
+ # set unmatched detections outside of area range to ignore
298
+ a = np.array([d['area']<aRng[0] or d['area']>aRng[1] for d in dt]).reshape((1, len(dt)))
299
+ dtIg = np.logical_or(dtIg, np.logical_and(dtm==0, np.repeat(a,T,0)))
300
+ # store results for given image and category
301
+ return {
302
+ 'image_id': imgId,
303
+ 'category_id': catId,
304
+ 'aRng': aRng,
305
+ 'maxDet': maxDet,
306
+ 'dtIds': [d['id'] for d in dt],
307
+ 'gtIds': [g['id'] for g in gt],
308
+ 'dtMatches': dtm,
309
+ 'gtMatches': gtm,
310
+ 'dtScores': [d['score'] for d in dt],
311
+ 'gtIgnore': gtIg,
312
+ 'dtIgnore': dtIg,
313
+ }
314
+
315
+ def accumulate(self, p = None):
316
+ '''
317
+ Accumulate per image evaluation results and store the result in self.eval
318
+ :param p: input params for evaluation
319
+ :return: None
320
+ '''
321
+ print('Accumulating evaluation results...')
322
+ tic = time.time()
323
+ if not self.evalImgs:
324
+ print('Please run evaluate() first')
325
+ # allows input customized parameters
326
+ if p is None:
327
+ p = self.params
328
+ p.catIds = p.catIds if p.useCats == 1 else [-1]
329
+ T = len(p.iouThrs)
330
+ R = len(p.recThrs)
331
+ K = len(p.catIds) if p.useCats else 1
332
+ A = len(p.areaRng)
333
+ M = len(p.maxDets)
334
+ precision = -np.ones((T,R,K,A,M)) # -1 for the precision of absent categories
335
+ recall = -np.ones((T,K,A,M))
336
+ scores = -np.ones((T,R,K,A,M))
337
+
338
+ # create dictionary for future indexing
339
+ _pe = self._paramsEval
340
+ catIds = _pe.catIds if _pe.useCats else [-1]
341
+ setK = set(catIds)
342
+ setA = set(map(tuple, _pe.areaRng))
343
+ setM = set(_pe.maxDets)
344
+ setI = set(_pe.imgIds)
345
+ # get inds to evaluate
346
+ k_list = [n for n, k in enumerate(p.catIds) if k in setK]
347
+ m_list = [m for n, m in enumerate(p.maxDets) if m in setM]
348
+ a_list = [n for n, a in enumerate(map(lambda x: tuple(x), p.areaRng)) if a in setA]
349
+ i_list = [n for n, i in enumerate(p.imgIds) if i in setI]
350
+ I0 = len(_pe.imgIds)
351
+ A0 = len(_pe.areaRng)
352
+ # retrieve E at each category, area range, and max number of detections
353
+ for k, k0 in enumerate(k_list):
354
+ Nk = k0*A0*I0
355
+ for a, a0 in enumerate(a_list):
356
+ Na = a0*I0
357
+ for m, maxDet in enumerate(m_list):
358
+ E = [self.evalImgs[Nk + Na + i] for i in i_list]
359
+ E = [e for e in E if not e is None]
360
+ if len(E) == 0:
361
+ continue
362
+ dtScores = np.concatenate([e['dtScores'][0:maxDet] for e in E])
363
+
364
+ # different sorting method generates slightly different results.
365
+ # mergesort is used to be consistent as Matlab implementation.
366
+ inds = np.argsort(-dtScores, kind='mergesort')
367
+ dtScoresSorted = dtScores[inds]
368
+
369
+ dtm = np.concatenate([e['dtMatches'][:,0:maxDet] for e in E], axis=1)[:,inds]
370
+ dtIg = np.concatenate([e['dtIgnore'][:,0:maxDet] for e in E], axis=1)[:,inds]
371
+ gtIg = np.concatenate([e['gtIgnore'] for e in E])
372
+ npig = np.count_nonzero(gtIg==0 )
373
+ if npig == 0:
374
+ continue
375
+ tps = np.logical_and( dtm, np.logical_not(dtIg) )
376
+ fps = np.logical_and(np.logical_not(dtm), np.logical_not(dtIg) )
377
+
378
+ tp_sum = np.cumsum(tps, axis=1).astype(dtype=float)
379
+ fp_sum = np.cumsum(fps, axis=1).astype(dtype=float)
380
+ for t, (tp, fp) in enumerate(zip(tp_sum, fp_sum)):
381
+ tp = np.array(tp)
382
+ fp = np.array(fp)
383
+ nd = len(tp)
384
+ rc = tp / npig
385
+ pr = tp / (fp+tp+np.spacing(1))
386
+ q = np.zeros((R,))
387
+ ss = np.zeros((R,))
388
+
389
+ if nd:
390
+ recall[t,k,a,m] = rc[-1]
391
+ else:
392
+ recall[t,k,a,m] = 0
393
+
394
+ # numpy is slow without cython optimization for accessing elements
395
+ # use python array gets significant speed improvement
396
+ pr = pr.tolist(); q = q.tolist()
397
+
398
+ for i in range(nd-1, 0, -1):
399
+ if pr[i] > pr[i-1]:
400
+ pr[i-1] = pr[i]
401
+
402
+ inds = np.searchsorted(rc, p.recThrs, side='left')
403
+ try:
404
+ for ri, pi in enumerate(inds):
405
+ q[ri] = pr[pi]
406
+ ss[ri] = dtScoresSorted[pi]
407
+ except:
408
+ pass
409
+ precision[t,:,k,a,m] = np.array(q)
410
+ scores[t,:,k,a,m] = np.array(ss)
411
+ self.eval = {
412
+ 'params': p,
413
+ 'counts': [T, R, K, A, M],
414
+ 'date': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
415
+ 'precision': precision,
416
+ 'recall': recall,
417
+ 'scores': scores,
418
+ }
419
+ toc = time.time()
420
+ print('DONE (t={:0.2f}s).'.format( toc-tic))
421
+
422
+ def summarize(self):
423
+ '''
424
+ Compute and display summary metrics for evaluation results.
425
+ Note this functin can *only* be applied on the default parameter setting
426
+ '''
427
+ def _summarize( ap=1, iouThr=None, areaRng='all', maxDets=100 ):
428
+ p = self.params
429
+ iStr = ' {:<18} {} @[ IoU={:<9} | area={:>6s} | maxDets={:>3d} ] = {:0.3f}'
430
+ titleStr = 'Average Precision' if ap == 1 else 'Average Recall'
431
+ typeStr = '(AP)' if ap==1 else '(AR)'
432
+ iouStr = '{:0.2f}:{:0.2f}'.format(p.iouThrs[0], p.iouThrs[-1]) \
433
+ if iouThr is None else '{:0.2f}'.format(iouThr)
434
+
435
+ aind = [i for i, aRng in enumerate(p.areaRngLbl) if aRng == areaRng]
436
+ mind = [i for i, mDet in enumerate(p.maxDets) if mDet == maxDets]
437
+ if ap == 1:
438
+ # dimension of precision: [TxRxKxAxM]
439
+ s = self.eval['precision']
440
+ # IoU
441
+ if iouThr is not None:
442
+ t = np.where(iouThr == p.iouThrs)[0]
443
+ s = s[t]
444
+ s = s[:,:,:,aind,mind]
445
+ else:
446
+ # dimension of recall: [TxKxAxM]
447
+ s = self.eval['recall']
448
+ if iouThr is not None:
449
+ t = np.where(iouThr == p.iouThrs)[0]
450
+ s = s[t]
451
+ s = s[:,:,aind,mind]
452
+ if len(s[s>-1])==0:
453
+ mean_s = -1
454
+ else:
455
+ mean_s = np.mean(s[s>-1])
456
+ print(iStr.format(titleStr, typeStr, iouStr, areaRng, maxDets, mean_s))
457
+ return mean_s
458
+ def _summarizeDets():
459
+ stats = np.zeros((12,))
460
+ stats[0] = _summarize(1)
461
+ stats[1] = _summarize(1, iouThr=.5, maxDets=self.params.maxDets[2])
462
+ stats[2] = _summarize(1, iouThr=.75, maxDets=self.params.maxDets[2])
463
+ stats[3] = _summarize(1, areaRng='small', maxDets=self.params.maxDets[2])
464
+ stats[4] = _summarize(1, areaRng='medium', maxDets=self.params.maxDets[2])
465
+ stats[5] = _summarize(1, areaRng='large', maxDets=self.params.maxDets[2])
466
+ stats[6] = _summarize(0, maxDets=self.params.maxDets[0])
467
+ stats[7] = _summarize(0, maxDets=self.params.maxDets[1])
468
+ stats[8] = _summarize(0, maxDets=self.params.maxDets[2])
469
+ stats[9] = _summarize(0, areaRng='small', maxDets=self.params.maxDets[2])
470
+ stats[10] = _summarize(0, areaRng='medium', maxDets=self.params.maxDets[2])
471
+ stats[11] = _summarize(0, areaRng='large', maxDets=self.params.maxDets[2])
472
+ return stats
473
+ def _summarizeKps():
474
+ stats = np.zeros((10,))
475
+ stats[0] = _summarize(1, maxDets=20)
476
+ stats[1] = _summarize(1, maxDets=20, iouThr=.5)
477
+ stats[2] = _summarize(1, maxDets=20, iouThr=.75)
478
+ stats[3] = _summarize(1, maxDets=20, areaRng='medium')
479
+ stats[4] = _summarize(1, maxDets=20, areaRng='large')
480
+ stats[5] = _summarize(0, maxDets=20)
481
+ stats[6] = _summarize(0, maxDets=20, iouThr=.5)
482
+ stats[7] = _summarize(0, maxDets=20, iouThr=.75)
483
+ stats[8] = _summarize(0, maxDets=20, areaRng='medium')
484
+ stats[9] = _summarize(0, maxDets=20, areaRng='large')
485
+ return stats
486
+ if not self.eval:
487
+ raise Exception('Please run accumulate() first')
488
+ iouType = self.params.iouType
489
+ if iouType == 'segm' or iouType == 'bbox':
490
+ summarize = _summarizeDets
491
+ elif iouType == 'keypoints':
492
+ summarize = _summarizeKps
493
+ self.stats = summarize()
494
+
495
+ def __str__(self):
496
+ self.summarize()
497
+
498
+ class Params:
499
+ '''
500
+ Params for coco evaluation api
501
+ '''
502
+ def setDetParams(self):
503
+ self.imgIds = []
504
+ self.catIds = []
505
+ # np.arange causes trouble. the data point on arange is slightly larger than the true value
506
+ self.iouThrs = np.linspace(.5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True)
507
+ self.recThrs = np.linspace(.0, 1.00, int(np.round((1.00 - .0) / .01)) + 1, endpoint=True)
508
+ self.maxDets = [1, 10, 100]
509
+ self.areaRng = [[0 ** 2, 1e5 ** 2], [0 ** 2, 32 ** 2], [32 ** 2, 96 ** 2], [96 ** 2, 1e5 ** 2]]
510
+ self.areaRngLbl = ['all', 'small', 'medium', 'large']
511
+ self.useCats = 1
512
+
513
+ def setKpParams(self):
514
+ self.imgIds = []
515
+ self.catIds = []
516
+ # np.arange causes trouble. the data point on arange is slightly larger than the true value
517
+ self.iouThrs = np.linspace(.5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True)
518
+ self.recThrs = np.linspace(.0, 1.00, int(np.round((1.00 - .0) / .01)) + 1, endpoint=True)
519
+ self.maxDets = [20]
520
+ self.areaRng = [[0 ** 2, 1e5 ** 2], [32 ** 2, 96 ** 2], [96 ** 2, 1e5 ** 2]]
521
+ self.areaRngLbl = ['all', 'medium', 'large']
522
+ self.useCats = 1
523
+ self.kpt_oks_sigmas = np.array([.26, .25, .25, .35, .35, .79, .79, .72, .72, .62,.62, 1.07, 1.07, .87, .87, .89, .89])/10.0
524
+
525
+ def __init__(self, iouType='segm'):
526
+ if iouType == 'segm' or iouType == 'bbox':
527
+ self.setDetParams()
528
+ elif iouType == 'keypoints':
529
+ self.setKpParams()
530
+ else:
531
+ raise Exception('iouType not supported')
532
+ self.iouType = iouType
533
+ # useSegm is deprecated
534
+ self.useSegm = None
extensions/microsoftexcel-controlnet/annotator/oneformer/pycocotools/mask.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __author__ = 'tsungyi'
2
+
3
+ # import annotator.oneformer.pycocotools._mask as _mask
4
+
5
+ # Interface for manipulating masks stored in RLE format.
6
+ #
7
+ # RLE is a simple yet efficient format for storing binary masks. RLE
8
+ # first divides a vector (or vectorized image) into a series of piecewise
9
+ # constant regions and then for each piece simply stores the length of
10
+ # that piece. For example, given M=[0 0 1 1 1 0 1] the RLE counts would
11
+ # be [2 3 1 1], or for M=[1 1 1 1 1 1 0] the counts would be [0 6 1]
12
+ # (note that the odd counts are always the numbers of zeros). Instead of
13
+ # storing the counts directly, additional compression is achieved with a
14
+ # variable bitrate representation based on a common scheme called LEB128.
15
+ #
16
+ # Compression is greatest given large piecewise constant regions.
17
+ # Specifically, the size of the RLE is proportional to the number of
18
+ # *boundaries* in M (or for an image the number of boundaries in the y
19
+ # direction). Assuming fairly simple shapes, the RLE representation is
20
+ # O(sqrt(n)) where n is number of pixels in the object. Hence space usage
21
+ # is substantially lower, especially for large simple objects (large n).
22
+ #
23
+ # Many common operations on masks can be computed directly using the RLE
24
+ # (without need for decoding). This includes computations such as area,
25
+ # union, intersection, etc. All of these operations are linear in the
26
+ # size of the RLE, in other words they are O(sqrt(n)) where n is the area
27
+ # of the object. Computing these operations on the original mask is O(n).
28
+ # Thus, using the RLE can result in substantial computational savings.
29
+ #
30
+ # The following API functions are defined:
31
+ # encode - Encode binary masks using RLE.
32
+ # decode - Decode binary masks encoded via RLE.
33
+ # merge - Compute union or intersection of encoded masks.
34
+ # iou - Compute intersection over union between masks.
35
+ # area - Compute area of encoded masks.
36
+ # toBbox - Get bounding boxes surrounding encoded masks.
37
+ # frPyObjects - Convert polygon, bbox, and uncompressed RLE to encoded RLE mask.
38
+ #
39
+ # Usage:
40
+ # Rs = encode( masks )
41
+ # masks = decode( Rs )
42
+ # R = merge( Rs, intersect=false )
43
+ # o = iou( dt, gt, iscrowd )
44
+ # a = area( Rs )
45
+ # bbs = toBbox( Rs )
46
+ # Rs = frPyObjects( [pyObjects], h, w )
47
+ #
48
+ # In the API the following formats are used:
49
+ # Rs - [dict] Run-length encoding of binary masks
50
+ # R - dict Run-length encoding of binary mask
51
+ # masks - [hxwxn] Binary mask(s) (must have type np.ndarray(dtype=uint8) in column-major order)
52
+ # iscrowd - [nx1] list of np.ndarray. 1 indicates corresponding gt image has crowd region to ignore
53
+ # bbs - [nx4] Bounding box(es) stored as [x y w h]
54
+ # poly - Polygon stored as [[x1 y1 x2 y2...],[x1 y1 ...],...] (2D list)
55
+ # dt,gt - May be either bounding boxes or encoded masks
56
+ # Both poly and bbs are 0-indexed (bbox=[0 0 1 1] encloses first pixel).
57
+ #
58
+ # Finally, a note about the intersection over union (iou) computation.
59
+ # The standard iou of a ground truth (gt) and detected (dt) object is
60
+ # iou(gt,dt) = area(intersect(gt,dt)) / area(union(gt,dt))
61
+ # For "crowd" regions, we use a modified criteria. If a gt object is
62
+ # marked as "iscrowd", we allow a dt to match any subregion of the gt.
63
+ # Choosing gt' in the crowd gt that best matches the dt can be done using
64
+ # gt'=intersect(dt,gt). Since by definition union(gt',dt)=dt, computing
65
+ # iou(gt,dt,iscrowd) = iou(gt',dt) = area(intersect(gt,dt)) / area(dt)
66
+ # For crowd gt regions we use this modified criteria above for the iou.
67
+ #
68
+ # To compile run "python setup.py build_ext --inplace"
69
+ # Please do not contact us for help with compiling.
70
+ #
71
+ # Microsoft COCO Toolbox. version 2.0
72
+ # Data, paper, and tutorials available at: http://mscoco.org/
73
+ # Code written by Piotr Dollar and Tsung-Yi Lin, 2015.
74
+ # Licensed under the Simplified BSD License [see coco/license.txt]
75
+
76
+ # iou = _mask.iou
77
+ # merge = _mask.merge
78
+ # frPyObjects = _mask.frPyObjects
79
+
80
+ def encode(bimask):
81
+ pass
82
+ # if len(bimask.shape) == 3:
83
+ # return _mask.encode(bimask)
84
+ # elif len(bimask.shape) == 2:
85
+ # h, w = bimask.shape
86
+ # return _mask.encode(bimask.reshape((h, w, 1), order='F'))[0]
87
+
88
+ def decode(rleObjs):
89
+ pass
90
+ # if type(rleObjs) == list:
91
+ # return _mask.decode(rleObjs)
92
+ # else:
93
+ # return _mask.decode([rleObjs])[:,:,0]
94
+
95
+ def area(rleObjs):
96
+ pass
97
+ # if type(rleObjs) == list:
98
+ # return _mask.area(rleObjs)
99
+ # else:
100
+ # return _mask.area([rleObjs])[0]
101
+
102
+ def toBbox(rleObjs):
103
+ pass
104
+ # if type(rleObjs) == list:
105
+ # return _mask.toBbox(rleObjs)
106
+ # else:
107
+ # return _mask.toBbox([rleObjs])[0]
extensions/microsoftexcel-controlnet/annotator/openpose/LICENSE ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ OPENPOSE: MULTIPERSON KEYPOINT DETECTION
2
+ SOFTWARE LICENSE AGREEMENT
3
+ ACADEMIC OR NON-PROFIT ORGANIZATION NONCOMMERCIAL RESEARCH USE ONLY
4
+
5
+ BY USING OR DOWNLOADING THE SOFTWARE, YOU ARE AGREEING TO THE TERMS OF THIS LICENSE AGREEMENT. IF YOU DO NOT AGREE WITH THESE TERMS, YOU MAY NOT USE OR DOWNLOAD THE SOFTWARE.
6
+
7
+ This is a license agreement ("Agreement") between your academic institution or non-profit organization or self (called "Licensee" or "You" in this Agreement) and Carnegie Mellon University (called "Licensor" in this Agreement). All rights not specifically granted to you in this Agreement are reserved for Licensor.
8
+
9
+ RESERVATION OF OWNERSHIP AND GRANT OF LICENSE:
10
+ Licensor retains exclusive ownership of any copy of the Software (as defined below) licensed under this Agreement and hereby grants to Licensee a personal, non-exclusive,
11
+ non-transferable license to use the Software for noncommercial research purposes, without the right to sublicense, pursuant to the terms and conditions of this Agreement. As used in this Agreement, the term "Software" means (i) the actual copy of all or any portion of code for program routines made accessible to Licensee by Licensor pursuant to this Agreement, inclusive of backups, updates, and/or merged copies permitted hereunder or subsequently supplied by Licensor, including all or any file structures, programming instructions, user interfaces and screen formats and sequences as well as any and all documentation and instructions related to it, and (ii) all or any derivatives and/or modifications created or made by You to any of the items specified in (i).
12
+
13
+ CONFIDENTIALITY: Licensee acknowledges that the Software is proprietary to Licensor, and as such, Licensee agrees to receive all such materials in confidence and use the Software only in accordance with the terms of this Agreement. Licensee agrees to use reasonable effort to protect the Software from unauthorized use, reproduction, distribution, or publication.
14
+
15
+ COPYRIGHT: The Software is owned by Licensor and is protected by United
16
+ States copyright laws and applicable international treaties and/or conventions.
17
+
18
+ PERMITTED USES: The Software may be used for your own noncommercial internal research purposes. You understand and agree that Licensor is not obligated to implement any suggestions and/or feedback you might provide regarding the Software, but to the extent Licensor does so, you are not entitled to any compensation related thereto.
19
+
20
+ DERIVATIVES: You may create derivatives of or make modifications to the Software, however, You agree that all and any such derivatives and modifications will be owned by Licensor and become a part of the Software licensed to You under this Agreement. You may only use such derivatives and modifications for your own noncommercial internal research purposes, and you may not otherwise use, distribute or copy such derivatives and modifications in violation of this Agreement.
21
+
22
+ BACKUPS: If Licensee is an organization, it may make that number of copies of the Software necessary for internal noncommercial use at a single site within its organization provided that all information appearing in or on the original labels, including the copyright and trademark notices are copied onto the labels of the copies.
23
+
24
+ USES NOT PERMITTED: You may not distribute, copy or use the Software except as explicitly permitted herein. Licensee has not been granted any trademark license as part of this Agreement and may not use the name or mark “OpenPose", "Carnegie Mellon" or any renditions thereof without the prior written permission of Licensor.
25
+
26
+ You may not sell, rent, lease, sublicense, lend, time-share or transfer, in whole or in part, or provide third parties access to prior or present versions (or any parts thereof) of the Software.
27
+
28
+ ASSIGNMENT: You may not assign this Agreement or your rights hereunder without the prior written consent of Licensor. Any attempted assignment without such consent shall be null and void.
29
+
30
+ TERM: The term of the license granted by this Agreement is from Licensee's acceptance of this Agreement by downloading the Software or by using the Software until terminated as provided below.
31
+
32
+ The Agreement automatically terminates without notice if you fail to comply with any provision of this Agreement. Licensee may terminate this Agreement by ceasing using the Software. Upon any termination of this Agreement, Licensee will delete any and all copies of the Software. You agree that all provisions which operate to protect the proprietary rights of Licensor shall remain in force should breach occur and that the obligation of confidentiality described in this Agreement is binding in perpetuity and, as such, survives the term of the Agreement.
33
+
34
+ FEE: Provided Licensee abides completely by the terms and conditions of this Agreement, there is no fee due to Licensor for Licensee's use of the Software in accordance with this Agreement.
35
+
36
+ DISCLAIMER OF WARRANTIES: THE SOFTWARE IS PROVIDED "AS-IS" WITHOUT WARRANTY OF ANY KIND INCLUDING ANY WARRANTIES OF PERFORMANCE OR MERCHANTABILITY OR FITNESS FOR A PARTICULAR USE OR PURPOSE OR OF NON-INFRINGEMENT. LICENSEE BEARS ALL RISK RELATING TO QUALITY AND PERFORMANCE OF THE SOFTWARE AND RELATED MATERIALS.
37
+
38
+ SUPPORT AND MAINTENANCE: No Software support or training by the Licensor is provided as part of this Agreement.
39
+
40
+ EXCLUSIVE REMEDY AND LIMITATION OF LIABILITY: To the maximum extent permitted under applicable law, Licensor shall not be liable for direct, indirect, special, incidental, or consequential damages or lost profits related to Licensee's use of and/or inability to use the Software, even if Licensor is advised of the possibility of such damage.
41
+
42
+ EXPORT REGULATION: Licensee agrees to comply with any and all applicable
43
+ U.S. export control laws, regulations, and/or other laws related to embargoes and sanction programs administered by the Office of Foreign Assets Control.
44
+
45
+ SEVERABILITY: If any provision(s) of this Agreement shall be held to be invalid, illegal, or unenforceable by a court or other tribunal of competent jurisdiction, the validity, legality and enforceability of the remaining provisions shall not in any way be affected or impaired thereby.
46
+
47
+ NO IMPLIED WAIVERS: No failure or delay by Licensor in enforcing any right or remedy under this Agreement shall be construed as a waiver of any future or other exercise of such right or remedy by Licensor.
48
+
49
+ GOVERNING LAW: This Agreement shall be construed and enforced in accordance with the laws of the Commonwealth of Pennsylvania without reference to conflict of laws principles. You consent to the personal jurisdiction of the courts of this County and waive their rights to venue outside of Allegheny County, Pennsylvania.
50
+
51
+ ENTIRE AGREEMENT AND AMENDMENTS: This Agreement constitutes the sole and entire agreement between Licensee and Licensor as to the matter set forth herein and supersedes any previous agreements, understandings, and arrangements between the parties relating hereto.
52
+
53
+
54
+
55
+ ************************************************************************
56
+
57
+ THIRD-PARTY SOFTWARE NOTICES AND INFORMATION
58
+
59
+ This project incorporates material from the project(s) listed below (collectively, "Third Party Code"). This Third Party Code is licensed to you under their original license terms set forth below. We reserves all other rights not expressly granted, whether by implication, estoppel or otherwise.
60
+
61
+ 1. Caffe, version 1.0.0, (https://github.com/BVLC/caffe/)
62
+
63
+ COPYRIGHT
64
+
65
+ All contributions by the University of California:
66
+ Copyright (c) 2014-2017 The Regents of the University of California (Regents)
67
+ All rights reserved.
68
+
69
+ All other contributions:
70
+ Copyright (c) 2014-2017, the respective contributors
71
+ All rights reserved.
72
+
73
+ Caffe uses a shared copyright model: each contributor holds copyright over
74
+ their contributions to Caffe. The project versioning records all such
75
+ contribution and copyright details. If a contributor wants to further mark
76
+ their specific copyright on a particular contribution, they should indicate
77
+ their copyright solely in the commit message of the change when it is
78
+ committed.
79
+
80
+ LICENSE
81
+
82
+ Redistribution and use in source and binary forms, with or without
83
+ modification, are permitted provided that the following conditions are met:
84
+
85
+ 1. Redistributions of source code must retain the above copyright notice, this
86
+ list of conditions and the following disclaimer.
87
+ 2. Redistributions in binary form must reproduce the above copyright notice,
88
+ this list of conditions and the following disclaimer in the documentation
89
+ and/or other materials provided with the distribution.
90
+
91
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
92
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
93
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
94
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
95
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
96
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
97
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
98
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
99
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
100
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
101
+
102
+ CONTRIBUTION AGREEMENT
103
+
104
+ By contributing to the BVLC/caffe repository through pull-request, comment,
105
+ or otherwise, the contributor releases their content to the
106
+ license and copyright terms herein.
107
+
108
+ ************END OF THIRD-PARTY SOFTWARE NOTICES AND INFORMATION**********
extensions/microsoftexcel-controlnet/annotator/openpose/__init__.py ADDED
@@ -0,0 +1,262 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Openpose
2
+ # Original from CMU https://github.com/CMU-Perceptual-Computing-Lab/openpose
3
+ # 2nd Edited by https://github.com/Hzzone/pytorch-openpose
4
+ # 3rd Edited by ControlNet
5
+ # 4th Edited by ControlNet (added face and correct hands)
6
+ # 5th Edited by ControlNet (Improved JSON serialization/deserialization, and lots of bug fixs)
7
+ # This preprocessor is licensed by CMU for non-commercial use only.
8
+
9
+
10
+ import os
11
+ os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
12
+
13
+ import json
14
+ import torch
15
+ import numpy as np
16
+ from . import util
17
+ from .body import Body, BodyResult, Keypoint
18
+ from .hand import Hand
19
+ from .face import Face
20
+ from modules import devices
21
+ from annotator.annotator_path import models_path
22
+
23
+ from typing import NamedTuple, Tuple, List, Callable, Union
24
+
25
+ body_model_path = "https://huggingface.co/lllyasviel/Annotators/resolve/main/body_pose_model.pth"
26
+ hand_model_path = "https://huggingface.co/lllyasviel/Annotators/resolve/main/hand_pose_model.pth"
27
+ face_model_path = "https://huggingface.co/lllyasviel/Annotators/resolve/main/facenet.pth"
28
+
29
+ HandResult = List[Keypoint]
30
+ FaceResult = List[Keypoint]
31
+
32
+ class PoseResult(NamedTuple):
33
+ body: BodyResult
34
+ left_hand: Union[HandResult, None]
35
+ right_hand: Union[HandResult, None]
36
+ face: Union[FaceResult, None]
37
+
38
+ def draw_poses(poses: List[PoseResult], H, W, draw_body=True, draw_hand=True, draw_face=True):
39
+ """
40
+ Draw the detected poses on an empty canvas.
41
+
42
+ Args:
43
+ poses (List[PoseResult]): A list of PoseResult objects containing the detected poses.
44
+ H (int): The height of the canvas.
45
+ W (int): The width of the canvas.
46
+ draw_body (bool, optional): Whether to draw body keypoints. Defaults to True.
47
+ draw_hand (bool, optional): Whether to draw hand keypoints. Defaults to True.
48
+ draw_face (bool, optional): Whether to draw face keypoints. Defaults to True.
49
+
50
+ Returns:
51
+ numpy.ndarray: A 3D numpy array representing the canvas with the drawn poses.
52
+ """
53
+ canvas = np.zeros(shape=(H, W, 3), dtype=np.uint8)
54
+
55
+ for pose in poses:
56
+ if draw_body:
57
+ canvas = util.draw_bodypose(canvas, pose.body.keypoints)
58
+
59
+ if draw_hand:
60
+ canvas = util.draw_handpose(canvas, pose.left_hand)
61
+ canvas = util.draw_handpose(canvas, pose.right_hand)
62
+
63
+ if draw_face:
64
+ canvas = util.draw_facepose(canvas, pose.face)
65
+
66
+ return canvas
67
+
68
+ def encode_poses_as_json(poses: List[PoseResult], canvas_height: int, canvas_width: int) -> str:
69
+ """ Encode the pose as a JSON string following openpose JSON output format:
70
+ https://github.com/CMU-Perceptual-Computing-Lab/openpose/blob/master/doc/02_output.md
71
+ """
72
+ def compress_keypoints(keypoints: Union[List[Keypoint], None]) -> Union[List[float], None]:
73
+ if not keypoints:
74
+ return None
75
+
76
+ return [
77
+ value
78
+ for keypoint in keypoints
79
+ for value in (
80
+ [float(keypoint.x), float(keypoint.y), 1.0]
81
+ if keypoint is not None
82
+ else [0.0, 0.0, 0.0]
83
+ )
84
+ ]
85
+
86
+ return json.dumps({
87
+ 'people': [
88
+ {
89
+ 'pose_keypoints_2d': compress_keypoints(pose.body.keypoints),
90
+ "face_keypoints_2d": compress_keypoints(pose.face),
91
+ "hand_left_keypoints_2d": compress_keypoints(pose.left_hand),
92
+ "hand_right_keypoints_2d":compress_keypoints(pose.right_hand),
93
+ }
94
+ for pose in poses
95
+ ],
96
+ 'canvas_height': canvas_height,
97
+ 'canvas_width': canvas_width,
98
+ }, indent=4)
99
+
100
+
101
+ class OpenposeDetector:
102
+ """
103
+ A class for detecting human poses in images using the Openpose model.
104
+
105
+ Attributes:
106
+ model_dir (str): Path to the directory where the pose models are stored.
107
+ """
108
+ model_dir = os.path.join(models_path, "openpose")
109
+
110
+ def __init__(self):
111
+ self.device = devices.get_device_for("controlnet")
112
+ self.body_estimation = None
113
+ self.hand_estimation = None
114
+ self.face_estimation = None
115
+
116
+ def load_model(self):
117
+ """
118
+ Load the Openpose body, hand, and face models.
119
+ """
120
+ body_modelpath = os.path.join(self.model_dir, "body_pose_model.pth")
121
+ hand_modelpath = os.path.join(self.model_dir, "hand_pose_model.pth")
122
+ face_modelpath = os.path.join(self.model_dir, "facenet.pth")
123
+
124
+ if not os.path.exists(body_modelpath):
125
+ from basicsr.utils.download_util import load_file_from_url
126
+ load_file_from_url(body_model_path, model_dir=self.model_dir)
127
+
128
+ if not os.path.exists(hand_modelpath):
129
+ from basicsr.utils.download_util import load_file_from_url
130
+ load_file_from_url(hand_model_path, model_dir=self.model_dir)
131
+
132
+ if not os.path.exists(face_modelpath):
133
+ from basicsr.utils.download_util import load_file_from_url
134
+ load_file_from_url(face_model_path, model_dir=self.model_dir)
135
+
136
+ self.body_estimation = Body(body_modelpath)
137
+ self.hand_estimation = Hand(hand_modelpath)
138
+ self.face_estimation = Face(face_modelpath)
139
+
140
+ def unload_model(self):
141
+ """
142
+ Unload the Openpose models by moving them to the CPU.
143
+ """
144
+ if self.body_estimation is not None:
145
+ self.body_estimation.model.to("cpu")
146
+ self.hand_estimation.model.to("cpu")
147
+ self.face_estimation.model.to("cpu")
148
+
149
+ def detect_hands(self, body: BodyResult, oriImg) -> Tuple[Union[HandResult, None], Union[HandResult, None]]:
150
+ left_hand = None
151
+ right_hand = None
152
+ H, W, _ = oriImg.shape
153
+ for x, y, w, is_left in util.handDetect(body, oriImg):
154
+ peaks = self.hand_estimation(oriImg[y:y+w, x:x+w, :]).astype(np.float32)
155
+ if peaks.ndim == 2 and peaks.shape[1] == 2:
156
+ peaks[:, 0] = np.where(peaks[:, 0] < 1e-6, -1, peaks[:, 0] + x) / float(W)
157
+ peaks[:, 1] = np.where(peaks[:, 1] < 1e-6, -1, peaks[:, 1] + y) / float(H)
158
+
159
+ hand_result = [
160
+ Keypoint(x=peak[0], y=peak[1])
161
+ for peak in peaks
162
+ ]
163
+
164
+ if is_left:
165
+ left_hand = hand_result
166
+ else:
167
+ right_hand = hand_result
168
+
169
+ return left_hand, right_hand
170
+
171
+ def detect_face(self, body: BodyResult, oriImg) -> Union[FaceResult, None]:
172
+ face = util.faceDetect(body, oriImg)
173
+ if face is None:
174
+ return None
175
+
176
+ x, y, w = face
177
+ H, W, _ = oriImg.shape
178
+ heatmaps = self.face_estimation(oriImg[y:y+w, x:x+w, :])
179
+ peaks = self.face_estimation.compute_peaks_from_heatmaps(heatmaps).astype(np.float32)
180
+ if peaks.ndim == 2 and peaks.shape[1] == 2:
181
+ peaks[:, 0] = np.where(peaks[:, 0] < 1e-6, -1, peaks[:, 0] + x) / float(W)
182
+ peaks[:, 1] = np.where(peaks[:, 1] < 1e-6, -1, peaks[:, 1] + y) / float(H)
183
+ return [
184
+ Keypoint(x=peak[0], y=peak[1])
185
+ for peak in peaks
186
+ ]
187
+
188
+ return None
189
+
190
+ def detect_poses(self, oriImg, include_hand=False, include_face=False) -> List[PoseResult]:
191
+ """
192
+ Detect poses in the given image.
193
+ Args:
194
+ oriImg (numpy.ndarray): The input image for pose detection.
195
+ include_hand (bool, optional): Whether to include hand detection. Defaults to False.
196
+ include_face (bool, optional): Whether to include face detection. Defaults to False.
197
+
198
+ Returns:
199
+ List[PoseResult]: A list of PoseResult objects containing the detected poses.
200
+ """
201
+ if self.body_estimation is None:
202
+ self.load_model()
203
+
204
+ self.body_estimation.model.to(self.device)
205
+ self.hand_estimation.model.to(self.device)
206
+ self.face_estimation.model.to(self.device)
207
+
208
+ self.body_estimation.cn_device = self.device
209
+ self.hand_estimation.cn_device = self.device
210
+ self.face_estimation.cn_device = self.device
211
+
212
+ oriImg = oriImg[:, :, ::-1].copy()
213
+ H, W, C = oriImg.shape
214
+ with torch.no_grad():
215
+ candidate, subset = self.body_estimation(oriImg)
216
+ bodies = self.body_estimation.format_body_result(candidate, subset)
217
+
218
+ results = []
219
+ for body in bodies:
220
+ left_hand, right_hand, face = (None,) * 3
221
+ if include_hand:
222
+ left_hand, right_hand = self.detect_hands(body, oriImg)
223
+ if include_face:
224
+ face = self.detect_face(body, oriImg)
225
+
226
+ results.append(PoseResult(BodyResult(
227
+ keypoints=[
228
+ Keypoint(
229
+ x=keypoint.x / float(W),
230
+ y=keypoint.y / float(H)
231
+ ) if keypoint is not None else None
232
+ for keypoint in body.keypoints
233
+ ],
234
+ total_score=body.total_score,
235
+ total_parts=body.total_parts
236
+ ), left_hand, right_hand, face))
237
+
238
+ return results
239
+
240
+ def __call__(
241
+ self, oriImg, include_body=True, include_hand=False, include_face=False,
242
+ json_pose_callback: Callable[[str], None] = None,
243
+ ):
244
+ """
245
+ Detect and draw poses in the given image.
246
+
247
+ Args:
248
+ oriImg (numpy.ndarray): The input image for pose detection and drawing.
249
+ include_body (bool, optional): Whether to include body keypoints. Defaults to True.
250
+ include_hand (bool, optional): Whether to include hand keypoints. Defaults to False.
251
+ include_face (bool, optional): Whether to include face keypoints. Defaults to False.
252
+ json_pose_callback (Callable, optional): A callback that accepts the pose JSON string.
253
+
254
+ Returns:
255
+ numpy.ndarray: The image with detected and drawn poses.
256
+ """
257
+ H, W, _ = oriImg.shape
258
+ poses = self.detect_poses(oriImg, include_hand, include_face)
259
+ if json_pose_callback:
260
+ json_pose_callback(encode_poses_as_json(poses, H, W))
261
+ return draw_poses(poses, H, W, draw_body=include_body, draw_hand=include_hand, draw_face=include_face)
262
+
extensions/microsoftexcel-controlnet/annotator/openpose/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (9.36 kB). View file
 
extensions/microsoftexcel-controlnet/annotator/openpose/__pycache__/body.cpython-310.pyc ADDED
Binary file (9.4 kB). View file
 
extensions/microsoftexcel-controlnet/annotator/openpose/__pycache__/face.cpython-310.pyc ADDED
Binary file (8.11 kB). View file
 
extensions/microsoftexcel-controlnet/annotator/openpose/__pycache__/hand.cpython-310.pyc ADDED
Binary file (3.18 kB). View file
 
extensions/microsoftexcel-controlnet/annotator/openpose/__pycache__/model.cpython-310.pyc ADDED
Binary file (6.22 kB). View file
 
extensions/microsoftexcel-controlnet/annotator/openpose/__pycache__/util.cpython-310.pyc ADDED
Binary file (11.6 kB). View file
 
extensions/microsoftexcel-controlnet/annotator/openpose/body.py ADDED
@@ -0,0 +1,278 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import numpy as np
3
+ import math
4
+ import time
5
+ from scipy.ndimage.filters import gaussian_filter
6
+ import matplotlib.pyplot as plt
7
+ import matplotlib
8
+ import torch
9
+ from torchvision import transforms
10
+ from typing import NamedTuple, List, Union
11
+
12
+ from . import util
13
+ from .model import bodypose_model
14
+
15
+ class Keypoint(NamedTuple):
16
+ x: float
17
+ y: float
18
+ score: float = 1.0
19
+ id: int = -1
20
+
21
+
22
+ class BodyResult(NamedTuple):
23
+ # Note: Using `Union` instead of `|` operator as the ladder is a Python
24
+ # 3.10 feature.
25
+ # Annotator code should be Python 3.8 Compatible, as controlnet repo uses
26
+ # Python 3.8 environment.
27
+ # https://github.com/lllyasviel/ControlNet/blob/d3284fcd0972c510635a4f5abe2eeb71dc0de524/environment.yaml#L6
28
+ keypoints: List[Union[Keypoint, None]]
29
+ total_score: float
30
+ total_parts: int
31
+
32
+
33
+ class Body(object):
34
+ def __init__(self, model_path):
35
+ self.model = bodypose_model()
36
+ # if torch.cuda.is_available():
37
+ # self.model = self.model.cuda()
38
+ # print('cuda')
39
+ model_dict = util.transfer(self.model, torch.load(model_path))
40
+ self.model.load_state_dict(model_dict)
41
+ self.model.eval()
42
+
43
+ def __call__(self, oriImg):
44
+ # scale_search = [0.5, 1.0, 1.5, 2.0]
45
+ scale_search = [0.5]
46
+ boxsize = 368
47
+ stride = 8
48
+ padValue = 128
49
+ thre1 = 0.1
50
+ thre2 = 0.05
51
+ multiplier = [x * boxsize / oriImg.shape[0] for x in scale_search]
52
+ heatmap_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 19))
53
+ paf_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 38))
54
+
55
+ for m in range(len(multiplier)):
56
+ scale = multiplier[m]
57
+ imageToTest = util.smart_resize_k(oriImg, fx=scale, fy=scale)
58
+ imageToTest_padded, pad = util.padRightDownCorner(imageToTest, stride, padValue)
59
+ im = np.transpose(np.float32(imageToTest_padded[:, :, :, np.newaxis]), (3, 2, 0, 1)) / 256 - 0.5
60
+ im = np.ascontiguousarray(im)
61
+
62
+ data = torch.from_numpy(im).float()
63
+ if torch.cuda.is_available():
64
+ data = data.cuda()
65
+ # data = data.permute([2, 0, 1]).unsqueeze(0).float()
66
+ with torch.no_grad():
67
+ data = data.to(self.cn_device)
68
+ Mconv7_stage6_L1, Mconv7_stage6_L2 = self.model(data)
69
+ Mconv7_stage6_L1 = Mconv7_stage6_L1.cpu().numpy()
70
+ Mconv7_stage6_L2 = Mconv7_stage6_L2.cpu().numpy()
71
+
72
+ # extract outputs, resize, and remove padding
73
+ # heatmap = np.transpose(np.squeeze(net.blobs[output_blobs.keys()[1]].data), (1, 2, 0)) # output 1 is heatmaps
74
+ heatmap = np.transpose(np.squeeze(Mconv7_stage6_L2), (1, 2, 0)) # output 1 is heatmaps
75
+ heatmap = util.smart_resize_k(heatmap, fx=stride, fy=stride)
76
+ heatmap = heatmap[:imageToTest_padded.shape[0] - pad[2], :imageToTest_padded.shape[1] - pad[3], :]
77
+ heatmap = util.smart_resize(heatmap, (oriImg.shape[0], oriImg.shape[1]))
78
+
79
+ # paf = np.transpose(np.squeeze(net.blobs[output_blobs.keys()[0]].data), (1, 2, 0)) # output 0 is PAFs
80
+ paf = np.transpose(np.squeeze(Mconv7_stage6_L1), (1, 2, 0)) # output 0 is PAFs
81
+ paf = util.smart_resize_k(paf, fx=stride, fy=stride)
82
+ paf = paf[:imageToTest_padded.shape[0] - pad[2], :imageToTest_padded.shape[1] - pad[3], :]
83
+ paf = util.smart_resize(paf, (oriImg.shape[0], oriImg.shape[1]))
84
+
85
+ heatmap_avg += heatmap_avg + heatmap / len(multiplier)
86
+ paf_avg += + paf / len(multiplier)
87
+
88
+ all_peaks = []
89
+ peak_counter = 0
90
+
91
+ for part in range(18):
92
+ map_ori = heatmap_avg[:, :, part]
93
+ one_heatmap = gaussian_filter(map_ori, sigma=3)
94
+
95
+ map_left = np.zeros(one_heatmap.shape)
96
+ map_left[1:, :] = one_heatmap[:-1, :]
97
+ map_right = np.zeros(one_heatmap.shape)
98
+ map_right[:-1, :] = one_heatmap[1:, :]
99
+ map_up = np.zeros(one_heatmap.shape)
100
+ map_up[:, 1:] = one_heatmap[:, :-1]
101
+ map_down = np.zeros(one_heatmap.shape)
102
+ map_down[:, :-1] = one_heatmap[:, 1:]
103
+
104
+ peaks_binary = np.logical_and.reduce(
105
+ (one_heatmap >= map_left, one_heatmap >= map_right, one_heatmap >= map_up, one_heatmap >= map_down, one_heatmap > thre1))
106
+ peaks = list(zip(np.nonzero(peaks_binary)[1], np.nonzero(peaks_binary)[0])) # note reverse
107
+ peaks_with_score = [x + (map_ori[x[1], x[0]],) for x in peaks]
108
+ peak_id = range(peak_counter, peak_counter + len(peaks))
109
+ peaks_with_score_and_id = [peaks_with_score[i] + (peak_id[i],) for i in range(len(peak_id))]
110
+
111
+ all_peaks.append(peaks_with_score_and_id)
112
+ peak_counter += len(peaks)
113
+
114
+ # find connection in the specified sequence, center 29 is in the position 15
115
+ limbSeq = [[2, 3], [2, 6], [3, 4], [4, 5], [6, 7], [7, 8], [2, 9], [9, 10], \
116
+ [10, 11], [2, 12], [12, 13], [13, 14], [2, 1], [1, 15], [15, 17], \
117
+ [1, 16], [16, 18], [3, 17], [6, 18]]
118
+ # the middle joints heatmap correpondence
119
+ mapIdx = [[31, 32], [39, 40], [33, 34], [35, 36], [41, 42], [43, 44], [19, 20], [21, 22], \
120
+ [23, 24], [25, 26], [27, 28], [29, 30], [47, 48], [49, 50], [53, 54], [51, 52], \
121
+ [55, 56], [37, 38], [45, 46]]
122
+
123
+ connection_all = []
124
+ special_k = []
125
+ mid_num = 10
126
+
127
+ for k in range(len(mapIdx)):
128
+ score_mid = paf_avg[:, :, [x - 19 for x in mapIdx[k]]]
129
+ candA = all_peaks[limbSeq[k][0] - 1]
130
+ candB = all_peaks[limbSeq[k][1] - 1]
131
+ nA = len(candA)
132
+ nB = len(candB)
133
+ indexA, indexB = limbSeq[k]
134
+ if (nA != 0 and nB != 0):
135
+ connection_candidate = []
136
+ for i in range(nA):
137
+ for j in range(nB):
138
+ vec = np.subtract(candB[j][:2], candA[i][:2])
139
+ norm = math.sqrt(vec[0] * vec[0] + vec[1] * vec[1])
140
+ norm = max(0.001, norm)
141
+ vec = np.divide(vec, norm)
142
+
143
+ startend = list(zip(np.linspace(candA[i][0], candB[j][0], num=mid_num), \
144
+ np.linspace(candA[i][1], candB[j][1], num=mid_num)))
145
+
146
+ vec_x = np.array([score_mid[int(round(startend[I][1])), int(round(startend[I][0])), 0] \
147
+ for I in range(len(startend))])
148
+ vec_y = np.array([score_mid[int(round(startend[I][1])), int(round(startend[I][0])), 1] \
149
+ for I in range(len(startend))])
150
+
151
+ score_midpts = np.multiply(vec_x, vec[0]) + np.multiply(vec_y, vec[1])
152
+ score_with_dist_prior = sum(score_midpts) / len(score_midpts) + min(
153
+ 0.5 * oriImg.shape[0] / norm - 1, 0)
154
+ criterion1 = len(np.nonzero(score_midpts > thre2)[0]) > 0.8 * len(score_midpts)
155
+ criterion2 = score_with_dist_prior > 0
156
+ if criterion1 and criterion2:
157
+ connection_candidate.append(
158
+ [i, j, score_with_dist_prior, score_with_dist_prior + candA[i][2] + candB[j][2]])
159
+
160
+ connection_candidate = sorted(connection_candidate, key=lambda x: x[2], reverse=True)
161
+ connection = np.zeros((0, 5))
162
+ for c in range(len(connection_candidate)):
163
+ i, j, s = connection_candidate[c][0:3]
164
+ if (i not in connection[:, 3] and j not in connection[:, 4]):
165
+ connection = np.vstack([connection, [candA[i][3], candB[j][3], s, i, j]])
166
+ if (len(connection) >= min(nA, nB)):
167
+ break
168
+
169
+ connection_all.append(connection)
170
+ else:
171
+ special_k.append(k)
172
+ connection_all.append([])
173
+
174
+ # last number in each row is the total parts number of that person
175
+ # the second last number in each row is the score of the overall configuration
176
+ subset = -1 * np.ones((0, 20))
177
+ candidate = np.array([item for sublist in all_peaks for item in sublist])
178
+
179
+ for k in range(len(mapIdx)):
180
+ if k not in special_k:
181
+ partAs = connection_all[k][:, 0]
182
+ partBs = connection_all[k][:, 1]
183
+ indexA, indexB = np.array(limbSeq[k]) - 1
184
+
185
+ for i in range(len(connection_all[k])): # = 1:size(temp,1)
186
+ found = 0
187
+ subset_idx = [-1, -1]
188
+ for j in range(len(subset)): # 1:size(subset,1):
189
+ if subset[j][indexA] == partAs[i] or subset[j][indexB] == partBs[i]:
190
+ subset_idx[found] = j
191
+ found += 1
192
+
193
+ if found == 1:
194
+ j = subset_idx[0]
195
+ if subset[j][indexB] != partBs[i]:
196
+ subset[j][indexB] = partBs[i]
197
+ subset[j][-1] += 1
198
+ subset[j][-2] += candidate[partBs[i].astype(int), 2] + connection_all[k][i][2]
199
+ elif found == 2: # if found 2 and disjoint, merge them
200
+ j1, j2 = subset_idx
201
+ membership = ((subset[j1] >= 0).astype(int) + (subset[j2] >= 0).astype(int))[:-2]
202
+ if len(np.nonzero(membership == 2)[0]) == 0: # merge
203
+ subset[j1][:-2] += (subset[j2][:-2] + 1)
204
+ subset[j1][-2:] += subset[j2][-2:]
205
+ subset[j1][-2] += connection_all[k][i][2]
206
+ subset = np.delete(subset, j2, 0)
207
+ else: # as like found == 1
208
+ subset[j1][indexB] = partBs[i]
209
+ subset[j1][-1] += 1
210
+ subset[j1][-2] += candidate[partBs[i].astype(int), 2] + connection_all[k][i][2]
211
+
212
+ # if find no partA in the subset, create a new subset
213
+ elif not found and k < 17:
214
+ row = -1 * np.ones(20)
215
+ row[indexA] = partAs[i]
216
+ row[indexB] = partBs[i]
217
+ row[-1] = 2
218
+ row[-2] = sum(candidate[connection_all[k][i, :2].astype(int), 2]) + connection_all[k][i][2]
219
+ subset = np.vstack([subset, row])
220
+ # delete some rows of subset which has few parts occur
221
+ deleteIdx = []
222
+ for i in range(len(subset)):
223
+ if subset[i][-1] < 4 or subset[i][-2] / subset[i][-1] < 0.4:
224
+ deleteIdx.append(i)
225
+ subset = np.delete(subset, deleteIdx, axis=0)
226
+
227
+ # subset: n*20 array, 0-17 is the index in candidate, 18 is the total score, 19 is the total parts
228
+ # candidate: x, y, score, id
229
+ return candidate, subset
230
+
231
+ @staticmethod
232
+ def format_body_result(candidate: np.ndarray, subset: np.ndarray) -> List[BodyResult]:
233
+ """
234
+ Format the body results from the candidate and subset arrays into a list of BodyResult objects.
235
+
236
+ Args:
237
+ candidate (np.ndarray): An array of candidates containing the x, y coordinates, score, and id
238
+ for each body part.
239
+ subset (np.ndarray): An array of subsets containing indices to the candidate array for each
240
+ person detected. The last two columns of each row hold the total score and total parts
241
+ of the person.
242
+
243
+ Returns:
244
+ List[BodyResult]: A list of BodyResult objects, where each object represents a person with
245
+ detected keypoints, total score, and total parts.
246
+ """
247
+ return [
248
+ BodyResult(
249
+ keypoints=[
250
+ Keypoint(
251
+ x=candidate[candidate_index][0],
252
+ y=candidate[candidate_index][1],
253
+ score=candidate[candidate_index][2],
254
+ id=candidate[candidate_index][3]
255
+ ) if candidate_index != -1 else None
256
+ for candidate_index in person[:18].astype(int)
257
+ ],
258
+ total_score=person[18],
259
+ total_parts=person[19]
260
+ )
261
+ for person in subset
262
+ ]
263
+
264
+
265
+ if __name__ == "__main__":
266
+ body_estimation = Body('../model/body_pose_model.pth')
267
+
268
+ test_image = '../images/ski.jpg'
269
+ oriImg = cv2.imread(test_image) # B,G,R order
270
+ candidate, subset = body_estimation(oriImg)
271
+ bodies = body_estimation.format_body_result(candidate, subset)
272
+
273
+ canvas = oriImg
274
+ for body in bodies:
275
+ canvas = util.draw_bodypose(canvas, body)
276
+
277
+ plt.imshow(canvas[:, :, [2, 1, 0]])
278
+ plt.show()
extensions/microsoftexcel-controlnet/annotator/openpose/face.py ADDED
@@ -0,0 +1,362 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import numpy as np
3
+ from torchvision.transforms import ToTensor, ToPILImage
4
+ import torch
5
+ import torch.nn.functional as F
6
+ import cv2
7
+
8
+ from . import util
9
+ from torch.nn import Conv2d, Module, ReLU, MaxPool2d, init
10
+
11
+
12
+ class FaceNet(Module):
13
+ """Model the cascading heatmaps. """
14
+ def __init__(self):
15
+ super(FaceNet, self).__init__()
16
+ # cnn to make feature map
17
+ self.relu = ReLU()
18
+ self.max_pooling_2d = MaxPool2d(kernel_size=2, stride=2)
19
+ self.conv1_1 = Conv2d(in_channels=3, out_channels=64,
20
+ kernel_size=3, stride=1, padding=1)
21
+ self.conv1_2 = Conv2d(
22
+ in_channels=64, out_channels=64, kernel_size=3, stride=1,
23
+ padding=1)
24
+ self.conv2_1 = Conv2d(
25
+ in_channels=64, out_channels=128, kernel_size=3, stride=1,
26
+ padding=1)
27
+ self.conv2_2 = Conv2d(
28
+ in_channels=128, out_channels=128, kernel_size=3, stride=1,
29
+ padding=1)
30
+ self.conv3_1 = Conv2d(
31
+ in_channels=128, out_channels=256, kernel_size=3, stride=1,
32
+ padding=1)
33
+ self.conv3_2 = Conv2d(
34
+ in_channels=256, out_channels=256, kernel_size=3, stride=1,
35
+ padding=1)
36
+ self.conv3_3 = Conv2d(
37
+ in_channels=256, out_channels=256, kernel_size=3, stride=1,
38
+ padding=1)
39
+ self.conv3_4 = Conv2d(
40
+ in_channels=256, out_channels=256, kernel_size=3, stride=1,
41
+ padding=1)
42
+ self.conv4_1 = Conv2d(
43
+ in_channels=256, out_channels=512, kernel_size=3, stride=1,
44
+ padding=1)
45
+ self.conv4_2 = Conv2d(
46
+ in_channels=512, out_channels=512, kernel_size=3, stride=1,
47
+ padding=1)
48
+ self.conv4_3 = Conv2d(
49
+ in_channels=512, out_channels=512, kernel_size=3, stride=1,
50
+ padding=1)
51
+ self.conv4_4 = Conv2d(
52
+ in_channels=512, out_channels=512, kernel_size=3, stride=1,
53
+ padding=1)
54
+ self.conv5_1 = Conv2d(
55
+ in_channels=512, out_channels=512, kernel_size=3, stride=1,
56
+ padding=1)
57
+ self.conv5_2 = Conv2d(
58
+ in_channels=512, out_channels=512, kernel_size=3, stride=1,
59
+ padding=1)
60
+ self.conv5_3_CPM = Conv2d(
61
+ in_channels=512, out_channels=128, kernel_size=3, stride=1,
62
+ padding=1)
63
+
64
+ # stage1
65
+ self.conv6_1_CPM = Conv2d(
66
+ in_channels=128, out_channels=512, kernel_size=1, stride=1,
67
+ padding=0)
68
+ self.conv6_2_CPM = Conv2d(
69
+ in_channels=512, out_channels=71, kernel_size=1, stride=1,
70
+ padding=0)
71
+
72
+ # stage2
73
+ self.Mconv1_stage2 = Conv2d(
74
+ in_channels=199, out_channels=128, kernel_size=7, stride=1,
75
+ padding=3)
76
+ self.Mconv2_stage2 = Conv2d(
77
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
78
+ padding=3)
79
+ self.Mconv3_stage2 = Conv2d(
80
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
81
+ padding=3)
82
+ self.Mconv4_stage2 = Conv2d(
83
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
84
+ padding=3)
85
+ self.Mconv5_stage2 = Conv2d(
86
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
87
+ padding=3)
88
+ self.Mconv6_stage2 = Conv2d(
89
+ in_channels=128, out_channels=128, kernel_size=1, stride=1,
90
+ padding=0)
91
+ self.Mconv7_stage2 = Conv2d(
92
+ in_channels=128, out_channels=71, kernel_size=1, stride=1,
93
+ padding=0)
94
+
95
+ # stage3
96
+ self.Mconv1_stage3 = Conv2d(
97
+ in_channels=199, out_channels=128, kernel_size=7, stride=1,
98
+ padding=3)
99
+ self.Mconv2_stage3 = Conv2d(
100
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
101
+ padding=3)
102
+ self.Mconv3_stage3 = Conv2d(
103
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
104
+ padding=3)
105
+ self.Mconv4_stage3 = Conv2d(
106
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
107
+ padding=3)
108
+ self.Mconv5_stage3 = Conv2d(
109
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
110
+ padding=3)
111
+ self.Mconv6_stage3 = Conv2d(
112
+ in_channels=128, out_channels=128, kernel_size=1, stride=1,
113
+ padding=0)
114
+ self.Mconv7_stage3 = Conv2d(
115
+ in_channels=128, out_channels=71, kernel_size=1, stride=1,
116
+ padding=0)
117
+
118
+ # stage4
119
+ self.Mconv1_stage4 = Conv2d(
120
+ in_channels=199, out_channels=128, kernel_size=7, stride=1,
121
+ padding=3)
122
+ self.Mconv2_stage4 = Conv2d(
123
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
124
+ padding=3)
125
+ self.Mconv3_stage4 = Conv2d(
126
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
127
+ padding=3)
128
+ self.Mconv4_stage4 = Conv2d(
129
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
130
+ padding=3)
131
+ self.Mconv5_stage4 = Conv2d(
132
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
133
+ padding=3)
134
+ self.Mconv6_stage4 = Conv2d(
135
+ in_channels=128, out_channels=128, kernel_size=1, stride=1,
136
+ padding=0)
137
+ self.Mconv7_stage4 = Conv2d(
138
+ in_channels=128, out_channels=71, kernel_size=1, stride=1,
139
+ padding=0)
140
+
141
+ # stage5
142
+ self.Mconv1_stage5 = Conv2d(
143
+ in_channels=199, out_channels=128, kernel_size=7, stride=1,
144
+ padding=3)
145
+ self.Mconv2_stage5 = Conv2d(
146
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
147
+ padding=3)
148
+ self.Mconv3_stage5 = Conv2d(
149
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
150
+ padding=3)
151
+ self.Mconv4_stage5 = Conv2d(
152
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
153
+ padding=3)
154
+ self.Mconv5_stage5 = Conv2d(
155
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
156
+ padding=3)
157
+ self.Mconv6_stage5 = Conv2d(
158
+ in_channels=128, out_channels=128, kernel_size=1, stride=1,
159
+ padding=0)
160
+ self.Mconv7_stage5 = Conv2d(
161
+ in_channels=128, out_channels=71, kernel_size=1, stride=1,
162
+ padding=0)
163
+
164
+ # stage6
165
+ self.Mconv1_stage6 = Conv2d(
166
+ in_channels=199, out_channels=128, kernel_size=7, stride=1,
167
+ padding=3)
168
+ self.Mconv2_stage6 = Conv2d(
169
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
170
+ padding=3)
171
+ self.Mconv3_stage6 = Conv2d(
172
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
173
+ padding=3)
174
+ self.Mconv4_stage6 = Conv2d(
175
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
176
+ padding=3)
177
+ self.Mconv5_stage6 = Conv2d(
178
+ in_channels=128, out_channels=128, kernel_size=7, stride=1,
179
+ padding=3)
180
+ self.Mconv6_stage6 = Conv2d(
181
+ in_channels=128, out_channels=128, kernel_size=1, stride=1,
182
+ padding=0)
183
+ self.Mconv7_stage6 = Conv2d(
184
+ in_channels=128, out_channels=71, kernel_size=1, stride=1,
185
+ padding=0)
186
+
187
+ for m in self.modules():
188
+ if isinstance(m, Conv2d):
189
+ init.constant_(m.bias, 0)
190
+
191
+ def forward(self, x):
192
+ """Return a list of heatmaps."""
193
+ heatmaps = []
194
+
195
+ h = self.relu(self.conv1_1(x))
196
+ h = self.relu(self.conv1_2(h))
197
+ h = self.max_pooling_2d(h)
198
+ h = self.relu(self.conv2_1(h))
199
+ h = self.relu(self.conv2_2(h))
200
+ h = self.max_pooling_2d(h)
201
+ h = self.relu(self.conv3_1(h))
202
+ h = self.relu(self.conv3_2(h))
203
+ h = self.relu(self.conv3_3(h))
204
+ h = self.relu(self.conv3_4(h))
205
+ h = self.max_pooling_2d(h)
206
+ h = self.relu(self.conv4_1(h))
207
+ h = self.relu(self.conv4_2(h))
208
+ h = self.relu(self.conv4_3(h))
209
+ h = self.relu(self.conv4_4(h))
210
+ h = self.relu(self.conv5_1(h))
211
+ h = self.relu(self.conv5_2(h))
212
+ h = self.relu(self.conv5_3_CPM(h))
213
+ feature_map = h
214
+
215
+ # stage1
216
+ h = self.relu(self.conv6_1_CPM(h))
217
+ h = self.conv6_2_CPM(h)
218
+ heatmaps.append(h)
219
+
220
+ # stage2
221
+ h = torch.cat([h, feature_map], dim=1) # channel concat
222
+ h = self.relu(self.Mconv1_stage2(h))
223
+ h = self.relu(self.Mconv2_stage2(h))
224
+ h = self.relu(self.Mconv3_stage2(h))
225
+ h = self.relu(self.Mconv4_stage2(h))
226
+ h = self.relu(self.Mconv5_stage2(h))
227
+ h = self.relu(self.Mconv6_stage2(h))
228
+ h = self.Mconv7_stage2(h)
229
+ heatmaps.append(h)
230
+
231
+ # stage3
232
+ h = torch.cat([h, feature_map], dim=1) # channel concat
233
+ h = self.relu(self.Mconv1_stage3(h))
234
+ h = self.relu(self.Mconv2_stage3(h))
235
+ h = self.relu(self.Mconv3_stage3(h))
236
+ h = self.relu(self.Mconv4_stage3(h))
237
+ h = self.relu(self.Mconv5_stage3(h))
238
+ h = self.relu(self.Mconv6_stage3(h))
239
+ h = self.Mconv7_stage3(h)
240
+ heatmaps.append(h)
241
+
242
+ # stage4
243
+ h = torch.cat([h, feature_map], dim=1) # channel concat
244
+ h = self.relu(self.Mconv1_stage4(h))
245
+ h = self.relu(self.Mconv2_stage4(h))
246
+ h = self.relu(self.Mconv3_stage4(h))
247
+ h = self.relu(self.Mconv4_stage4(h))
248
+ h = self.relu(self.Mconv5_stage4(h))
249
+ h = self.relu(self.Mconv6_stage4(h))
250
+ h = self.Mconv7_stage4(h)
251
+ heatmaps.append(h)
252
+
253
+ # stage5
254
+ h = torch.cat([h, feature_map], dim=1) # channel concat
255
+ h = self.relu(self.Mconv1_stage5(h))
256
+ h = self.relu(self.Mconv2_stage5(h))
257
+ h = self.relu(self.Mconv3_stage5(h))
258
+ h = self.relu(self.Mconv4_stage5(h))
259
+ h = self.relu(self.Mconv5_stage5(h))
260
+ h = self.relu(self.Mconv6_stage5(h))
261
+ h = self.Mconv7_stage5(h)
262
+ heatmaps.append(h)
263
+
264
+ # stage6
265
+ h = torch.cat([h, feature_map], dim=1) # channel concat
266
+ h = self.relu(self.Mconv1_stage6(h))
267
+ h = self.relu(self.Mconv2_stage6(h))
268
+ h = self.relu(self.Mconv3_stage6(h))
269
+ h = self.relu(self.Mconv4_stage6(h))
270
+ h = self.relu(self.Mconv5_stage6(h))
271
+ h = self.relu(self.Mconv6_stage6(h))
272
+ h = self.Mconv7_stage6(h)
273
+ heatmaps.append(h)
274
+
275
+ return heatmaps
276
+
277
+
278
+ LOG = logging.getLogger(__name__)
279
+ TOTEN = ToTensor()
280
+ TOPIL = ToPILImage()
281
+
282
+
283
+ params = {
284
+ 'gaussian_sigma': 2.5,
285
+ 'inference_img_size': 736, # 368, 736, 1312
286
+ 'heatmap_peak_thresh': 0.1,
287
+ 'crop_scale': 1.5,
288
+ 'line_indices': [
289
+ [0, 1], [1, 2], [2, 3], [3, 4], [4, 5], [5, 6],
290
+ [6, 7], [7, 8], [8, 9], [9, 10], [10, 11], [11, 12], [12, 13],
291
+ [13, 14], [14, 15], [15, 16],
292
+ [17, 18], [18, 19], [19, 20], [20, 21],
293
+ [22, 23], [23, 24], [24, 25], [25, 26],
294
+ [27, 28], [28, 29], [29, 30],
295
+ [31, 32], [32, 33], [33, 34], [34, 35],
296
+ [36, 37], [37, 38], [38, 39], [39, 40], [40, 41], [41, 36],
297
+ [42, 43], [43, 44], [44, 45], [45, 46], [46, 47], [47, 42],
298
+ [48, 49], [49, 50], [50, 51], [51, 52], [52, 53], [53, 54],
299
+ [54, 55], [55, 56], [56, 57], [57, 58], [58, 59], [59, 48],
300
+ [60, 61], [61, 62], [62, 63], [63, 64], [64, 65], [65, 66],
301
+ [66, 67], [67, 60]
302
+ ],
303
+ }
304
+
305
+
306
+ class Face(object):
307
+ """
308
+ The OpenPose face landmark detector model.
309
+
310
+ Args:
311
+ inference_size: set the size of the inference image size, suggested:
312
+ 368, 736, 1312, default 736
313
+ gaussian_sigma: blur the heatmaps, default 2.5
314
+ heatmap_peak_thresh: return landmark if over threshold, default 0.1
315
+
316
+ """
317
+ def __init__(self, face_model_path,
318
+ inference_size=None,
319
+ gaussian_sigma=None,
320
+ heatmap_peak_thresh=None):
321
+ self.inference_size = inference_size or params["inference_img_size"]
322
+ self.sigma = gaussian_sigma or params['gaussian_sigma']
323
+ self.threshold = heatmap_peak_thresh or params["heatmap_peak_thresh"]
324
+ self.model = FaceNet()
325
+ self.model.load_state_dict(torch.load(face_model_path))
326
+ # if torch.cuda.is_available():
327
+ # self.model = self.model.cuda()
328
+ # print('cuda')
329
+ self.model.eval()
330
+
331
+ def __call__(self, face_img):
332
+ H, W, C = face_img.shape
333
+
334
+ w_size = 384
335
+ x_data = torch.from_numpy(util.smart_resize(face_img, (w_size, w_size))).permute([2, 0, 1]) / 256.0 - 0.5
336
+
337
+ x_data = x_data.to(self.cn_device)
338
+
339
+ with torch.no_grad():
340
+ hs = self.model(x_data[None, ...])
341
+ heatmaps = F.interpolate(
342
+ hs[-1],
343
+ (H, W),
344
+ mode='bilinear', align_corners=True).cpu().numpy()[0]
345
+ return heatmaps
346
+
347
+ def compute_peaks_from_heatmaps(self, heatmaps):
348
+ all_peaks = []
349
+ for part in range(heatmaps.shape[0]):
350
+ map_ori = heatmaps[part].copy()
351
+ binary = np.ascontiguousarray(map_ori > 0.05, dtype=np.uint8)
352
+
353
+ if np.sum(binary) == 0:
354
+ continue
355
+
356
+ positions = np.where(binary > 0.5)
357
+ intensities = map_ori[positions]
358
+ mi = np.argmax(intensities)
359
+ y, x = positions[0][mi], positions[1][mi]
360
+ all_peaks.append([x, y])
361
+
362
+ return np.array(all_peaks)
extensions/microsoftexcel-controlnet/annotator/openpose/hand.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import json
3
+ import numpy as np
4
+ import math
5
+ import time
6
+ from scipy.ndimage.filters import gaussian_filter
7
+ import matplotlib.pyplot as plt
8
+ import matplotlib
9
+ import torch
10
+ from skimage.measure import label
11
+
12
+ from .model import handpose_model
13
+ from . import util
14
+
15
+ class Hand(object):
16
+ def __init__(self, model_path):
17
+ self.model = handpose_model()
18
+ # if torch.cuda.is_available():
19
+ # self.model = self.model.cuda()
20
+ # print('cuda')
21
+ model_dict = util.transfer(self.model, torch.load(model_path))
22
+ self.model.load_state_dict(model_dict)
23
+ self.model.eval()
24
+
25
+ def __call__(self, oriImgRaw):
26
+ scale_search = [0.5, 1.0, 1.5, 2.0]
27
+ # scale_search = [0.5]
28
+ boxsize = 368
29
+ stride = 8
30
+ padValue = 128
31
+ thre = 0.05
32
+ multiplier = [x * boxsize for x in scale_search]
33
+
34
+ wsize = 128
35
+ heatmap_avg = np.zeros((wsize, wsize, 22))
36
+
37
+ Hr, Wr, Cr = oriImgRaw.shape
38
+
39
+ oriImg = cv2.GaussianBlur(oriImgRaw, (0, 0), 0.8)
40
+
41
+ for m in range(len(multiplier)):
42
+ scale = multiplier[m]
43
+ imageToTest = util.smart_resize(oriImg, (scale, scale))
44
+
45
+ imageToTest_padded, pad = util.padRightDownCorner(imageToTest, stride, padValue)
46
+ im = np.transpose(np.float32(imageToTest_padded[:, :, :, np.newaxis]), (3, 2, 0, 1)) / 256 - 0.5
47
+ im = np.ascontiguousarray(im)
48
+
49
+ data = torch.from_numpy(im).float()
50
+ if torch.cuda.is_available():
51
+ data = data.cuda()
52
+
53
+ with torch.no_grad():
54
+ data = data.to(self.cn_device)
55
+ output = self.model(data).cpu().numpy()
56
+
57
+ # extract outputs, resize, and remove padding
58
+ heatmap = np.transpose(np.squeeze(output), (1, 2, 0)) # output 1 is heatmaps
59
+ heatmap = util.smart_resize_k(heatmap, fx=stride, fy=stride)
60
+ heatmap = heatmap[:imageToTest_padded.shape[0] - pad[2], :imageToTest_padded.shape[1] - pad[3], :]
61
+ heatmap = util.smart_resize(heatmap, (wsize, wsize))
62
+
63
+ heatmap_avg += heatmap / len(multiplier)
64
+
65
+ all_peaks = []
66
+ for part in range(21):
67
+ map_ori = heatmap_avg[:, :, part]
68
+ one_heatmap = gaussian_filter(map_ori, sigma=3)
69
+ binary = np.ascontiguousarray(one_heatmap > thre, dtype=np.uint8)
70
+
71
+ if np.sum(binary) == 0:
72
+ all_peaks.append([0, 0])
73
+ continue
74
+ label_img, label_numbers = label(binary, return_num=True, connectivity=binary.ndim)
75
+ max_index = np.argmax([np.sum(map_ori[label_img == i]) for i in range(1, label_numbers + 1)]) + 1
76
+ label_img[label_img != max_index] = 0
77
+ map_ori[label_img == 0] = 0
78
+
79
+ y, x = util.npmax(map_ori)
80
+ y = int(float(y) * float(Hr) / float(wsize))
81
+ x = int(float(x) * float(Wr) / float(wsize))
82
+ all_peaks.append([x, y])
83
+ return np.array(all_peaks)
84
+
85
+ if __name__ == "__main__":
86
+ hand_estimation = Hand('../model/hand_pose_model.pth')
87
+
88
+ # test_image = '../images/hand.jpg'
89
+ test_image = '../images/hand.jpg'
90
+ oriImg = cv2.imread(test_image) # B,G,R order
91
+ peaks = hand_estimation(oriImg)
92
+ canvas = util.draw_handpose(oriImg, peaks, True)
93
+ cv2.imshow('', canvas)
94
+ cv2.waitKey(0)
extensions/microsoftexcel-controlnet/annotator/openpose/model.py ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from collections import OrderedDict
3
+
4
+ import torch
5
+ import torch.nn as nn
6
+
7
+ def make_layers(block, no_relu_layers):
8
+ layers = []
9
+ for layer_name, v in block.items():
10
+ if 'pool' in layer_name:
11
+ layer = nn.MaxPool2d(kernel_size=v[0], stride=v[1],
12
+ padding=v[2])
13
+ layers.append((layer_name, layer))
14
+ else:
15
+ conv2d = nn.Conv2d(in_channels=v[0], out_channels=v[1],
16
+ kernel_size=v[2], stride=v[3],
17
+ padding=v[4])
18
+ layers.append((layer_name, conv2d))
19
+ if layer_name not in no_relu_layers:
20
+ layers.append(('relu_'+layer_name, nn.ReLU(inplace=True)))
21
+
22
+ return nn.Sequential(OrderedDict(layers))
23
+
24
+ class bodypose_model(nn.Module):
25
+ def __init__(self):
26
+ super(bodypose_model, self).__init__()
27
+
28
+ # these layers have no relu layer
29
+ no_relu_layers = ['conv5_5_CPM_L1', 'conv5_5_CPM_L2', 'Mconv7_stage2_L1',\
30
+ 'Mconv7_stage2_L2', 'Mconv7_stage3_L1', 'Mconv7_stage3_L2',\
31
+ 'Mconv7_stage4_L1', 'Mconv7_stage4_L2', 'Mconv7_stage5_L1',\
32
+ 'Mconv7_stage5_L2', 'Mconv7_stage6_L1', 'Mconv7_stage6_L1']
33
+ blocks = {}
34
+ block0 = OrderedDict([
35
+ ('conv1_1', [3, 64, 3, 1, 1]),
36
+ ('conv1_2', [64, 64, 3, 1, 1]),
37
+ ('pool1_stage1', [2, 2, 0]),
38
+ ('conv2_1', [64, 128, 3, 1, 1]),
39
+ ('conv2_2', [128, 128, 3, 1, 1]),
40
+ ('pool2_stage1', [2, 2, 0]),
41
+ ('conv3_1', [128, 256, 3, 1, 1]),
42
+ ('conv3_2', [256, 256, 3, 1, 1]),
43
+ ('conv3_3', [256, 256, 3, 1, 1]),
44
+ ('conv3_4', [256, 256, 3, 1, 1]),
45
+ ('pool3_stage1', [2, 2, 0]),
46
+ ('conv4_1', [256, 512, 3, 1, 1]),
47
+ ('conv4_2', [512, 512, 3, 1, 1]),
48
+ ('conv4_3_CPM', [512, 256, 3, 1, 1]),
49
+ ('conv4_4_CPM', [256, 128, 3, 1, 1])
50
+ ])
51
+
52
+
53
+ # Stage 1
54
+ block1_1 = OrderedDict([
55
+ ('conv5_1_CPM_L1', [128, 128, 3, 1, 1]),
56
+ ('conv5_2_CPM_L1', [128, 128, 3, 1, 1]),
57
+ ('conv5_3_CPM_L1', [128, 128, 3, 1, 1]),
58
+ ('conv5_4_CPM_L1', [128, 512, 1, 1, 0]),
59
+ ('conv5_5_CPM_L1', [512, 38, 1, 1, 0])
60
+ ])
61
+
62
+ block1_2 = OrderedDict([
63
+ ('conv5_1_CPM_L2', [128, 128, 3, 1, 1]),
64
+ ('conv5_2_CPM_L2', [128, 128, 3, 1, 1]),
65
+ ('conv5_3_CPM_L2', [128, 128, 3, 1, 1]),
66
+ ('conv5_4_CPM_L2', [128, 512, 1, 1, 0]),
67
+ ('conv5_5_CPM_L2', [512, 19, 1, 1, 0])
68
+ ])
69
+ blocks['block1_1'] = block1_1
70
+ blocks['block1_2'] = block1_2
71
+
72
+ self.model0 = make_layers(block0, no_relu_layers)
73
+
74
+ # Stages 2 - 6
75
+ for i in range(2, 7):
76
+ blocks['block%d_1' % i] = OrderedDict([
77
+ ('Mconv1_stage%d_L1' % i, [185, 128, 7, 1, 3]),
78
+ ('Mconv2_stage%d_L1' % i, [128, 128, 7, 1, 3]),
79
+ ('Mconv3_stage%d_L1' % i, [128, 128, 7, 1, 3]),
80
+ ('Mconv4_stage%d_L1' % i, [128, 128, 7, 1, 3]),
81
+ ('Mconv5_stage%d_L1' % i, [128, 128, 7, 1, 3]),
82
+ ('Mconv6_stage%d_L1' % i, [128, 128, 1, 1, 0]),
83
+ ('Mconv7_stage%d_L1' % i, [128, 38, 1, 1, 0])
84
+ ])
85
+
86
+ blocks['block%d_2' % i] = OrderedDict([
87
+ ('Mconv1_stage%d_L2' % i, [185, 128, 7, 1, 3]),
88
+ ('Mconv2_stage%d_L2' % i, [128, 128, 7, 1, 3]),
89
+ ('Mconv3_stage%d_L2' % i, [128, 128, 7, 1, 3]),
90
+ ('Mconv4_stage%d_L2' % i, [128, 128, 7, 1, 3]),
91
+ ('Mconv5_stage%d_L2' % i, [128, 128, 7, 1, 3]),
92
+ ('Mconv6_stage%d_L2' % i, [128, 128, 1, 1, 0]),
93
+ ('Mconv7_stage%d_L2' % i, [128, 19, 1, 1, 0])
94
+ ])
95
+
96
+ for k in blocks.keys():
97
+ blocks[k] = make_layers(blocks[k], no_relu_layers)
98
+
99
+ self.model1_1 = blocks['block1_1']
100
+ self.model2_1 = blocks['block2_1']
101
+ self.model3_1 = blocks['block3_1']
102
+ self.model4_1 = blocks['block4_1']
103
+ self.model5_1 = blocks['block5_1']
104
+ self.model6_1 = blocks['block6_1']
105
+
106
+ self.model1_2 = blocks['block1_2']
107
+ self.model2_2 = blocks['block2_2']
108
+ self.model3_2 = blocks['block3_2']
109
+ self.model4_2 = blocks['block4_2']
110
+ self.model5_2 = blocks['block5_2']
111
+ self.model6_2 = blocks['block6_2']
112
+
113
+
114
+ def forward(self, x):
115
+
116
+ out1 = self.model0(x)
117
+
118
+ out1_1 = self.model1_1(out1)
119
+ out1_2 = self.model1_2(out1)
120
+ out2 = torch.cat([out1_1, out1_2, out1], 1)
121
+
122
+ out2_1 = self.model2_1(out2)
123
+ out2_2 = self.model2_2(out2)
124
+ out3 = torch.cat([out2_1, out2_2, out1], 1)
125
+
126
+ out3_1 = self.model3_1(out3)
127
+ out3_2 = self.model3_2(out3)
128
+ out4 = torch.cat([out3_1, out3_2, out1], 1)
129
+
130
+ out4_1 = self.model4_1(out4)
131
+ out4_2 = self.model4_2(out4)
132
+ out5 = torch.cat([out4_1, out4_2, out1], 1)
133
+
134
+ out5_1 = self.model5_1(out5)
135
+ out5_2 = self.model5_2(out5)
136
+ out6 = torch.cat([out5_1, out5_2, out1], 1)
137
+
138
+ out6_1 = self.model6_1(out6)
139
+ out6_2 = self.model6_2(out6)
140
+
141
+ return out6_1, out6_2
142
+
143
+ class handpose_model(nn.Module):
144
+ def __init__(self):
145
+ super(handpose_model, self).__init__()
146
+
147
+ # these layers have no relu layer
148
+ no_relu_layers = ['conv6_2_CPM', 'Mconv7_stage2', 'Mconv7_stage3',\
149
+ 'Mconv7_stage4', 'Mconv7_stage5', 'Mconv7_stage6']
150
+ # stage 1
151
+ block1_0 = OrderedDict([
152
+ ('conv1_1', [3, 64, 3, 1, 1]),
153
+ ('conv1_2', [64, 64, 3, 1, 1]),
154
+ ('pool1_stage1', [2, 2, 0]),
155
+ ('conv2_1', [64, 128, 3, 1, 1]),
156
+ ('conv2_2', [128, 128, 3, 1, 1]),
157
+ ('pool2_stage1', [2, 2, 0]),
158
+ ('conv3_1', [128, 256, 3, 1, 1]),
159
+ ('conv3_2', [256, 256, 3, 1, 1]),
160
+ ('conv3_3', [256, 256, 3, 1, 1]),
161
+ ('conv3_4', [256, 256, 3, 1, 1]),
162
+ ('pool3_stage1', [2, 2, 0]),
163
+ ('conv4_1', [256, 512, 3, 1, 1]),
164
+ ('conv4_2', [512, 512, 3, 1, 1]),
165
+ ('conv4_3', [512, 512, 3, 1, 1]),
166
+ ('conv4_4', [512, 512, 3, 1, 1]),
167
+ ('conv5_1', [512, 512, 3, 1, 1]),
168
+ ('conv5_2', [512, 512, 3, 1, 1]),
169
+ ('conv5_3_CPM', [512, 128, 3, 1, 1])
170
+ ])
171
+
172
+ block1_1 = OrderedDict([
173
+ ('conv6_1_CPM', [128, 512, 1, 1, 0]),
174
+ ('conv6_2_CPM', [512, 22, 1, 1, 0])
175
+ ])
176
+
177
+ blocks = {}
178
+ blocks['block1_0'] = block1_0
179
+ blocks['block1_1'] = block1_1
180
+
181
+ # stage 2-6
182
+ for i in range(2, 7):
183
+ blocks['block%d' % i] = OrderedDict([
184
+ ('Mconv1_stage%d' % i, [150, 128, 7, 1, 3]),
185
+ ('Mconv2_stage%d' % i, [128, 128, 7, 1, 3]),
186
+ ('Mconv3_stage%d' % i, [128, 128, 7, 1, 3]),
187
+ ('Mconv4_stage%d' % i, [128, 128, 7, 1, 3]),
188
+ ('Mconv5_stage%d' % i, [128, 128, 7, 1, 3]),
189
+ ('Mconv6_stage%d' % i, [128, 128, 1, 1, 0]),
190
+ ('Mconv7_stage%d' % i, [128, 22, 1, 1, 0])
191
+ ])
192
+
193
+ for k in blocks.keys():
194
+ blocks[k] = make_layers(blocks[k], no_relu_layers)
195
+
196
+ self.model1_0 = blocks['block1_0']
197
+ self.model1_1 = blocks['block1_1']
198
+ self.model2 = blocks['block2']
199
+ self.model3 = blocks['block3']
200
+ self.model4 = blocks['block4']
201
+ self.model5 = blocks['block5']
202
+ self.model6 = blocks['block6']
203
+
204
+ def forward(self, x):
205
+ out1_0 = self.model1_0(x)
206
+ out1_1 = self.model1_1(out1_0)
207
+ concat_stage2 = torch.cat([out1_1, out1_0], 1)
208
+ out_stage2 = self.model2(concat_stage2)
209
+ concat_stage3 = torch.cat([out_stage2, out1_0], 1)
210
+ out_stage3 = self.model3(concat_stage3)
211
+ concat_stage4 = torch.cat([out_stage3, out1_0], 1)
212
+ out_stage4 = self.model4(concat_stage4)
213
+ concat_stage5 = torch.cat([out_stage4, out1_0], 1)
214
+ out_stage5 = self.model5(concat_stage5)
215
+ concat_stage6 = torch.cat([out_stage5, out1_0], 1)
216
+ out_stage6 = self.model6(concat_stage6)
217
+ return out_stage6
218
+
extensions/microsoftexcel-controlnet/annotator/openpose/util.py ADDED
@@ -0,0 +1,383 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import numpy as np
3
+ import matplotlib
4
+ import cv2
5
+ from typing import List, Tuple, Union
6
+
7
+ from .body import BodyResult, Keypoint
8
+
9
+ eps = 0.01
10
+
11
+
12
+ def smart_resize(x, s):
13
+ Ht, Wt = s
14
+ if x.ndim == 2:
15
+ Ho, Wo = x.shape
16
+ Co = 1
17
+ else:
18
+ Ho, Wo, Co = x.shape
19
+ if Co == 3 or Co == 1:
20
+ k = float(Ht + Wt) / float(Ho + Wo)
21
+ return cv2.resize(x, (int(Wt), int(Ht)), interpolation=cv2.INTER_AREA if k < 1 else cv2.INTER_LANCZOS4)
22
+ else:
23
+ return np.stack([smart_resize(x[:, :, i], s) for i in range(Co)], axis=2)
24
+
25
+
26
+ def smart_resize_k(x, fx, fy):
27
+ if x.ndim == 2:
28
+ Ho, Wo = x.shape
29
+ Co = 1
30
+ else:
31
+ Ho, Wo, Co = x.shape
32
+ Ht, Wt = Ho * fy, Wo * fx
33
+ if Co == 3 or Co == 1:
34
+ k = float(Ht + Wt) / float(Ho + Wo)
35
+ return cv2.resize(x, (int(Wt), int(Ht)), interpolation=cv2.INTER_AREA if k < 1 else cv2.INTER_LANCZOS4)
36
+ else:
37
+ return np.stack([smart_resize_k(x[:, :, i], fx, fy) for i in range(Co)], axis=2)
38
+
39
+
40
+ def padRightDownCorner(img, stride, padValue):
41
+ h = img.shape[0]
42
+ w = img.shape[1]
43
+
44
+ pad = 4 * [None]
45
+ pad[0] = 0 # up
46
+ pad[1] = 0 # left
47
+ pad[2] = 0 if (h % stride == 0) else stride - (h % stride) # down
48
+ pad[3] = 0 if (w % stride == 0) else stride - (w % stride) # right
49
+
50
+ img_padded = img
51
+ pad_up = np.tile(img_padded[0:1, :, :]*0 + padValue, (pad[0], 1, 1))
52
+ img_padded = np.concatenate((pad_up, img_padded), axis=0)
53
+ pad_left = np.tile(img_padded[:, 0:1, :]*0 + padValue, (1, pad[1], 1))
54
+ img_padded = np.concatenate((pad_left, img_padded), axis=1)
55
+ pad_down = np.tile(img_padded[-2:-1, :, :]*0 + padValue, (pad[2], 1, 1))
56
+ img_padded = np.concatenate((img_padded, pad_down), axis=0)
57
+ pad_right = np.tile(img_padded[:, -2:-1, :]*0 + padValue, (1, pad[3], 1))
58
+ img_padded = np.concatenate((img_padded, pad_right), axis=1)
59
+
60
+ return img_padded, pad
61
+
62
+
63
+ def transfer(model, model_weights):
64
+ transfered_model_weights = {}
65
+ for weights_name in model.state_dict().keys():
66
+ transfered_model_weights[weights_name] = model_weights['.'.join(weights_name.split('.')[1:])]
67
+ return transfered_model_weights
68
+
69
+
70
+ def draw_bodypose(canvas: np.ndarray, keypoints: List[Keypoint]) -> np.ndarray:
71
+ """
72
+ Draw keypoints and limbs representing body pose on a given canvas.
73
+
74
+ Args:
75
+ canvas (np.ndarray): A 3D numpy array representing the canvas (image) on which to draw the body pose.
76
+ keypoints (List[Keypoint]): A list of Keypoint objects representing the body keypoints to be drawn.
77
+
78
+ Returns:
79
+ np.ndarray: A 3D numpy array representing the modified canvas with the drawn body pose.
80
+
81
+ Note:
82
+ The function expects the x and y coordinates of the keypoints to be normalized between 0 and 1.
83
+ """
84
+ H, W, C = canvas.shape
85
+ stickwidth = 4
86
+
87
+ limbSeq = [
88
+ [2, 3], [2, 6], [3, 4], [4, 5],
89
+ [6, 7], [7, 8], [2, 9], [9, 10],
90
+ [10, 11], [2, 12], [12, 13], [13, 14],
91
+ [2, 1], [1, 15], [15, 17], [1, 16],
92
+ [16, 18],
93
+ ]
94
+
95
+ colors = [[255, 0, 0], [255, 85, 0], [255, 170, 0], [255, 255, 0], [170, 255, 0], [85, 255, 0], [0, 255, 0], \
96
+ [0, 255, 85], [0, 255, 170], [0, 255, 255], [0, 170, 255], [0, 85, 255], [0, 0, 255], [85, 0, 255], \
97
+ [170, 0, 255], [255, 0, 255], [255, 0, 170], [255, 0, 85]]
98
+
99
+ for (k1_index, k2_index), color in zip(limbSeq, colors):
100
+ keypoint1 = keypoints[k1_index - 1]
101
+ keypoint2 = keypoints[k2_index - 1]
102
+
103
+ if keypoint1 is None or keypoint2 is None:
104
+ continue
105
+
106
+ Y = np.array([keypoint1.x, keypoint2.x]) * float(W)
107
+ X = np.array([keypoint1.y, keypoint2.y]) * float(H)
108
+ mX = np.mean(X)
109
+ mY = np.mean(Y)
110
+ length = ((X[0] - X[1]) ** 2 + (Y[0] - Y[1]) ** 2) ** 0.5
111
+ angle = math.degrees(math.atan2(X[0] - X[1], Y[0] - Y[1]))
112
+ polygon = cv2.ellipse2Poly((int(mY), int(mX)), (int(length / 2), stickwidth), int(angle), 0, 360, 1)
113
+ cv2.fillConvexPoly(canvas, polygon, [int(float(c) * 0.6) for c in color])
114
+
115
+ for keypoint, color in zip(keypoints, colors):
116
+ if keypoint is None:
117
+ continue
118
+
119
+ x, y = keypoint.x, keypoint.y
120
+ x = int(x * W)
121
+ y = int(y * H)
122
+ cv2.circle(canvas, (int(x), int(y)), 4, color, thickness=-1)
123
+
124
+ return canvas
125
+
126
+
127
+ def draw_handpose(canvas: np.ndarray, keypoints: Union[List[Keypoint], None]) -> np.ndarray:
128
+ """
129
+ Draw keypoints and connections representing hand pose on a given canvas.
130
+
131
+ Args:
132
+ canvas (np.ndarray): A 3D numpy array representing the canvas (image) on which to draw the hand pose.
133
+ keypoints (List[Keypoint]| None): A list of Keypoint objects representing the hand keypoints to be drawn
134
+ or None if no keypoints are present.
135
+
136
+ Returns:
137
+ np.ndarray: A 3D numpy array representing the modified canvas with the drawn hand pose.
138
+
139
+ Note:
140
+ The function expects the x and y coordinates of the keypoints to be normalized between 0 and 1.
141
+ """
142
+ if not keypoints:
143
+ return canvas
144
+
145
+ H, W, C = canvas.shape
146
+
147
+ edges = [[0, 1], [1, 2], [2, 3], [3, 4], [0, 5], [5, 6], [6, 7], [7, 8], [0, 9], [9, 10], \
148
+ [10, 11], [11, 12], [0, 13], [13, 14], [14, 15], [15, 16], [0, 17], [17, 18], [18, 19], [19, 20]]
149
+
150
+ for ie, (e1, e2) in enumerate(edges):
151
+ k1 = keypoints[e1]
152
+ k2 = keypoints[e2]
153
+ if k1 is None or k2 is None:
154
+ continue
155
+
156
+ x1 = int(k1.x * W)
157
+ y1 = int(k1.y * H)
158
+ x2 = int(k2.x * W)
159
+ y2 = int(k2.y * H)
160
+ if x1 > eps and y1 > eps and x2 > eps and y2 > eps:
161
+ cv2.line(canvas, (x1, y1), (x2, y2), matplotlib.colors.hsv_to_rgb([ie / float(len(edges)), 1.0, 1.0]) * 255, thickness=2)
162
+
163
+ for keypoint in keypoints:
164
+ x, y = keypoint.x, keypoint.y
165
+ x = int(x * W)
166
+ y = int(y * H)
167
+ if x > eps and y > eps:
168
+ cv2.circle(canvas, (x, y), 4, (0, 0, 255), thickness=-1)
169
+ return canvas
170
+
171
+
172
+ def draw_facepose(canvas: np.ndarray, keypoints: Union[List[Keypoint], None]) -> np.ndarray:
173
+ """
174
+ Draw keypoints representing face pose on a given canvas.
175
+
176
+ Args:
177
+ canvas (np.ndarray): A 3D numpy array representing the canvas (image) on which to draw the face pose.
178
+ keypoints (List[Keypoint]| None): A list of Keypoint objects representing the face keypoints to be drawn
179
+ or None if no keypoints are present.
180
+
181
+ Returns:
182
+ np.ndarray: A 3D numpy array representing the modified canvas with the drawn face pose.
183
+
184
+ Note:
185
+ The function expects the x and y coordinates of the keypoints to be normalized between 0 and 1.
186
+ """
187
+ if not keypoints:
188
+ return canvas
189
+
190
+ H, W, C = canvas.shape
191
+ for keypoint in keypoints:
192
+ x, y = keypoint.x, keypoint.y
193
+ x = int(x * W)
194
+ y = int(y * H)
195
+ if x > eps and y > eps:
196
+ cv2.circle(canvas, (x, y), 3, (255, 255, 255), thickness=-1)
197
+ return canvas
198
+
199
+
200
+ # detect hand according to body pose keypoints
201
+ # please refer to https://github.com/CMU-Perceptual-Computing-Lab/openpose/blob/master/src/openpose/hand/handDetector.cpp
202
+ def handDetect(body: BodyResult, oriImg) -> List[Tuple[int, int, int, bool]]:
203
+ """
204
+ Detect hands in the input body pose keypoints and calculate the bounding box for each hand.
205
+
206
+ Args:
207
+ body (BodyResult): A BodyResult object containing the detected body pose keypoints.
208
+ oriImg (numpy.ndarray): A 3D numpy array representing the original input image.
209
+
210
+ Returns:
211
+ List[Tuple[int, int, int, bool]]: A list of tuples, each containing the coordinates (x, y) of the top-left
212
+ corner of the bounding box, the width (height) of the bounding box, and
213
+ a boolean flag indicating whether the hand is a left hand (True) or a
214
+ right hand (False).
215
+
216
+ Notes:
217
+ - The width and height of the bounding boxes are equal since the network requires squared input.
218
+ - The minimum bounding box size is 20 pixels.
219
+ """
220
+ ratioWristElbow = 0.33
221
+ detect_result = []
222
+ image_height, image_width = oriImg.shape[0:2]
223
+
224
+ keypoints = body.keypoints
225
+ # right hand: wrist 4, elbow 3, shoulder 2
226
+ # left hand: wrist 7, elbow 6, shoulder 5
227
+ left_shoulder = keypoints[5]
228
+ left_elbow = keypoints[6]
229
+ left_wrist = keypoints[7]
230
+ right_shoulder = keypoints[2]
231
+ right_elbow = keypoints[3]
232
+ right_wrist = keypoints[4]
233
+
234
+ # if any of three not detected
235
+ has_left = all(keypoint is not None for keypoint in (left_shoulder, left_elbow, left_wrist))
236
+ has_right = all(keypoint is not None for keypoint in (right_shoulder, right_elbow, right_wrist))
237
+ if not (has_left or has_right):
238
+ return []
239
+
240
+ hands = []
241
+ #left hand
242
+ if has_left:
243
+ hands.append([
244
+ left_shoulder.x, left_shoulder.y,
245
+ left_elbow.x, left_elbow.y,
246
+ left_wrist.x, left_wrist.y,
247
+ True
248
+ ])
249
+ # right hand
250
+ if has_right:
251
+ hands.append([
252
+ right_shoulder.x, right_shoulder.y,
253
+ right_elbow.x, right_elbow.y,
254
+ right_wrist.x, right_wrist.y,
255
+ False
256
+ ])
257
+
258
+ for x1, y1, x2, y2, x3, y3, is_left in hands:
259
+ # pos_hand = pos_wrist + ratio * (pos_wrist - pos_elbox) = (1 + ratio) * pos_wrist - ratio * pos_elbox
260
+ # handRectangle.x = posePtr[wrist*3] + ratioWristElbow * (posePtr[wrist*3] - posePtr[elbow*3]);
261
+ # handRectangle.y = posePtr[wrist*3+1] + ratioWristElbow * (posePtr[wrist*3+1] - posePtr[elbow*3+1]);
262
+ # const auto distanceWristElbow = getDistance(poseKeypoints, person, wrist, elbow);
263
+ # const auto distanceElbowShoulder = getDistance(poseKeypoints, person, elbow, shoulder);
264
+ # handRectangle.width = 1.5f * fastMax(distanceWristElbow, 0.9f * distanceElbowShoulder);
265
+ x = x3 + ratioWristElbow * (x3 - x2)
266
+ y = y3 + ratioWristElbow * (y3 - y2)
267
+ distanceWristElbow = math.sqrt((x3 - x2) ** 2 + (y3 - y2) ** 2)
268
+ distanceElbowShoulder = math.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2)
269
+ width = 1.5 * max(distanceWristElbow, 0.9 * distanceElbowShoulder)
270
+ # x-y refers to the center --> offset to topLeft point
271
+ # handRectangle.x -= handRectangle.width / 2.f;
272
+ # handRectangle.y -= handRectangle.height / 2.f;
273
+ x -= width / 2
274
+ y -= width / 2 # width = height
275
+ # overflow the image
276
+ if x < 0: x = 0
277
+ if y < 0: y = 0
278
+ width1 = width
279
+ width2 = width
280
+ if x + width > image_width: width1 = image_width - x
281
+ if y + width > image_height: width2 = image_height - y
282
+ width = min(width1, width2)
283
+ # the max hand box value is 20 pixels
284
+ if width >= 20:
285
+ detect_result.append((int(x), int(y), int(width), is_left))
286
+
287
+ '''
288
+ return value: [[x, y, w, True if left hand else False]].
289
+ width=height since the network require squared input.
290
+ x, y is the coordinate of top left
291
+ '''
292
+ return detect_result
293
+
294
+
295
+ # Written by Lvmin
296
+ def faceDetect(body: BodyResult, oriImg) -> Union[Tuple[int, int, int], None]:
297
+ """
298
+ Detect the face in the input body pose keypoints and calculate the bounding box for the face.
299
+
300
+ Args:
301
+ body (BodyResult): A BodyResult object containing the detected body pose keypoints.
302
+ oriImg (numpy.ndarray): A 3D numpy array representing the original input image.
303
+
304
+ Returns:
305
+ Tuple[int, int, int] | None: A tuple containing the coordinates (x, y) of the top-left corner of the
306
+ bounding box and the width (height) of the bounding box, or None if the
307
+ face is not detected or the bounding box width is less than 20 pixels.
308
+
309
+ Notes:
310
+ - The width and height of the bounding box are equal.
311
+ - The minimum bounding box size is 20 pixels.
312
+ """
313
+ # left right eye ear 14 15 16 17
314
+ image_height, image_width = oriImg.shape[0:2]
315
+
316
+ keypoints = body.keypoints
317
+ head = keypoints[0]
318
+ left_eye = keypoints[14]
319
+ right_eye = keypoints[15]
320
+ left_ear = keypoints[16]
321
+ right_ear = keypoints[17]
322
+
323
+ if head is None or all(keypoint is None for keypoint in (left_eye, right_eye, left_ear, right_ear)):
324
+ return None
325
+
326
+ width = 0.0
327
+ x0, y0 = head.x, head.y
328
+
329
+ if left_eye is not None:
330
+ x1, y1 = left_eye.x, left_eye.y
331
+ d = max(abs(x0 - x1), abs(y0 - y1))
332
+ width = max(width, d * 3.0)
333
+
334
+ if right_eye is not None:
335
+ x1, y1 = right_eye.x, right_eye.y
336
+ d = max(abs(x0 - x1), abs(y0 - y1))
337
+ width = max(width, d * 3.0)
338
+
339
+ if left_ear is not None:
340
+ x1, y1 = left_ear.x, left_ear.y
341
+ d = max(abs(x0 - x1), abs(y0 - y1))
342
+ width = max(width, d * 1.5)
343
+
344
+ if right_ear is not None:
345
+ x1, y1 = right_ear.x, right_ear.y
346
+ d = max(abs(x0 - x1), abs(y0 - y1))
347
+ width = max(width, d * 1.5)
348
+
349
+ x, y = x0, y0
350
+
351
+ x -= width
352
+ y -= width
353
+
354
+ if x < 0:
355
+ x = 0
356
+
357
+ if y < 0:
358
+ y = 0
359
+
360
+ width1 = width * 2
361
+ width2 = width * 2
362
+
363
+ if x + width > image_width:
364
+ width1 = image_width - x
365
+
366
+ if y + width > image_height:
367
+ width2 = image_height - y
368
+
369
+ width = min(width1, width2)
370
+
371
+ if width >= 20:
372
+ return int(x), int(y), int(width)
373
+ else:
374
+ return None
375
+
376
+
377
+ # get max index of 2d array
378
+ def npmax(array):
379
+ arrayindex = array.argmax(1)
380
+ arrayvalue = array.max(1)
381
+ i = arrayvalue.argmax()
382
+ j = arrayindex[i]
383
+ return i, j
extensions/microsoftexcel-controlnet/annotator/pidinet/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ It is just for research purpose, and commercial use should be contacted with authors first.
2
+
3
+ Copyright (c) 2021 Zhuo Su
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
extensions/microsoftexcel-controlnet/annotator/pidinet/__init__.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import numpy as np
4
+ from einops import rearrange
5
+ from annotator.pidinet.model import pidinet
6
+ from annotator.util import safe_step
7
+ from modules import devices
8
+ from annotator.annotator_path import models_path
9
+ from scripts.utils import load_state_dict
10
+
11
+ netNetwork = None
12
+ remote_model_path = "https://huggingface.co/lllyasviel/Annotators/resolve/main/table5_pidinet.pth"
13
+ modeldir = os.path.join(models_path, "pidinet")
14
+ old_modeldir = os.path.dirname(os.path.realpath(__file__))
15
+
16
+ def apply_pidinet(input_image, is_safe=False, apply_fliter=False):
17
+ global netNetwork
18
+ if netNetwork is None:
19
+ modelpath = os.path.join(modeldir, "table5_pidinet.pth")
20
+ old_modelpath = os.path.join(old_modeldir, "table5_pidinet.pth")
21
+ if os.path.exists(old_modelpath):
22
+ modelpath = old_modelpath
23
+ elif not os.path.exists(modelpath):
24
+ from basicsr.utils.download_util import load_file_from_url
25
+ load_file_from_url(remote_model_path, model_dir=modeldir)
26
+ netNetwork = pidinet()
27
+ ckp = load_state_dict(modelpath)
28
+ netNetwork.load_state_dict({k.replace('module.',''):v for k, v in ckp.items()})
29
+
30
+ netNetwork = netNetwork.to(devices.get_device_for("controlnet"))
31
+ netNetwork.eval()
32
+ assert input_image.ndim == 3
33
+ input_image = input_image[:, :, ::-1].copy()
34
+ with torch.no_grad():
35
+ image_pidi = torch.from_numpy(input_image).float().to(devices.get_device_for("controlnet"))
36
+ image_pidi = image_pidi / 255.0
37
+ image_pidi = rearrange(image_pidi, 'h w c -> 1 c h w')
38
+ edge = netNetwork(image_pidi)[-1]
39
+ edge = edge.cpu().numpy()
40
+ if apply_fliter:
41
+ edge = edge > 0.5
42
+ if is_safe:
43
+ edge = safe_step(edge)
44
+ edge = (edge * 255.0).clip(0, 255).astype(np.uint8)
45
+
46
+ return edge[0][0]
47
+
48
+ def unload_pid_model():
49
+ global netNetwork
50
+ if netNetwork is not None:
51
+ netNetwork.cpu()
extensions/microsoftexcel-controlnet/annotator/pidinet/model.py ADDED
@@ -0,0 +1,653 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Author: Zhuo Su, Wenzhe Liu
3
+ Date: Feb 18, 2021
4
+ """
5
+
6
+ import math
7
+
8
+ import cv2
9
+ import numpy as np
10
+ import torch
11
+ import torch.nn as nn
12
+ import torch.nn.functional as F
13
+ from basicsr.utils import img2tensor
14
+
15
+ nets = {
16
+ 'baseline': {
17
+ 'layer0': 'cv',
18
+ 'layer1': 'cv',
19
+ 'layer2': 'cv',
20
+ 'layer3': 'cv',
21
+ 'layer4': 'cv',
22
+ 'layer5': 'cv',
23
+ 'layer6': 'cv',
24
+ 'layer7': 'cv',
25
+ 'layer8': 'cv',
26
+ 'layer9': 'cv',
27
+ 'layer10': 'cv',
28
+ 'layer11': 'cv',
29
+ 'layer12': 'cv',
30
+ 'layer13': 'cv',
31
+ 'layer14': 'cv',
32
+ 'layer15': 'cv',
33
+ },
34
+ 'c-v15': {
35
+ 'layer0': 'cd',
36
+ 'layer1': 'cv',
37
+ 'layer2': 'cv',
38
+ 'layer3': 'cv',
39
+ 'layer4': 'cv',
40
+ 'layer5': 'cv',
41
+ 'layer6': 'cv',
42
+ 'layer7': 'cv',
43
+ 'layer8': 'cv',
44
+ 'layer9': 'cv',
45
+ 'layer10': 'cv',
46
+ 'layer11': 'cv',
47
+ 'layer12': 'cv',
48
+ 'layer13': 'cv',
49
+ 'layer14': 'cv',
50
+ 'layer15': 'cv',
51
+ },
52
+ 'a-v15': {
53
+ 'layer0': 'ad',
54
+ 'layer1': 'cv',
55
+ 'layer2': 'cv',
56
+ 'layer3': 'cv',
57
+ 'layer4': 'cv',
58
+ 'layer5': 'cv',
59
+ 'layer6': 'cv',
60
+ 'layer7': 'cv',
61
+ 'layer8': 'cv',
62
+ 'layer9': 'cv',
63
+ 'layer10': 'cv',
64
+ 'layer11': 'cv',
65
+ 'layer12': 'cv',
66
+ 'layer13': 'cv',
67
+ 'layer14': 'cv',
68
+ 'layer15': 'cv',
69
+ },
70
+ 'r-v15': {
71
+ 'layer0': 'rd',
72
+ 'layer1': 'cv',
73
+ 'layer2': 'cv',
74
+ 'layer3': 'cv',
75
+ 'layer4': 'cv',
76
+ 'layer5': 'cv',
77
+ 'layer6': 'cv',
78
+ 'layer7': 'cv',
79
+ 'layer8': 'cv',
80
+ 'layer9': 'cv',
81
+ 'layer10': 'cv',
82
+ 'layer11': 'cv',
83
+ 'layer12': 'cv',
84
+ 'layer13': 'cv',
85
+ 'layer14': 'cv',
86
+ 'layer15': 'cv',
87
+ },
88
+ 'cvvv4': {
89
+ 'layer0': 'cd',
90
+ 'layer1': 'cv',
91
+ 'layer2': 'cv',
92
+ 'layer3': 'cv',
93
+ 'layer4': 'cd',
94
+ 'layer5': 'cv',
95
+ 'layer6': 'cv',
96
+ 'layer7': 'cv',
97
+ 'layer8': 'cd',
98
+ 'layer9': 'cv',
99
+ 'layer10': 'cv',
100
+ 'layer11': 'cv',
101
+ 'layer12': 'cd',
102
+ 'layer13': 'cv',
103
+ 'layer14': 'cv',
104
+ 'layer15': 'cv',
105
+ },
106
+ 'avvv4': {
107
+ 'layer0': 'ad',
108
+ 'layer1': 'cv',
109
+ 'layer2': 'cv',
110
+ 'layer3': 'cv',
111
+ 'layer4': 'ad',
112
+ 'layer5': 'cv',
113
+ 'layer6': 'cv',
114
+ 'layer7': 'cv',
115
+ 'layer8': 'ad',
116
+ 'layer9': 'cv',
117
+ 'layer10': 'cv',
118
+ 'layer11': 'cv',
119
+ 'layer12': 'ad',
120
+ 'layer13': 'cv',
121
+ 'layer14': 'cv',
122
+ 'layer15': 'cv',
123
+ },
124
+ 'rvvv4': {
125
+ 'layer0': 'rd',
126
+ 'layer1': 'cv',
127
+ 'layer2': 'cv',
128
+ 'layer3': 'cv',
129
+ 'layer4': 'rd',
130
+ 'layer5': 'cv',
131
+ 'layer6': 'cv',
132
+ 'layer7': 'cv',
133
+ 'layer8': 'rd',
134
+ 'layer9': 'cv',
135
+ 'layer10': 'cv',
136
+ 'layer11': 'cv',
137
+ 'layer12': 'rd',
138
+ 'layer13': 'cv',
139
+ 'layer14': 'cv',
140
+ 'layer15': 'cv',
141
+ },
142
+ 'cccv4': {
143
+ 'layer0': 'cd',
144
+ 'layer1': 'cd',
145
+ 'layer2': 'cd',
146
+ 'layer3': 'cv',
147
+ 'layer4': 'cd',
148
+ 'layer5': 'cd',
149
+ 'layer6': 'cd',
150
+ 'layer7': 'cv',
151
+ 'layer8': 'cd',
152
+ 'layer9': 'cd',
153
+ 'layer10': 'cd',
154
+ 'layer11': 'cv',
155
+ 'layer12': 'cd',
156
+ 'layer13': 'cd',
157
+ 'layer14': 'cd',
158
+ 'layer15': 'cv',
159
+ },
160
+ 'aaav4': {
161
+ 'layer0': 'ad',
162
+ 'layer1': 'ad',
163
+ 'layer2': 'ad',
164
+ 'layer3': 'cv',
165
+ 'layer4': 'ad',
166
+ 'layer5': 'ad',
167
+ 'layer6': 'ad',
168
+ 'layer7': 'cv',
169
+ 'layer8': 'ad',
170
+ 'layer9': 'ad',
171
+ 'layer10': 'ad',
172
+ 'layer11': 'cv',
173
+ 'layer12': 'ad',
174
+ 'layer13': 'ad',
175
+ 'layer14': 'ad',
176
+ 'layer15': 'cv',
177
+ },
178
+ 'rrrv4': {
179
+ 'layer0': 'rd',
180
+ 'layer1': 'rd',
181
+ 'layer2': 'rd',
182
+ 'layer3': 'cv',
183
+ 'layer4': 'rd',
184
+ 'layer5': 'rd',
185
+ 'layer6': 'rd',
186
+ 'layer7': 'cv',
187
+ 'layer8': 'rd',
188
+ 'layer9': 'rd',
189
+ 'layer10': 'rd',
190
+ 'layer11': 'cv',
191
+ 'layer12': 'rd',
192
+ 'layer13': 'rd',
193
+ 'layer14': 'rd',
194
+ 'layer15': 'cv',
195
+ },
196
+ 'c16': {
197
+ 'layer0': 'cd',
198
+ 'layer1': 'cd',
199
+ 'layer2': 'cd',
200
+ 'layer3': 'cd',
201
+ 'layer4': 'cd',
202
+ 'layer5': 'cd',
203
+ 'layer6': 'cd',
204
+ 'layer7': 'cd',
205
+ 'layer8': 'cd',
206
+ 'layer9': 'cd',
207
+ 'layer10': 'cd',
208
+ 'layer11': 'cd',
209
+ 'layer12': 'cd',
210
+ 'layer13': 'cd',
211
+ 'layer14': 'cd',
212
+ 'layer15': 'cd',
213
+ },
214
+ 'a16': {
215
+ 'layer0': 'ad',
216
+ 'layer1': 'ad',
217
+ 'layer2': 'ad',
218
+ 'layer3': 'ad',
219
+ 'layer4': 'ad',
220
+ 'layer5': 'ad',
221
+ 'layer6': 'ad',
222
+ 'layer7': 'ad',
223
+ 'layer8': 'ad',
224
+ 'layer9': 'ad',
225
+ 'layer10': 'ad',
226
+ 'layer11': 'ad',
227
+ 'layer12': 'ad',
228
+ 'layer13': 'ad',
229
+ 'layer14': 'ad',
230
+ 'layer15': 'ad',
231
+ },
232
+ 'r16': {
233
+ 'layer0': 'rd',
234
+ 'layer1': 'rd',
235
+ 'layer2': 'rd',
236
+ 'layer3': 'rd',
237
+ 'layer4': 'rd',
238
+ 'layer5': 'rd',
239
+ 'layer6': 'rd',
240
+ 'layer7': 'rd',
241
+ 'layer8': 'rd',
242
+ 'layer9': 'rd',
243
+ 'layer10': 'rd',
244
+ 'layer11': 'rd',
245
+ 'layer12': 'rd',
246
+ 'layer13': 'rd',
247
+ 'layer14': 'rd',
248
+ 'layer15': 'rd',
249
+ },
250
+ 'carv4': {
251
+ 'layer0': 'cd',
252
+ 'layer1': 'ad',
253
+ 'layer2': 'rd',
254
+ 'layer3': 'cv',
255
+ 'layer4': 'cd',
256
+ 'layer5': 'ad',
257
+ 'layer6': 'rd',
258
+ 'layer7': 'cv',
259
+ 'layer8': 'cd',
260
+ 'layer9': 'ad',
261
+ 'layer10': 'rd',
262
+ 'layer11': 'cv',
263
+ 'layer12': 'cd',
264
+ 'layer13': 'ad',
265
+ 'layer14': 'rd',
266
+ 'layer15': 'cv',
267
+ },
268
+ }
269
+
270
+ def createConvFunc(op_type):
271
+ assert op_type in ['cv', 'cd', 'ad', 'rd'], 'unknown op type: %s' % str(op_type)
272
+ if op_type == 'cv':
273
+ return F.conv2d
274
+
275
+ if op_type == 'cd':
276
+ def func(x, weights, bias=None, stride=1, padding=0, dilation=1, groups=1):
277
+ assert dilation in [1, 2], 'dilation for cd_conv should be in 1 or 2'
278
+ assert weights.size(2) == 3 and weights.size(3) == 3, 'kernel size for cd_conv should be 3x3'
279
+ assert padding == dilation, 'padding for cd_conv set wrong'
280
+
281
+ weights_c = weights.sum(dim=[2, 3], keepdim=True)
282
+ yc = F.conv2d(x, weights_c, stride=stride, padding=0, groups=groups)
283
+ y = F.conv2d(x, weights, bias, stride=stride, padding=padding, dilation=dilation, groups=groups)
284
+ return y - yc
285
+ return func
286
+ elif op_type == 'ad':
287
+ def func(x, weights, bias=None, stride=1, padding=0, dilation=1, groups=1):
288
+ assert dilation in [1, 2], 'dilation for ad_conv should be in 1 or 2'
289
+ assert weights.size(2) == 3 and weights.size(3) == 3, 'kernel size for ad_conv should be 3x3'
290
+ assert padding == dilation, 'padding for ad_conv set wrong'
291
+
292
+ shape = weights.shape
293
+ weights = weights.view(shape[0], shape[1], -1)
294
+ weights_conv = (weights - weights[:, :, [3, 0, 1, 6, 4, 2, 7, 8, 5]]).view(shape) # clock-wise
295
+ y = F.conv2d(x, weights_conv, bias, stride=stride, padding=padding, dilation=dilation, groups=groups)
296
+ return y
297
+ return func
298
+ elif op_type == 'rd':
299
+ def func(x, weights, bias=None, stride=1, padding=0, dilation=1, groups=1):
300
+ assert dilation in [1, 2], 'dilation for rd_conv should be in 1 or 2'
301
+ assert weights.size(2) == 3 and weights.size(3) == 3, 'kernel size for rd_conv should be 3x3'
302
+ padding = 2 * dilation
303
+
304
+ shape = weights.shape
305
+ if weights.is_cuda:
306
+ buffer = torch.cuda.FloatTensor(shape[0], shape[1], 5 * 5).fill_(0)
307
+ else:
308
+ buffer = torch.zeros(shape[0], shape[1], 5 * 5)
309
+ weights = weights.view(shape[0], shape[1], -1)
310
+ buffer[:, :, [0, 2, 4, 10, 14, 20, 22, 24]] = weights[:, :, 1:]
311
+ buffer[:, :, [6, 7, 8, 11, 13, 16, 17, 18]] = -weights[:, :, 1:]
312
+ buffer[:, :, 12] = 0
313
+ buffer = buffer.view(shape[0], shape[1], 5, 5)
314
+ y = F.conv2d(x, buffer, bias, stride=stride, padding=padding, dilation=dilation, groups=groups)
315
+ return y
316
+ return func
317
+ else:
318
+ print('impossible to be here unless you force that')
319
+ return None
320
+
321
+ class Conv2d(nn.Module):
322
+ def __init__(self, pdc, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=False):
323
+ super(Conv2d, self).__init__()
324
+ if in_channels % groups != 0:
325
+ raise ValueError('in_channels must be divisible by groups')
326
+ if out_channels % groups != 0:
327
+ raise ValueError('out_channels must be divisible by groups')
328
+ self.in_channels = in_channels
329
+ self.out_channels = out_channels
330
+ self.kernel_size = kernel_size
331
+ self.stride = stride
332
+ self.padding = padding
333
+ self.dilation = dilation
334
+ self.groups = groups
335
+ self.weight = nn.Parameter(torch.Tensor(out_channels, in_channels // groups, kernel_size, kernel_size))
336
+ if bias:
337
+ self.bias = nn.Parameter(torch.Tensor(out_channels))
338
+ else:
339
+ self.register_parameter('bias', None)
340
+ self.reset_parameters()
341
+ self.pdc = pdc
342
+
343
+ def reset_parameters(self):
344
+ nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5))
345
+ if self.bias is not None:
346
+ fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight)
347
+ bound = 1 / math.sqrt(fan_in)
348
+ nn.init.uniform_(self.bias, -bound, bound)
349
+
350
+ def forward(self, input):
351
+
352
+ return self.pdc(input, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
353
+
354
+ class CSAM(nn.Module):
355
+ """
356
+ Compact Spatial Attention Module
357
+ """
358
+ def __init__(self, channels):
359
+ super(CSAM, self).__init__()
360
+
361
+ mid_channels = 4
362
+ self.relu1 = nn.ReLU()
363
+ self.conv1 = nn.Conv2d(channels, mid_channels, kernel_size=1, padding=0)
364
+ self.conv2 = nn.Conv2d(mid_channels, 1, kernel_size=3, padding=1, bias=False)
365
+ self.sigmoid = nn.Sigmoid()
366
+ nn.init.constant_(self.conv1.bias, 0)
367
+
368
+ def forward(self, x):
369
+ y = self.relu1(x)
370
+ y = self.conv1(y)
371
+ y = self.conv2(y)
372
+ y = self.sigmoid(y)
373
+
374
+ return x * y
375
+
376
+ class CDCM(nn.Module):
377
+ """
378
+ Compact Dilation Convolution based Module
379
+ """
380
+ def __init__(self, in_channels, out_channels):
381
+ super(CDCM, self).__init__()
382
+
383
+ self.relu1 = nn.ReLU()
384
+ self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=1, padding=0)
385
+ self.conv2_1 = nn.Conv2d(out_channels, out_channels, kernel_size=3, dilation=5, padding=5, bias=False)
386
+ self.conv2_2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, dilation=7, padding=7, bias=False)
387
+ self.conv2_3 = nn.Conv2d(out_channels, out_channels, kernel_size=3, dilation=9, padding=9, bias=False)
388
+ self.conv2_4 = nn.Conv2d(out_channels, out_channels, kernel_size=3, dilation=11, padding=11, bias=False)
389
+ nn.init.constant_(self.conv1.bias, 0)
390
+
391
+ def forward(self, x):
392
+ x = self.relu1(x)
393
+ x = self.conv1(x)
394
+ x1 = self.conv2_1(x)
395
+ x2 = self.conv2_2(x)
396
+ x3 = self.conv2_3(x)
397
+ x4 = self.conv2_4(x)
398
+ return x1 + x2 + x3 + x4
399
+
400
+
401
+ class MapReduce(nn.Module):
402
+ """
403
+ Reduce feature maps into a single edge map
404
+ """
405
+ def __init__(self, channels):
406
+ super(MapReduce, self).__init__()
407
+ self.conv = nn.Conv2d(channels, 1, kernel_size=1, padding=0)
408
+ nn.init.constant_(self.conv.bias, 0)
409
+
410
+ def forward(self, x):
411
+ return self.conv(x)
412
+
413
+
414
+ class PDCBlock(nn.Module):
415
+ def __init__(self, pdc, inplane, ouplane, stride=1):
416
+ super(PDCBlock, self).__init__()
417
+ self.stride=stride
418
+
419
+ self.stride=stride
420
+ if self.stride > 1:
421
+ self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
422
+ self.shortcut = nn.Conv2d(inplane, ouplane, kernel_size=1, padding=0)
423
+ self.conv1 = Conv2d(pdc, inplane, inplane, kernel_size=3, padding=1, groups=inplane, bias=False)
424
+ self.relu2 = nn.ReLU()
425
+ self.conv2 = nn.Conv2d(inplane, ouplane, kernel_size=1, padding=0, bias=False)
426
+
427
+ def forward(self, x):
428
+ if self.stride > 1:
429
+ x = self.pool(x)
430
+ y = self.conv1(x)
431
+ y = self.relu2(y)
432
+ y = self.conv2(y)
433
+ if self.stride > 1:
434
+ x = self.shortcut(x)
435
+ y = y + x
436
+ return y
437
+
438
+ class PDCBlock_converted(nn.Module):
439
+ """
440
+ CPDC, APDC can be converted to vanilla 3x3 convolution
441
+ RPDC can be converted to vanilla 5x5 convolution
442
+ """
443
+ def __init__(self, pdc, inplane, ouplane, stride=1):
444
+ super(PDCBlock_converted, self).__init__()
445
+ self.stride=stride
446
+
447
+ if self.stride > 1:
448
+ self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
449
+ self.shortcut = nn.Conv2d(inplane, ouplane, kernel_size=1, padding=0)
450
+ if pdc == 'rd':
451
+ self.conv1 = nn.Conv2d(inplane, inplane, kernel_size=5, padding=2, groups=inplane, bias=False)
452
+ else:
453
+ self.conv1 = nn.Conv2d(inplane, inplane, kernel_size=3, padding=1, groups=inplane, bias=False)
454
+ self.relu2 = nn.ReLU()
455
+ self.conv2 = nn.Conv2d(inplane, ouplane, kernel_size=1, padding=0, bias=False)
456
+
457
+ def forward(self, x):
458
+ if self.stride > 1:
459
+ x = self.pool(x)
460
+ y = self.conv1(x)
461
+ y = self.relu2(y)
462
+ y = self.conv2(y)
463
+ if self.stride > 1:
464
+ x = self.shortcut(x)
465
+ y = y + x
466
+ return y
467
+
468
+ class PiDiNet(nn.Module):
469
+ def __init__(self, inplane, pdcs, dil=None, sa=False, convert=False):
470
+ super(PiDiNet, self).__init__()
471
+ self.sa = sa
472
+ if dil is not None:
473
+ assert isinstance(dil, int), 'dil should be an int'
474
+ self.dil = dil
475
+
476
+ self.fuseplanes = []
477
+
478
+ self.inplane = inplane
479
+ if convert:
480
+ if pdcs[0] == 'rd':
481
+ init_kernel_size = 5
482
+ init_padding = 2
483
+ else:
484
+ init_kernel_size = 3
485
+ init_padding = 1
486
+ self.init_block = nn.Conv2d(3, self.inplane,
487
+ kernel_size=init_kernel_size, padding=init_padding, bias=False)
488
+ block_class = PDCBlock_converted
489
+ else:
490
+ self.init_block = Conv2d(pdcs[0], 3, self.inplane, kernel_size=3, padding=1)
491
+ block_class = PDCBlock
492
+
493
+ self.block1_1 = block_class(pdcs[1], self.inplane, self.inplane)
494
+ self.block1_2 = block_class(pdcs[2], self.inplane, self.inplane)
495
+ self.block1_3 = block_class(pdcs[3], self.inplane, self.inplane)
496
+ self.fuseplanes.append(self.inplane) # C
497
+
498
+ inplane = self.inplane
499
+ self.inplane = self.inplane * 2
500
+ self.block2_1 = block_class(pdcs[4], inplane, self.inplane, stride=2)
501
+ self.block2_2 = block_class(pdcs[5], self.inplane, self.inplane)
502
+ self.block2_3 = block_class(pdcs[6], self.inplane, self.inplane)
503
+ self.block2_4 = block_class(pdcs[7], self.inplane, self.inplane)
504
+ self.fuseplanes.append(self.inplane) # 2C
505
+
506
+ inplane = self.inplane
507
+ self.inplane = self.inplane * 2
508
+ self.block3_1 = block_class(pdcs[8], inplane, self.inplane, stride=2)
509
+ self.block3_2 = block_class(pdcs[9], self.inplane, self.inplane)
510
+ self.block3_3 = block_class(pdcs[10], self.inplane, self.inplane)
511
+ self.block3_4 = block_class(pdcs[11], self.inplane, self.inplane)
512
+ self.fuseplanes.append(self.inplane) # 4C
513
+
514
+ self.block4_1 = block_class(pdcs[12], self.inplane, self.inplane, stride=2)
515
+ self.block4_2 = block_class(pdcs[13], self.inplane, self.inplane)
516
+ self.block4_3 = block_class(pdcs[14], self.inplane, self.inplane)
517
+ self.block4_4 = block_class(pdcs[15], self.inplane, self.inplane)
518
+ self.fuseplanes.append(self.inplane) # 4C
519
+
520
+ self.conv_reduces = nn.ModuleList()
521
+ if self.sa and self.dil is not None:
522
+ self.attentions = nn.ModuleList()
523
+ self.dilations = nn.ModuleList()
524
+ for i in range(4):
525
+ self.dilations.append(CDCM(self.fuseplanes[i], self.dil))
526
+ self.attentions.append(CSAM(self.dil))
527
+ self.conv_reduces.append(MapReduce(self.dil))
528
+ elif self.sa:
529
+ self.attentions = nn.ModuleList()
530
+ for i in range(4):
531
+ self.attentions.append(CSAM(self.fuseplanes[i]))
532
+ self.conv_reduces.append(MapReduce(self.fuseplanes[i]))
533
+ elif self.dil is not None:
534
+ self.dilations = nn.ModuleList()
535
+ for i in range(4):
536
+ self.dilations.append(CDCM(self.fuseplanes[i], self.dil))
537
+ self.conv_reduces.append(MapReduce(self.dil))
538
+ else:
539
+ for i in range(4):
540
+ self.conv_reduces.append(MapReduce(self.fuseplanes[i]))
541
+
542
+ self.classifier = nn.Conv2d(4, 1, kernel_size=1) # has bias
543
+ nn.init.constant_(self.classifier.weight, 0.25)
544
+ nn.init.constant_(self.classifier.bias, 0)
545
+
546
+ # print('initialization done')
547
+
548
+ def get_weights(self):
549
+ conv_weights = []
550
+ bn_weights = []
551
+ relu_weights = []
552
+ for pname, p in self.named_parameters():
553
+ if 'bn' in pname:
554
+ bn_weights.append(p)
555
+ elif 'relu' in pname:
556
+ relu_weights.append(p)
557
+ else:
558
+ conv_weights.append(p)
559
+
560
+ return conv_weights, bn_weights, relu_weights
561
+
562
+ def forward(self, x):
563
+ H, W = x.size()[2:]
564
+
565
+ x = self.init_block(x)
566
+
567
+ x1 = self.block1_1(x)
568
+ x1 = self.block1_2(x1)
569
+ x1 = self.block1_3(x1)
570
+
571
+ x2 = self.block2_1(x1)
572
+ x2 = self.block2_2(x2)
573
+ x2 = self.block2_3(x2)
574
+ x2 = self.block2_4(x2)
575
+
576
+ x3 = self.block3_1(x2)
577
+ x3 = self.block3_2(x3)
578
+ x3 = self.block3_3(x3)
579
+ x3 = self.block3_4(x3)
580
+
581
+ x4 = self.block4_1(x3)
582
+ x4 = self.block4_2(x4)
583
+ x4 = self.block4_3(x4)
584
+ x4 = self.block4_4(x4)
585
+
586
+ x_fuses = []
587
+ if self.sa and self.dil is not None:
588
+ for i, xi in enumerate([x1, x2, x3, x4]):
589
+ x_fuses.append(self.attentions[i](self.dilations[i](xi)))
590
+ elif self.sa:
591
+ for i, xi in enumerate([x1, x2, x3, x4]):
592
+ x_fuses.append(self.attentions[i](xi))
593
+ elif self.dil is not None:
594
+ for i, xi in enumerate([x1, x2, x3, x4]):
595
+ x_fuses.append(self.dilations[i](xi))
596
+ else:
597
+ x_fuses = [x1, x2, x3, x4]
598
+
599
+ e1 = self.conv_reduces[0](x_fuses[0])
600
+ e1 = F.interpolate(e1, (H, W), mode="bilinear", align_corners=False)
601
+
602
+ e2 = self.conv_reduces[1](x_fuses[1])
603
+ e2 = F.interpolate(e2, (H, W), mode="bilinear", align_corners=False)
604
+
605
+ e3 = self.conv_reduces[2](x_fuses[2])
606
+ e3 = F.interpolate(e3, (H, W), mode="bilinear", align_corners=False)
607
+
608
+ e4 = self.conv_reduces[3](x_fuses[3])
609
+ e4 = F.interpolate(e4, (H, W), mode="bilinear", align_corners=False)
610
+
611
+ outputs = [e1, e2, e3, e4]
612
+
613
+ output = self.classifier(torch.cat(outputs, dim=1))
614
+ #if not self.training:
615
+ # return torch.sigmoid(output)
616
+
617
+ outputs.append(output)
618
+ outputs = [torch.sigmoid(r) for r in outputs]
619
+ return outputs
620
+
621
+ def config_model(model):
622
+ model_options = list(nets.keys())
623
+ assert model in model_options, \
624
+ 'unrecognized model, please choose from %s' % str(model_options)
625
+
626
+ # print(str(nets[model]))
627
+
628
+ pdcs = []
629
+ for i in range(16):
630
+ layer_name = 'layer%d' % i
631
+ op = nets[model][layer_name]
632
+ pdcs.append(createConvFunc(op))
633
+
634
+ return pdcs
635
+
636
+ def pidinet():
637
+ pdcs = config_model('carv4')
638
+ dil = 24 #if args.dil else None
639
+ return PiDiNet(60, pdcs, dil=dil, sa=True)
640
+
641
+
642
+ if __name__ == '__main__':
643
+ model = pidinet()
644
+ ckp = torch.load('table5_pidinet.pth')['state_dict']
645
+ model.load_state_dict({k.replace('module.',''):v for k, v in ckp.items()})
646
+ im = cv2.imread('examples/test_my/cat_v4.png')
647
+ im = img2tensor(im).unsqueeze(0)/255.
648
+ res = model(im)[-1]
649
+ res = res>0.5
650
+ res = res.float()
651
+ res = (res[0,0].cpu().data.numpy()*255.).astype(np.uint8)
652
+ print(res.shape)
653
+ cv2.imwrite('edge.png', res)
extensions/microsoftexcel-controlnet/annotator/shuffle/__init__.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+
3
+ import cv2
4
+ import numpy as np
5
+ from annotator.util import make_noise_disk, img2mask
6
+
7
+
8
+ class ContentShuffleDetector:
9
+ def __call__(self, img, h=None, w=None, f=None):
10
+ H, W, C = img.shape
11
+ if h is None:
12
+ h = H
13
+ if w is None:
14
+ w = W
15
+ if f is None:
16
+ f = 256
17
+ x = make_noise_disk(h, w, 1, f) * float(W - 1)
18
+ y = make_noise_disk(h, w, 1, f) * float(H - 1)
19
+ flow = np.concatenate([x, y], axis=2).astype(np.float32)
20
+ return cv2.remap(img, flow, None, cv2.INTER_LINEAR)
21
+
22
+
23
+ class ColorShuffleDetector:
24
+ def __call__(self, img):
25
+ H, W, C = img.shape
26
+ F = np.random.randint(64, 384)
27
+ A = make_noise_disk(H, W, 3, F)
28
+ B = make_noise_disk(H, W, 3, F)
29
+ C = (A + B) / 2.0
30
+ A = (C + (A - C) * 3.0).clip(0, 1)
31
+ B = (C + (B - C) * 3.0).clip(0, 1)
32
+ L = img.astype(np.float32) / 255.0
33
+ Y = A * L + B * (1 - L)
34
+ Y -= np.min(Y, axis=(0, 1), keepdims=True)
35
+ Y /= np.maximum(np.max(Y, axis=(0, 1), keepdims=True), 1e-5)
36
+ Y *= 255.0
37
+ return Y.clip(0, 255).astype(np.uint8)
38
+
39
+
40
+ class GrayDetector:
41
+ def __call__(self, img):
42
+ eps = 1e-5
43
+ X = img.astype(np.float32)
44
+ r, g, b = X[:, :, 0], X[:, :, 1], X[:, :, 2]
45
+ kr, kg, kb = [random.random() + eps for _ in range(3)]
46
+ ks = kr + kg + kb
47
+ kr /= ks
48
+ kg /= ks
49
+ kb /= ks
50
+ Y = r * kr + g * kg + b * kb
51
+ Y = np.stack([Y] * 3, axis=2)
52
+ return Y.clip(0, 255).astype(np.uint8)
53
+
54
+
55
+ class DownSampleDetector:
56
+ def __call__(self, img, level=3, k=16.0):
57
+ h = img.astype(np.float32)
58
+ for _ in range(level):
59
+ h += np.random.normal(loc=0.0, scale=k, size=h.shape)
60
+ h = cv2.pyrDown(h)
61
+ for _ in range(level):
62
+ h = cv2.pyrUp(h)
63
+ h += np.random.normal(loc=0.0, scale=k, size=h.shape)
64
+ return h.clip(0, 255).astype(np.uint8)
65
+
66
+
67
+ class Image2MaskShuffleDetector:
68
+ def __init__(self, resolution=(640, 512)):
69
+ self.H, self.W = resolution
70
+
71
+ def __call__(self, img):
72
+ m = img2mask(img, self.H, self.W)
73
+ m *= 255.0
74
+ return m.clip(0, 255).astype(np.uint8)
extensions/microsoftexcel-controlnet/annotator/uniformer/LICENSE ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Copyright 2022 SenseTime X-Lab. All rights reserved.
2
+
3
+ Apache License
4
+ Version 2.0, January 2004
5
+ http://www.apache.org/licenses/
6
+
7
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
8
+
9
+ 1. Definitions.
10
+
11
+ "License" shall mean the terms and conditions for use, reproduction,
12
+ and distribution as defined by Sections 1 through 9 of this document.
13
+
14
+ "Licensor" shall mean the copyright owner or entity authorized by
15
+ the copyright owner that is granting the License.
16
+
17
+ "Legal Entity" shall mean the union of the acting entity and all
18
+ other entities that control, are controlled by, or are under common
19
+ control with that entity. For the purposes of this definition,
20
+ "control" means (i) the power, direct or indirect, to cause the
21
+ direction or management of such entity, whether by contract or
22
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
23
+ outstanding shares, or (iii) beneficial ownership of such entity.
24
+
25
+ "You" (or "Your") shall mean an individual or Legal Entity
26
+ exercising permissions granted by this License.
27
+
28
+ "Source" form shall mean the preferred form for making modifications,
29
+ including but not limited to software source code, documentation
30
+ source, and configuration files.
31
+
32
+ "Object" form shall mean any form resulting from mechanical
33
+ transformation or translation of a Source form, including but
34
+ not limited to compiled object code, generated documentation,
35
+ and conversions to other media types.
36
+
37
+ "Work" shall mean the work of authorship, whether in Source or
38
+ Object form, made available under the License, as indicated by a
39
+ copyright notice that is included in or attached to the work
40
+ (an example is provided in the Appendix below).
41
+
42
+ "Derivative Works" shall mean any work, whether in Source or Object
43
+ form, that is based on (or derived from) the Work and for which the
44
+ editorial revisions, annotations, elaborations, or other modifications
45
+ represent, as a whole, an original work of authorship. For the purposes
46
+ of this License, Derivative Works shall not include works that remain
47
+ separable from, or merely link (or bind by name) to the interfaces of,
48
+ the Work and Derivative Works thereof.
49
+
50
+ "Contribution" shall mean any work of authorship, including
51
+ the original version of the Work and any modifications or additions
52
+ to that Work or Derivative Works thereof, that is intentionally
53
+ submitted to Licensor for inclusion in the Work by the copyright owner
54
+ or by an individual or Legal Entity authorized to submit on behalf of
55
+ the copyright owner. For the purposes of this definition, "submitted"
56
+ means any form of electronic, verbal, or written communication sent
57
+ to the Licensor or its representatives, including but not limited to
58
+ communication on electronic mailing lists, source code control systems,
59
+ and issue tracking systems that are managed by, or on behalf of, the
60
+ Licensor for the purpose of discussing and improving the Work, but
61
+ excluding communication that is conspicuously marked or otherwise
62
+ designated in writing by the copyright owner as "Not a Contribution."
63
+
64
+ "Contributor" shall mean Licensor and any individual or Legal Entity
65
+ on behalf of whom a Contribution has been received by Licensor and
66
+ subsequently incorporated within the Work.
67
+
68
+ 2. Grant of Copyright License. Subject to the terms and conditions of
69
+ this License, each Contributor hereby grants to You a perpetual,
70
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
71
+ copyright license to reproduce, prepare Derivative Works of,
72
+ publicly display, publicly perform, sublicense, and distribute the
73
+ Work and such Derivative Works in Source or Object form.
74
+
75
+ 3. Grant of Patent License. Subject to the terms and conditions of
76
+ this License, each Contributor hereby grants to You a perpetual,
77
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
78
+ (except as stated in this section) patent license to make, have made,
79
+ use, offer to sell, sell, import, and otherwise transfer the Work,
80
+ where such license applies only to those patent claims licensable
81
+ by such Contributor that are necessarily infringed by their
82
+ Contribution(s) alone or by combination of their Contribution(s)
83
+ with the Work to which such Contribution(s) was submitted. If You
84
+ institute patent litigation against any entity (including a
85
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
86
+ or a Contribution incorporated within the Work constitutes direct
87
+ or contributory patent infringement, then any patent licenses
88
+ granted to You under this License for that Work shall terminate
89
+ as of the date such litigation is filed.
90
+
91
+ 4. Redistribution. You may reproduce and distribute copies of the
92
+ Work or Derivative Works thereof in any medium, with or without
93
+ modifications, and in Source or Object form, provided that You
94
+ meet the following conditions:
95
+
96
+ (a) You must give any other recipients of the Work or
97
+ Derivative Works a copy of this License; and
98
+
99
+ (b) You must cause any modified files to carry prominent notices
100
+ stating that You changed the files; and
101
+
102
+ (c) You must retain, in the Source form of any Derivative Works
103
+ that You distribute, all copyright, patent, trademark, and
104
+ attribution notices from the Source form of the Work,
105
+ excluding those notices that do not pertain to any part of
106
+ the Derivative Works; and
107
+
108
+ (d) If the Work includes a "NOTICE" text file as part of its
109
+ distribution, then any Derivative Works that You distribute must
110
+ include a readable copy of the attribution notices contained
111
+ within such NOTICE file, excluding those notices that do not
112
+ pertain to any part of the Derivative Works, in at least one
113
+ of the following places: within a NOTICE text file distributed
114
+ as part of the Derivative Works; within the Source form or
115
+ documentation, if provided along with the Derivative Works; or,
116
+ within a display generated by the Derivative Works, if and
117
+ wherever such third-party notices normally appear. The contents
118
+ of the NOTICE file are for informational purposes only and
119
+ do not modify the License. You may add Your own attribution
120
+ notices within Derivative Works that You distribute, alongside
121
+ or as an addendum to the NOTICE text from the Work, provided
122
+ that such additional attribution notices cannot be construed
123
+ as modifying the License.
124
+
125
+ You may add Your own copyright statement to Your modifications and
126
+ may provide additional or different license terms and conditions
127
+ for use, reproduction, or distribution of Your modifications, or
128
+ for any such Derivative Works as a whole, provided Your use,
129
+ reproduction, and distribution of the Work otherwise complies with
130
+ the conditions stated in this License.
131
+
132
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
133
+ any Contribution intentionally submitted for inclusion in the Work
134
+ by You to the Licensor shall be under the terms and conditions of
135
+ this License, without any additional terms or conditions.
136
+ Notwithstanding the above, nothing herein shall supersede or modify
137
+ the terms of any separate license agreement you may have executed
138
+ with Licensor regarding such Contributions.
139
+
140
+ 6. Trademarks. This License does not grant permission to use the trade
141
+ names, trademarks, service marks, or product names of the Licensor,
142
+ except as required for reasonable and customary use in describing the
143
+ origin of the Work and reproducing the content of the NOTICE file.
144
+
145
+ 7. Disclaimer of Warranty. Unless required by applicable law or
146
+ agreed to in writing, Licensor provides the Work (and each
147
+ Contributor provides its Contributions) on an "AS IS" BASIS,
148
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
149
+ implied, including, without limitation, any warranties or conditions
150
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
151
+ PARTICULAR PURPOSE. You are solely responsible for determining the
152
+ appropriateness of using or redistributing the Work and assume any
153
+ risks associated with Your exercise of permissions under this License.
154
+
155
+ 8. Limitation of Liability. In no event and under no legal theory,
156
+ whether in tort (including negligence), contract, or otherwise,
157
+ unless required by applicable law (such as deliberate and grossly
158
+ negligent acts) or agreed to in writing, shall any Contributor be
159
+ liable to You for damages, including any direct, indirect, special,
160
+ incidental, or consequential damages of any character arising as a
161
+ result of this License or out of the use or inability to use the
162
+ Work (including but not limited to damages for loss of goodwill,
163
+ work stoppage, computer failure or malfunction, or any and all
164
+ other commercial damages or losses), even if such Contributor
165
+ has been advised of the possibility of such damages.
166
+
167
+ 9. Accepting Warranty or Additional Liability. While redistributing
168
+ the Work or Derivative Works thereof, You may choose to offer,
169
+ and charge a fee for, acceptance of support, warranty, indemnity,
170
+ or other liability obligations and/or rights consistent with this
171
+ License. However, in accepting such obligations, You may act only
172
+ on Your own behalf and on Your sole responsibility, not on behalf
173
+ of any other Contributor, and only if You agree to indemnify,
174
+ defend, and hold each Contributor harmless for any liability
175
+ incurred by, or claims asserted against, such Contributor by reason
176
+ of your accepting any such warranty or additional liability.
177
+
178
+ END OF TERMS AND CONDITIONS
179
+
180
+ APPENDIX: How to apply the Apache License to your work.
181
+
182
+ To apply the Apache License to your work, attach the following
183
+ boilerplate notice, with the fields enclosed by brackets "[]"
184
+ replaced with your own identifying information. (Don't include
185
+ the brackets!) The text should be enclosed in the appropriate
186
+ comment syntax for the file format. We also recommend that a
187
+ file or class name and description of purpose be included on the
188
+ same "printed page" as the copyright notice for easier
189
+ identification within third-party archives.
190
+
191
+ Copyright 2022 SenseTime X-Lab.
192
+
193
+ Licensed under the Apache License, Version 2.0 (the "License");
194
+ you may not use this file except in compliance with the License.
195
+ You may obtain a copy of the License at
196
+
197
+ http://www.apache.org/licenses/LICENSE-2.0
198
+
199
+ Unless required by applicable law or agreed to in writing, software
200
+ distributed under the License is distributed on an "AS IS" BASIS,
201
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
202
+ See the License for the specific language governing permissions and
203
+ limitations under the License.
extensions/microsoftexcel-controlnet/annotator/uniformer/__init__.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from annotator.annotator_path import models_path
3
+ from modules import devices
4
+ from annotator.uniformer.inference import init_segmentor, inference_segmentor, show_result_pyplot
5
+
6
+ try:
7
+ from mmseg.core.evaluation import get_palette
8
+ except ImportError:
9
+ from annotator.mmpkg.mmseg.core.evaluation import get_palette
10
+
11
+ modeldir = os.path.join(models_path, "uniformer")
12
+ checkpoint_file = "https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/upernet_global_small.pth"
13
+ config_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), "upernet_global_small.py")
14
+ old_modeldir = os.path.dirname(os.path.realpath(__file__))
15
+ model = None
16
+
17
+ def unload_uniformer_model():
18
+ global model
19
+ if model is not None:
20
+ model = model.cpu()
21
+
22
+ def apply_uniformer(img):
23
+ global model
24
+ if model is None:
25
+ modelpath = os.path.join(modeldir, "upernet_global_small.pth")
26
+ old_modelpath = os.path.join(old_modeldir, "upernet_global_small.pth")
27
+ if os.path.exists(old_modelpath):
28
+ modelpath = old_modelpath
29
+ elif not os.path.exists(modelpath):
30
+ from basicsr.utils.download_util import load_file_from_url
31
+ load_file_from_url(checkpoint_file, model_dir=modeldir)
32
+
33
+ model = init_segmentor(config_file, modelpath, device=devices.get_device_for("controlnet"))
34
+ model = model.to(devices.get_device_for("controlnet"))
35
+
36
+ if devices.get_device_for("controlnet").type == 'mps':
37
+ # adaptive_avg_pool2d can fail on MPS, workaround with CPU
38
+ import torch.nn.functional
39
+
40
+ orig_adaptive_avg_pool2d = torch.nn.functional.adaptive_avg_pool2d
41
+ def cpu_if_exception(input, *args, **kwargs):
42
+ try:
43
+ return orig_adaptive_avg_pool2d(input, *args, **kwargs)
44
+ except:
45
+ return orig_adaptive_avg_pool2d(input.cpu(), *args, **kwargs).to(input.device)
46
+
47
+ try:
48
+ torch.nn.functional.adaptive_avg_pool2d = cpu_if_exception
49
+ result = inference_segmentor(model, img)
50
+ finally:
51
+ torch.nn.functional.adaptive_avg_pool2d = orig_adaptive_avg_pool2d
52
+ else:
53
+ result = inference_segmentor(model, img)
54
+
55
+ res_img = show_result_pyplot(model, img, result, get_palette('ade'), opacity=1)
56
+ return res_img
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/datasets/ade20k.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # dataset settings
2
+ dataset_type = 'ADE20KDataset'
3
+ data_root = 'data/ade/ADEChallengeData2016'
4
+ img_norm_cfg = dict(
5
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
6
+ crop_size = (512, 512)
7
+ train_pipeline = [
8
+ dict(type='LoadImageFromFile'),
9
+ dict(type='LoadAnnotations', reduce_zero_label=True),
10
+ dict(type='Resize', img_scale=(2048, 512), ratio_range=(0.5, 2.0)),
11
+ dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
12
+ dict(type='RandomFlip', prob=0.5),
13
+ dict(type='PhotoMetricDistortion'),
14
+ dict(type='Normalize', **img_norm_cfg),
15
+ dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
16
+ dict(type='DefaultFormatBundle'),
17
+ dict(type='Collect', keys=['img', 'gt_semantic_seg']),
18
+ ]
19
+ test_pipeline = [
20
+ dict(type='LoadImageFromFile'),
21
+ dict(
22
+ type='MultiScaleFlipAug',
23
+ img_scale=(2048, 512),
24
+ # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
25
+ flip=False,
26
+ transforms=[
27
+ dict(type='Resize', keep_ratio=True),
28
+ dict(type='RandomFlip'),
29
+ dict(type='Normalize', **img_norm_cfg),
30
+ dict(type='ImageToTensor', keys=['img']),
31
+ dict(type='Collect', keys=['img']),
32
+ ])
33
+ ]
34
+ data = dict(
35
+ samples_per_gpu=4,
36
+ workers_per_gpu=4,
37
+ train=dict(
38
+ type=dataset_type,
39
+ data_root=data_root,
40
+ img_dir='images/training',
41
+ ann_dir='annotations/training',
42
+ pipeline=train_pipeline),
43
+ val=dict(
44
+ type=dataset_type,
45
+ data_root=data_root,
46
+ img_dir='images/validation',
47
+ ann_dir='annotations/validation',
48
+ pipeline=test_pipeline),
49
+ test=dict(
50
+ type=dataset_type,
51
+ data_root=data_root,
52
+ img_dir='images/validation',
53
+ ann_dir='annotations/validation',
54
+ pipeline=test_pipeline))
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/datasets/chase_db1.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # dataset settings
2
+ dataset_type = 'ChaseDB1Dataset'
3
+ data_root = 'data/CHASE_DB1'
4
+ img_norm_cfg = dict(
5
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
6
+ img_scale = (960, 999)
7
+ crop_size = (128, 128)
8
+ train_pipeline = [
9
+ dict(type='LoadImageFromFile'),
10
+ dict(type='LoadAnnotations'),
11
+ dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)),
12
+ dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
13
+ dict(type='RandomFlip', prob=0.5),
14
+ dict(type='PhotoMetricDistortion'),
15
+ dict(type='Normalize', **img_norm_cfg),
16
+ dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
17
+ dict(type='DefaultFormatBundle'),
18
+ dict(type='Collect', keys=['img', 'gt_semantic_seg'])
19
+ ]
20
+ test_pipeline = [
21
+ dict(type='LoadImageFromFile'),
22
+ dict(
23
+ type='MultiScaleFlipAug',
24
+ img_scale=img_scale,
25
+ # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0],
26
+ flip=False,
27
+ transforms=[
28
+ dict(type='Resize', keep_ratio=True),
29
+ dict(type='RandomFlip'),
30
+ dict(type='Normalize', **img_norm_cfg),
31
+ dict(type='ImageToTensor', keys=['img']),
32
+ dict(type='Collect', keys=['img'])
33
+ ])
34
+ ]
35
+
36
+ data = dict(
37
+ samples_per_gpu=4,
38
+ workers_per_gpu=4,
39
+ train=dict(
40
+ type='RepeatDataset',
41
+ times=40000,
42
+ dataset=dict(
43
+ type=dataset_type,
44
+ data_root=data_root,
45
+ img_dir='images/training',
46
+ ann_dir='annotations/training',
47
+ pipeline=train_pipeline)),
48
+ val=dict(
49
+ type=dataset_type,
50
+ data_root=data_root,
51
+ img_dir='images/validation',
52
+ ann_dir='annotations/validation',
53
+ pipeline=test_pipeline),
54
+ test=dict(
55
+ type=dataset_type,
56
+ data_root=data_root,
57
+ img_dir='images/validation',
58
+ ann_dir='annotations/validation',
59
+ pipeline=test_pipeline))
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/datasets/cityscapes.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # dataset settings
2
+ dataset_type = 'CityscapesDataset'
3
+ data_root = 'data/cityscapes/'
4
+ img_norm_cfg = dict(
5
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
6
+ crop_size = (512, 1024)
7
+ train_pipeline = [
8
+ dict(type='LoadImageFromFile'),
9
+ dict(type='LoadAnnotations'),
10
+ dict(type='Resize', img_scale=(2048, 1024), ratio_range=(0.5, 2.0)),
11
+ dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
12
+ dict(type='RandomFlip', prob=0.5),
13
+ dict(type='PhotoMetricDistortion'),
14
+ dict(type='Normalize', **img_norm_cfg),
15
+ dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
16
+ dict(type='DefaultFormatBundle'),
17
+ dict(type='Collect', keys=['img', 'gt_semantic_seg']),
18
+ ]
19
+ test_pipeline = [
20
+ dict(type='LoadImageFromFile'),
21
+ dict(
22
+ type='MultiScaleFlipAug',
23
+ img_scale=(2048, 1024),
24
+ # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
25
+ flip=False,
26
+ transforms=[
27
+ dict(type='Resize', keep_ratio=True),
28
+ dict(type='RandomFlip'),
29
+ dict(type='Normalize', **img_norm_cfg),
30
+ dict(type='ImageToTensor', keys=['img']),
31
+ dict(type='Collect', keys=['img']),
32
+ ])
33
+ ]
34
+ data = dict(
35
+ samples_per_gpu=2,
36
+ workers_per_gpu=2,
37
+ train=dict(
38
+ type=dataset_type,
39
+ data_root=data_root,
40
+ img_dir='leftImg8bit/train',
41
+ ann_dir='gtFine/train',
42
+ pipeline=train_pipeline),
43
+ val=dict(
44
+ type=dataset_type,
45
+ data_root=data_root,
46
+ img_dir='leftImg8bit/val',
47
+ ann_dir='gtFine/val',
48
+ pipeline=test_pipeline),
49
+ test=dict(
50
+ type=dataset_type,
51
+ data_root=data_root,
52
+ img_dir='leftImg8bit/val',
53
+ ann_dir='gtFine/val',
54
+ pipeline=test_pipeline))
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/datasets/cityscapes_769x769.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = './cityscapes.py'
2
+ img_norm_cfg = dict(
3
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
4
+ crop_size = (769, 769)
5
+ train_pipeline = [
6
+ dict(type='LoadImageFromFile'),
7
+ dict(type='LoadAnnotations'),
8
+ dict(type='Resize', img_scale=(2049, 1025), ratio_range=(0.5, 2.0)),
9
+ dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
10
+ dict(type='RandomFlip', prob=0.5),
11
+ dict(type='PhotoMetricDistortion'),
12
+ dict(type='Normalize', **img_norm_cfg),
13
+ dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
14
+ dict(type='DefaultFormatBundle'),
15
+ dict(type='Collect', keys=['img', 'gt_semantic_seg']),
16
+ ]
17
+ test_pipeline = [
18
+ dict(type='LoadImageFromFile'),
19
+ dict(
20
+ type='MultiScaleFlipAug',
21
+ img_scale=(2049, 1025),
22
+ # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
23
+ flip=False,
24
+ transforms=[
25
+ dict(type='Resize', keep_ratio=True),
26
+ dict(type='RandomFlip'),
27
+ dict(type='Normalize', **img_norm_cfg),
28
+ dict(type='ImageToTensor', keys=['img']),
29
+ dict(type='Collect', keys=['img']),
30
+ ])
31
+ ]
32
+ data = dict(
33
+ train=dict(pipeline=train_pipeline),
34
+ val=dict(pipeline=test_pipeline),
35
+ test=dict(pipeline=test_pipeline))
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/datasets/drive.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # dataset settings
2
+ dataset_type = 'DRIVEDataset'
3
+ data_root = 'data/DRIVE'
4
+ img_norm_cfg = dict(
5
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
6
+ img_scale = (584, 565)
7
+ crop_size = (64, 64)
8
+ train_pipeline = [
9
+ dict(type='LoadImageFromFile'),
10
+ dict(type='LoadAnnotations'),
11
+ dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)),
12
+ dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
13
+ dict(type='RandomFlip', prob=0.5),
14
+ dict(type='PhotoMetricDistortion'),
15
+ dict(type='Normalize', **img_norm_cfg),
16
+ dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
17
+ dict(type='DefaultFormatBundle'),
18
+ dict(type='Collect', keys=['img', 'gt_semantic_seg'])
19
+ ]
20
+ test_pipeline = [
21
+ dict(type='LoadImageFromFile'),
22
+ dict(
23
+ type='MultiScaleFlipAug',
24
+ img_scale=img_scale,
25
+ # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0],
26
+ flip=False,
27
+ transforms=[
28
+ dict(type='Resize', keep_ratio=True),
29
+ dict(type='RandomFlip'),
30
+ dict(type='Normalize', **img_norm_cfg),
31
+ dict(type='ImageToTensor', keys=['img']),
32
+ dict(type='Collect', keys=['img'])
33
+ ])
34
+ ]
35
+
36
+ data = dict(
37
+ samples_per_gpu=4,
38
+ workers_per_gpu=4,
39
+ train=dict(
40
+ type='RepeatDataset',
41
+ times=40000,
42
+ dataset=dict(
43
+ type=dataset_type,
44
+ data_root=data_root,
45
+ img_dir='images/training',
46
+ ann_dir='annotations/training',
47
+ pipeline=train_pipeline)),
48
+ val=dict(
49
+ type=dataset_type,
50
+ data_root=data_root,
51
+ img_dir='images/validation',
52
+ ann_dir='annotations/validation',
53
+ pipeline=test_pipeline),
54
+ test=dict(
55
+ type=dataset_type,
56
+ data_root=data_root,
57
+ img_dir='images/validation',
58
+ ann_dir='annotations/validation',
59
+ pipeline=test_pipeline))
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/datasets/hrf.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # dataset settings
2
+ dataset_type = 'HRFDataset'
3
+ data_root = 'data/HRF'
4
+ img_norm_cfg = dict(
5
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
6
+ img_scale = (2336, 3504)
7
+ crop_size = (256, 256)
8
+ train_pipeline = [
9
+ dict(type='LoadImageFromFile'),
10
+ dict(type='LoadAnnotations'),
11
+ dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)),
12
+ dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
13
+ dict(type='RandomFlip', prob=0.5),
14
+ dict(type='PhotoMetricDistortion'),
15
+ dict(type='Normalize', **img_norm_cfg),
16
+ dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
17
+ dict(type='DefaultFormatBundle'),
18
+ dict(type='Collect', keys=['img', 'gt_semantic_seg'])
19
+ ]
20
+ test_pipeline = [
21
+ dict(type='LoadImageFromFile'),
22
+ dict(
23
+ type='MultiScaleFlipAug',
24
+ img_scale=img_scale,
25
+ # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0],
26
+ flip=False,
27
+ transforms=[
28
+ dict(type='Resize', keep_ratio=True),
29
+ dict(type='RandomFlip'),
30
+ dict(type='Normalize', **img_norm_cfg),
31
+ dict(type='ImageToTensor', keys=['img']),
32
+ dict(type='Collect', keys=['img'])
33
+ ])
34
+ ]
35
+
36
+ data = dict(
37
+ samples_per_gpu=4,
38
+ workers_per_gpu=4,
39
+ train=dict(
40
+ type='RepeatDataset',
41
+ times=40000,
42
+ dataset=dict(
43
+ type=dataset_type,
44
+ data_root=data_root,
45
+ img_dir='images/training',
46
+ ann_dir='annotations/training',
47
+ pipeline=train_pipeline)),
48
+ val=dict(
49
+ type=dataset_type,
50
+ data_root=data_root,
51
+ img_dir='images/validation',
52
+ ann_dir='annotations/validation',
53
+ pipeline=test_pipeline),
54
+ test=dict(
55
+ type=dataset_type,
56
+ data_root=data_root,
57
+ img_dir='images/validation',
58
+ ann_dir='annotations/validation',
59
+ pipeline=test_pipeline))
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/datasets/pascal_context.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # dataset settings
2
+ dataset_type = 'PascalContextDataset'
3
+ data_root = 'data/VOCdevkit/VOC2010/'
4
+ img_norm_cfg = dict(
5
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
6
+
7
+ img_scale = (520, 520)
8
+ crop_size = (480, 480)
9
+
10
+ train_pipeline = [
11
+ dict(type='LoadImageFromFile'),
12
+ dict(type='LoadAnnotations'),
13
+ dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)),
14
+ dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
15
+ dict(type='RandomFlip', prob=0.5),
16
+ dict(type='PhotoMetricDistortion'),
17
+ dict(type='Normalize', **img_norm_cfg),
18
+ dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
19
+ dict(type='DefaultFormatBundle'),
20
+ dict(type='Collect', keys=['img', 'gt_semantic_seg']),
21
+ ]
22
+ test_pipeline = [
23
+ dict(type='LoadImageFromFile'),
24
+ dict(
25
+ type='MultiScaleFlipAug',
26
+ img_scale=img_scale,
27
+ # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
28
+ flip=False,
29
+ transforms=[
30
+ dict(type='Resize', keep_ratio=True),
31
+ dict(type='RandomFlip'),
32
+ dict(type='Normalize', **img_norm_cfg),
33
+ dict(type='ImageToTensor', keys=['img']),
34
+ dict(type='Collect', keys=['img']),
35
+ ])
36
+ ]
37
+ data = dict(
38
+ samples_per_gpu=4,
39
+ workers_per_gpu=4,
40
+ train=dict(
41
+ type=dataset_type,
42
+ data_root=data_root,
43
+ img_dir='JPEGImages',
44
+ ann_dir='SegmentationClassContext',
45
+ split='ImageSets/SegmentationContext/train.txt',
46
+ pipeline=train_pipeline),
47
+ val=dict(
48
+ type=dataset_type,
49
+ data_root=data_root,
50
+ img_dir='JPEGImages',
51
+ ann_dir='SegmentationClassContext',
52
+ split='ImageSets/SegmentationContext/val.txt',
53
+ pipeline=test_pipeline),
54
+ test=dict(
55
+ type=dataset_type,
56
+ data_root=data_root,
57
+ img_dir='JPEGImages',
58
+ ann_dir='SegmentationClassContext',
59
+ split='ImageSets/SegmentationContext/val.txt',
60
+ pipeline=test_pipeline))
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/datasets/pascal_context_59.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # dataset settings
2
+ dataset_type = 'PascalContextDataset59'
3
+ data_root = 'data/VOCdevkit/VOC2010/'
4
+ img_norm_cfg = dict(
5
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
6
+
7
+ img_scale = (520, 520)
8
+ crop_size = (480, 480)
9
+
10
+ train_pipeline = [
11
+ dict(type='LoadImageFromFile'),
12
+ dict(type='LoadAnnotations', reduce_zero_label=True),
13
+ dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)),
14
+ dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
15
+ dict(type='RandomFlip', prob=0.5),
16
+ dict(type='PhotoMetricDistortion'),
17
+ dict(type='Normalize', **img_norm_cfg),
18
+ dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
19
+ dict(type='DefaultFormatBundle'),
20
+ dict(type='Collect', keys=['img', 'gt_semantic_seg']),
21
+ ]
22
+ test_pipeline = [
23
+ dict(type='LoadImageFromFile'),
24
+ dict(
25
+ type='MultiScaleFlipAug',
26
+ img_scale=img_scale,
27
+ # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
28
+ flip=False,
29
+ transforms=[
30
+ dict(type='Resize', keep_ratio=True),
31
+ dict(type='RandomFlip'),
32
+ dict(type='Normalize', **img_norm_cfg),
33
+ dict(type='ImageToTensor', keys=['img']),
34
+ dict(type='Collect', keys=['img']),
35
+ ])
36
+ ]
37
+ data = dict(
38
+ samples_per_gpu=4,
39
+ workers_per_gpu=4,
40
+ train=dict(
41
+ type=dataset_type,
42
+ data_root=data_root,
43
+ img_dir='JPEGImages',
44
+ ann_dir='SegmentationClassContext',
45
+ split='ImageSets/SegmentationContext/train.txt',
46
+ pipeline=train_pipeline),
47
+ val=dict(
48
+ type=dataset_type,
49
+ data_root=data_root,
50
+ img_dir='JPEGImages',
51
+ ann_dir='SegmentationClassContext',
52
+ split='ImageSets/SegmentationContext/val.txt',
53
+ pipeline=test_pipeline),
54
+ test=dict(
55
+ type=dataset_type,
56
+ data_root=data_root,
57
+ img_dir='JPEGImages',
58
+ ann_dir='SegmentationClassContext',
59
+ split='ImageSets/SegmentationContext/val.txt',
60
+ pipeline=test_pipeline))
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/datasets/pascal_voc12.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # dataset settings
2
+ dataset_type = 'PascalVOCDataset'
3
+ data_root = 'data/VOCdevkit/VOC2012'
4
+ img_norm_cfg = dict(
5
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
6
+ crop_size = (512, 512)
7
+ train_pipeline = [
8
+ dict(type='LoadImageFromFile'),
9
+ dict(type='LoadAnnotations'),
10
+ dict(type='Resize', img_scale=(2048, 512), ratio_range=(0.5, 2.0)),
11
+ dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
12
+ dict(type='RandomFlip', prob=0.5),
13
+ dict(type='PhotoMetricDistortion'),
14
+ dict(type='Normalize', **img_norm_cfg),
15
+ dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
16
+ dict(type='DefaultFormatBundle'),
17
+ dict(type='Collect', keys=['img', 'gt_semantic_seg']),
18
+ ]
19
+ test_pipeline = [
20
+ dict(type='LoadImageFromFile'),
21
+ dict(
22
+ type='MultiScaleFlipAug',
23
+ img_scale=(2048, 512),
24
+ # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
25
+ flip=False,
26
+ transforms=[
27
+ dict(type='Resize', keep_ratio=True),
28
+ dict(type='RandomFlip'),
29
+ dict(type='Normalize', **img_norm_cfg),
30
+ dict(type='ImageToTensor', keys=['img']),
31
+ dict(type='Collect', keys=['img']),
32
+ ])
33
+ ]
34
+ data = dict(
35
+ samples_per_gpu=4,
36
+ workers_per_gpu=4,
37
+ train=dict(
38
+ type=dataset_type,
39
+ data_root=data_root,
40
+ img_dir='JPEGImages',
41
+ ann_dir='SegmentationClass',
42
+ split='ImageSets/Segmentation/train.txt',
43
+ pipeline=train_pipeline),
44
+ val=dict(
45
+ type=dataset_type,
46
+ data_root=data_root,
47
+ img_dir='JPEGImages',
48
+ ann_dir='SegmentationClass',
49
+ split='ImageSets/Segmentation/val.txt',
50
+ pipeline=test_pipeline),
51
+ test=dict(
52
+ type=dataset_type,
53
+ data_root=data_root,
54
+ img_dir='JPEGImages',
55
+ ann_dir='SegmentationClass',
56
+ split='ImageSets/Segmentation/val.txt',
57
+ pipeline=test_pipeline))
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/datasets/pascal_voc12_aug.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = './pascal_voc12.py'
2
+ # dataset settings
3
+ data = dict(
4
+ train=dict(
5
+ ann_dir=['SegmentationClass', 'SegmentationClassAug'],
6
+ split=[
7
+ 'ImageSets/Segmentation/train.txt',
8
+ 'ImageSets/Segmentation/aug.txt'
9
+ ]))
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/datasets/stare.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # dataset settings
2
+ dataset_type = 'STAREDataset'
3
+ data_root = 'data/STARE'
4
+ img_norm_cfg = dict(
5
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
6
+ img_scale = (605, 700)
7
+ crop_size = (128, 128)
8
+ train_pipeline = [
9
+ dict(type='LoadImageFromFile'),
10
+ dict(type='LoadAnnotations'),
11
+ dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)),
12
+ dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
13
+ dict(type='RandomFlip', prob=0.5),
14
+ dict(type='PhotoMetricDistortion'),
15
+ dict(type='Normalize', **img_norm_cfg),
16
+ dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
17
+ dict(type='DefaultFormatBundle'),
18
+ dict(type='Collect', keys=['img', 'gt_semantic_seg'])
19
+ ]
20
+ test_pipeline = [
21
+ dict(type='LoadImageFromFile'),
22
+ dict(
23
+ type='MultiScaleFlipAug',
24
+ img_scale=img_scale,
25
+ # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0],
26
+ flip=False,
27
+ transforms=[
28
+ dict(type='Resize', keep_ratio=True),
29
+ dict(type='RandomFlip'),
30
+ dict(type='Normalize', **img_norm_cfg),
31
+ dict(type='ImageToTensor', keys=['img']),
32
+ dict(type='Collect', keys=['img'])
33
+ ])
34
+ ]
35
+
36
+ data = dict(
37
+ samples_per_gpu=4,
38
+ workers_per_gpu=4,
39
+ train=dict(
40
+ type='RepeatDataset',
41
+ times=40000,
42
+ dataset=dict(
43
+ type=dataset_type,
44
+ data_root=data_root,
45
+ img_dir='images/training',
46
+ ann_dir='annotations/training',
47
+ pipeline=train_pipeline)),
48
+ val=dict(
49
+ type=dataset_type,
50
+ data_root=data_root,
51
+ img_dir='images/validation',
52
+ ann_dir='annotations/validation',
53
+ pipeline=test_pipeline),
54
+ test=dict(
55
+ type=dataset_type,
56
+ data_root=data_root,
57
+ img_dir='images/validation',
58
+ ann_dir='annotations/validation',
59
+ pipeline=test_pipeline))
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/default_runtime.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # yapf:disable
2
+ log_config = dict(
3
+ interval=50,
4
+ hooks=[
5
+ dict(type='TextLoggerHook', by_epoch=False),
6
+ # dict(type='TensorboardLoggerHook')
7
+ ])
8
+ # yapf:enable
9
+ dist_params = dict(backend='nccl')
10
+ log_level = 'INFO'
11
+ load_from = None
12
+ resume_from = None
13
+ workflow = [('train', 1)]
14
+ cudnn_benchmark = True
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/ann_r50-d8.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ norm_cfg = dict(type='SyncBN', requires_grad=True)
3
+ model = dict(
4
+ type='EncoderDecoder',
5
+ pretrained='open-mmlab://resnet50_v1c',
6
+ backbone=dict(
7
+ type='ResNetV1c',
8
+ depth=50,
9
+ num_stages=4,
10
+ out_indices=(0, 1, 2, 3),
11
+ dilations=(1, 1, 2, 4),
12
+ strides=(1, 2, 1, 1),
13
+ norm_cfg=norm_cfg,
14
+ norm_eval=False,
15
+ style='pytorch',
16
+ contract_dilation=True),
17
+ decode_head=dict(
18
+ type='ANNHead',
19
+ in_channels=[1024, 2048],
20
+ in_index=[2, 3],
21
+ channels=512,
22
+ project_channels=256,
23
+ query_scales=(1, ),
24
+ key_pool_scales=(1, 3, 6, 8),
25
+ dropout_ratio=0.1,
26
+ num_classes=19,
27
+ norm_cfg=norm_cfg,
28
+ align_corners=False,
29
+ loss_decode=dict(
30
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
31
+ auxiliary_head=dict(
32
+ type='FCNHead',
33
+ in_channels=1024,
34
+ in_index=2,
35
+ channels=256,
36
+ num_convs=1,
37
+ concat_input=False,
38
+ dropout_ratio=0.1,
39
+ num_classes=19,
40
+ norm_cfg=norm_cfg,
41
+ align_corners=False,
42
+ loss_decode=dict(
43
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
44
+ # model training and testing settings
45
+ train_cfg=dict(),
46
+ test_cfg=dict(mode='whole'))
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/apcnet_r50-d8.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ norm_cfg = dict(type='SyncBN', requires_grad=True)
3
+ model = dict(
4
+ type='EncoderDecoder',
5
+ pretrained='open-mmlab://resnet50_v1c',
6
+ backbone=dict(
7
+ type='ResNetV1c',
8
+ depth=50,
9
+ num_stages=4,
10
+ out_indices=(0, 1, 2, 3),
11
+ dilations=(1, 1, 2, 4),
12
+ strides=(1, 2, 1, 1),
13
+ norm_cfg=norm_cfg,
14
+ norm_eval=False,
15
+ style='pytorch',
16
+ contract_dilation=True),
17
+ decode_head=dict(
18
+ type='APCHead',
19
+ in_channels=2048,
20
+ in_index=3,
21
+ channels=512,
22
+ pool_scales=(1, 2, 3, 6),
23
+ dropout_ratio=0.1,
24
+ num_classes=19,
25
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
26
+ align_corners=False,
27
+ loss_decode=dict(
28
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
29
+ auxiliary_head=dict(
30
+ type='FCNHead',
31
+ in_channels=1024,
32
+ in_index=2,
33
+ channels=256,
34
+ num_convs=1,
35
+ concat_input=False,
36
+ dropout_ratio=0.1,
37
+ num_classes=19,
38
+ norm_cfg=norm_cfg,
39
+ align_corners=False,
40
+ loss_decode=dict(
41
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
42
+ # model training and testing settings
43
+ train_cfg=dict(),
44
+ test_cfg=dict(mode='whole'))
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/ccnet_r50-d8.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ norm_cfg = dict(type='SyncBN', requires_grad=True)
3
+ model = dict(
4
+ type='EncoderDecoder',
5
+ pretrained='open-mmlab://resnet50_v1c',
6
+ backbone=dict(
7
+ type='ResNetV1c',
8
+ depth=50,
9
+ num_stages=4,
10
+ out_indices=(0, 1, 2, 3),
11
+ dilations=(1, 1, 2, 4),
12
+ strides=(1, 2, 1, 1),
13
+ norm_cfg=norm_cfg,
14
+ norm_eval=False,
15
+ style='pytorch',
16
+ contract_dilation=True),
17
+ decode_head=dict(
18
+ type='CCHead',
19
+ in_channels=2048,
20
+ in_index=3,
21
+ channels=512,
22
+ recurrence=2,
23
+ dropout_ratio=0.1,
24
+ num_classes=19,
25
+ norm_cfg=norm_cfg,
26
+ align_corners=False,
27
+ loss_decode=dict(
28
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
29
+ auxiliary_head=dict(
30
+ type='FCNHead',
31
+ in_channels=1024,
32
+ in_index=2,
33
+ channels=256,
34
+ num_convs=1,
35
+ concat_input=False,
36
+ dropout_ratio=0.1,
37
+ num_classes=19,
38
+ norm_cfg=norm_cfg,
39
+ align_corners=False,
40
+ loss_decode=dict(
41
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
42
+ # model training and testing settings
43
+ train_cfg=dict(),
44
+ test_cfg=dict(mode='whole'))
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/cgnet.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ norm_cfg = dict(type='SyncBN', eps=1e-03, requires_grad=True)
3
+ model = dict(
4
+ type='EncoderDecoder',
5
+ backbone=dict(
6
+ type='CGNet',
7
+ norm_cfg=norm_cfg,
8
+ in_channels=3,
9
+ num_channels=(32, 64, 128),
10
+ num_blocks=(3, 21),
11
+ dilations=(2, 4),
12
+ reductions=(8, 16)),
13
+ decode_head=dict(
14
+ type='FCNHead',
15
+ in_channels=256,
16
+ in_index=2,
17
+ channels=256,
18
+ num_convs=0,
19
+ concat_input=False,
20
+ dropout_ratio=0,
21
+ num_classes=19,
22
+ norm_cfg=norm_cfg,
23
+ loss_decode=dict(
24
+ type='CrossEntropyLoss',
25
+ use_sigmoid=False,
26
+ loss_weight=1.0,
27
+ class_weight=[
28
+ 2.5959933, 6.7415504, 3.5354059, 9.8663225, 9.690899, 9.369352,
29
+ 10.289121, 9.953208, 4.3097677, 9.490387, 7.674431, 9.396905,
30
+ 10.347791, 6.3927646, 10.226669, 10.241062, 10.280587,
31
+ 10.396974, 10.055647
32
+ ])),
33
+ # model training and testing settings
34
+ train_cfg=dict(sampler=None),
35
+ test_cfg=dict(mode='whole'))
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/danet_r50-d8.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ norm_cfg = dict(type='SyncBN', requires_grad=True)
3
+ model = dict(
4
+ type='EncoderDecoder',
5
+ pretrained='open-mmlab://resnet50_v1c',
6
+ backbone=dict(
7
+ type='ResNetV1c',
8
+ depth=50,
9
+ num_stages=4,
10
+ out_indices=(0, 1, 2, 3),
11
+ dilations=(1, 1, 2, 4),
12
+ strides=(1, 2, 1, 1),
13
+ norm_cfg=norm_cfg,
14
+ norm_eval=False,
15
+ style='pytorch',
16
+ contract_dilation=True),
17
+ decode_head=dict(
18
+ type='DAHead',
19
+ in_channels=2048,
20
+ in_index=3,
21
+ channels=512,
22
+ pam_channels=64,
23
+ dropout_ratio=0.1,
24
+ num_classes=19,
25
+ norm_cfg=norm_cfg,
26
+ align_corners=False,
27
+ loss_decode=dict(
28
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
29
+ auxiliary_head=dict(
30
+ type='FCNHead',
31
+ in_channels=1024,
32
+ in_index=2,
33
+ channels=256,
34
+ num_convs=1,
35
+ concat_input=False,
36
+ dropout_ratio=0.1,
37
+ num_classes=19,
38
+ norm_cfg=norm_cfg,
39
+ align_corners=False,
40
+ loss_decode=dict(
41
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
42
+ # model training and testing settings
43
+ train_cfg=dict(),
44
+ test_cfg=dict(mode='whole'))
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/deeplabv3_r50-d8.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ norm_cfg = dict(type='SyncBN', requires_grad=True)
3
+ model = dict(
4
+ type='EncoderDecoder',
5
+ pretrained='open-mmlab://resnet50_v1c',
6
+ backbone=dict(
7
+ type='ResNetV1c',
8
+ depth=50,
9
+ num_stages=4,
10
+ out_indices=(0, 1, 2, 3),
11
+ dilations=(1, 1, 2, 4),
12
+ strides=(1, 2, 1, 1),
13
+ norm_cfg=norm_cfg,
14
+ norm_eval=False,
15
+ style='pytorch',
16
+ contract_dilation=True),
17
+ decode_head=dict(
18
+ type='ASPPHead',
19
+ in_channels=2048,
20
+ in_index=3,
21
+ channels=512,
22
+ dilations=(1, 12, 24, 36),
23
+ dropout_ratio=0.1,
24
+ num_classes=19,
25
+ norm_cfg=norm_cfg,
26
+ align_corners=False,
27
+ loss_decode=dict(
28
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
29
+ auxiliary_head=dict(
30
+ type='FCNHead',
31
+ in_channels=1024,
32
+ in_index=2,
33
+ channels=256,
34
+ num_convs=1,
35
+ concat_input=False,
36
+ dropout_ratio=0.1,
37
+ num_classes=19,
38
+ norm_cfg=norm_cfg,
39
+ align_corners=False,
40
+ loss_decode=dict(
41
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
42
+ # model training and testing settings
43
+ train_cfg=dict(),
44
+ test_cfg=dict(mode='whole'))