676e2c59aa5f2e27514c3795349ecbd4b2cb266b3b631e4a273b82cd4c9536a9
Browse files- extensions/microsoftexcel-controlnet/annotator/oneformer/oneformer/modeling/transformer_decoder/position_encoding.py +67 -0
- extensions/microsoftexcel-controlnet/annotator/oneformer/oneformer/modeling/transformer_decoder/text_transformer.py +257 -0
- extensions/microsoftexcel-controlnet/annotator/oneformer/oneformer/modeling/transformer_decoder/transformer.py +376 -0
- extensions/microsoftexcel-controlnet/annotator/oneformer/oneformer/oneformer_model.py +470 -0
- extensions/microsoftexcel-controlnet/annotator/oneformer/oneformer/utils/__init__.py +2 -0
- extensions/microsoftexcel-controlnet/annotator/oneformer/oneformer/utils/box_ops.py +133 -0
- extensions/microsoftexcel-controlnet/annotator/oneformer/oneformer/utils/events.py +120 -0
- extensions/microsoftexcel-controlnet/annotator/oneformer/oneformer/utils/misc.py +197 -0
- extensions/microsoftexcel-controlnet/annotator/oneformer/oneformer/utils/pos_embed.py +122 -0
- extensions/microsoftexcel-controlnet/annotator/oneformer/pycocotools/__init__.py +1 -0
- extensions/microsoftexcel-controlnet/annotator/oneformer/pycocotools/coco.py +444 -0
- extensions/microsoftexcel-controlnet/annotator/oneformer/pycocotools/cocoeval.py +534 -0
- extensions/microsoftexcel-controlnet/annotator/oneformer/pycocotools/mask.py +107 -0
- extensions/microsoftexcel-controlnet/annotator/openpose/LICENSE +108 -0
- extensions/microsoftexcel-controlnet/annotator/openpose/__init__.py +262 -0
- extensions/microsoftexcel-controlnet/annotator/openpose/__pycache__/__init__.cpython-310.pyc +0 -0
- extensions/microsoftexcel-controlnet/annotator/openpose/__pycache__/body.cpython-310.pyc +0 -0
- extensions/microsoftexcel-controlnet/annotator/openpose/__pycache__/face.cpython-310.pyc +0 -0
- extensions/microsoftexcel-controlnet/annotator/openpose/__pycache__/hand.cpython-310.pyc +0 -0
- extensions/microsoftexcel-controlnet/annotator/openpose/__pycache__/model.cpython-310.pyc +0 -0
- extensions/microsoftexcel-controlnet/annotator/openpose/__pycache__/util.cpython-310.pyc +0 -0
- extensions/microsoftexcel-controlnet/annotator/openpose/body.py +278 -0
- extensions/microsoftexcel-controlnet/annotator/openpose/face.py +362 -0
- extensions/microsoftexcel-controlnet/annotator/openpose/hand.py +94 -0
- extensions/microsoftexcel-controlnet/annotator/openpose/model.py +218 -0
- extensions/microsoftexcel-controlnet/annotator/openpose/util.py +383 -0
- extensions/microsoftexcel-controlnet/annotator/pidinet/LICENSE +21 -0
- extensions/microsoftexcel-controlnet/annotator/pidinet/__init__.py +51 -0
- extensions/microsoftexcel-controlnet/annotator/pidinet/model.py +653 -0
- extensions/microsoftexcel-controlnet/annotator/shuffle/__init__.py +74 -0
- extensions/microsoftexcel-controlnet/annotator/uniformer/LICENSE +203 -0
- extensions/microsoftexcel-controlnet/annotator/uniformer/__init__.py +56 -0
- extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/datasets/ade20k.py +54 -0
- extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/datasets/chase_db1.py +59 -0
- extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/datasets/cityscapes.py +54 -0
- extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/datasets/cityscapes_769x769.py +35 -0
- extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/datasets/drive.py +59 -0
- extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/datasets/hrf.py +59 -0
- extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/datasets/pascal_context.py +60 -0
- extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/datasets/pascal_context_59.py +60 -0
- extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/datasets/pascal_voc12.py +57 -0
- extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/datasets/pascal_voc12_aug.py +9 -0
- extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/datasets/stare.py +59 -0
- extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/default_runtime.py +14 -0
- extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/ann_r50-d8.py +46 -0
- extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/apcnet_r50-d8.py +44 -0
- extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/ccnet_r50-d8.py +44 -0
- extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/cgnet.py +35 -0
- extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/danet_r50-d8.py +44 -0
- extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/deeplabv3_r50-d8.py +44 -0
extensions/microsoftexcel-controlnet/annotator/oneformer/oneformer/modeling/transformer_decoder/position_encoding.py
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# ------------------------------------------------------------------------------
|
2 |
+
# Reference: https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/modeling/transformer_decoder/position_encoding.py
|
3 |
+
# Modified by Jitesh Jain (https://github.com/praeclarumjj3)
|
4 |
+
# ------------------------------------------------------------------------------
|
5 |
+
|
6 |
+
"""
|
7 |
+
Various positional encodings for the transformer.
|
8 |
+
"""
|
9 |
+
import math
|
10 |
+
|
11 |
+
import torch
|
12 |
+
from torch import nn
|
13 |
+
|
14 |
+
|
15 |
+
class PositionEmbeddingSine(nn.Module):
|
16 |
+
"""
|
17 |
+
This is a more standard version of the position embedding, very similar to the one
|
18 |
+
used by the Attention is all you need paper, generalized to work on images.
|
19 |
+
"""
|
20 |
+
|
21 |
+
def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
|
22 |
+
super().__init__()
|
23 |
+
self.num_pos_feats = num_pos_feats
|
24 |
+
self.temperature = temperature
|
25 |
+
self.normalize = normalize
|
26 |
+
if scale is not None and normalize is False:
|
27 |
+
raise ValueError("normalize should be True if scale is passed")
|
28 |
+
if scale is None:
|
29 |
+
scale = 2 * math.pi
|
30 |
+
self.scale = scale
|
31 |
+
|
32 |
+
def forward(self, x, mask=None):
|
33 |
+
if mask is None:
|
34 |
+
mask = torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool)
|
35 |
+
not_mask = ~mask
|
36 |
+
y_embed = not_mask.cumsum(1, dtype=torch.float32)
|
37 |
+
x_embed = not_mask.cumsum(2, dtype=torch.float32)
|
38 |
+
if self.normalize:
|
39 |
+
eps = 1e-6
|
40 |
+
y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
|
41 |
+
x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
|
42 |
+
|
43 |
+
dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
|
44 |
+
dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
|
45 |
+
|
46 |
+
pos_x = x_embed[:, :, :, None] / dim_t
|
47 |
+
pos_y = y_embed[:, :, :, None] / dim_t
|
48 |
+
pos_x = torch.stack(
|
49 |
+
(pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4
|
50 |
+
).flatten(3)
|
51 |
+
pos_y = torch.stack(
|
52 |
+
(pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4
|
53 |
+
).flatten(3)
|
54 |
+
pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
|
55 |
+
return pos
|
56 |
+
|
57 |
+
def __repr__(self, _repr_indent=4):
|
58 |
+
head = "Positional encoding " + self.__class__.__name__
|
59 |
+
body = [
|
60 |
+
"num_pos_feats: {}".format(self.num_pos_feats),
|
61 |
+
"temperature: {}".format(self.temperature),
|
62 |
+
"normalize: {}".format(self.normalize),
|
63 |
+
"scale: {}".format(self.scale),
|
64 |
+
]
|
65 |
+
# _repr_indent = 4
|
66 |
+
lines = [head] + [" " * _repr_indent + line for line in body]
|
67 |
+
return "\n".join(lines)
|
extensions/microsoftexcel-controlnet/annotator/oneformer/oneformer/modeling/transformer_decoder/text_transformer.py
ADDED
@@ -0,0 +1,257 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -------------------------------------------------------------------------
|
2 |
+
# MIT License
|
3 |
+
#
|
4 |
+
# Copyright (c) 2021 OpenAI
|
5 |
+
#
|
6 |
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
7 |
+
# of this software and associated documentation files (the "Software"), to deal
|
8 |
+
# in the Software without restriction, including without limitation the rights
|
9 |
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10 |
+
# copies of the Software, and to permit persons to whom the Software is
|
11 |
+
# furnished to do so, subject to the following conditions:
|
12 |
+
#
|
13 |
+
# The above copyright notice and this permission notice shall be included in all
|
14 |
+
# copies or substantial portions of the Software.
|
15 |
+
#
|
16 |
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
17 |
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
18 |
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
19 |
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
20 |
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
21 |
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
22 |
+
# SOFTWARE.
|
23 |
+
#
|
24 |
+
# -------------------------------------------------------------------------
|
25 |
+
|
26 |
+
import torch
|
27 |
+
import torch.utils.checkpoint as checkpoint
|
28 |
+
from torch import nn
|
29 |
+
from collections import OrderedDict
|
30 |
+
from timm.models.layers import trunc_normal_
|
31 |
+
|
32 |
+
class Attention(nn.Module):
|
33 |
+
def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
|
34 |
+
super().__init__()
|
35 |
+
self.num_heads = num_heads
|
36 |
+
head_dim = dim // num_heads
|
37 |
+
# NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
|
38 |
+
self.scale = qk_scale or head_dim ** -0.5
|
39 |
+
|
40 |
+
self.q_proj = nn.Linear(dim, dim, bias=qkv_bias)
|
41 |
+
self.k_proj = nn.Linear(dim, dim, bias=qkv_bias)
|
42 |
+
self.v_proj = nn.Linear(dim, dim, bias=qkv_bias)
|
43 |
+
|
44 |
+
|
45 |
+
self.attn_drop = nn.Dropout(attn_drop)
|
46 |
+
self.proj = nn.Linear(dim, dim)
|
47 |
+
self.proj_drop = nn.Dropout(proj_drop)
|
48 |
+
|
49 |
+
def forward(self, q, k, v):
|
50 |
+
B, N, C = q.shape
|
51 |
+
assert k.shape == v.shape
|
52 |
+
B, M, C = k.shape
|
53 |
+
q = self.q_proj(q).reshape(B, N, self.num_heads, C // self.num_heads)
|
54 |
+
k = self.k_proj(k).reshape(B, M, self.num_heads, C // self.num_heads)
|
55 |
+
v = self.v_proj(v).reshape(B, M, self.num_heads, C // self.num_heads)
|
56 |
+
|
57 |
+
attn = torch.einsum('bnkc,bmkc->bknm', q, k) * self.scale
|
58 |
+
|
59 |
+
attn = attn.softmax(dim=-1)
|
60 |
+
|
61 |
+
x = torch.einsum('bknm,bmkc->bnkc', attn, v).reshape(B, N, C)
|
62 |
+
|
63 |
+
x = self.proj(x)
|
64 |
+
x = self.proj_drop(x)
|
65 |
+
return x
|
66 |
+
|
67 |
+
class TransformerDecoderLayer(nn.Module):
|
68 |
+
def __init__(
|
69 |
+
self,
|
70 |
+
d_model,
|
71 |
+
nhead,
|
72 |
+
dropout=0.1,
|
73 |
+
):
|
74 |
+
super().__init__()
|
75 |
+
self.self_attn = Attention(d_model, nhead, proj_drop=dropout)
|
76 |
+
self.cross_attn = Attention(d_model, nhead, proj_drop=dropout)
|
77 |
+
|
78 |
+
self.norm1 = nn.LayerNorm(d_model)
|
79 |
+
self.norm2 = nn.LayerNorm(d_model)
|
80 |
+
self.norm3 = nn.LayerNorm(d_model)
|
81 |
+
self.dropout = nn.Dropout(dropout)
|
82 |
+
|
83 |
+
self.mlp = nn.Sequential(
|
84 |
+
nn.Linear(d_model, d_model * 4),
|
85 |
+
nn.GELU(),
|
86 |
+
nn.Dropout(dropout),
|
87 |
+
nn.Linear(d_model * 4, d_model)
|
88 |
+
)
|
89 |
+
|
90 |
+
def forward(self, x, mem):
|
91 |
+
q = k = v = self.norm1(x)
|
92 |
+
x = x + self.self_attn(q, k, v)
|
93 |
+
q = self.norm2(x)
|
94 |
+
x = x + self.cross_attn(q, mem, mem)
|
95 |
+
x = x + self.dropout(self.mlp(self.norm3(x)))
|
96 |
+
return x
|
97 |
+
|
98 |
+
|
99 |
+
class ContextDecoder(nn.Module):
|
100 |
+
def __init__(self,
|
101 |
+
transformer_width=256,
|
102 |
+
transformer_heads=4,
|
103 |
+
transformer_layers=6,
|
104 |
+
visual_dim=1024,
|
105 |
+
dropout=0.1,
|
106 |
+
**kwargs):
|
107 |
+
super().__init__()
|
108 |
+
|
109 |
+
self.memory_proj = nn.Sequential(
|
110 |
+
nn.LayerNorm(visual_dim),
|
111 |
+
nn.Linear(visual_dim, transformer_width),
|
112 |
+
nn.LayerNorm(transformer_width),
|
113 |
+
)
|
114 |
+
|
115 |
+
self.text_proj = nn.Sequential(
|
116 |
+
nn.LayerNorm(visual_dim),
|
117 |
+
nn.Linear(visual_dim, transformer_width),
|
118 |
+
)
|
119 |
+
|
120 |
+
self.decoder = nn.ModuleList([
|
121 |
+
TransformerDecoderLayer(transformer_width, transformer_heads, dropout) for _ in range(transformer_layers)
|
122 |
+
])
|
123 |
+
|
124 |
+
self.out_proj = nn.Sequential(
|
125 |
+
nn.LayerNorm(transformer_width),
|
126 |
+
nn.Linear(transformer_width, visual_dim)
|
127 |
+
)
|
128 |
+
|
129 |
+
self.apply(self._init_weights)
|
130 |
+
|
131 |
+
def _init_weights(self, m):
|
132 |
+
if isinstance(m, nn.Linear):
|
133 |
+
trunc_normal_(m.weight, std=.02)
|
134 |
+
if isinstance(m, nn.Linear) and m.bias is not None:
|
135 |
+
nn.init.constant_(m.bias, 0)
|
136 |
+
elif isinstance(m, nn.LayerNorm):
|
137 |
+
nn.init.constant_(m.bias, 0)
|
138 |
+
nn.init.constant_(m.weight, 1.0)
|
139 |
+
|
140 |
+
|
141 |
+
def forward(self, text, visual):
|
142 |
+
B, N, C = visual.shape
|
143 |
+
visual = self.memory_proj(visual)
|
144 |
+
x = self.text_proj(text)
|
145 |
+
|
146 |
+
for layer in self.decoder:
|
147 |
+
x = layer(x, visual)
|
148 |
+
|
149 |
+
return self.out_proj(x)
|
150 |
+
|
151 |
+
|
152 |
+
class QuickGELU(nn.Module):
|
153 |
+
|
154 |
+
def forward(self, x: torch.Tensor):
|
155 |
+
return x * torch.sigmoid(1.702 * x)
|
156 |
+
|
157 |
+
|
158 |
+
class ResidualAttentionBlock(nn.Module):
|
159 |
+
|
160 |
+
def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None):
|
161 |
+
super().__init__()
|
162 |
+
|
163 |
+
self.attn = nn.MultiheadAttention(d_model, n_head)
|
164 |
+
self.ln_1 = nn.LayerNorm(d_model)
|
165 |
+
self.mlp = nn.Sequential(
|
166 |
+
OrderedDict([('c_fc', nn.Linear(d_model, d_model * 4)), ('gelu', QuickGELU()),
|
167 |
+
('c_proj', nn.Linear(d_model * 4, d_model))]))
|
168 |
+
self.ln_2 = nn.LayerNorm(d_model)
|
169 |
+
self.attn_mask = attn_mask
|
170 |
+
|
171 |
+
def attention(self, x: torch.Tensor, key_padding_mask: torch.Tensor):
|
172 |
+
self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None
|
173 |
+
return self.attn(x, x, x, need_weights=False, attn_mask=self.attn_mask, key_padding_mask=key_padding_mask)[0]
|
174 |
+
|
175 |
+
def forward(self, x: torch.Tensor, key_padding_mask=None):
|
176 |
+
x = x + self.attention(self.ln_1(x), key_padding_mask=key_padding_mask)
|
177 |
+
x = x + self.mlp(self.ln_2(x))
|
178 |
+
return x
|
179 |
+
|
180 |
+
class Transformer(nn.Module):
|
181 |
+
|
182 |
+
def __init__(self, width: int, layers: int, heads: int, attn_mask: torch.Tensor = None, use_checkpoint=False):
|
183 |
+
super().__init__()
|
184 |
+
self.width = width
|
185 |
+
self.layers = layers
|
186 |
+
self.resblocks = nn.Sequential(*[ResidualAttentionBlock(width, heads, attn_mask) for _ in range(layers)])
|
187 |
+
proj_std = (self.width**-0.5) * ((2 * self.layers)**-0.5)
|
188 |
+
attn_std = self.width**-0.5
|
189 |
+
fc_std = (2 * self.width)**-0.5
|
190 |
+
for block in self.resblocks:
|
191 |
+
nn.init.normal_(block.attn.in_proj_weight, std=attn_std)
|
192 |
+
nn.init.normal_(block.attn.out_proj.weight, std=proj_std)
|
193 |
+
nn.init.normal_(block.mlp.c_fc.weight, std=fc_std)
|
194 |
+
nn.init.normal_(block.mlp.c_proj.weight, std=proj_std)
|
195 |
+
|
196 |
+
self.use_checkpoint = use_checkpoint
|
197 |
+
|
198 |
+
def forward(self, x: torch.Tensor):
|
199 |
+
for resblock in self.resblocks:
|
200 |
+
if self.use_checkpoint:
|
201 |
+
x = checkpoint.checkpoint(resblock, x)
|
202 |
+
else:
|
203 |
+
x = resblock(x)
|
204 |
+
return x
|
205 |
+
|
206 |
+
|
207 |
+
class TextTransformer(nn.Module):
|
208 |
+
|
209 |
+
def __init__(
|
210 |
+
self,
|
211 |
+
context_length: int,
|
212 |
+
width: int,
|
213 |
+
layers: int,
|
214 |
+
vocab_size,
|
215 |
+
use_checkpoint=False,
|
216 |
+
):
|
217 |
+
|
218 |
+
super().__init__()
|
219 |
+
heads = width // 64
|
220 |
+
self.context_length = context_length
|
221 |
+
self.width = width
|
222 |
+
self.transformer = Transformer(
|
223 |
+
width=width,
|
224 |
+
layers=layers,
|
225 |
+
heads=heads,
|
226 |
+
attn_mask=self.build_attention_mask(),
|
227 |
+
use_checkpoint=use_checkpoint)
|
228 |
+
|
229 |
+
self.positional_embedding = nn.Parameter(torch.empty(self.context_length, width))
|
230 |
+
self.ln_final = nn.LayerNorm(width)
|
231 |
+
self.token_embedding = nn.Embedding(vocab_size, width)
|
232 |
+
nn.init.normal_(self.token_embedding.weight, std=0.02)
|
233 |
+
|
234 |
+
# initialization
|
235 |
+
nn.init.normal_(self.positional_embedding, std=0.01)
|
236 |
+
|
237 |
+
def build_attention_mask(self):
|
238 |
+
# lazily create causal attention mask, with full attention between the vision tokens
|
239 |
+
# pytorch uses additive attention mask; fill with -inf
|
240 |
+
mask = torch.empty(self.context_length, self.context_length)
|
241 |
+
mask.fill_(float('-inf'))
|
242 |
+
mask.triu_(1) # zero out the lower diagonal
|
243 |
+
return mask
|
244 |
+
|
245 |
+
def forward(self, text):
|
246 |
+
x = self.token_embedding(text)
|
247 |
+
x = x + self.positional_embedding
|
248 |
+
x = x.permute(1, 0, 2) # NLD -> LND
|
249 |
+
x = self.transformer(x)
|
250 |
+
x = x.permute(1, 0, 2) # LND -> NLD
|
251 |
+
x = self.ln_final(x)
|
252 |
+
|
253 |
+
# x.shape = [batch_size, n_ctx, transformer.width]
|
254 |
+
# take features from the eot embedding (eot_token is the highest number in each sequence)
|
255 |
+
x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)]
|
256 |
+
|
257 |
+
return x
|
extensions/microsoftexcel-controlnet/annotator/oneformer/oneformer/modeling/transformer_decoder/transformer.py
ADDED
@@ -0,0 +1,376 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# ------------------------------------------------------------------------------
|
2 |
+
# Reference: https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/modeling/transformer_decoder/transformer.py
|
3 |
+
# Modified by Jitesh Jain (https://github.com/praeclarumjj3)
|
4 |
+
# ------------------------------------------------------------------------------
|
5 |
+
|
6 |
+
"""
|
7 |
+
Transformer class.
|
8 |
+
|
9 |
+
Copy-paste from torch.nn.Transformer with modifications:
|
10 |
+
* positional encodings are passed in MHattention
|
11 |
+
* extra LN at the end of encoder is removed
|
12 |
+
* decoder returns a stack of activations from all decoding layers
|
13 |
+
"""
|
14 |
+
import copy
|
15 |
+
from typing import List, Optional
|
16 |
+
|
17 |
+
import torch
|
18 |
+
import torch.nn.functional as F
|
19 |
+
from torch import Tensor, nn
|
20 |
+
|
21 |
+
|
22 |
+
class Transformer(nn.Module):
|
23 |
+
def __init__(
|
24 |
+
self,
|
25 |
+
d_model=512,
|
26 |
+
nhead=8,
|
27 |
+
num_encoder_layers=6,
|
28 |
+
num_decoder_layers=6,
|
29 |
+
dim_feedforward=2048,
|
30 |
+
dropout=0.1,
|
31 |
+
activation="relu",
|
32 |
+
normalize_before=False,
|
33 |
+
return_intermediate_dec=False,
|
34 |
+
):
|
35 |
+
super().__init__()
|
36 |
+
|
37 |
+
encoder_layer = TransformerEncoderLayer(
|
38 |
+
d_model, nhead, dim_feedforward, dropout, activation, normalize_before
|
39 |
+
)
|
40 |
+
encoder_norm = nn.LayerNorm(d_model) if normalize_before else None
|
41 |
+
self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)
|
42 |
+
|
43 |
+
decoder_layer = TransformerDecoderLayer(
|
44 |
+
d_model, nhead, dim_feedforward, dropout, activation, normalize_before
|
45 |
+
)
|
46 |
+
decoder_norm = nn.LayerNorm(d_model)
|
47 |
+
self.decoder = TransformerDecoder(
|
48 |
+
decoder_layer,
|
49 |
+
num_decoder_layers,
|
50 |
+
decoder_norm,
|
51 |
+
return_intermediate=return_intermediate_dec,
|
52 |
+
)
|
53 |
+
|
54 |
+
self._reset_parameters()
|
55 |
+
|
56 |
+
self.d_model = d_model
|
57 |
+
self.nhead = nhead
|
58 |
+
|
59 |
+
def _reset_parameters(self):
|
60 |
+
for p in self.parameters():
|
61 |
+
if p.dim() > 1:
|
62 |
+
nn.init.xavier_uniform_(p)
|
63 |
+
|
64 |
+
def forward(self, src, mask, query_embed, pos_embed, task_token=None):
|
65 |
+
# flatten NxCxHxW to HWxNxC
|
66 |
+
bs, c, h, w = src.shape
|
67 |
+
src = src.flatten(2).permute(2, 0, 1)
|
68 |
+
pos_embed = pos_embed.flatten(2).permute(2, 0, 1)
|
69 |
+
query_embed = query_embed.unsqueeze(1).repeat(1, bs, 1)
|
70 |
+
if mask is not None:
|
71 |
+
mask = mask.flatten(1)
|
72 |
+
|
73 |
+
if task_token is None:
|
74 |
+
tgt = torch.zeros_like(query_embed)
|
75 |
+
else:
|
76 |
+
tgt = task_token.repeat(query_embed.shape[0], 1, 1)
|
77 |
+
|
78 |
+
memory = self.encoder(src, src_key_padding_mask=mask, pos=pos_embed)
|
79 |
+
hs = self.decoder(
|
80 |
+
tgt, memory, memory_key_padding_mask=mask, pos=pos_embed, query_pos=query_embed
|
81 |
+
)
|
82 |
+
return hs.transpose(1, 2), memory.permute(1, 2, 0).view(bs, c, h, w)
|
83 |
+
|
84 |
+
|
85 |
+
class TransformerEncoder(nn.Module):
|
86 |
+
def __init__(self, encoder_layer, num_layers, norm=None):
|
87 |
+
super().__init__()
|
88 |
+
self.layers = _get_clones(encoder_layer, num_layers)
|
89 |
+
self.num_layers = num_layers
|
90 |
+
self.norm = norm
|
91 |
+
|
92 |
+
def forward(
|
93 |
+
self,
|
94 |
+
src,
|
95 |
+
mask: Optional[Tensor] = None,
|
96 |
+
src_key_padding_mask: Optional[Tensor] = None,
|
97 |
+
pos: Optional[Tensor] = None,
|
98 |
+
):
|
99 |
+
output = src
|
100 |
+
|
101 |
+
for layer in self.layers:
|
102 |
+
output = layer(
|
103 |
+
output, src_mask=mask, src_key_padding_mask=src_key_padding_mask, pos=pos
|
104 |
+
)
|
105 |
+
|
106 |
+
if self.norm is not None:
|
107 |
+
output = self.norm(output)
|
108 |
+
|
109 |
+
return output
|
110 |
+
|
111 |
+
|
112 |
+
class TransformerDecoder(nn.Module):
|
113 |
+
def __init__(self, decoder_layer, num_layers, norm=None, return_intermediate=False):
|
114 |
+
super().__init__()
|
115 |
+
self.layers = _get_clones(decoder_layer, num_layers)
|
116 |
+
self.num_layers = num_layers
|
117 |
+
self.norm = norm
|
118 |
+
self.return_intermediate = return_intermediate
|
119 |
+
|
120 |
+
def forward(
|
121 |
+
self,
|
122 |
+
tgt,
|
123 |
+
memory,
|
124 |
+
tgt_mask: Optional[Tensor] = None,
|
125 |
+
memory_mask: Optional[Tensor] = None,
|
126 |
+
tgt_key_padding_mask: Optional[Tensor] = None,
|
127 |
+
memory_key_padding_mask: Optional[Tensor] = None,
|
128 |
+
pos: Optional[Tensor] = None,
|
129 |
+
query_pos: Optional[Tensor] = None,
|
130 |
+
):
|
131 |
+
output = tgt
|
132 |
+
|
133 |
+
intermediate = []
|
134 |
+
|
135 |
+
for layer in self.layers:
|
136 |
+
output = layer(
|
137 |
+
output,
|
138 |
+
memory,
|
139 |
+
tgt_mask=tgt_mask,
|
140 |
+
memory_mask=memory_mask,
|
141 |
+
tgt_key_padding_mask=tgt_key_padding_mask,
|
142 |
+
memory_key_padding_mask=memory_key_padding_mask,
|
143 |
+
pos=pos,
|
144 |
+
query_pos=query_pos,
|
145 |
+
)
|
146 |
+
if self.return_intermediate:
|
147 |
+
intermediate.append(self.norm(output))
|
148 |
+
|
149 |
+
if self.norm is not None:
|
150 |
+
output = self.norm(output)
|
151 |
+
if self.return_intermediate:
|
152 |
+
intermediate.pop()
|
153 |
+
intermediate.append(output)
|
154 |
+
|
155 |
+
if self.return_intermediate:
|
156 |
+
return torch.stack(intermediate)
|
157 |
+
|
158 |
+
return output.unsqueeze(0)
|
159 |
+
|
160 |
+
|
161 |
+
class TransformerEncoderLayer(nn.Module):
|
162 |
+
def __init__(
|
163 |
+
self,
|
164 |
+
d_model,
|
165 |
+
nhead,
|
166 |
+
dim_feedforward=2048,
|
167 |
+
dropout=0.1,
|
168 |
+
activation="relu",
|
169 |
+
normalize_before=False,
|
170 |
+
):
|
171 |
+
super().__init__()
|
172 |
+
self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
|
173 |
+
# Implementation of Feedforward model
|
174 |
+
self.linear1 = nn.Linear(d_model, dim_feedforward)
|
175 |
+
self.dropout = nn.Dropout(dropout)
|
176 |
+
self.linear2 = nn.Linear(dim_feedforward, d_model)
|
177 |
+
|
178 |
+
self.norm1 = nn.LayerNorm(d_model)
|
179 |
+
self.norm2 = nn.LayerNorm(d_model)
|
180 |
+
self.dropout1 = nn.Dropout(dropout)
|
181 |
+
self.dropout2 = nn.Dropout(dropout)
|
182 |
+
|
183 |
+
self.activation = _get_activation_fn(activation)
|
184 |
+
self.normalize_before = normalize_before
|
185 |
+
|
186 |
+
def with_pos_embed(self, tensor, pos: Optional[Tensor]):
|
187 |
+
return tensor if pos is None else tensor + pos
|
188 |
+
|
189 |
+
def forward_post(
|
190 |
+
self,
|
191 |
+
src,
|
192 |
+
src_mask: Optional[Tensor] = None,
|
193 |
+
src_key_padding_mask: Optional[Tensor] = None,
|
194 |
+
pos: Optional[Tensor] = None,
|
195 |
+
):
|
196 |
+
q = k = self.with_pos_embed(src, pos)
|
197 |
+
src2 = self.self_attn(
|
198 |
+
q, k, value=src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask
|
199 |
+
)[0]
|
200 |
+
src = src + self.dropout1(src2)
|
201 |
+
src = self.norm1(src)
|
202 |
+
src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
|
203 |
+
src = src + self.dropout2(src2)
|
204 |
+
src = self.norm2(src)
|
205 |
+
return src
|
206 |
+
|
207 |
+
def forward_pre(
|
208 |
+
self,
|
209 |
+
src,
|
210 |
+
src_mask: Optional[Tensor] = None,
|
211 |
+
src_key_padding_mask: Optional[Tensor] = None,
|
212 |
+
pos: Optional[Tensor] = None,
|
213 |
+
):
|
214 |
+
src2 = self.norm1(src)
|
215 |
+
q = k = self.with_pos_embed(src2, pos)
|
216 |
+
src2 = self.self_attn(
|
217 |
+
q, k, value=src2, attn_mask=src_mask, key_padding_mask=src_key_padding_mask
|
218 |
+
)[0]
|
219 |
+
src = src + self.dropout1(src2)
|
220 |
+
src2 = self.norm2(src)
|
221 |
+
src2 = self.linear2(self.dropout(self.activation(self.linear1(src2))))
|
222 |
+
src = src + self.dropout2(src2)
|
223 |
+
return src
|
224 |
+
|
225 |
+
def forward(
|
226 |
+
self,
|
227 |
+
src,
|
228 |
+
src_mask: Optional[Tensor] = None,
|
229 |
+
src_key_padding_mask: Optional[Tensor] = None,
|
230 |
+
pos: Optional[Tensor] = None,
|
231 |
+
):
|
232 |
+
if self.normalize_before:
|
233 |
+
return self.forward_pre(src, src_mask, src_key_padding_mask, pos)
|
234 |
+
return self.forward_post(src, src_mask, src_key_padding_mask, pos)
|
235 |
+
|
236 |
+
|
237 |
+
class TransformerDecoderLayer(nn.Module):
|
238 |
+
def __init__(
|
239 |
+
self,
|
240 |
+
d_model,
|
241 |
+
nhead,
|
242 |
+
dim_feedforward=2048,
|
243 |
+
dropout=0.1,
|
244 |
+
activation="relu",
|
245 |
+
normalize_before=False,
|
246 |
+
):
|
247 |
+
super().__init__()
|
248 |
+
self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
|
249 |
+
self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
|
250 |
+
# Implementation of Feedforward model
|
251 |
+
self.linear1 = nn.Linear(d_model, dim_feedforward)
|
252 |
+
self.dropout = nn.Dropout(dropout)
|
253 |
+
self.linear2 = nn.Linear(dim_feedforward, d_model)
|
254 |
+
|
255 |
+
self.norm1 = nn.LayerNorm(d_model)
|
256 |
+
self.norm2 = nn.LayerNorm(d_model)
|
257 |
+
self.norm3 = nn.LayerNorm(d_model)
|
258 |
+
self.dropout1 = nn.Dropout(dropout)
|
259 |
+
self.dropout2 = nn.Dropout(dropout)
|
260 |
+
self.dropout3 = nn.Dropout(dropout)
|
261 |
+
|
262 |
+
self.activation = _get_activation_fn(activation)
|
263 |
+
self.normalize_before = normalize_before
|
264 |
+
|
265 |
+
def with_pos_embed(self, tensor, pos: Optional[Tensor]):
|
266 |
+
return tensor if pos is None else tensor + pos
|
267 |
+
|
268 |
+
def forward_post(
|
269 |
+
self,
|
270 |
+
tgt,
|
271 |
+
memory,
|
272 |
+
tgt_mask: Optional[Tensor] = None,
|
273 |
+
memory_mask: Optional[Tensor] = None,
|
274 |
+
tgt_key_padding_mask: Optional[Tensor] = None,
|
275 |
+
memory_key_padding_mask: Optional[Tensor] = None,
|
276 |
+
pos: Optional[Tensor] = None,
|
277 |
+
query_pos: Optional[Tensor] = None,
|
278 |
+
):
|
279 |
+
q = k = self.with_pos_embed(tgt, query_pos)
|
280 |
+
tgt2 = self.self_attn(
|
281 |
+
q, k, value=tgt, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask
|
282 |
+
)[0]
|
283 |
+
tgt = tgt + self.dropout1(tgt2)
|
284 |
+
tgt = self.norm1(tgt)
|
285 |
+
tgt2 = self.multihead_attn(
|
286 |
+
query=self.with_pos_embed(tgt, query_pos),
|
287 |
+
key=self.with_pos_embed(memory, pos),
|
288 |
+
value=memory,
|
289 |
+
attn_mask=memory_mask,
|
290 |
+
key_padding_mask=memory_key_padding_mask,
|
291 |
+
)[0]
|
292 |
+
tgt = tgt + self.dropout2(tgt2)
|
293 |
+
tgt = self.norm2(tgt)
|
294 |
+
tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
|
295 |
+
tgt = tgt + self.dropout3(tgt2)
|
296 |
+
tgt = self.norm3(tgt)
|
297 |
+
return tgt
|
298 |
+
|
299 |
+
def forward_pre(
|
300 |
+
self,
|
301 |
+
tgt,
|
302 |
+
memory,
|
303 |
+
tgt_mask: Optional[Tensor] = None,
|
304 |
+
memory_mask: Optional[Tensor] = None,
|
305 |
+
tgt_key_padding_mask: Optional[Tensor] = None,
|
306 |
+
memory_key_padding_mask: Optional[Tensor] = None,
|
307 |
+
pos: Optional[Tensor] = None,
|
308 |
+
query_pos: Optional[Tensor] = None,
|
309 |
+
):
|
310 |
+
tgt2 = self.norm1(tgt)
|
311 |
+
q = k = self.with_pos_embed(tgt2, query_pos)
|
312 |
+
tgt2 = self.self_attn(
|
313 |
+
q, k, value=tgt2, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask
|
314 |
+
)[0]
|
315 |
+
tgt = tgt + self.dropout1(tgt2)
|
316 |
+
tgt2 = self.norm2(tgt)
|
317 |
+
tgt2 = self.multihead_attn(
|
318 |
+
query=self.with_pos_embed(tgt2, query_pos),
|
319 |
+
key=self.with_pos_embed(memory, pos),
|
320 |
+
value=memory,
|
321 |
+
attn_mask=memory_mask,
|
322 |
+
key_padding_mask=memory_key_padding_mask,
|
323 |
+
)[0]
|
324 |
+
tgt = tgt + self.dropout2(tgt2)
|
325 |
+
tgt2 = self.norm3(tgt)
|
326 |
+
tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
|
327 |
+
tgt = tgt + self.dropout3(tgt2)
|
328 |
+
return tgt
|
329 |
+
|
330 |
+
def forward(
|
331 |
+
self,
|
332 |
+
tgt,
|
333 |
+
memory,
|
334 |
+
tgt_mask: Optional[Tensor] = None,
|
335 |
+
memory_mask: Optional[Tensor] = None,
|
336 |
+
tgt_key_padding_mask: Optional[Tensor] = None,
|
337 |
+
memory_key_padding_mask: Optional[Tensor] = None,
|
338 |
+
pos: Optional[Tensor] = None,
|
339 |
+
query_pos: Optional[Tensor] = None,
|
340 |
+
):
|
341 |
+
if self.normalize_before:
|
342 |
+
return self.forward_pre(
|
343 |
+
tgt,
|
344 |
+
memory,
|
345 |
+
tgt_mask,
|
346 |
+
memory_mask,
|
347 |
+
tgt_key_padding_mask,
|
348 |
+
memory_key_padding_mask,
|
349 |
+
pos,
|
350 |
+
query_pos,
|
351 |
+
)
|
352 |
+
return self.forward_post(
|
353 |
+
tgt,
|
354 |
+
memory,
|
355 |
+
tgt_mask,
|
356 |
+
memory_mask,
|
357 |
+
tgt_key_padding_mask,
|
358 |
+
memory_key_padding_mask,
|
359 |
+
pos,
|
360 |
+
query_pos,
|
361 |
+
)
|
362 |
+
|
363 |
+
|
364 |
+
def _get_clones(module, N):
|
365 |
+
return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
|
366 |
+
|
367 |
+
|
368 |
+
def _get_activation_fn(activation):
|
369 |
+
"""Return an activation function given a string"""
|
370 |
+
if activation == "relu":
|
371 |
+
return F.relu
|
372 |
+
if activation == "gelu":
|
373 |
+
return F.gelu
|
374 |
+
if activation == "glu":
|
375 |
+
return F.glu
|
376 |
+
raise RuntimeError(f"activation should be relu/gelu, not {activation}.")
|
extensions/microsoftexcel-controlnet/annotator/oneformer/oneformer/oneformer_model.py
ADDED
@@ -0,0 +1,470 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# ------------------------------------------------------------------------------
|
2 |
+
# Reference: https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/maskformer_model.py
|
3 |
+
# Modified by Jitesh Jain (https://github.com/praeclarumjj3)
|
4 |
+
# ------------------------------------------------------------------------------
|
5 |
+
|
6 |
+
from typing import Tuple
|
7 |
+
|
8 |
+
import torch
|
9 |
+
from torch import nn
|
10 |
+
from torch.nn import functional as F
|
11 |
+
|
12 |
+
from annotator.oneformer.detectron2.config import configurable
|
13 |
+
from annotator.oneformer.detectron2.data import MetadataCatalog
|
14 |
+
from annotator.oneformer.detectron2.modeling import META_ARCH_REGISTRY, build_backbone, build_sem_seg_head
|
15 |
+
from annotator.oneformer.detectron2.modeling.backbone import Backbone
|
16 |
+
from annotator.oneformer.detectron2.modeling.postprocessing import sem_seg_postprocess
|
17 |
+
from annotator.oneformer.detectron2.structures import Boxes, ImageList, Instances, BitMasks
|
18 |
+
from annotator.oneformer.detectron2.utils.memory import retry_if_cuda_oom
|
19 |
+
|
20 |
+
from .modeling.matcher import HungarianMatcher
|
21 |
+
from einops import rearrange
|
22 |
+
from .modeling.transformer_decoder.text_transformer import TextTransformer
|
23 |
+
from .modeling.transformer_decoder.oneformer_transformer_decoder import MLP
|
24 |
+
from annotator.oneformer.oneformer.data.tokenizer import SimpleTokenizer, Tokenize
|
25 |
+
|
26 |
+
@META_ARCH_REGISTRY.register()
|
27 |
+
class OneFormer(nn.Module):
|
28 |
+
"""
|
29 |
+
Main class for mask classification semantic segmentation architectures.
|
30 |
+
"""
|
31 |
+
|
32 |
+
@configurable
|
33 |
+
def __init__(
|
34 |
+
self,
|
35 |
+
*,
|
36 |
+
backbone: Backbone,
|
37 |
+
sem_seg_head: nn.Module,
|
38 |
+
task_mlp: nn.Module,
|
39 |
+
text_encoder: nn.Module,
|
40 |
+
text_projector: nn.Module,
|
41 |
+
prompt_ctx: nn.Embedding,
|
42 |
+
num_queries: int,
|
43 |
+
object_mask_threshold: float,
|
44 |
+
overlap_threshold: float,
|
45 |
+
metadata,
|
46 |
+
size_divisibility: int,
|
47 |
+
sem_seg_postprocess_before_inference: bool,
|
48 |
+
pixel_mean: Tuple[float],
|
49 |
+
pixel_std: Tuple[float],
|
50 |
+
# inference
|
51 |
+
semantic_on: bool,
|
52 |
+
panoptic_on: bool,
|
53 |
+
instance_on: bool,
|
54 |
+
detection_on: bool,
|
55 |
+
test_topk_per_image: int,
|
56 |
+
task_seq_len: int,
|
57 |
+
max_seq_len: int,
|
58 |
+
is_demo: bool,
|
59 |
+
):
|
60 |
+
"""
|
61 |
+
Args:
|
62 |
+
backbone: a backbone module, must follow detectron2's backbone interface
|
63 |
+
sem_seg_head: a module that predicts semantic segmentation from backbone features
|
64 |
+
criterion: a module that defines the loss
|
65 |
+
num_queries: int, number of queries
|
66 |
+
object_mask_threshold: float, threshold to filter query based on classification score
|
67 |
+
for panoptic segmentation inference
|
68 |
+
overlap_threshold: overlap threshold used in general inference for panoptic segmentation
|
69 |
+
metadata: dataset meta, get `thing` and `stuff` category names for panoptic
|
70 |
+
segmentation inference
|
71 |
+
size_divisibility: Some backbones require the input height and width to be divisible by a
|
72 |
+
specific integer. We can use this to override such requirement.
|
73 |
+
sem_seg_postprocess_before_inference: whether to resize the prediction back
|
74 |
+
to original input size before semantic segmentation inference or after.
|
75 |
+
For high-resolution dataset like Mapillary, resizing predictions before
|
76 |
+
inference will cause OOM error.
|
77 |
+
pixel_mean, pixel_std: list or tuple with #channels element, representing
|
78 |
+
the per-channel mean and std to be used to normalize the input image
|
79 |
+
semantic_on: bool, whether to output semantic segmentation prediction
|
80 |
+
instance_on: bool, whether to output instance segmentation prediction
|
81 |
+
panoptic_on: bool, whether to output panoptic segmentation prediction
|
82 |
+
test_topk_per_image: int, instance segmentation parameter, keep topk instances per image
|
83 |
+
"""
|
84 |
+
super().__init__()
|
85 |
+
self.backbone = backbone
|
86 |
+
self.sem_seg_head = sem_seg_head
|
87 |
+
self.task_mlp = task_mlp
|
88 |
+
self.text_encoder = text_encoder
|
89 |
+
self.text_projector = text_projector
|
90 |
+
self.prompt_ctx = prompt_ctx
|
91 |
+
self.num_queries = num_queries
|
92 |
+
self.overlap_threshold = overlap_threshold
|
93 |
+
self.object_mask_threshold = object_mask_threshold
|
94 |
+
self.metadata = metadata
|
95 |
+
if size_divisibility < 0:
|
96 |
+
# use backbone size_divisibility if not set
|
97 |
+
size_divisibility = self.backbone.size_divisibility
|
98 |
+
self.size_divisibility = size_divisibility
|
99 |
+
self.sem_seg_postprocess_before_inference = sem_seg_postprocess_before_inference
|
100 |
+
self.register_buffer("pixel_mean", torch.Tensor(pixel_mean).view(-1, 1, 1), False)
|
101 |
+
self.register_buffer("pixel_std", torch.Tensor(pixel_std).view(-1, 1, 1), False)
|
102 |
+
|
103 |
+
# additional args
|
104 |
+
self.semantic_on = semantic_on
|
105 |
+
self.instance_on = instance_on
|
106 |
+
self.panoptic_on = panoptic_on
|
107 |
+
self.detection_on = detection_on
|
108 |
+
self.test_topk_per_image = test_topk_per_image
|
109 |
+
|
110 |
+
self.text_tokenizer = Tokenize(SimpleTokenizer(), max_seq_len=max_seq_len)
|
111 |
+
self.task_tokenizer = Tokenize(SimpleTokenizer(), max_seq_len=task_seq_len)
|
112 |
+
self.is_demo = is_demo
|
113 |
+
|
114 |
+
self.thing_indices = [k for k in self.metadata.thing_dataset_id_to_contiguous_id.keys()]
|
115 |
+
|
116 |
+
if not self.semantic_on:
|
117 |
+
assert self.sem_seg_postprocess_before_inference
|
118 |
+
|
119 |
+
@classmethod
|
120 |
+
def from_config(cls, cfg):
|
121 |
+
backbone = build_backbone(cfg)
|
122 |
+
sem_seg_head = build_sem_seg_head(cfg, backbone.output_shape())
|
123 |
+
|
124 |
+
if cfg.MODEL.IS_TRAIN:
|
125 |
+
text_encoder = TextTransformer(context_length=cfg.MODEL.TEXT_ENCODER.CONTEXT_LENGTH,
|
126 |
+
width=cfg.MODEL.TEXT_ENCODER.WIDTH,
|
127 |
+
layers=cfg.MODEL.TEXT_ENCODER.NUM_LAYERS,
|
128 |
+
vocab_size=cfg.MODEL.TEXT_ENCODER.VOCAB_SIZE)
|
129 |
+
text_projector = MLP(text_encoder.width, cfg.MODEL.ONE_FORMER.HIDDEN_DIM,
|
130 |
+
cfg.MODEL.ONE_FORMER.HIDDEN_DIM, cfg.MODEL.TEXT_ENCODER.PROJ_NUM_LAYERS)
|
131 |
+
if cfg.MODEL.TEXT_ENCODER.N_CTX > 0:
|
132 |
+
prompt_ctx = nn.Embedding(cfg.MODEL.TEXT_ENCODER.N_CTX, cfg.MODEL.TEXT_ENCODER.WIDTH)
|
133 |
+
else:
|
134 |
+
prompt_ctx = None
|
135 |
+
else:
|
136 |
+
text_encoder = None
|
137 |
+
text_projector = None
|
138 |
+
prompt_ctx = None
|
139 |
+
|
140 |
+
task_mlp = MLP(cfg.INPUT.TASK_SEQ_LEN, cfg.MODEL.ONE_FORMER.HIDDEN_DIM,
|
141 |
+
cfg.MODEL.ONE_FORMER.HIDDEN_DIM, 2)
|
142 |
+
|
143 |
+
# Loss parameters:
|
144 |
+
deep_supervision = cfg.MODEL.ONE_FORMER.DEEP_SUPERVISION
|
145 |
+
no_object_weight = cfg.MODEL.ONE_FORMER.NO_OBJECT_WEIGHT
|
146 |
+
|
147 |
+
# loss weights
|
148 |
+
class_weight = cfg.MODEL.ONE_FORMER.CLASS_WEIGHT
|
149 |
+
dice_weight = cfg.MODEL.ONE_FORMER.DICE_WEIGHT
|
150 |
+
mask_weight = cfg.MODEL.ONE_FORMER.MASK_WEIGHT
|
151 |
+
contrastive_weight = cfg.MODEL.ONE_FORMER.CONTRASTIVE_WEIGHT
|
152 |
+
|
153 |
+
# building criterion
|
154 |
+
matcher = HungarianMatcher(
|
155 |
+
cost_class=class_weight,
|
156 |
+
cost_mask=mask_weight,
|
157 |
+
cost_dice=dice_weight,
|
158 |
+
num_points=cfg.MODEL.ONE_FORMER.TRAIN_NUM_POINTS,
|
159 |
+
)
|
160 |
+
|
161 |
+
weight_dict = {"loss_ce": class_weight, "loss_mask": mask_weight,
|
162 |
+
"loss_dice": dice_weight, "loss_contrastive": contrastive_weight}
|
163 |
+
|
164 |
+
|
165 |
+
if deep_supervision:
|
166 |
+
dec_layers = cfg.MODEL.ONE_FORMER.DEC_LAYERS
|
167 |
+
aux_weight_dict = {}
|
168 |
+
for i in range(dec_layers - 1):
|
169 |
+
aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()})
|
170 |
+
weight_dict.update(aux_weight_dict)
|
171 |
+
|
172 |
+
losses = ["labels", "masks", "contrastive"]
|
173 |
+
|
174 |
+
return {
|
175 |
+
"backbone": backbone,
|
176 |
+
"sem_seg_head": sem_seg_head,
|
177 |
+
"task_mlp": task_mlp,
|
178 |
+
"prompt_ctx": prompt_ctx,
|
179 |
+
"text_encoder": text_encoder,
|
180 |
+
"text_projector": text_projector,
|
181 |
+
"num_queries": cfg.MODEL.ONE_FORMER.NUM_OBJECT_QUERIES,
|
182 |
+
"object_mask_threshold": cfg.MODEL.TEST.OBJECT_MASK_THRESHOLD,
|
183 |
+
"overlap_threshold": cfg.MODEL.TEST.OVERLAP_THRESHOLD,
|
184 |
+
"metadata": MetadataCatalog.get(cfg.DATASETS.TRAIN[0]),
|
185 |
+
"size_divisibility": cfg.MODEL.ONE_FORMER.SIZE_DIVISIBILITY,
|
186 |
+
"sem_seg_postprocess_before_inference": (
|
187 |
+
cfg.MODEL.TEST.SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE
|
188 |
+
or cfg.MODEL.TEST.PANOPTIC_ON
|
189 |
+
or cfg.MODEL.TEST.INSTANCE_ON
|
190 |
+
),
|
191 |
+
"pixel_mean": cfg.MODEL.PIXEL_MEAN,
|
192 |
+
"pixel_std": cfg.MODEL.PIXEL_STD,
|
193 |
+
# inference
|
194 |
+
"semantic_on": cfg.MODEL.TEST.SEMANTIC_ON,
|
195 |
+
"instance_on": cfg.MODEL.TEST.INSTANCE_ON,
|
196 |
+
"panoptic_on": cfg.MODEL.TEST.PANOPTIC_ON,
|
197 |
+
"detection_on": cfg.MODEL.TEST.DETECTION_ON,
|
198 |
+
"test_topk_per_image": cfg.TEST.DETECTIONS_PER_IMAGE,
|
199 |
+
"task_seq_len": cfg.INPUT.TASK_SEQ_LEN,
|
200 |
+
"max_seq_len": cfg.INPUT.MAX_SEQ_LEN,
|
201 |
+
"is_demo": cfg.MODEL.IS_DEMO,
|
202 |
+
}
|
203 |
+
|
204 |
+
@property
|
205 |
+
def device(self):
|
206 |
+
return self.pixel_mean.device
|
207 |
+
|
208 |
+
def encode_text(self, text):
|
209 |
+
assert text.ndim in [2, 3], text.ndim
|
210 |
+
b = text.shape[0]
|
211 |
+
squeeze_dim = False
|
212 |
+
num_text = 1
|
213 |
+
if text.ndim == 3:
|
214 |
+
num_text = text.shape[1]
|
215 |
+
text = rearrange(text, 'b n l -> (b n) l', n=num_text)
|
216 |
+
squeeze_dim = True
|
217 |
+
|
218 |
+
# [B, C]
|
219 |
+
x = self.text_encoder(text)
|
220 |
+
|
221 |
+
text_x = self.text_projector(x)
|
222 |
+
|
223 |
+
if squeeze_dim:
|
224 |
+
text_x = rearrange(text_x, '(b n) c -> b n c', n=num_text)
|
225 |
+
if self.prompt_ctx is not None:
|
226 |
+
text_ctx = self.prompt_ctx.weight.unsqueeze(0).repeat(text_x.shape[0], 1, 1)
|
227 |
+
text_x = torch.cat([text_x, text_ctx], dim=1)
|
228 |
+
|
229 |
+
return {"texts": text_x}
|
230 |
+
|
231 |
+
def forward(self, batched_inputs):
|
232 |
+
"""
|
233 |
+
Args:
|
234 |
+
batched_inputs: a list, batched outputs of :class:`DatasetMapper`.
|
235 |
+
Each item in the list contains the inputs for one image.
|
236 |
+
For now, each item in the list is a dict that contains:
|
237 |
+
* "image": Tensor, image in (C, H, W) format.
|
238 |
+
* "instances": per-region ground truth
|
239 |
+
* Other information that's included in the original dicts, such as:
|
240 |
+
"height", "width" (int): the output resolution of the model (may be different
|
241 |
+
from input resolution), used in inference.
|
242 |
+
Returns:
|
243 |
+
list[dict]:
|
244 |
+
each dict has the results for one image. The dict contains the following keys:
|
245 |
+
* "sem_seg":
|
246 |
+
A Tensor that represents the
|
247 |
+
per-pixel segmentation prediced by the head.
|
248 |
+
The prediction has shape KxHxW that represents the logits of
|
249 |
+
each class for each pixel.
|
250 |
+
* "panoptic_seg":
|
251 |
+
A tuple that represent panoptic output
|
252 |
+
panoptic_seg (Tensor): of shape (height, width) where the values are ids for each segment.
|
253 |
+
segments_info (list[dict]): Describe each segment in `panoptic_seg`.
|
254 |
+
Each dict contains keys "id", "category_id", "isthing".
|
255 |
+
"""
|
256 |
+
images = [x["image"].to(self.device) for x in batched_inputs]
|
257 |
+
images = [(x - self.pixel_mean) / self.pixel_std for x in images]
|
258 |
+
images = ImageList.from_tensors(images, self.size_divisibility)
|
259 |
+
|
260 |
+
tasks = torch.cat([self.task_tokenizer(x["task"]).to(self.device).unsqueeze(0) for x in batched_inputs], dim=0)
|
261 |
+
tasks = self.task_mlp(tasks.float())
|
262 |
+
|
263 |
+
features = self.backbone(images.tensor)
|
264 |
+
outputs = self.sem_seg_head(features, tasks)
|
265 |
+
|
266 |
+
if self.training:
|
267 |
+
texts = torch.cat([self.text_tokenizer(x["text"]).to(self.device).unsqueeze(0) for x in batched_inputs], dim=0)
|
268 |
+
texts_x = self.encode_text(texts)
|
269 |
+
|
270 |
+
outputs = {**outputs, **texts_x}
|
271 |
+
|
272 |
+
# mask classification target
|
273 |
+
if "instances" in batched_inputs[0]:
|
274 |
+
gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
|
275 |
+
targets = self.prepare_targets(gt_instances, images)
|
276 |
+
else:
|
277 |
+
targets = None
|
278 |
+
|
279 |
+
# bipartite matching-based loss
|
280 |
+
losses = self.criterion(outputs, targets)
|
281 |
+
|
282 |
+
for k in list(losses.keys()):
|
283 |
+
if k in self.criterion.weight_dict:
|
284 |
+
losses[k] *= self.criterion.weight_dict[k]
|
285 |
+
else:
|
286 |
+
# remove this loss if not specified in `weight_dict`
|
287 |
+
losses.pop(k)
|
288 |
+
return losses
|
289 |
+
else:
|
290 |
+
mask_cls_results = outputs["pred_logits"]
|
291 |
+
mask_pred_results = outputs["pred_masks"]
|
292 |
+
# upsample masks
|
293 |
+
mask_pred_results = F.interpolate(
|
294 |
+
mask_pred_results,
|
295 |
+
size=(images.tensor.shape[-2], images.tensor.shape[-1]),
|
296 |
+
mode="bilinear",
|
297 |
+
align_corners=False,
|
298 |
+
)
|
299 |
+
|
300 |
+
del outputs
|
301 |
+
|
302 |
+
processed_results = []
|
303 |
+
for i, data in enumerate(zip(
|
304 |
+
mask_cls_results, mask_pred_results, batched_inputs, images.image_sizes
|
305 |
+
)):
|
306 |
+
mask_cls_result, mask_pred_result, input_per_image, image_size = data
|
307 |
+
height = input_per_image.get("height", image_size[0])
|
308 |
+
width = input_per_image.get("width", image_size[1])
|
309 |
+
processed_results.append({})
|
310 |
+
|
311 |
+
if self.sem_seg_postprocess_before_inference:
|
312 |
+
mask_pred_result = retry_if_cuda_oom(sem_seg_postprocess)(
|
313 |
+
mask_pred_result, image_size, height, width
|
314 |
+
)
|
315 |
+
mask_cls_result = mask_cls_result.to(mask_pred_result)
|
316 |
+
|
317 |
+
# semantic segmentation inference
|
318 |
+
if self.semantic_on:
|
319 |
+
r = retry_if_cuda_oom(self.semantic_inference)(mask_cls_result, mask_pred_result)
|
320 |
+
if not self.sem_seg_postprocess_before_inference:
|
321 |
+
r = retry_if_cuda_oom(sem_seg_postprocess)(r, image_size, height, width)
|
322 |
+
processed_results[-1]["sem_seg"] = r
|
323 |
+
|
324 |
+
# panoptic segmentation inference
|
325 |
+
if self.panoptic_on:
|
326 |
+
panoptic_r = retry_if_cuda_oom(self.panoptic_inference)(mask_cls_result, mask_pred_result)
|
327 |
+
processed_results[-1]["panoptic_seg"] = panoptic_r
|
328 |
+
|
329 |
+
# instance segmentation inference
|
330 |
+
if self.instance_on:
|
331 |
+
instance_r = retry_if_cuda_oom(self.instance_inference)(mask_cls_result, mask_pred_result)
|
332 |
+
processed_results[-1]["instances"] = instance_r
|
333 |
+
|
334 |
+
if self.detection_on:
|
335 |
+
bbox_r = retry_if_cuda_oom(self.instance_inference)(mask_cls_result, mask_pred_result)
|
336 |
+
processed_results[-1]["box_instances"] = bbox_r
|
337 |
+
|
338 |
+
return processed_results
|
339 |
+
|
340 |
+
def prepare_targets(self, targets, images):
|
341 |
+
h_pad, w_pad = images.tensor.shape[-2:]
|
342 |
+
new_targets = []
|
343 |
+
for targets_per_image in targets:
|
344 |
+
# pad gt
|
345 |
+
gt_masks = targets_per_image.gt_masks
|
346 |
+
padded_masks = torch.zeros((gt_masks.shape[0], h_pad, w_pad), dtype=gt_masks.dtype, device=gt_masks.device)
|
347 |
+
padded_masks[:, : gt_masks.shape[1], : gt_masks.shape[2]] = gt_masks
|
348 |
+
new_targets.append(
|
349 |
+
{
|
350 |
+
"labels": targets_per_image.gt_classes,
|
351 |
+
"masks": padded_masks,
|
352 |
+
}
|
353 |
+
)
|
354 |
+
return new_targets
|
355 |
+
|
356 |
+
def semantic_inference(self, mask_cls, mask_pred):
|
357 |
+
mask_cls = F.softmax(mask_cls, dim=-1)[..., :-1]
|
358 |
+
mask_pred = mask_pred.sigmoid()
|
359 |
+
semseg = torch.einsum("qc,qhw->chw", mask_cls, mask_pred)
|
360 |
+
return semseg
|
361 |
+
|
362 |
+
def panoptic_inference(self, mask_cls, mask_pred):
|
363 |
+
scores, labels = F.softmax(mask_cls, dim=-1).max(-1)
|
364 |
+
mask_pred = mask_pred.sigmoid()
|
365 |
+
|
366 |
+
keep = labels.ne(self.sem_seg_head.num_classes) & (scores > self.object_mask_threshold)
|
367 |
+
cur_scores = scores[keep]
|
368 |
+
cur_classes = labels[keep]
|
369 |
+
cur_masks = mask_pred[keep]
|
370 |
+
cur_mask_cls = mask_cls[keep]
|
371 |
+
cur_mask_cls = cur_mask_cls[:, :-1]
|
372 |
+
|
373 |
+
cur_prob_masks = cur_scores.view(-1, 1, 1) * cur_masks
|
374 |
+
|
375 |
+
h, w = cur_masks.shape[-2:]
|
376 |
+
panoptic_seg = torch.zeros((h, w), dtype=torch.int32, device=cur_masks.device)
|
377 |
+
segments_info = []
|
378 |
+
|
379 |
+
current_segment_id = 0
|
380 |
+
|
381 |
+
if cur_masks.shape[0] == 0:
|
382 |
+
# We didn't detect any mask :(
|
383 |
+
return panoptic_seg, segments_info
|
384 |
+
else:
|
385 |
+
# take argmax
|
386 |
+
cur_mask_ids = cur_prob_masks.argmax(0)
|
387 |
+
stuff_memory_list = {}
|
388 |
+
for k in range(cur_classes.shape[0]):
|
389 |
+
pred_class = cur_classes[k].item()
|
390 |
+
isthing = pred_class in self.metadata.thing_dataset_id_to_contiguous_id.values()
|
391 |
+
mask_area = (cur_mask_ids == k).sum().item()
|
392 |
+
original_area = (cur_masks[k] >= 0.5).sum().item()
|
393 |
+
mask = (cur_mask_ids == k) & (cur_masks[k] >= 0.5)
|
394 |
+
|
395 |
+
if mask_area > 0 and original_area > 0 and mask.sum().item() > 0:
|
396 |
+
if mask_area / original_area < self.overlap_threshold:
|
397 |
+
continue
|
398 |
+
|
399 |
+
# merge stuff regions
|
400 |
+
if not isthing:
|
401 |
+
if int(pred_class) in stuff_memory_list.keys():
|
402 |
+
panoptic_seg[mask] = stuff_memory_list[int(pred_class)]
|
403 |
+
continue
|
404 |
+
else:
|
405 |
+
stuff_memory_list[int(pred_class)] = current_segment_id + 1
|
406 |
+
|
407 |
+
current_segment_id += 1
|
408 |
+
panoptic_seg[mask] = current_segment_id
|
409 |
+
|
410 |
+
segments_info.append(
|
411 |
+
{
|
412 |
+
"id": current_segment_id,
|
413 |
+
"isthing": bool(isthing),
|
414 |
+
"category_id": int(pred_class),
|
415 |
+
}
|
416 |
+
)
|
417 |
+
|
418 |
+
return panoptic_seg, segments_info
|
419 |
+
|
420 |
+
def instance_inference(self, mask_cls, mask_pred):
|
421 |
+
# mask_pred is already processed to have the same shape as original input
|
422 |
+
image_size = mask_pred.shape[-2:]
|
423 |
+
|
424 |
+
# [Q, K]
|
425 |
+
scores = F.softmax(mask_cls, dim=-1)[:, :-1]
|
426 |
+
labels = torch.arange(self.sem_seg_head.num_classes, device=self.device).unsqueeze(0).repeat(self.num_queries, 1).flatten(0, 1)
|
427 |
+
|
428 |
+
# scores_per_image, topk_indices = scores.flatten(0, 1).topk(self.num_queries, sorted=False)
|
429 |
+
scores_per_image, topk_indices = scores.flatten(0, 1).topk(self.test_topk_per_image, sorted=False)
|
430 |
+
labels_per_image = labels[topk_indices]
|
431 |
+
|
432 |
+
topk_indices = topk_indices // self.sem_seg_head.num_classes
|
433 |
+
# mask_pred = mask_pred.unsqueeze(1).repeat(1, self.sem_seg_head.num_classes, 1).flatten(0, 1)
|
434 |
+
mask_pred = mask_pred[topk_indices]
|
435 |
+
|
436 |
+
# Only consider scores with confidence over [self.object_mask_threshold] for demo
|
437 |
+
if self.is_demo:
|
438 |
+
keep = scores_per_image > self.object_mask_threshold
|
439 |
+
scores_per_image = scores_per_image[keep]
|
440 |
+
labels_per_image = labels_per_image[keep]
|
441 |
+
mask_pred = mask_pred[keep]
|
442 |
+
|
443 |
+
# if this is panoptic segmentation, we only keep the "thing" classes
|
444 |
+
if self.panoptic_on:
|
445 |
+
keep = torch.zeros_like(scores_per_image).bool()
|
446 |
+
for i, lab in enumerate(labels_per_image):
|
447 |
+
keep[i] = lab in self.metadata.thing_dataset_id_to_contiguous_id.values()
|
448 |
+
|
449 |
+
scores_per_image = scores_per_image[keep]
|
450 |
+
labels_per_image = labels_per_image[keep]
|
451 |
+
mask_pred = mask_pred[keep]
|
452 |
+
|
453 |
+
if 'ade20k' in self.metadata.name:
|
454 |
+
for i in range(labels_per_image.shape[0]):
|
455 |
+
labels_per_image[i] = self.thing_indices.index(labels_per_image[i].item())
|
456 |
+
|
457 |
+
result = Instances(image_size)
|
458 |
+
# mask (before sigmoid)
|
459 |
+
result.pred_masks = (mask_pred > 0).float()
|
460 |
+
if self.detection_on:
|
461 |
+
# Uncomment the following to get boxes from masks (this is slow)
|
462 |
+
result.pred_boxes = BitMasks(mask_pred > 0).get_bounding_boxes()
|
463 |
+
else:
|
464 |
+
result.pred_boxes = Boxes(torch.zeros(mask_pred.size(0), 4))
|
465 |
+
|
466 |
+
# calculate average mask prob
|
467 |
+
mask_scores_per_image = (mask_pred.sigmoid().flatten(1) * result.pred_masks.flatten(1)).sum(1) / (result.pred_masks.flatten(1).sum(1) + 1e-6)
|
468 |
+
result.scores = scores_per_image * mask_scores_per_image
|
469 |
+
result.pred_classes = labels_per_image
|
470 |
+
return result
|
extensions/microsoftexcel-controlnet/annotator/oneformer/oneformer/utils/__init__.py
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
2 |
+
from .events import setup_wandb, WandbWriter
|
extensions/microsoftexcel-controlnet/annotator/oneformer/oneformer/utils/box_ops.py
ADDED
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
2 |
+
"""
|
3 |
+
Utilities for bounding box manipulation and GIoU.
|
4 |
+
"""
|
5 |
+
import torch, os
|
6 |
+
from torchvision.ops.boxes import box_area
|
7 |
+
|
8 |
+
|
9 |
+
def box_cxcywh_to_xyxy(x):
|
10 |
+
x_c, y_c, w, h = x.unbind(-1)
|
11 |
+
b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
|
12 |
+
(x_c + 0.5 * w), (y_c + 0.5 * h)]
|
13 |
+
return torch.stack(b, dim=-1)
|
14 |
+
|
15 |
+
|
16 |
+
def box_xyxy_to_cxcywh(x):
|
17 |
+
x0, y0, x1, y1 = x.unbind(-1)
|
18 |
+
b = [(x0 + x1) / 2, (y0 + y1) / 2,
|
19 |
+
(x1 - x0), (y1 - y0)]
|
20 |
+
return torch.stack(b, dim=-1)
|
21 |
+
|
22 |
+
|
23 |
+
# modified from torchvision to also return the union
|
24 |
+
def box_iou(boxes1, boxes2):
|
25 |
+
area1 = box_area(boxes1)
|
26 |
+
area2 = box_area(boxes2)
|
27 |
+
|
28 |
+
# import ipdb; ipdb.set_trace()
|
29 |
+
lt = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2]
|
30 |
+
rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2]
|
31 |
+
|
32 |
+
wh = (rb - lt).clamp(min=0) # [N,M,2]
|
33 |
+
inter = wh[:, :, 0] * wh[:, :, 1] # [N,M]
|
34 |
+
|
35 |
+
union = area1[:, None] + area2 - inter
|
36 |
+
|
37 |
+
iou = inter / (union + 1e-6)
|
38 |
+
return iou, union
|
39 |
+
|
40 |
+
|
41 |
+
def generalized_box_iou(boxes1, boxes2):
|
42 |
+
"""
|
43 |
+
Generalized IoU from https://giou.stanford.edu/
|
44 |
+
The boxes should be in [x0, y0, x1, y1] format
|
45 |
+
Returns a [N, M] pairwise matrix, where N = len(boxes1)
|
46 |
+
and M = len(boxes2)
|
47 |
+
"""
|
48 |
+
# degenerate boxes gives inf / nan results
|
49 |
+
# so do an early check
|
50 |
+
assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
|
51 |
+
assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
|
52 |
+
# except:
|
53 |
+
# import ipdb; ipdb.set_trace()
|
54 |
+
iou, union = box_iou(boxes1, boxes2)
|
55 |
+
|
56 |
+
lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])
|
57 |
+
rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
|
58 |
+
|
59 |
+
wh = (rb - lt).clamp(min=0) # [N,M,2]
|
60 |
+
area = wh[:, :, 0] * wh[:, :, 1]
|
61 |
+
|
62 |
+
return iou - (area - union) / (area + 1e-6)
|
63 |
+
|
64 |
+
|
65 |
+
|
66 |
+
# modified from torchvision to also return the union
|
67 |
+
def box_iou_pairwise(boxes1, boxes2):
|
68 |
+
area1 = box_area(boxes1)
|
69 |
+
area2 = box_area(boxes2)
|
70 |
+
|
71 |
+
lt = torch.max(boxes1[:, :2], boxes2[:, :2]) # [N,2]
|
72 |
+
rb = torch.min(boxes1[:, 2:], boxes2[:, 2:]) # [N,2]
|
73 |
+
|
74 |
+
wh = (rb - lt).clamp(min=0) # [N,2]
|
75 |
+
inter = wh[:, 0] * wh[:, 1] # [N]
|
76 |
+
|
77 |
+
union = area1 + area2 - inter
|
78 |
+
|
79 |
+
iou = inter / union
|
80 |
+
return iou, union
|
81 |
+
|
82 |
+
|
83 |
+
def generalized_box_iou_pairwise(boxes1, boxes2):
|
84 |
+
"""
|
85 |
+
Generalized IoU from https://giou.stanford.edu/
|
86 |
+
Input:
|
87 |
+
- boxes1, boxes2: N,4
|
88 |
+
Output:
|
89 |
+
- giou: N, 4
|
90 |
+
"""
|
91 |
+
# degenerate boxes gives inf / nan results
|
92 |
+
# so do an early check
|
93 |
+
assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
|
94 |
+
assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
|
95 |
+
assert boxes1.shape == boxes2.shape
|
96 |
+
iou, union = box_iou_pairwise(boxes1, boxes2) # N, 4
|
97 |
+
|
98 |
+
lt = torch.min(boxes1[:, :2], boxes2[:, :2])
|
99 |
+
rb = torch.max(boxes1[:, 2:], boxes2[:, 2:])
|
100 |
+
|
101 |
+
wh = (rb - lt).clamp(min=0) # [N,2]
|
102 |
+
area = wh[:, 0] * wh[:, 1]
|
103 |
+
|
104 |
+
return iou - (area - union) / area
|
105 |
+
|
106 |
+
def masks_to_boxes(masks):
|
107 |
+
"""Compute the bounding boxes around the provided masks
|
108 |
+
The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions.
|
109 |
+
Returns a [N, 4] tensors, with the boxes in xyxy format
|
110 |
+
"""
|
111 |
+
if masks.numel() == 0:
|
112 |
+
return torch.zeros((0, 4), device=masks.device)
|
113 |
+
|
114 |
+
h, w = masks.shape[-2:]
|
115 |
+
|
116 |
+
y = torch.arange(0, h, dtype=torch.float)
|
117 |
+
x = torch.arange(0, w, dtype=torch.float)
|
118 |
+
y, x = torch.meshgrid(y, x)
|
119 |
+
|
120 |
+
x_mask = (masks * x.unsqueeze(0))
|
121 |
+
x_max = x_mask.flatten(1).max(-1)[0]
|
122 |
+
x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
|
123 |
+
|
124 |
+
y_mask = (masks * y.unsqueeze(0))
|
125 |
+
y_max = y_mask.flatten(1).max(-1)[0]
|
126 |
+
y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
|
127 |
+
|
128 |
+
return torch.stack([x_min, y_min, x_max, y_max], 1)
|
129 |
+
|
130 |
+
if __name__ == '__main__':
|
131 |
+
x = torch.rand(5, 4)
|
132 |
+
y = torch.rand(3, 4)
|
133 |
+
iou, union = box_iou(x, y)
|
extensions/microsoftexcel-controlnet/annotator/oneformer/oneformer/utils/events.py
ADDED
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import wandb
|
3 |
+
from annotator.oneformer.detectron2.utils import comm
|
4 |
+
from annotator.oneformer.detectron2.utils.events import EventWriter, get_event_storage
|
5 |
+
|
6 |
+
|
7 |
+
def setup_wandb(cfg, args):
|
8 |
+
if comm.is_main_process():
|
9 |
+
init_args = {
|
10 |
+
k.lower(): v
|
11 |
+
for k, v in cfg.WANDB.items()
|
12 |
+
if isinstance(k, str) and k not in ["config"]
|
13 |
+
}
|
14 |
+
# only include most related part to avoid too big table
|
15 |
+
# TODO: add configurable params to select which part of `cfg` should be saved in config
|
16 |
+
if "config_exclude_keys" in init_args:
|
17 |
+
init_args["config"] = cfg
|
18 |
+
init_args["config"]["cfg_file"] = args.config_file
|
19 |
+
else:
|
20 |
+
init_args["config"] = {
|
21 |
+
"model": cfg.MODEL,
|
22 |
+
"solver": cfg.SOLVER,
|
23 |
+
"cfg_file": args.config_file,
|
24 |
+
}
|
25 |
+
if ("name" not in init_args) or (init_args["name"] is None):
|
26 |
+
init_args["name"] = os.path.basename(args.config_file)
|
27 |
+
else:
|
28 |
+
init_args["name"] = init_args["name"] + '_' + os.path.basename(args.config_file)
|
29 |
+
wandb.init(**init_args)
|
30 |
+
|
31 |
+
|
32 |
+
class BaseRule(object):
|
33 |
+
def __call__(self, target):
|
34 |
+
return target
|
35 |
+
|
36 |
+
|
37 |
+
class IsIn(BaseRule):
|
38 |
+
def __init__(self, keyword: str):
|
39 |
+
self.keyword = keyword
|
40 |
+
|
41 |
+
def __call__(self, target):
|
42 |
+
return self.keyword in target
|
43 |
+
|
44 |
+
|
45 |
+
class Prefix(BaseRule):
|
46 |
+
def __init__(self, keyword: str):
|
47 |
+
self.keyword = keyword
|
48 |
+
|
49 |
+
def __call__(self, target):
|
50 |
+
return "/".join([self.keyword, target])
|
51 |
+
|
52 |
+
|
53 |
+
class WandbWriter(EventWriter):
|
54 |
+
"""
|
55 |
+
Write all scalars to a tensorboard file.
|
56 |
+
"""
|
57 |
+
|
58 |
+
def __init__(self):
|
59 |
+
"""
|
60 |
+
Args:
|
61 |
+
log_dir (str): the directory to save the output events
|
62 |
+
kwargs: other arguments passed to `torch.utils.tensorboard.SummaryWriter(...)`
|
63 |
+
"""
|
64 |
+
self._last_write = -1
|
65 |
+
self._group_rules = [
|
66 |
+
(IsIn("/"), BaseRule()),
|
67 |
+
(IsIn("loss"), Prefix("train")),
|
68 |
+
]
|
69 |
+
|
70 |
+
def write(self):
|
71 |
+
|
72 |
+
storage = get_event_storage()
|
73 |
+
|
74 |
+
def _group_name(scalar_name):
|
75 |
+
for (rule, op) in self._group_rules:
|
76 |
+
if rule(scalar_name):
|
77 |
+
return op(scalar_name)
|
78 |
+
return scalar_name
|
79 |
+
|
80 |
+
stats = {
|
81 |
+
_group_name(name): scalars[0]
|
82 |
+
for name, scalars in storage.latest().items()
|
83 |
+
if scalars[1] > self._last_write
|
84 |
+
}
|
85 |
+
if len(stats) > 0:
|
86 |
+
self._last_write = max([v[1] for k, v in storage.latest().items()])
|
87 |
+
|
88 |
+
# storage.put_{image,histogram} is only meant to be used by
|
89 |
+
# tensorboard writer. So we access its internal fields directly from here.
|
90 |
+
if len(storage._vis_data) >= 1:
|
91 |
+
stats["image"] = [
|
92 |
+
wandb.Image(img, caption=img_name)
|
93 |
+
for img_name, img, step_num in storage._vis_data
|
94 |
+
]
|
95 |
+
# Storage stores all image data and rely on this writer to clear them.
|
96 |
+
# As a result it assumes only one writer will use its image data.
|
97 |
+
# An alternative design is to let storage store limited recent
|
98 |
+
# data (e.g. only the most recent image) that all writers can access.
|
99 |
+
# In that case a writer may not see all image data if its period is long.
|
100 |
+
storage.clear_images()
|
101 |
+
|
102 |
+
if len(storage._histograms) >= 1:
|
103 |
+
|
104 |
+
def create_bar(tag, bucket_limits, bucket_counts, **kwargs):
|
105 |
+
data = [
|
106 |
+
[label, val] for (label, val) in zip(bucket_limits, bucket_counts)
|
107 |
+
]
|
108 |
+
table = wandb.Table(data=data, columns=["label", "value"])
|
109 |
+
return wandb.plot.bar(table, "label", "value", title=tag)
|
110 |
+
|
111 |
+
stats["hist"] = [create_bar(**params) for params in storage._histograms]
|
112 |
+
|
113 |
+
storage.clear_histograms()
|
114 |
+
|
115 |
+
if len(stats) == 0:
|
116 |
+
return
|
117 |
+
wandb.log(stats, step=storage.iter)
|
118 |
+
|
119 |
+
def close(self):
|
120 |
+
wandb.finish()
|
extensions/microsoftexcel-controlnet/annotator/oneformer/oneformer/utils/misc.py
ADDED
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
2 |
+
# Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/util/misc.py
|
3 |
+
"""
|
4 |
+
Misc functions, including distributed helpers.
|
5 |
+
|
6 |
+
Mostly copy-paste from torchvision references.
|
7 |
+
"""
|
8 |
+
from typing import List, Optional
|
9 |
+
|
10 |
+
import torch
|
11 |
+
import torch.distributed as dist
|
12 |
+
import torchvision
|
13 |
+
from torch import Tensor
|
14 |
+
import warnings
|
15 |
+
import torch.nn.functional as F
|
16 |
+
import math
|
17 |
+
|
18 |
+
def inverse_sigmoid(x, eps=1e-3):
|
19 |
+
x = x.clamp(min=0, max=1)
|
20 |
+
x1 = x.clamp(min=eps)
|
21 |
+
x2 = (1 - x).clamp(min=eps)
|
22 |
+
return torch.log(x1/x2)
|
23 |
+
|
24 |
+
def _no_grad_trunc_normal_(tensor, mean, std, a, b):
|
25 |
+
# Cut & paste from PyTorch official master until it's in a few official releases - RW
|
26 |
+
# Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
|
27 |
+
def norm_cdf(x):
|
28 |
+
# Computes standard normal cumulative distribution function
|
29 |
+
return (1. + math.erf(x / math.sqrt(2.))) / 2.
|
30 |
+
|
31 |
+
if (mean < a - 2 * std) or (mean > b + 2 * std):
|
32 |
+
warnings.warn("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
|
33 |
+
"The distribution of values may be incorrect.",
|
34 |
+
stacklevel=2)
|
35 |
+
|
36 |
+
with torch.no_grad():
|
37 |
+
# Values are generated by using a truncated uniform distribution and
|
38 |
+
# then using the inverse CDF for the normal distribution.
|
39 |
+
# Get upper and lower cdf values
|
40 |
+
l = norm_cdf((a - mean) / std)
|
41 |
+
u = norm_cdf((b - mean) / std)
|
42 |
+
|
43 |
+
# Uniformly fill tensor with values from [l, u], then translate to
|
44 |
+
# [2l-1, 2u-1].
|
45 |
+
tensor.uniform_(2 * l - 1, 2 * u - 1)
|
46 |
+
|
47 |
+
# Use inverse cdf transform for normal distribution to get truncated
|
48 |
+
# standard normal
|
49 |
+
tensor.erfinv_()
|
50 |
+
|
51 |
+
# Transform to proper mean, std
|
52 |
+
tensor.mul_(std * math.sqrt(2.))
|
53 |
+
tensor.add_(mean)
|
54 |
+
|
55 |
+
# Clamp to ensure it's in the proper range
|
56 |
+
tensor.clamp_(min=a, max=b)
|
57 |
+
return tensor
|
58 |
+
|
59 |
+
def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
|
60 |
+
# type: (Tensor, float, float, float, float) -> Tensor
|
61 |
+
r"""Fills the input Tensor with values drawn from a truncated
|
62 |
+
normal distribution. The values are effectively drawn from the
|
63 |
+
normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
|
64 |
+
with values outside :math:`[a, b]` redrawn until they are within
|
65 |
+
the bounds. The method used for generating the random values works
|
66 |
+
best when :math:`a \leq \text{mean} \leq b`.
|
67 |
+
Args:
|
68 |
+
tensor: an n-dimensional `torch.Tensor`
|
69 |
+
mean: the mean of the normal distribution
|
70 |
+
std: the standard deviation of the normal distribution
|
71 |
+
a: the minimum cutoff value
|
72 |
+
b: the maximum cutoff value
|
73 |
+
Examples:
|
74 |
+
>>> w = torch.empty(3, 5)
|
75 |
+
>>> nn.init.trunc_normal_(w)
|
76 |
+
"""
|
77 |
+
return _no_grad_trunc_normal_(tensor, mean, std, a, b)
|
78 |
+
|
79 |
+
def resize(input,
|
80 |
+
size=None,
|
81 |
+
scale_factor=None,
|
82 |
+
mode='nearest',
|
83 |
+
align_corners=None,
|
84 |
+
warning=True):
|
85 |
+
if warning:
|
86 |
+
if size is not None and align_corners:
|
87 |
+
input_h, input_w = tuple(int(x) for x in input.shape[2:])
|
88 |
+
output_h, output_w = tuple(int(x) for x in size)
|
89 |
+
if output_h > input_h or output_w > output_h:
|
90 |
+
if ((output_h > 1 and output_w > 1 and input_h > 1
|
91 |
+
and input_w > 1) and (output_h - 1) % (input_h - 1)
|
92 |
+
and (output_w - 1) % (input_w - 1)):
|
93 |
+
warnings.warn(
|
94 |
+
f'When align_corners={align_corners}, '
|
95 |
+
'the output would more aligned if '
|
96 |
+
f'input size {(input_h, input_w)} is `x+1` and '
|
97 |
+
f'out size {(output_h, output_w)} is `nx+1`')
|
98 |
+
if isinstance(size, torch.Size):
|
99 |
+
size = tuple(int(x) for x in size)
|
100 |
+
return F.interpolate(input, size, scale_factor, mode, align_corners)
|
101 |
+
|
102 |
+
def _max_by_axis(the_list):
|
103 |
+
# type: (List[List[int]]) -> List[int]
|
104 |
+
maxes = the_list[0]
|
105 |
+
for sublist in the_list[1:]:
|
106 |
+
for index, item in enumerate(sublist):
|
107 |
+
maxes[index] = max(maxes[index], item)
|
108 |
+
return maxes
|
109 |
+
|
110 |
+
|
111 |
+
class NestedTensor(object):
|
112 |
+
def __init__(self, tensors, mask: Optional[Tensor]):
|
113 |
+
self.tensors = tensors
|
114 |
+
self.mask = mask
|
115 |
+
|
116 |
+
def to(self, device):
|
117 |
+
# type: (Device) -> NestedTensor # noqa
|
118 |
+
cast_tensor = self.tensors.to(device)
|
119 |
+
mask = self.mask
|
120 |
+
if mask is not None:
|
121 |
+
assert mask is not None
|
122 |
+
cast_mask = mask.to(device)
|
123 |
+
else:
|
124 |
+
cast_mask = None
|
125 |
+
return NestedTensor(cast_tensor, cast_mask)
|
126 |
+
|
127 |
+
def decompose(self):
|
128 |
+
return self.tensors, self.mask
|
129 |
+
|
130 |
+
def __repr__(self):
|
131 |
+
return str(self.tensors)
|
132 |
+
|
133 |
+
|
134 |
+
def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
|
135 |
+
# TODO make this more general
|
136 |
+
if tensor_list[0].ndim == 3:
|
137 |
+
if torchvision._is_tracing():
|
138 |
+
# nested_tensor_from_tensor_list() does not export well to ONNX
|
139 |
+
# call _onnx_nested_tensor_from_tensor_list() instead
|
140 |
+
return _onnx_nested_tensor_from_tensor_list(tensor_list)
|
141 |
+
|
142 |
+
# TODO make it support different-sized images
|
143 |
+
max_size = _max_by_axis([list(img.shape) for img in tensor_list])
|
144 |
+
# min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list]))
|
145 |
+
batch_shape = [len(tensor_list)] + max_size
|
146 |
+
b, c, h, w = batch_shape
|
147 |
+
dtype = tensor_list[0].dtype
|
148 |
+
device = tensor_list[0].device
|
149 |
+
tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
|
150 |
+
mask = torch.ones((b, h, w), dtype=torch.bool, device=device)
|
151 |
+
for img, pad_img, m in zip(tensor_list, tensor, mask):
|
152 |
+
pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
|
153 |
+
m[: img.shape[1], : img.shape[2]] = False
|
154 |
+
else:
|
155 |
+
raise ValueError("not supported")
|
156 |
+
return NestedTensor(tensor, mask)
|
157 |
+
|
158 |
+
|
159 |
+
# _onnx_nested_tensor_from_tensor_list() is an implementation of
|
160 |
+
# nested_tensor_from_tensor_list() that is supported by ONNX tracing.
|
161 |
+
@torch.jit.unused
|
162 |
+
def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor:
|
163 |
+
max_size = []
|
164 |
+
for i in range(tensor_list[0].dim()):
|
165 |
+
max_size_i = torch.max(
|
166 |
+
torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32)
|
167 |
+
).to(torch.int64)
|
168 |
+
max_size.append(max_size_i)
|
169 |
+
max_size = tuple(max_size)
|
170 |
+
|
171 |
+
# work around for
|
172 |
+
# pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
|
173 |
+
# m[: img.shape[1], :img.shape[2]] = False
|
174 |
+
# which is not yet supported in onnx
|
175 |
+
padded_imgs = []
|
176 |
+
padded_masks = []
|
177 |
+
for img in tensor_list:
|
178 |
+
padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))]
|
179 |
+
padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0]))
|
180 |
+
padded_imgs.append(padded_img)
|
181 |
+
|
182 |
+
m = torch.zeros_like(img[0], dtype=torch.int, device=img.device)
|
183 |
+
padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1)
|
184 |
+
padded_masks.append(padded_mask.to(torch.bool))
|
185 |
+
|
186 |
+
tensor = torch.stack(padded_imgs)
|
187 |
+
mask = torch.stack(padded_masks)
|
188 |
+
|
189 |
+
return NestedTensor(tensor, mask=mask)
|
190 |
+
|
191 |
+
|
192 |
+
def is_dist_avail_and_initialized():
|
193 |
+
if not dist.is_available():
|
194 |
+
return False
|
195 |
+
if not dist.is_initialized():
|
196 |
+
return False
|
197 |
+
return True
|
extensions/microsoftexcel-controlnet/annotator/oneformer/oneformer/utils/pos_embed.py
ADDED
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# --------------------------------------------------------
|
2 |
+
# Position embedding utils
|
3 |
+
# --------------------------------------------------------
|
4 |
+
|
5 |
+
from typing import Tuple
|
6 |
+
|
7 |
+
import numpy as np
|
8 |
+
import torch
|
9 |
+
|
10 |
+
|
11 |
+
# --------------------------------------------------------
|
12 |
+
# 2D sine-cosine position embedding
|
13 |
+
# References:
|
14 |
+
# Transformer: https://github.com/tensorflow/models/blob/master/official/nlp/transformer/model_utils.py
|
15 |
+
# MoCo v3: https://github.com/facebookresearch/moco-v3
|
16 |
+
# --------------------------------------------------------
|
17 |
+
def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False):
|
18 |
+
"""
|
19 |
+
grid_size: int of the grid height and width
|
20 |
+
return:
|
21 |
+
pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
|
22 |
+
"""
|
23 |
+
grid_h = np.arange(grid_size, dtype=np.float32)
|
24 |
+
grid_w = np.arange(grid_size, dtype=np.float32)
|
25 |
+
grid = np.meshgrid(grid_w, grid_h) # here w goes first
|
26 |
+
grid = np.stack(grid, axis=0)
|
27 |
+
|
28 |
+
grid = grid.reshape([2, 1, grid_size, grid_size])
|
29 |
+
pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
|
30 |
+
if cls_token:
|
31 |
+
pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
|
32 |
+
return pos_embed
|
33 |
+
|
34 |
+
|
35 |
+
def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
|
36 |
+
assert embed_dim % 2 == 0
|
37 |
+
|
38 |
+
# use half of dimensions to encode grid_h
|
39 |
+
emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2)
|
40 |
+
emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2)
|
41 |
+
|
42 |
+
emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
|
43 |
+
return emb
|
44 |
+
|
45 |
+
|
46 |
+
def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
|
47 |
+
"""
|
48 |
+
embed_dim: output dimension for each position
|
49 |
+
pos: a list of positions to be encoded: size (M,)
|
50 |
+
out: (M, D)
|
51 |
+
"""
|
52 |
+
assert embed_dim % 2 == 0
|
53 |
+
omega = np.arange(embed_dim // 2, dtype=np.float)
|
54 |
+
omega /= embed_dim / 2.0
|
55 |
+
omega = 1.0 / 10000 ** omega # (D/2,)
|
56 |
+
|
57 |
+
pos = pos.reshape(-1) # (M,)
|
58 |
+
out = np.einsum("m,d->md", pos, omega) # (M, D/2), outer product
|
59 |
+
|
60 |
+
emb_sin = np.sin(out) # (M, D/2)
|
61 |
+
emb_cos = np.cos(out) # (M, D/2)
|
62 |
+
|
63 |
+
emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D)
|
64 |
+
return emb
|
65 |
+
|
66 |
+
|
67 |
+
# --------------------------------------------------------
|
68 |
+
# Interpolate position embeddings for high-resolution
|
69 |
+
# References:
|
70 |
+
# DeiT: https://github.com/facebookresearch/deit
|
71 |
+
# --------------------------------------------------------
|
72 |
+
def interpolate_pos_embed(model, checkpoint_model, pos_embed_key):
|
73 |
+
if pos_embed_key in checkpoint_model:
|
74 |
+
pos_embed_checkpoint = checkpoint_model[pos_embed_key]
|
75 |
+
embedding_size = pos_embed_checkpoint.shape[-1]
|
76 |
+
num_patches = model.num_patches
|
77 |
+
if pos_embed_key.startswith("decoder"):
|
78 |
+
num_extra_tokens = model.decoder_pos_embed.shape[-2] - num_patches
|
79 |
+
else:
|
80 |
+
num_extra_tokens = model.pos_embed.shape[-2] - num_patches
|
81 |
+
# height (== width) for the checkpoint position embedding
|
82 |
+
orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5)
|
83 |
+
# height (== width) for the new position embedding
|
84 |
+
new_size = int(num_patches ** 0.5)
|
85 |
+
# class_token and dist_token are kept unchanged
|
86 |
+
if orig_size != new_size:
|
87 |
+
print(
|
88 |
+
"Position interpolate from %dx%d to %dx%d"
|
89 |
+
% (orig_size, orig_size, new_size, new_size)
|
90 |
+
)
|
91 |
+
extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
|
92 |
+
# only the position tokens are interpolated
|
93 |
+
pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
|
94 |
+
pos_tokens = pos_tokens.reshape(
|
95 |
+
-1, orig_size, orig_size, embedding_size
|
96 |
+
).permute(0, 3, 1, 2)
|
97 |
+
pos_tokens = torch.nn.functional.interpolate(
|
98 |
+
pos_tokens,
|
99 |
+
size=(new_size, new_size),
|
100 |
+
mode="bicubic",
|
101 |
+
align_corners=False,
|
102 |
+
)
|
103 |
+
pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
|
104 |
+
new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
|
105 |
+
checkpoint_model[pos_embed_key] = new_pos_embed
|
106 |
+
|
107 |
+
|
108 |
+
def interpolate_pos_embed_online(
|
109 |
+
pos_embed, orig_size: Tuple[int], new_size: Tuple[int], num_extra_tokens: int
|
110 |
+
):
|
111 |
+
extra_tokens = pos_embed[:, :num_extra_tokens]
|
112 |
+
pos_tokens = pos_embed[:, num_extra_tokens:]
|
113 |
+
embedding_size = pos_tokens.shape[-1]
|
114 |
+
pos_tokens = pos_tokens.reshape(
|
115 |
+
-1, orig_size[0], orig_size[1], embedding_size
|
116 |
+
).permute(0, 3, 1, 2)
|
117 |
+
pos_tokens = torch.nn.functional.interpolate(
|
118 |
+
pos_tokens, size=new_size, mode="bicubic", align_corners=False,
|
119 |
+
)
|
120 |
+
pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
|
121 |
+
new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
|
122 |
+
return new_pos_embed
|
extensions/microsoftexcel-controlnet/annotator/oneformer/pycocotools/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
__author__ = 'tylin'
|
extensions/microsoftexcel-controlnet/annotator/oneformer/pycocotools/coco.py
ADDED
@@ -0,0 +1,444 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
__author__ = 'tylin'
|
2 |
+
__version__ = '2.0'
|
3 |
+
# Interface for accessing the Microsoft COCO dataset.
|
4 |
+
|
5 |
+
# Microsoft COCO is a large image dataset designed for object detection,
|
6 |
+
# segmentation, and caption generation. annotator.oneformer.pycocotools is a Python API that
|
7 |
+
# assists in loading, parsing and visualizing the annotations in COCO.
|
8 |
+
# Please visit http://mscoco.org/ for more information on COCO, including
|
9 |
+
# for the data, paper, and tutorials. The exact format of the annotations
|
10 |
+
# is also described on the COCO website. For example usage of the annotator.oneformer.pycocotools
|
11 |
+
# please see annotator.oneformer.pycocotools_demo.ipynb. In addition to this API, please download both
|
12 |
+
# the COCO images and annotations in order to run the demo.
|
13 |
+
|
14 |
+
# An alternative to using the API is to load the annotations directly
|
15 |
+
# into Python dictionary
|
16 |
+
# Using the API provides additional utility functions. Note that this API
|
17 |
+
# supports both *instance* and *caption* annotations. In the case of
|
18 |
+
# captions not all functions are defined (e.g. categories are undefined).
|
19 |
+
|
20 |
+
# The following API functions are defined:
|
21 |
+
# COCO - COCO api class that loads COCO annotation file and prepare data structures.
|
22 |
+
# decodeMask - Decode binary mask M encoded via run-length encoding.
|
23 |
+
# encodeMask - Encode binary mask M using run-length encoding.
|
24 |
+
# getAnnIds - Get ann ids that satisfy given filter conditions.
|
25 |
+
# getCatIds - Get cat ids that satisfy given filter conditions.
|
26 |
+
# getImgIds - Get img ids that satisfy given filter conditions.
|
27 |
+
# loadAnns - Load anns with the specified ids.
|
28 |
+
# loadCats - Load cats with the specified ids.
|
29 |
+
# loadImgs - Load imgs with the specified ids.
|
30 |
+
# annToMask - Convert segmentation in an annotation to binary mask.
|
31 |
+
# showAnns - Display the specified annotations.
|
32 |
+
# loadRes - Load algorithm results and create API for accessing them.
|
33 |
+
# download - Download COCO images from mscoco.org server.
|
34 |
+
# Throughout the API "ann"=annotation, "cat"=category, and "img"=image.
|
35 |
+
# Help on each functions can be accessed by: "help COCO>function".
|
36 |
+
|
37 |
+
# See also COCO>decodeMask,
|
38 |
+
# COCO>encodeMask, COCO>getAnnIds, COCO>getCatIds,
|
39 |
+
# COCO>getImgIds, COCO>loadAnns, COCO>loadCats,
|
40 |
+
# COCO>loadImgs, COCO>annToMask, COCO>showAnns
|
41 |
+
|
42 |
+
# Microsoft COCO Toolbox. version 2.0
|
43 |
+
# Data, paper, and tutorials available at: http://mscoco.org/
|
44 |
+
# Code written by Piotr Dollar and Tsung-Yi Lin, 2014.
|
45 |
+
# Licensed under the Simplified BSD License [see bsd.txt]
|
46 |
+
|
47 |
+
import json
|
48 |
+
import time
|
49 |
+
import numpy as np
|
50 |
+
import copy
|
51 |
+
import itertools
|
52 |
+
from . import mask as maskUtils
|
53 |
+
import os
|
54 |
+
from collections import defaultdict
|
55 |
+
import sys
|
56 |
+
PYTHON_VERSION = sys.version_info[0]
|
57 |
+
if PYTHON_VERSION == 2:
|
58 |
+
from urllib import urlretrieve
|
59 |
+
elif PYTHON_VERSION == 3:
|
60 |
+
from urllib.request import urlretrieve
|
61 |
+
|
62 |
+
|
63 |
+
def _isArrayLike(obj):
|
64 |
+
return hasattr(obj, '__iter__') and hasattr(obj, '__len__')
|
65 |
+
|
66 |
+
|
67 |
+
class COCO:
|
68 |
+
def __init__(self, annotation_file=None):
|
69 |
+
"""
|
70 |
+
Constructor of Microsoft COCO helper class for reading and visualizing annotations.
|
71 |
+
:param annotation_file (str): location of annotation file
|
72 |
+
:param image_folder (str): location to the folder that hosts images.
|
73 |
+
:return:
|
74 |
+
"""
|
75 |
+
# load dataset
|
76 |
+
self.dataset,self.anns,self.cats,self.imgs = dict(),dict(),dict(),dict()
|
77 |
+
self.imgToAnns, self.catToImgs = defaultdict(list), defaultdict(list)
|
78 |
+
if not annotation_file == None:
|
79 |
+
print('loading annotations into memory...')
|
80 |
+
tic = time.time()
|
81 |
+
with open(annotation_file, 'r') as f:
|
82 |
+
dataset = json.load(f)
|
83 |
+
assert type(dataset)==dict, 'annotation file format {} not supported'.format(type(dataset))
|
84 |
+
print('Done (t={:0.2f}s)'.format(time.time()- tic))
|
85 |
+
self.dataset = dataset
|
86 |
+
self.createIndex()
|
87 |
+
|
88 |
+
def createIndex(self):
|
89 |
+
# create index
|
90 |
+
print('creating index...')
|
91 |
+
anns, cats, imgs = {}, {}, {}
|
92 |
+
imgToAnns,catToImgs = defaultdict(list),defaultdict(list)
|
93 |
+
if 'annotations' in self.dataset:
|
94 |
+
for ann in self.dataset['annotations']:
|
95 |
+
imgToAnns[ann['image_id']].append(ann)
|
96 |
+
anns[ann['id']] = ann
|
97 |
+
|
98 |
+
if 'images' in self.dataset:
|
99 |
+
for img in self.dataset['images']:
|
100 |
+
imgs[img['id']] = img
|
101 |
+
|
102 |
+
if 'categories' in self.dataset:
|
103 |
+
for cat in self.dataset['categories']:
|
104 |
+
cats[cat['id']] = cat
|
105 |
+
|
106 |
+
if 'annotations' in self.dataset and 'categories' in self.dataset:
|
107 |
+
for ann in self.dataset['annotations']:
|
108 |
+
catToImgs[ann['category_id']].append(ann['image_id'])
|
109 |
+
|
110 |
+
print('index created!')
|
111 |
+
|
112 |
+
# create class members
|
113 |
+
self.anns = anns
|
114 |
+
self.imgToAnns = imgToAnns
|
115 |
+
self.catToImgs = catToImgs
|
116 |
+
self.imgs = imgs
|
117 |
+
self.cats = cats
|
118 |
+
|
119 |
+
def info(self):
|
120 |
+
"""
|
121 |
+
Print information about the annotation file.
|
122 |
+
:return:
|
123 |
+
"""
|
124 |
+
for key, value in self.dataset['info'].items():
|
125 |
+
print('{}: {}'.format(key, value))
|
126 |
+
|
127 |
+
def getAnnIds(self, imgIds=[], catIds=[], areaRng=[], iscrowd=None):
|
128 |
+
"""
|
129 |
+
Get ann ids that satisfy given filter conditions. default skips that filter
|
130 |
+
:param imgIds (int array) : get anns for given imgs
|
131 |
+
catIds (int array) : get anns for given cats
|
132 |
+
areaRng (float array) : get anns for given area range (e.g. [0 inf])
|
133 |
+
iscrowd (boolean) : get anns for given crowd label (False or True)
|
134 |
+
:return: ids (int array) : integer array of ann ids
|
135 |
+
"""
|
136 |
+
imgIds = imgIds if _isArrayLike(imgIds) else [imgIds]
|
137 |
+
catIds = catIds if _isArrayLike(catIds) else [catIds]
|
138 |
+
|
139 |
+
if len(imgIds) == len(catIds) == len(areaRng) == 0:
|
140 |
+
anns = self.dataset['annotations']
|
141 |
+
else:
|
142 |
+
if not len(imgIds) == 0:
|
143 |
+
lists = [self.imgToAnns[imgId] for imgId in imgIds if imgId in self.imgToAnns]
|
144 |
+
anns = list(itertools.chain.from_iterable(lists))
|
145 |
+
else:
|
146 |
+
anns = self.dataset['annotations']
|
147 |
+
anns = anns if len(catIds) == 0 else [ann for ann in anns if ann['category_id'] in catIds]
|
148 |
+
anns = anns if len(areaRng) == 0 else [ann for ann in anns if ann['area'] > areaRng[0] and ann['area'] < areaRng[1]]
|
149 |
+
if not iscrowd == None:
|
150 |
+
ids = [ann['id'] for ann in anns if ann['iscrowd'] == iscrowd]
|
151 |
+
else:
|
152 |
+
ids = [ann['id'] for ann in anns]
|
153 |
+
return ids
|
154 |
+
|
155 |
+
def getCatIds(self, catNms=[], supNms=[], catIds=[]):
|
156 |
+
"""
|
157 |
+
filtering parameters. default skips that filter.
|
158 |
+
:param catNms (str array) : get cats for given cat names
|
159 |
+
:param supNms (str array) : get cats for given supercategory names
|
160 |
+
:param catIds (int array) : get cats for given cat ids
|
161 |
+
:return: ids (int array) : integer array of cat ids
|
162 |
+
"""
|
163 |
+
catNms = catNms if _isArrayLike(catNms) else [catNms]
|
164 |
+
supNms = supNms if _isArrayLike(supNms) else [supNms]
|
165 |
+
catIds = catIds if _isArrayLike(catIds) else [catIds]
|
166 |
+
|
167 |
+
if len(catNms) == len(supNms) == len(catIds) == 0:
|
168 |
+
cats = self.dataset['categories']
|
169 |
+
else:
|
170 |
+
cats = self.dataset['categories']
|
171 |
+
cats = cats if len(catNms) == 0 else [cat for cat in cats if cat['name'] in catNms]
|
172 |
+
cats = cats if len(supNms) == 0 else [cat for cat in cats if cat['supercategory'] in supNms]
|
173 |
+
cats = cats if len(catIds) == 0 else [cat for cat in cats if cat['id'] in catIds]
|
174 |
+
ids = [cat['id'] for cat in cats]
|
175 |
+
return ids
|
176 |
+
|
177 |
+
def getImgIds(self, imgIds=[], catIds=[]):
|
178 |
+
'''
|
179 |
+
Get img ids that satisfy given filter conditions.
|
180 |
+
:param imgIds (int array) : get imgs for given ids
|
181 |
+
:param catIds (int array) : get imgs with all given cats
|
182 |
+
:return: ids (int array) : integer array of img ids
|
183 |
+
'''
|
184 |
+
imgIds = imgIds if _isArrayLike(imgIds) else [imgIds]
|
185 |
+
catIds = catIds if _isArrayLike(catIds) else [catIds]
|
186 |
+
|
187 |
+
if len(imgIds) == len(catIds) == 0:
|
188 |
+
ids = self.imgs.keys()
|
189 |
+
else:
|
190 |
+
ids = set(imgIds)
|
191 |
+
for i, catId in enumerate(catIds):
|
192 |
+
if i == 0 and len(ids) == 0:
|
193 |
+
ids = set(self.catToImgs[catId])
|
194 |
+
else:
|
195 |
+
ids &= set(self.catToImgs[catId])
|
196 |
+
return list(ids)
|
197 |
+
|
198 |
+
def loadAnns(self, ids=[]):
|
199 |
+
"""
|
200 |
+
Load anns with the specified ids.
|
201 |
+
:param ids (int array) : integer ids specifying anns
|
202 |
+
:return: anns (object array) : loaded ann objects
|
203 |
+
"""
|
204 |
+
if _isArrayLike(ids):
|
205 |
+
return [self.anns[id] for id in ids]
|
206 |
+
elif type(ids) == int:
|
207 |
+
return [self.anns[ids]]
|
208 |
+
|
209 |
+
def loadCats(self, ids=[]):
|
210 |
+
"""
|
211 |
+
Load cats with the specified ids.
|
212 |
+
:param ids (int array) : integer ids specifying cats
|
213 |
+
:return: cats (object array) : loaded cat objects
|
214 |
+
"""
|
215 |
+
if _isArrayLike(ids):
|
216 |
+
return [self.cats[id] for id in ids]
|
217 |
+
elif type(ids) == int:
|
218 |
+
return [self.cats[ids]]
|
219 |
+
|
220 |
+
def loadImgs(self, ids=[]):
|
221 |
+
"""
|
222 |
+
Load anns with the specified ids.
|
223 |
+
:param ids (int array) : integer ids specifying img
|
224 |
+
:return: imgs (object array) : loaded img objects
|
225 |
+
"""
|
226 |
+
if _isArrayLike(ids):
|
227 |
+
return [self.imgs[id] for id in ids]
|
228 |
+
elif type(ids) == int:
|
229 |
+
return [self.imgs[ids]]
|
230 |
+
|
231 |
+
def showAnns(self, anns, draw_bbox=False):
|
232 |
+
"""
|
233 |
+
Display the specified annotations.
|
234 |
+
:param anns (array of object): annotations to display
|
235 |
+
:return: None
|
236 |
+
"""
|
237 |
+
if len(anns) == 0:
|
238 |
+
return 0
|
239 |
+
if 'segmentation' in anns[0] or 'keypoints' in anns[0]:
|
240 |
+
datasetType = 'instances'
|
241 |
+
elif 'caption' in anns[0]:
|
242 |
+
datasetType = 'captions'
|
243 |
+
else:
|
244 |
+
raise Exception('datasetType not supported')
|
245 |
+
if datasetType == 'instances':
|
246 |
+
import matplotlib.pyplot as plt
|
247 |
+
from matplotlib.collections import PatchCollection
|
248 |
+
from matplotlib.patches import Polygon
|
249 |
+
|
250 |
+
ax = plt.gca()
|
251 |
+
ax.set_autoscale_on(False)
|
252 |
+
polygons = []
|
253 |
+
color = []
|
254 |
+
for ann in anns:
|
255 |
+
c = (np.random.random((1, 3))*0.6+0.4).tolist()[0]
|
256 |
+
if 'segmentation' in ann:
|
257 |
+
if type(ann['segmentation']) == list:
|
258 |
+
# polygon
|
259 |
+
for seg in ann['segmentation']:
|
260 |
+
poly = np.array(seg).reshape((int(len(seg)/2), 2))
|
261 |
+
polygons.append(Polygon(poly))
|
262 |
+
color.append(c)
|
263 |
+
else:
|
264 |
+
# mask
|
265 |
+
t = self.imgs[ann['image_id']]
|
266 |
+
if type(ann['segmentation']['counts']) == list:
|
267 |
+
rle = maskUtils.frPyObjects([ann['segmentation']], t['height'], t['width'])
|
268 |
+
else:
|
269 |
+
rle = [ann['segmentation']]
|
270 |
+
m = maskUtils.decode(rle)
|
271 |
+
img = np.ones( (m.shape[0], m.shape[1], 3) )
|
272 |
+
if ann['iscrowd'] == 1:
|
273 |
+
color_mask = np.array([2.0,166.0,101.0])/255
|
274 |
+
if ann['iscrowd'] == 0:
|
275 |
+
color_mask = np.random.random((1, 3)).tolist()[0]
|
276 |
+
for i in range(3):
|
277 |
+
img[:,:,i] = color_mask[i]
|
278 |
+
ax.imshow(np.dstack( (img, m*0.5) ))
|
279 |
+
if 'keypoints' in ann and type(ann['keypoints']) == list:
|
280 |
+
# turn skeleton into zero-based index
|
281 |
+
sks = np.array(self.loadCats(ann['category_id'])[0]['skeleton'])-1
|
282 |
+
kp = np.array(ann['keypoints'])
|
283 |
+
x = kp[0::3]
|
284 |
+
y = kp[1::3]
|
285 |
+
v = kp[2::3]
|
286 |
+
for sk in sks:
|
287 |
+
if np.all(v[sk]>0):
|
288 |
+
plt.plot(x[sk],y[sk], linewidth=3, color=c)
|
289 |
+
plt.plot(x[v>0], y[v>0],'o',markersize=8, markerfacecolor=c, markeredgecolor='k',markeredgewidth=2)
|
290 |
+
plt.plot(x[v>1], y[v>1],'o',markersize=8, markerfacecolor=c, markeredgecolor=c, markeredgewidth=2)
|
291 |
+
|
292 |
+
if draw_bbox:
|
293 |
+
[bbox_x, bbox_y, bbox_w, bbox_h] = ann['bbox']
|
294 |
+
poly = [[bbox_x, bbox_y], [bbox_x, bbox_y+bbox_h], [bbox_x+bbox_w, bbox_y+bbox_h], [bbox_x+bbox_w, bbox_y]]
|
295 |
+
np_poly = np.array(poly).reshape((4,2))
|
296 |
+
polygons.append(Polygon(np_poly))
|
297 |
+
color.append(c)
|
298 |
+
|
299 |
+
p = PatchCollection(polygons, facecolor=color, linewidths=0, alpha=0.4)
|
300 |
+
ax.add_collection(p)
|
301 |
+
p = PatchCollection(polygons, facecolor='none', edgecolors=color, linewidths=2)
|
302 |
+
ax.add_collection(p)
|
303 |
+
elif datasetType == 'captions':
|
304 |
+
for ann in anns:
|
305 |
+
print(ann['caption'])
|
306 |
+
|
307 |
+
def loadRes(self, resFile):
|
308 |
+
"""
|
309 |
+
Load result file and return a result api object.
|
310 |
+
:param resFile (str) : file name of result file
|
311 |
+
:return: res (obj) : result api object
|
312 |
+
"""
|
313 |
+
res = COCO()
|
314 |
+
res.dataset['images'] = [img for img in self.dataset['images']]
|
315 |
+
|
316 |
+
print('Loading and preparing results...')
|
317 |
+
tic = time.time()
|
318 |
+
if type(resFile) == str or (PYTHON_VERSION == 2 and type(resFile) == unicode):
|
319 |
+
with open(resFile) as f:
|
320 |
+
anns = json.load(f)
|
321 |
+
elif type(resFile) == np.ndarray:
|
322 |
+
anns = self.loadNumpyAnnotations(resFile)
|
323 |
+
else:
|
324 |
+
anns = resFile
|
325 |
+
assert type(anns) == list, 'results in not an array of objects'
|
326 |
+
annsImgIds = [ann['image_id'] for ann in anns]
|
327 |
+
assert set(annsImgIds) == (set(annsImgIds) & set(self.getImgIds())), \
|
328 |
+
'Results do not correspond to current coco set'
|
329 |
+
if 'caption' in anns[0]:
|
330 |
+
imgIds = set([img['id'] for img in res.dataset['images']]) & set([ann['image_id'] for ann in anns])
|
331 |
+
res.dataset['images'] = [img for img in res.dataset['images'] if img['id'] in imgIds]
|
332 |
+
for id, ann in enumerate(anns):
|
333 |
+
ann['id'] = id+1
|
334 |
+
elif 'bbox' in anns[0] and not anns[0]['bbox'] == []:
|
335 |
+
res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
|
336 |
+
for id, ann in enumerate(anns):
|
337 |
+
bb = ann['bbox']
|
338 |
+
x1, x2, y1, y2 = [bb[0], bb[0]+bb[2], bb[1], bb[1]+bb[3]]
|
339 |
+
if not 'segmentation' in ann:
|
340 |
+
ann['segmentation'] = [[x1, y1, x1, y2, x2, y2, x2, y1]]
|
341 |
+
ann['area'] = bb[2]*bb[3]
|
342 |
+
ann['id'] = id+1
|
343 |
+
ann['iscrowd'] = 0
|
344 |
+
elif 'segmentation' in anns[0]:
|
345 |
+
res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
|
346 |
+
for id, ann in enumerate(anns):
|
347 |
+
# now only support compressed RLE format as segmentation results
|
348 |
+
ann['area'] = maskUtils.area(ann['segmentation'])
|
349 |
+
if not 'bbox' in ann:
|
350 |
+
ann['bbox'] = maskUtils.toBbox(ann['segmentation'])
|
351 |
+
ann['id'] = id+1
|
352 |
+
ann['iscrowd'] = 0
|
353 |
+
elif 'keypoints' in anns[0]:
|
354 |
+
res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
|
355 |
+
for id, ann in enumerate(anns):
|
356 |
+
s = ann['keypoints']
|
357 |
+
x = s[0::3]
|
358 |
+
y = s[1::3]
|
359 |
+
x0,x1,y0,y1 = np.min(x), np.max(x), np.min(y), np.max(y)
|
360 |
+
ann['area'] = (x1-x0)*(y1-y0)
|
361 |
+
ann['id'] = id + 1
|
362 |
+
ann['bbox'] = [x0,y0,x1-x0,y1-y0]
|
363 |
+
print('DONE (t={:0.2f}s)'.format(time.time()- tic))
|
364 |
+
|
365 |
+
res.dataset['annotations'] = anns
|
366 |
+
res.createIndex()
|
367 |
+
return res
|
368 |
+
|
369 |
+
def download(self, tarDir = None, imgIds = [] ):
|
370 |
+
'''
|
371 |
+
Download COCO images from mscoco.org server.
|
372 |
+
:param tarDir (str): COCO results directory name
|
373 |
+
imgIds (list): images to be downloaded
|
374 |
+
:return:
|
375 |
+
'''
|
376 |
+
if tarDir is None:
|
377 |
+
print('Please specify target directory')
|
378 |
+
return -1
|
379 |
+
if len(imgIds) == 0:
|
380 |
+
imgs = self.imgs.values()
|
381 |
+
else:
|
382 |
+
imgs = self.loadImgs(imgIds)
|
383 |
+
N = len(imgs)
|
384 |
+
if not os.path.exists(tarDir):
|
385 |
+
os.makedirs(tarDir)
|
386 |
+
for i, img in enumerate(imgs):
|
387 |
+
tic = time.time()
|
388 |
+
fname = os.path.join(tarDir, img['file_name'])
|
389 |
+
if not os.path.exists(fname):
|
390 |
+
urlretrieve(img['coco_url'], fname)
|
391 |
+
print('downloaded {}/{} images (t={:0.1f}s)'.format(i, N, time.time()- tic))
|
392 |
+
|
393 |
+
def loadNumpyAnnotations(self, data):
|
394 |
+
"""
|
395 |
+
Convert result data from a numpy array [Nx7] where each row contains {imageID,x1,y1,w,h,score,class}
|
396 |
+
:param data (numpy.ndarray)
|
397 |
+
:return: annotations (python nested list)
|
398 |
+
"""
|
399 |
+
print('Converting ndarray to lists...')
|
400 |
+
assert(type(data) == np.ndarray)
|
401 |
+
print(data.shape)
|
402 |
+
assert(data.shape[1] == 7)
|
403 |
+
N = data.shape[0]
|
404 |
+
ann = []
|
405 |
+
for i in range(N):
|
406 |
+
if i % 1000000 == 0:
|
407 |
+
print('{}/{}'.format(i,N))
|
408 |
+
ann += [{
|
409 |
+
'image_id' : int(data[i, 0]),
|
410 |
+
'bbox' : [ data[i, 1], data[i, 2], data[i, 3], data[i, 4] ],
|
411 |
+
'score' : data[i, 5],
|
412 |
+
'category_id': int(data[i, 6]),
|
413 |
+
}]
|
414 |
+
return ann
|
415 |
+
|
416 |
+
def annToRLE(self, ann):
|
417 |
+
"""
|
418 |
+
Convert annotation which can be polygons, uncompressed RLE to RLE.
|
419 |
+
:return: binary mask (numpy 2D array)
|
420 |
+
"""
|
421 |
+
t = self.imgs[ann['image_id']]
|
422 |
+
h, w = t['height'], t['width']
|
423 |
+
segm = ann['segmentation']
|
424 |
+
if type(segm) == list:
|
425 |
+
# polygon -- a single object might consist of multiple parts
|
426 |
+
# we merge all parts into one mask rle code
|
427 |
+
rles = maskUtils.frPyObjects(segm, h, w)
|
428 |
+
rle = maskUtils.merge(rles)
|
429 |
+
elif type(segm['counts']) == list:
|
430 |
+
# uncompressed RLE
|
431 |
+
rle = maskUtils.frPyObjects(segm, h, w)
|
432 |
+
else:
|
433 |
+
# rle
|
434 |
+
rle = ann['segmentation']
|
435 |
+
return rle
|
436 |
+
|
437 |
+
def annToMask(self, ann):
|
438 |
+
"""
|
439 |
+
Convert annotation which can be polygons, uncompressed RLE, or RLE to binary mask.
|
440 |
+
:return: binary mask (numpy 2D array)
|
441 |
+
"""
|
442 |
+
rle = self.annToRLE(ann)
|
443 |
+
m = maskUtils.decode(rle)
|
444 |
+
return m
|
extensions/microsoftexcel-controlnet/annotator/oneformer/pycocotools/cocoeval.py
ADDED
@@ -0,0 +1,534 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
__author__ = 'tsungyi'
|
2 |
+
|
3 |
+
import numpy as np
|
4 |
+
import datetime
|
5 |
+
import time
|
6 |
+
from collections import defaultdict
|
7 |
+
from . import mask as maskUtils
|
8 |
+
import copy
|
9 |
+
|
10 |
+
class COCOeval:
|
11 |
+
# Interface for evaluating detection on the Microsoft COCO dataset.
|
12 |
+
#
|
13 |
+
# The usage for CocoEval is as follows:
|
14 |
+
# cocoGt=..., cocoDt=... # load dataset and results
|
15 |
+
# E = CocoEval(cocoGt,cocoDt); # initialize CocoEval object
|
16 |
+
# E.params.recThrs = ...; # set parameters as desired
|
17 |
+
# E.evaluate(); # run per image evaluation
|
18 |
+
# E.accumulate(); # accumulate per image results
|
19 |
+
# E.summarize(); # display summary metrics of results
|
20 |
+
# For example usage see evalDemo.m and http://mscoco.org/.
|
21 |
+
#
|
22 |
+
# The evaluation parameters are as follows (defaults in brackets):
|
23 |
+
# imgIds - [all] N img ids to use for evaluation
|
24 |
+
# catIds - [all] K cat ids to use for evaluation
|
25 |
+
# iouThrs - [.5:.05:.95] T=10 IoU thresholds for evaluation
|
26 |
+
# recThrs - [0:.01:1] R=101 recall thresholds for evaluation
|
27 |
+
# areaRng - [...] A=4 object area ranges for evaluation
|
28 |
+
# maxDets - [1 10 100] M=3 thresholds on max detections per image
|
29 |
+
# iouType - ['segm'] set iouType to 'segm', 'bbox' or 'keypoints'
|
30 |
+
# iouType replaced the now DEPRECATED useSegm parameter.
|
31 |
+
# useCats - [1] if true use category labels for evaluation
|
32 |
+
# Note: if useCats=0 category labels are ignored as in proposal scoring.
|
33 |
+
# Note: multiple areaRngs [Ax2] and maxDets [Mx1] can be specified.
|
34 |
+
#
|
35 |
+
# evaluate(): evaluates detections on every image and every category and
|
36 |
+
# concats the results into the "evalImgs" with fields:
|
37 |
+
# dtIds - [1xD] id for each of the D detections (dt)
|
38 |
+
# gtIds - [1xG] id for each of the G ground truths (gt)
|
39 |
+
# dtMatches - [TxD] matching gt id at each IoU or 0
|
40 |
+
# gtMatches - [TxG] matching dt id at each IoU or 0
|
41 |
+
# dtScores - [1xD] confidence of each dt
|
42 |
+
# gtIgnore - [1xG] ignore flag for each gt
|
43 |
+
# dtIgnore - [TxD] ignore flag for each dt at each IoU
|
44 |
+
#
|
45 |
+
# accumulate(): accumulates the per-image, per-category evaluation
|
46 |
+
# results in "evalImgs" into the dictionary "eval" with fields:
|
47 |
+
# params - parameters used for evaluation
|
48 |
+
# date - date evaluation was performed
|
49 |
+
# counts - [T,R,K,A,M] parameter dimensions (see above)
|
50 |
+
# precision - [TxRxKxAxM] precision for every evaluation setting
|
51 |
+
# recall - [TxKxAxM] max recall for every evaluation setting
|
52 |
+
# Note: precision and recall==-1 for settings with no gt objects.
|
53 |
+
#
|
54 |
+
# See also coco, mask, pycocoDemo, pycocoEvalDemo
|
55 |
+
#
|
56 |
+
# Microsoft COCO Toolbox. version 2.0
|
57 |
+
# Data, paper, and tutorials available at: http://mscoco.org/
|
58 |
+
# Code written by Piotr Dollar and Tsung-Yi Lin, 2015.
|
59 |
+
# Licensed under the Simplified BSD License [see coco/license.txt]
|
60 |
+
def __init__(self, cocoGt=None, cocoDt=None, iouType='segm'):
|
61 |
+
'''
|
62 |
+
Initialize CocoEval using coco APIs for gt and dt
|
63 |
+
:param cocoGt: coco object with ground truth annotations
|
64 |
+
:param cocoDt: coco object with detection results
|
65 |
+
:return: None
|
66 |
+
'''
|
67 |
+
if not iouType:
|
68 |
+
print('iouType not specified. use default iouType segm')
|
69 |
+
self.cocoGt = cocoGt # ground truth COCO API
|
70 |
+
self.cocoDt = cocoDt # detections COCO API
|
71 |
+
self.evalImgs = defaultdict(list) # per-image per-category evaluation results [KxAxI] elements
|
72 |
+
self.eval = {} # accumulated evaluation results
|
73 |
+
self._gts = defaultdict(list) # gt for evaluation
|
74 |
+
self._dts = defaultdict(list) # dt for evaluation
|
75 |
+
self.params = Params(iouType=iouType) # parameters
|
76 |
+
self._paramsEval = {} # parameters for evaluation
|
77 |
+
self.stats = [] # result summarization
|
78 |
+
self.ious = {} # ious between all gts and dts
|
79 |
+
if not cocoGt is None:
|
80 |
+
self.params.imgIds = sorted(cocoGt.getImgIds())
|
81 |
+
self.params.catIds = sorted(cocoGt.getCatIds())
|
82 |
+
|
83 |
+
|
84 |
+
def _prepare(self):
|
85 |
+
'''
|
86 |
+
Prepare ._gts and ._dts for evaluation based on params
|
87 |
+
:return: None
|
88 |
+
'''
|
89 |
+
def _toMask(anns, coco):
|
90 |
+
# modify ann['segmentation'] by reference
|
91 |
+
for ann in anns:
|
92 |
+
rle = coco.annToRLE(ann)
|
93 |
+
ann['segmentation'] = rle
|
94 |
+
p = self.params
|
95 |
+
if p.useCats:
|
96 |
+
gts=self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds))
|
97 |
+
dts=self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds))
|
98 |
+
else:
|
99 |
+
gts=self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds))
|
100 |
+
dts=self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds))
|
101 |
+
|
102 |
+
# convert ground truth to mask if iouType == 'segm'
|
103 |
+
if p.iouType == 'segm':
|
104 |
+
_toMask(gts, self.cocoGt)
|
105 |
+
_toMask(dts, self.cocoDt)
|
106 |
+
# set ignore flag
|
107 |
+
for gt in gts:
|
108 |
+
gt['ignore'] = gt['ignore'] if 'ignore' in gt else 0
|
109 |
+
gt['ignore'] = 'iscrowd' in gt and gt['iscrowd']
|
110 |
+
if p.iouType == 'keypoints':
|
111 |
+
gt['ignore'] = (gt['num_keypoints'] == 0) or gt['ignore']
|
112 |
+
self._gts = defaultdict(list) # gt for evaluation
|
113 |
+
self._dts = defaultdict(list) # dt for evaluation
|
114 |
+
for gt in gts:
|
115 |
+
self._gts[gt['image_id'], gt['category_id']].append(gt)
|
116 |
+
for dt in dts:
|
117 |
+
self._dts[dt['image_id'], dt['category_id']].append(dt)
|
118 |
+
self.evalImgs = defaultdict(list) # per-image per-category evaluation results
|
119 |
+
self.eval = {} # accumulated evaluation results
|
120 |
+
|
121 |
+
def evaluate(self):
|
122 |
+
'''
|
123 |
+
Run per image evaluation on given images and store results (a list of dict) in self.evalImgs
|
124 |
+
:return: None
|
125 |
+
'''
|
126 |
+
tic = time.time()
|
127 |
+
print('Running per image evaluation...')
|
128 |
+
p = self.params
|
129 |
+
# add backward compatibility if useSegm is specified in params
|
130 |
+
if not p.useSegm is None:
|
131 |
+
p.iouType = 'segm' if p.useSegm == 1 else 'bbox'
|
132 |
+
print('useSegm (deprecated) is not None. Running {} evaluation'.format(p.iouType))
|
133 |
+
print('Evaluate annotation type *{}*'.format(p.iouType))
|
134 |
+
p.imgIds = list(np.unique(p.imgIds))
|
135 |
+
if p.useCats:
|
136 |
+
p.catIds = list(np.unique(p.catIds))
|
137 |
+
p.maxDets = sorted(p.maxDets)
|
138 |
+
self.params=p
|
139 |
+
|
140 |
+
self._prepare()
|
141 |
+
# loop through images, area range, max detection number
|
142 |
+
catIds = p.catIds if p.useCats else [-1]
|
143 |
+
|
144 |
+
if p.iouType == 'segm' or p.iouType == 'bbox':
|
145 |
+
computeIoU = self.computeIoU
|
146 |
+
elif p.iouType == 'keypoints':
|
147 |
+
computeIoU = self.computeOks
|
148 |
+
self.ious = {(imgId, catId): computeIoU(imgId, catId) \
|
149 |
+
for imgId in p.imgIds
|
150 |
+
for catId in catIds}
|
151 |
+
|
152 |
+
evaluateImg = self.evaluateImg
|
153 |
+
maxDet = p.maxDets[-1]
|
154 |
+
self.evalImgs = [evaluateImg(imgId, catId, areaRng, maxDet)
|
155 |
+
for catId in catIds
|
156 |
+
for areaRng in p.areaRng
|
157 |
+
for imgId in p.imgIds
|
158 |
+
]
|
159 |
+
self._paramsEval = copy.deepcopy(self.params)
|
160 |
+
toc = time.time()
|
161 |
+
print('DONE (t={:0.2f}s).'.format(toc-tic))
|
162 |
+
|
163 |
+
def computeIoU(self, imgId, catId):
|
164 |
+
p = self.params
|
165 |
+
if p.useCats:
|
166 |
+
gt = self._gts[imgId,catId]
|
167 |
+
dt = self._dts[imgId,catId]
|
168 |
+
else:
|
169 |
+
gt = [_ for cId in p.catIds for _ in self._gts[imgId,cId]]
|
170 |
+
dt = [_ for cId in p.catIds for _ in self._dts[imgId,cId]]
|
171 |
+
if len(gt) == 0 and len(dt) ==0:
|
172 |
+
return []
|
173 |
+
inds = np.argsort([-d['score'] for d in dt], kind='mergesort')
|
174 |
+
dt = [dt[i] for i in inds]
|
175 |
+
if len(dt) > p.maxDets[-1]:
|
176 |
+
dt=dt[0:p.maxDets[-1]]
|
177 |
+
|
178 |
+
if p.iouType == 'segm':
|
179 |
+
g = [g['segmentation'] for g in gt]
|
180 |
+
d = [d['segmentation'] for d in dt]
|
181 |
+
elif p.iouType == 'bbox':
|
182 |
+
g = [g['bbox'] for g in gt]
|
183 |
+
d = [d['bbox'] for d in dt]
|
184 |
+
else:
|
185 |
+
raise Exception('unknown iouType for iou computation')
|
186 |
+
|
187 |
+
# compute iou between each dt and gt region
|
188 |
+
iscrowd = [int(o['iscrowd']) for o in gt]
|
189 |
+
ious = maskUtils.iou(d,g,iscrowd)
|
190 |
+
return ious
|
191 |
+
|
192 |
+
def computeOks(self, imgId, catId):
|
193 |
+
p = self.params
|
194 |
+
# dimention here should be Nxm
|
195 |
+
gts = self._gts[imgId, catId]
|
196 |
+
dts = self._dts[imgId, catId]
|
197 |
+
inds = np.argsort([-d['score'] for d in dts], kind='mergesort')
|
198 |
+
dts = [dts[i] for i in inds]
|
199 |
+
if len(dts) > p.maxDets[-1]:
|
200 |
+
dts = dts[0:p.maxDets[-1]]
|
201 |
+
# if len(gts) == 0 and len(dts) == 0:
|
202 |
+
if len(gts) == 0 or len(dts) == 0:
|
203 |
+
return []
|
204 |
+
ious = np.zeros((len(dts), len(gts)))
|
205 |
+
sigmas = p.kpt_oks_sigmas
|
206 |
+
vars = (sigmas * 2)**2
|
207 |
+
k = len(sigmas)
|
208 |
+
# compute oks between each detection and ground truth object
|
209 |
+
for j, gt in enumerate(gts):
|
210 |
+
# create bounds for ignore regions(double the gt bbox)
|
211 |
+
g = np.array(gt['keypoints'])
|
212 |
+
xg = g[0::3]; yg = g[1::3]; vg = g[2::3]
|
213 |
+
k1 = np.count_nonzero(vg > 0)
|
214 |
+
bb = gt['bbox']
|
215 |
+
x0 = bb[0] - bb[2]; x1 = bb[0] + bb[2] * 2
|
216 |
+
y0 = bb[1] - bb[3]; y1 = bb[1] + bb[3] * 2
|
217 |
+
for i, dt in enumerate(dts):
|
218 |
+
d = np.array(dt['keypoints'])
|
219 |
+
xd = d[0::3]; yd = d[1::3]
|
220 |
+
if k1>0:
|
221 |
+
# measure the per-keypoint distance if keypoints visible
|
222 |
+
dx = xd - xg
|
223 |
+
dy = yd - yg
|
224 |
+
else:
|
225 |
+
# measure minimum distance to keypoints in (x0,y0) & (x1,y1)
|
226 |
+
z = np.zeros((k))
|
227 |
+
dx = np.max((z, x0-xd),axis=0)+np.max((z, xd-x1),axis=0)
|
228 |
+
dy = np.max((z, y0-yd),axis=0)+np.max((z, yd-y1),axis=0)
|
229 |
+
e = (dx**2 + dy**2) / vars / (gt['area']+np.spacing(1)) / 2
|
230 |
+
if k1 > 0:
|
231 |
+
e=e[vg > 0]
|
232 |
+
ious[i, j] = np.sum(np.exp(-e)) / e.shape[0]
|
233 |
+
return ious
|
234 |
+
|
235 |
+
def evaluateImg(self, imgId, catId, aRng, maxDet):
|
236 |
+
'''
|
237 |
+
perform evaluation for single category and image
|
238 |
+
:return: dict (single image results)
|
239 |
+
'''
|
240 |
+
p = self.params
|
241 |
+
if p.useCats:
|
242 |
+
gt = self._gts[imgId,catId]
|
243 |
+
dt = self._dts[imgId,catId]
|
244 |
+
else:
|
245 |
+
gt = [_ for cId in p.catIds for _ in self._gts[imgId,cId]]
|
246 |
+
dt = [_ for cId in p.catIds for _ in self._dts[imgId,cId]]
|
247 |
+
if len(gt) == 0 and len(dt) ==0:
|
248 |
+
return None
|
249 |
+
|
250 |
+
for g in gt:
|
251 |
+
if g['ignore'] or (g['area']<aRng[0] or g['area']>aRng[1]):
|
252 |
+
g['_ignore'] = 1
|
253 |
+
else:
|
254 |
+
g['_ignore'] = 0
|
255 |
+
|
256 |
+
# sort dt highest score first, sort gt ignore last
|
257 |
+
gtind = np.argsort([g['_ignore'] for g in gt], kind='mergesort')
|
258 |
+
gt = [gt[i] for i in gtind]
|
259 |
+
dtind = np.argsort([-d['score'] for d in dt], kind='mergesort')
|
260 |
+
dt = [dt[i] for i in dtind[0:maxDet]]
|
261 |
+
iscrowd = [int(o['iscrowd']) for o in gt]
|
262 |
+
# load computed ious
|
263 |
+
ious = self.ious[imgId, catId][:, gtind] if len(self.ious[imgId, catId]) > 0 else self.ious[imgId, catId]
|
264 |
+
|
265 |
+
T = len(p.iouThrs)
|
266 |
+
G = len(gt)
|
267 |
+
D = len(dt)
|
268 |
+
gtm = np.zeros((T,G))
|
269 |
+
dtm = np.zeros((T,D))
|
270 |
+
gtIg = np.array([g['_ignore'] for g in gt])
|
271 |
+
dtIg = np.zeros((T,D))
|
272 |
+
if not len(ious)==0:
|
273 |
+
for tind, t in enumerate(p.iouThrs):
|
274 |
+
for dind, d in enumerate(dt):
|
275 |
+
# information about best match so far (m=-1 -> unmatched)
|
276 |
+
iou = min([t,1-1e-10])
|
277 |
+
m = -1
|
278 |
+
for gind, g in enumerate(gt):
|
279 |
+
# if this gt already matched, and not a crowd, continue
|
280 |
+
if gtm[tind,gind]>0 and not iscrowd[gind]:
|
281 |
+
continue
|
282 |
+
# if dt matched to reg gt, and on ignore gt, stop
|
283 |
+
if m>-1 and gtIg[m]==0 and gtIg[gind]==1:
|
284 |
+
break
|
285 |
+
# continue to next gt unless better match made
|
286 |
+
if ious[dind,gind] < iou:
|
287 |
+
continue
|
288 |
+
# if match successful and best so far, store appropriately
|
289 |
+
iou=ious[dind,gind]
|
290 |
+
m=gind
|
291 |
+
# if match made store id of match for both dt and gt
|
292 |
+
if m ==-1:
|
293 |
+
continue
|
294 |
+
dtIg[tind,dind] = gtIg[m]
|
295 |
+
dtm[tind,dind] = gt[m]['id']
|
296 |
+
gtm[tind,m] = d['id']
|
297 |
+
# set unmatched detections outside of area range to ignore
|
298 |
+
a = np.array([d['area']<aRng[0] or d['area']>aRng[1] for d in dt]).reshape((1, len(dt)))
|
299 |
+
dtIg = np.logical_or(dtIg, np.logical_and(dtm==0, np.repeat(a,T,0)))
|
300 |
+
# store results for given image and category
|
301 |
+
return {
|
302 |
+
'image_id': imgId,
|
303 |
+
'category_id': catId,
|
304 |
+
'aRng': aRng,
|
305 |
+
'maxDet': maxDet,
|
306 |
+
'dtIds': [d['id'] for d in dt],
|
307 |
+
'gtIds': [g['id'] for g in gt],
|
308 |
+
'dtMatches': dtm,
|
309 |
+
'gtMatches': gtm,
|
310 |
+
'dtScores': [d['score'] for d in dt],
|
311 |
+
'gtIgnore': gtIg,
|
312 |
+
'dtIgnore': dtIg,
|
313 |
+
}
|
314 |
+
|
315 |
+
def accumulate(self, p = None):
|
316 |
+
'''
|
317 |
+
Accumulate per image evaluation results and store the result in self.eval
|
318 |
+
:param p: input params for evaluation
|
319 |
+
:return: None
|
320 |
+
'''
|
321 |
+
print('Accumulating evaluation results...')
|
322 |
+
tic = time.time()
|
323 |
+
if not self.evalImgs:
|
324 |
+
print('Please run evaluate() first')
|
325 |
+
# allows input customized parameters
|
326 |
+
if p is None:
|
327 |
+
p = self.params
|
328 |
+
p.catIds = p.catIds if p.useCats == 1 else [-1]
|
329 |
+
T = len(p.iouThrs)
|
330 |
+
R = len(p.recThrs)
|
331 |
+
K = len(p.catIds) if p.useCats else 1
|
332 |
+
A = len(p.areaRng)
|
333 |
+
M = len(p.maxDets)
|
334 |
+
precision = -np.ones((T,R,K,A,M)) # -1 for the precision of absent categories
|
335 |
+
recall = -np.ones((T,K,A,M))
|
336 |
+
scores = -np.ones((T,R,K,A,M))
|
337 |
+
|
338 |
+
# create dictionary for future indexing
|
339 |
+
_pe = self._paramsEval
|
340 |
+
catIds = _pe.catIds if _pe.useCats else [-1]
|
341 |
+
setK = set(catIds)
|
342 |
+
setA = set(map(tuple, _pe.areaRng))
|
343 |
+
setM = set(_pe.maxDets)
|
344 |
+
setI = set(_pe.imgIds)
|
345 |
+
# get inds to evaluate
|
346 |
+
k_list = [n for n, k in enumerate(p.catIds) if k in setK]
|
347 |
+
m_list = [m for n, m in enumerate(p.maxDets) if m in setM]
|
348 |
+
a_list = [n for n, a in enumerate(map(lambda x: tuple(x), p.areaRng)) if a in setA]
|
349 |
+
i_list = [n for n, i in enumerate(p.imgIds) if i in setI]
|
350 |
+
I0 = len(_pe.imgIds)
|
351 |
+
A0 = len(_pe.areaRng)
|
352 |
+
# retrieve E at each category, area range, and max number of detections
|
353 |
+
for k, k0 in enumerate(k_list):
|
354 |
+
Nk = k0*A0*I0
|
355 |
+
for a, a0 in enumerate(a_list):
|
356 |
+
Na = a0*I0
|
357 |
+
for m, maxDet in enumerate(m_list):
|
358 |
+
E = [self.evalImgs[Nk + Na + i] for i in i_list]
|
359 |
+
E = [e for e in E if not e is None]
|
360 |
+
if len(E) == 0:
|
361 |
+
continue
|
362 |
+
dtScores = np.concatenate([e['dtScores'][0:maxDet] for e in E])
|
363 |
+
|
364 |
+
# different sorting method generates slightly different results.
|
365 |
+
# mergesort is used to be consistent as Matlab implementation.
|
366 |
+
inds = np.argsort(-dtScores, kind='mergesort')
|
367 |
+
dtScoresSorted = dtScores[inds]
|
368 |
+
|
369 |
+
dtm = np.concatenate([e['dtMatches'][:,0:maxDet] for e in E], axis=1)[:,inds]
|
370 |
+
dtIg = np.concatenate([e['dtIgnore'][:,0:maxDet] for e in E], axis=1)[:,inds]
|
371 |
+
gtIg = np.concatenate([e['gtIgnore'] for e in E])
|
372 |
+
npig = np.count_nonzero(gtIg==0 )
|
373 |
+
if npig == 0:
|
374 |
+
continue
|
375 |
+
tps = np.logical_and( dtm, np.logical_not(dtIg) )
|
376 |
+
fps = np.logical_and(np.logical_not(dtm), np.logical_not(dtIg) )
|
377 |
+
|
378 |
+
tp_sum = np.cumsum(tps, axis=1).astype(dtype=float)
|
379 |
+
fp_sum = np.cumsum(fps, axis=1).astype(dtype=float)
|
380 |
+
for t, (tp, fp) in enumerate(zip(tp_sum, fp_sum)):
|
381 |
+
tp = np.array(tp)
|
382 |
+
fp = np.array(fp)
|
383 |
+
nd = len(tp)
|
384 |
+
rc = tp / npig
|
385 |
+
pr = tp / (fp+tp+np.spacing(1))
|
386 |
+
q = np.zeros((R,))
|
387 |
+
ss = np.zeros((R,))
|
388 |
+
|
389 |
+
if nd:
|
390 |
+
recall[t,k,a,m] = rc[-1]
|
391 |
+
else:
|
392 |
+
recall[t,k,a,m] = 0
|
393 |
+
|
394 |
+
# numpy is slow without cython optimization for accessing elements
|
395 |
+
# use python array gets significant speed improvement
|
396 |
+
pr = pr.tolist(); q = q.tolist()
|
397 |
+
|
398 |
+
for i in range(nd-1, 0, -1):
|
399 |
+
if pr[i] > pr[i-1]:
|
400 |
+
pr[i-1] = pr[i]
|
401 |
+
|
402 |
+
inds = np.searchsorted(rc, p.recThrs, side='left')
|
403 |
+
try:
|
404 |
+
for ri, pi in enumerate(inds):
|
405 |
+
q[ri] = pr[pi]
|
406 |
+
ss[ri] = dtScoresSorted[pi]
|
407 |
+
except:
|
408 |
+
pass
|
409 |
+
precision[t,:,k,a,m] = np.array(q)
|
410 |
+
scores[t,:,k,a,m] = np.array(ss)
|
411 |
+
self.eval = {
|
412 |
+
'params': p,
|
413 |
+
'counts': [T, R, K, A, M],
|
414 |
+
'date': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
415 |
+
'precision': precision,
|
416 |
+
'recall': recall,
|
417 |
+
'scores': scores,
|
418 |
+
}
|
419 |
+
toc = time.time()
|
420 |
+
print('DONE (t={:0.2f}s).'.format( toc-tic))
|
421 |
+
|
422 |
+
def summarize(self):
|
423 |
+
'''
|
424 |
+
Compute and display summary metrics for evaluation results.
|
425 |
+
Note this functin can *only* be applied on the default parameter setting
|
426 |
+
'''
|
427 |
+
def _summarize( ap=1, iouThr=None, areaRng='all', maxDets=100 ):
|
428 |
+
p = self.params
|
429 |
+
iStr = ' {:<18} {} @[ IoU={:<9} | area={:>6s} | maxDets={:>3d} ] = {:0.3f}'
|
430 |
+
titleStr = 'Average Precision' if ap == 1 else 'Average Recall'
|
431 |
+
typeStr = '(AP)' if ap==1 else '(AR)'
|
432 |
+
iouStr = '{:0.2f}:{:0.2f}'.format(p.iouThrs[0], p.iouThrs[-1]) \
|
433 |
+
if iouThr is None else '{:0.2f}'.format(iouThr)
|
434 |
+
|
435 |
+
aind = [i for i, aRng in enumerate(p.areaRngLbl) if aRng == areaRng]
|
436 |
+
mind = [i for i, mDet in enumerate(p.maxDets) if mDet == maxDets]
|
437 |
+
if ap == 1:
|
438 |
+
# dimension of precision: [TxRxKxAxM]
|
439 |
+
s = self.eval['precision']
|
440 |
+
# IoU
|
441 |
+
if iouThr is not None:
|
442 |
+
t = np.where(iouThr == p.iouThrs)[0]
|
443 |
+
s = s[t]
|
444 |
+
s = s[:,:,:,aind,mind]
|
445 |
+
else:
|
446 |
+
# dimension of recall: [TxKxAxM]
|
447 |
+
s = self.eval['recall']
|
448 |
+
if iouThr is not None:
|
449 |
+
t = np.where(iouThr == p.iouThrs)[0]
|
450 |
+
s = s[t]
|
451 |
+
s = s[:,:,aind,mind]
|
452 |
+
if len(s[s>-1])==0:
|
453 |
+
mean_s = -1
|
454 |
+
else:
|
455 |
+
mean_s = np.mean(s[s>-1])
|
456 |
+
print(iStr.format(titleStr, typeStr, iouStr, areaRng, maxDets, mean_s))
|
457 |
+
return mean_s
|
458 |
+
def _summarizeDets():
|
459 |
+
stats = np.zeros((12,))
|
460 |
+
stats[0] = _summarize(1)
|
461 |
+
stats[1] = _summarize(1, iouThr=.5, maxDets=self.params.maxDets[2])
|
462 |
+
stats[2] = _summarize(1, iouThr=.75, maxDets=self.params.maxDets[2])
|
463 |
+
stats[3] = _summarize(1, areaRng='small', maxDets=self.params.maxDets[2])
|
464 |
+
stats[4] = _summarize(1, areaRng='medium', maxDets=self.params.maxDets[2])
|
465 |
+
stats[5] = _summarize(1, areaRng='large', maxDets=self.params.maxDets[2])
|
466 |
+
stats[6] = _summarize(0, maxDets=self.params.maxDets[0])
|
467 |
+
stats[7] = _summarize(0, maxDets=self.params.maxDets[1])
|
468 |
+
stats[8] = _summarize(0, maxDets=self.params.maxDets[2])
|
469 |
+
stats[9] = _summarize(0, areaRng='small', maxDets=self.params.maxDets[2])
|
470 |
+
stats[10] = _summarize(0, areaRng='medium', maxDets=self.params.maxDets[2])
|
471 |
+
stats[11] = _summarize(0, areaRng='large', maxDets=self.params.maxDets[2])
|
472 |
+
return stats
|
473 |
+
def _summarizeKps():
|
474 |
+
stats = np.zeros((10,))
|
475 |
+
stats[0] = _summarize(1, maxDets=20)
|
476 |
+
stats[1] = _summarize(1, maxDets=20, iouThr=.5)
|
477 |
+
stats[2] = _summarize(1, maxDets=20, iouThr=.75)
|
478 |
+
stats[3] = _summarize(1, maxDets=20, areaRng='medium')
|
479 |
+
stats[4] = _summarize(1, maxDets=20, areaRng='large')
|
480 |
+
stats[5] = _summarize(0, maxDets=20)
|
481 |
+
stats[6] = _summarize(0, maxDets=20, iouThr=.5)
|
482 |
+
stats[7] = _summarize(0, maxDets=20, iouThr=.75)
|
483 |
+
stats[8] = _summarize(0, maxDets=20, areaRng='medium')
|
484 |
+
stats[9] = _summarize(0, maxDets=20, areaRng='large')
|
485 |
+
return stats
|
486 |
+
if not self.eval:
|
487 |
+
raise Exception('Please run accumulate() first')
|
488 |
+
iouType = self.params.iouType
|
489 |
+
if iouType == 'segm' or iouType == 'bbox':
|
490 |
+
summarize = _summarizeDets
|
491 |
+
elif iouType == 'keypoints':
|
492 |
+
summarize = _summarizeKps
|
493 |
+
self.stats = summarize()
|
494 |
+
|
495 |
+
def __str__(self):
|
496 |
+
self.summarize()
|
497 |
+
|
498 |
+
class Params:
|
499 |
+
'''
|
500 |
+
Params for coco evaluation api
|
501 |
+
'''
|
502 |
+
def setDetParams(self):
|
503 |
+
self.imgIds = []
|
504 |
+
self.catIds = []
|
505 |
+
# np.arange causes trouble. the data point on arange is slightly larger than the true value
|
506 |
+
self.iouThrs = np.linspace(.5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True)
|
507 |
+
self.recThrs = np.linspace(.0, 1.00, int(np.round((1.00 - .0) / .01)) + 1, endpoint=True)
|
508 |
+
self.maxDets = [1, 10, 100]
|
509 |
+
self.areaRng = [[0 ** 2, 1e5 ** 2], [0 ** 2, 32 ** 2], [32 ** 2, 96 ** 2], [96 ** 2, 1e5 ** 2]]
|
510 |
+
self.areaRngLbl = ['all', 'small', 'medium', 'large']
|
511 |
+
self.useCats = 1
|
512 |
+
|
513 |
+
def setKpParams(self):
|
514 |
+
self.imgIds = []
|
515 |
+
self.catIds = []
|
516 |
+
# np.arange causes trouble. the data point on arange is slightly larger than the true value
|
517 |
+
self.iouThrs = np.linspace(.5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True)
|
518 |
+
self.recThrs = np.linspace(.0, 1.00, int(np.round((1.00 - .0) / .01)) + 1, endpoint=True)
|
519 |
+
self.maxDets = [20]
|
520 |
+
self.areaRng = [[0 ** 2, 1e5 ** 2], [32 ** 2, 96 ** 2], [96 ** 2, 1e5 ** 2]]
|
521 |
+
self.areaRngLbl = ['all', 'medium', 'large']
|
522 |
+
self.useCats = 1
|
523 |
+
self.kpt_oks_sigmas = np.array([.26, .25, .25, .35, .35, .79, .79, .72, .72, .62,.62, 1.07, 1.07, .87, .87, .89, .89])/10.0
|
524 |
+
|
525 |
+
def __init__(self, iouType='segm'):
|
526 |
+
if iouType == 'segm' or iouType == 'bbox':
|
527 |
+
self.setDetParams()
|
528 |
+
elif iouType == 'keypoints':
|
529 |
+
self.setKpParams()
|
530 |
+
else:
|
531 |
+
raise Exception('iouType not supported')
|
532 |
+
self.iouType = iouType
|
533 |
+
# useSegm is deprecated
|
534 |
+
self.useSegm = None
|
extensions/microsoftexcel-controlnet/annotator/oneformer/pycocotools/mask.py
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
__author__ = 'tsungyi'
|
2 |
+
|
3 |
+
# import annotator.oneformer.pycocotools._mask as _mask
|
4 |
+
|
5 |
+
# Interface for manipulating masks stored in RLE format.
|
6 |
+
#
|
7 |
+
# RLE is a simple yet efficient format for storing binary masks. RLE
|
8 |
+
# first divides a vector (or vectorized image) into a series of piecewise
|
9 |
+
# constant regions and then for each piece simply stores the length of
|
10 |
+
# that piece. For example, given M=[0 0 1 1 1 0 1] the RLE counts would
|
11 |
+
# be [2 3 1 1], or for M=[1 1 1 1 1 1 0] the counts would be [0 6 1]
|
12 |
+
# (note that the odd counts are always the numbers of zeros). Instead of
|
13 |
+
# storing the counts directly, additional compression is achieved with a
|
14 |
+
# variable bitrate representation based on a common scheme called LEB128.
|
15 |
+
#
|
16 |
+
# Compression is greatest given large piecewise constant regions.
|
17 |
+
# Specifically, the size of the RLE is proportional to the number of
|
18 |
+
# *boundaries* in M (or for an image the number of boundaries in the y
|
19 |
+
# direction). Assuming fairly simple shapes, the RLE representation is
|
20 |
+
# O(sqrt(n)) where n is number of pixels in the object. Hence space usage
|
21 |
+
# is substantially lower, especially for large simple objects (large n).
|
22 |
+
#
|
23 |
+
# Many common operations on masks can be computed directly using the RLE
|
24 |
+
# (without need for decoding). This includes computations such as area,
|
25 |
+
# union, intersection, etc. All of these operations are linear in the
|
26 |
+
# size of the RLE, in other words they are O(sqrt(n)) where n is the area
|
27 |
+
# of the object. Computing these operations on the original mask is O(n).
|
28 |
+
# Thus, using the RLE can result in substantial computational savings.
|
29 |
+
#
|
30 |
+
# The following API functions are defined:
|
31 |
+
# encode - Encode binary masks using RLE.
|
32 |
+
# decode - Decode binary masks encoded via RLE.
|
33 |
+
# merge - Compute union or intersection of encoded masks.
|
34 |
+
# iou - Compute intersection over union between masks.
|
35 |
+
# area - Compute area of encoded masks.
|
36 |
+
# toBbox - Get bounding boxes surrounding encoded masks.
|
37 |
+
# frPyObjects - Convert polygon, bbox, and uncompressed RLE to encoded RLE mask.
|
38 |
+
#
|
39 |
+
# Usage:
|
40 |
+
# Rs = encode( masks )
|
41 |
+
# masks = decode( Rs )
|
42 |
+
# R = merge( Rs, intersect=false )
|
43 |
+
# o = iou( dt, gt, iscrowd )
|
44 |
+
# a = area( Rs )
|
45 |
+
# bbs = toBbox( Rs )
|
46 |
+
# Rs = frPyObjects( [pyObjects], h, w )
|
47 |
+
#
|
48 |
+
# In the API the following formats are used:
|
49 |
+
# Rs - [dict] Run-length encoding of binary masks
|
50 |
+
# R - dict Run-length encoding of binary mask
|
51 |
+
# masks - [hxwxn] Binary mask(s) (must have type np.ndarray(dtype=uint8) in column-major order)
|
52 |
+
# iscrowd - [nx1] list of np.ndarray. 1 indicates corresponding gt image has crowd region to ignore
|
53 |
+
# bbs - [nx4] Bounding box(es) stored as [x y w h]
|
54 |
+
# poly - Polygon stored as [[x1 y1 x2 y2...],[x1 y1 ...],...] (2D list)
|
55 |
+
# dt,gt - May be either bounding boxes or encoded masks
|
56 |
+
# Both poly and bbs are 0-indexed (bbox=[0 0 1 1] encloses first pixel).
|
57 |
+
#
|
58 |
+
# Finally, a note about the intersection over union (iou) computation.
|
59 |
+
# The standard iou of a ground truth (gt) and detected (dt) object is
|
60 |
+
# iou(gt,dt) = area(intersect(gt,dt)) / area(union(gt,dt))
|
61 |
+
# For "crowd" regions, we use a modified criteria. If a gt object is
|
62 |
+
# marked as "iscrowd", we allow a dt to match any subregion of the gt.
|
63 |
+
# Choosing gt' in the crowd gt that best matches the dt can be done using
|
64 |
+
# gt'=intersect(dt,gt). Since by definition union(gt',dt)=dt, computing
|
65 |
+
# iou(gt,dt,iscrowd) = iou(gt',dt) = area(intersect(gt,dt)) / area(dt)
|
66 |
+
# For crowd gt regions we use this modified criteria above for the iou.
|
67 |
+
#
|
68 |
+
# To compile run "python setup.py build_ext --inplace"
|
69 |
+
# Please do not contact us for help with compiling.
|
70 |
+
#
|
71 |
+
# Microsoft COCO Toolbox. version 2.0
|
72 |
+
# Data, paper, and tutorials available at: http://mscoco.org/
|
73 |
+
# Code written by Piotr Dollar and Tsung-Yi Lin, 2015.
|
74 |
+
# Licensed under the Simplified BSD License [see coco/license.txt]
|
75 |
+
|
76 |
+
# iou = _mask.iou
|
77 |
+
# merge = _mask.merge
|
78 |
+
# frPyObjects = _mask.frPyObjects
|
79 |
+
|
80 |
+
def encode(bimask):
|
81 |
+
pass
|
82 |
+
# if len(bimask.shape) == 3:
|
83 |
+
# return _mask.encode(bimask)
|
84 |
+
# elif len(bimask.shape) == 2:
|
85 |
+
# h, w = bimask.shape
|
86 |
+
# return _mask.encode(bimask.reshape((h, w, 1), order='F'))[0]
|
87 |
+
|
88 |
+
def decode(rleObjs):
|
89 |
+
pass
|
90 |
+
# if type(rleObjs) == list:
|
91 |
+
# return _mask.decode(rleObjs)
|
92 |
+
# else:
|
93 |
+
# return _mask.decode([rleObjs])[:,:,0]
|
94 |
+
|
95 |
+
def area(rleObjs):
|
96 |
+
pass
|
97 |
+
# if type(rleObjs) == list:
|
98 |
+
# return _mask.area(rleObjs)
|
99 |
+
# else:
|
100 |
+
# return _mask.area([rleObjs])[0]
|
101 |
+
|
102 |
+
def toBbox(rleObjs):
|
103 |
+
pass
|
104 |
+
# if type(rleObjs) == list:
|
105 |
+
# return _mask.toBbox(rleObjs)
|
106 |
+
# else:
|
107 |
+
# return _mask.toBbox([rleObjs])[0]
|
extensions/microsoftexcel-controlnet/annotator/openpose/LICENSE
ADDED
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
OPENPOSE: MULTIPERSON KEYPOINT DETECTION
|
2 |
+
SOFTWARE LICENSE AGREEMENT
|
3 |
+
ACADEMIC OR NON-PROFIT ORGANIZATION NONCOMMERCIAL RESEARCH USE ONLY
|
4 |
+
|
5 |
+
BY USING OR DOWNLOADING THE SOFTWARE, YOU ARE AGREEING TO THE TERMS OF THIS LICENSE AGREEMENT. IF YOU DO NOT AGREE WITH THESE TERMS, YOU MAY NOT USE OR DOWNLOAD THE SOFTWARE.
|
6 |
+
|
7 |
+
This is a license agreement ("Agreement") between your academic institution or non-profit organization or self (called "Licensee" or "You" in this Agreement) and Carnegie Mellon University (called "Licensor" in this Agreement). All rights not specifically granted to you in this Agreement are reserved for Licensor.
|
8 |
+
|
9 |
+
RESERVATION OF OWNERSHIP AND GRANT OF LICENSE:
|
10 |
+
Licensor retains exclusive ownership of any copy of the Software (as defined below) licensed under this Agreement and hereby grants to Licensee a personal, non-exclusive,
|
11 |
+
non-transferable license to use the Software for noncommercial research purposes, without the right to sublicense, pursuant to the terms and conditions of this Agreement. As used in this Agreement, the term "Software" means (i) the actual copy of all or any portion of code for program routines made accessible to Licensee by Licensor pursuant to this Agreement, inclusive of backups, updates, and/or merged copies permitted hereunder or subsequently supplied by Licensor, including all or any file structures, programming instructions, user interfaces and screen formats and sequences as well as any and all documentation and instructions related to it, and (ii) all or any derivatives and/or modifications created or made by You to any of the items specified in (i).
|
12 |
+
|
13 |
+
CONFIDENTIALITY: Licensee acknowledges that the Software is proprietary to Licensor, and as such, Licensee agrees to receive all such materials in confidence and use the Software only in accordance with the terms of this Agreement. Licensee agrees to use reasonable effort to protect the Software from unauthorized use, reproduction, distribution, or publication.
|
14 |
+
|
15 |
+
COPYRIGHT: The Software is owned by Licensor and is protected by United
|
16 |
+
States copyright laws and applicable international treaties and/or conventions.
|
17 |
+
|
18 |
+
PERMITTED USES: The Software may be used for your own noncommercial internal research purposes. You understand and agree that Licensor is not obligated to implement any suggestions and/or feedback you might provide regarding the Software, but to the extent Licensor does so, you are not entitled to any compensation related thereto.
|
19 |
+
|
20 |
+
DERIVATIVES: You may create derivatives of or make modifications to the Software, however, You agree that all and any such derivatives and modifications will be owned by Licensor and become a part of the Software licensed to You under this Agreement. You may only use such derivatives and modifications for your own noncommercial internal research purposes, and you may not otherwise use, distribute or copy such derivatives and modifications in violation of this Agreement.
|
21 |
+
|
22 |
+
BACKUPS: If Licensee is an organization, it may make that number of copies of the Software necessary for internal noncommercial use at a single site within its organization provided that all information appearing in or on the original labels, including the copyright and trademark notices are copied onto the labels of the copies.
|
23 |
+
|
24 |
+
USES NOT PERMITTED: You may not distribute, copy or use the Software except as explicitly permitted herein. Licensee has not been granted any trademark license as part of this Agreement and may not use the name or mark “OpenPose", "Carnegie Mellon" or any renditions thereof without the prior written permission of Licensor.
|
25 |
+
|
26 |
+
You may not sell, rent, lease, sublicense, lend, time-share or transfer, in whole or in part, or provide third parties access to prior or present versions (or any parts thereof) of the Software.
|
27 |
+
|
28 |
+
ASSIGNMENT: You may not assign this Agreement or your rights hereunder without the prior written consent of Licensor. Any attempted assignment without such consent shall be null and void.
|
29 |
+
|
30 |
+
TERM: The term of the license granted by this Agreement is from Licensee's acceptance of this Agreement by downloading the Software or by using the Software until terminated as provided below.
|
31 |
+
|
32 |
+
The Agreement automatically terminates without notice if you fail to comply with any provision of this Agreement. Licensee may terminate this Agreement by ceasing using the Software. Upon any termination of this Agreement, Licensee will delete any and all copies of the Software. You agree that all provisions which operate to protect the proprietary rights of Licensor shall remain in force should breach occur and that the obligation of confidentiality described in this Agreement is binding in perpetuity and, as such, survives the term of the Agreement.
|
33 |
+
|
34 |
+
FEE: Provided Licensee abides completely by the terms and conditions of this Agreement, there is no fee due to Licensor for Licensee's use of the Software in accordance with this Agreement.
|
35 |
+
|
36 |
+
DISCLAIMER OF WARRANTIES: THE SOFTWARE IS PROVIDED "AS-IS" WITHOUT WARRANTY OF ANY KIND INCLUDING ANY WARRANTIES OF PERFORMANCE OR MERCHANTABILITY OR FITNESS FOR A PARTICULAR USE OR PURPOSE OR OF NON-INFRINGEMENT. LICENSEE BEARS ALL RISK RELATING TO QUALITY AND PERFORMANCE OF THE SOFTWARE AND RELATED MATERIALS.
|
37 |
+
|
38 |
+
SUPPORT AND MAINTENANCE: No Software support or training by the Licensor is provided as part of this Agreement.
|
39 |
+
|
40 |
+
EXCLUSIVE REMEDY AND LIMITATION OF LIABILITY: To the maximum extent permitted under applicable law, Licensor shall not be liable for direct, indirect, special, incidental, or consequential damages or lost profits related to Licensee's use of and/or inability to use the Software, even if Licensor is advised of the possibility of such damage.
|
41 |
+
|
42 |
+
EXPORT REGULATION: Licensee agrees to comply with any and all applicable
|
43 |
+
U.S. export control laws, regulations, and/or other laws related to embargoes and sanction programs administered by the Office of Foreign Assets Control.
|
44 |
+
|
45 |
+
SEVERABILITY: If any provision(s) of this Agreement shall be held to be invalid, illegal, or unenforceable by a court or other tribunal of competent jurisdiction, the validity, legality and enforceability of the remaining provisions shall not in any way be affected or impaired thereby.
|
46 |
+
|
47 |
+
NO IMPLIED WAIVERS: No failure or delay by Licensor in enforcing any right or remedy under this Agreement shall be construed as a waiver of any future or other exercise of such right or remedy by Licensor.
|
48 |
+
|
49 |
+
GOVERNING LAW: This Agreement shall be construed and enforced in accordance with the laws of the Commonwealth of Pennsylvania without reference to conflict of laws principles. You consent to the personal jurisdiction of the courts of this County and waive their rights to venue outside of Allegheny County, Pennsylvania.
|
50 |
+
|
51 |
+
ENTIRE AGREEMENT AND AMENDMENTS: This Agreement constitutes the sole and entire agreement between Licensee and Licensor as to the matter set forth herein and supersedes any previous agreements, understandings, and arrangements between the parties relating hereto.
|
52 |
+
|
53 |
+
|
54 |
+
|
55 |
+
************************************************************************
|
56 |
+
|
57 |
+
THIRD-PARTY SOFTWARE NOTICES AND INFORMATION
|
58 |
+
|
59 |
+
This project incorporates material from the project(s) listed below (collectively, "Third Party Code"). This Third Party Code is licensed to you under their original license terms set forth below. We reserves all other rights not expressly granted, whether by implication, estoppel or otherwise.
|
60 |
+
|
61 |
+
1. Caffe, version 1.0.0, (https://github.com/BVLC/caffe/)
|
62 |
+
|
63 |
+
COPYRIGHT
|
64 |
+
|
65 |
+
All contributions by the University of California:
|
66 |
+
Copyright (c) 2014-2017 The Regents of the University of California (Regents)
|
67 |
+
All rights reserved.
|
68 |
+
|
69 |
+
All other contributions:
|
70 |
+
Copyright (c) 2014-2017, the respective contributors
|
71 |
+
All rights reserved.
|
72 |
+
|
73 |
+
Caffe uses a shared copyright model: each contributor holds copyright over
|
74 |
+
their contributions to Caffe. The project versioning records all such
|
75 |
+
contribution and copyright details. If a contributor wants to further mark
|
76 |
+
their specific copyright on a particular contribution, they should indicate
|
77 |
+
their copyright solely in the commit message of the change when it is
|
78 |
+
committed.
|
79 |
+
|
80 |
+
LICENSE
|
81 |
+
|
82 |
+
Redistribution and use in source and binary forms, with or without
|
83 |
+
modification, are permitted provided that the following conditions are met:
|
84 |
+
|
85 |
+
1. Redistributions of source code must retain the above copyright notice, this
|
86 |
+
list of conditions and the following disclaimer.
|
87 |
+
2. Redistributions in binary form must reproduce the above copyright notice,
|
88 |
+
this list of conditions and the following disclaimer in the documentation
|
89 |
+
and/or other materials provided with the distribution.
|
90 |
+
|
91 |
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
92 |
+
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
93 |
+
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
94 |
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
95 |
+
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
96 |
+
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
97 |
+
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
98 |
+
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
99 |
+
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
100 |
+
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
101 |
+
|
102 |
+
CONTRIBUTION AGREEMENT
|
103 |
+
|
104 |
+
By contributing to the BVLC/caffe repository through pull-request, comment,
|
105 |
+
or otherwise, the contributor releases their content to the
|
106 |
+
license and copyright terms herein.
|
107 |
+
|
108 |
+
************END OF THIRD-PARTY SOFTWARE NOTICES AND INFORMATION**********
|
extensions/microsoftexcel-controlnet/annotator/openpose/__init__.py
ADDED
@@ -0,0 +1,262 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Openpose
|
2 |
+
# Original from CMU https://github.com/CMU-Perceptual-Computing-Lab/openpose
|
3 |
+
# 2nd Edited by https://github.com/Hzzone/pytorch-openpose
|
4 |
+
# 3rd Edited by ControlNet
|
5 |
+
# 4th Edited by ControlNet (added face and correct hands)
|
6 |
+
# 5th Edited by ControlNet (Improved JSON serialization/deserialization, and lots of bug fixs)
|
7 |
+
# This preprocessor is licensed by CMU for non-commercial use only.
|
8 |
+
|
9 |
+
|
10 |
+
import os
|
11 |
+
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
|
12 |
+
|
13 |
+
import json
|
14 |
+
import torch
|
15 |
+
import numpy as np
|
16 |
+
from . import util
|
17 |
+
from .body import Body, BodyResult, Keypoint
|
18 |
+
from .hand import Hand
|
19 |
+
from .face import Face
|
20 |
+
from modules import devices
|
21 |
+
from annotator.annotator_path import models_path
|
22 |
+
|
23 |
+
from typing import NamedTuple, Tuple, List, Callable, Union
|
24 |
+
|
25 |
+
body_model_path = "https://huggingface.co/lllyasviel/Annotators/resolve/main/body_pose_model.pth"
|
26 |
+
hand_model_path = "https://huggingface.co/lllyasviel/Annotators/resolve/main/hand_pose_model.pth"
|
27 |
+
face_model_path = "https://huggingface.co/lllyasviel/Annotators/resolve/main/facenet.pth"
|
28 |
+
|
29 |
+
HandResult = List[Keypoint]
|
30 |
+
FaceResult = List[Keypoint]
|
31 |
+
|
32 |
+
class PoseResult(NamedTuple):
|
33 |
+
body: BodyResult
|
34 |
+
left_hand: Union[HandResult, None]
|
35 |
+
right_hand: Union[HandResult, None]
|
36 |
+
face: Union[FaceResult, None]
|
37 |
+
|
38 |
+
def draw_poses(poses: List[PoseResult], H, W, draw_body=True, draw_hand=True, draw_face=True):
|
39 |
+
"""
|
40 |
+
Draw the detected poses on an empty canvas.
|
41 |
+
|
42 |
+
Args:
|
43 |
+
poses (List[PoseResult]): A list of PoseResult objects containing the detected poses.
|
44 |
+
H (int): The height of the canvas.
|
45 |
+
W (int): The width of the canvas.
|
46 |
+
draw_body (bool, optional): Whether to draw body keypoints. Defaults to True.
|
47 |
+
draw_hand (bool, optional): Whether to draw hand keypoints. Defaults to True.
|
48 |
+
draw_face (bool, optional): Whether to draw face keypoints. Defaults to True.
|
49 |
+
|
50 |
+
Returns:
|
51 |
+
numpy.ndarray: A 3D numpy array representing the canvas with the drawn poses.
|
52 |
+
"""
|
53 |
+
canvas = np.zeros(shape=(H, W, 3), dtype=np.uint8)
|
54 |
+
|
55 |
+
for pose in poses:
|
56 |
+
if draw_body:
|
57 |
+
canvas = util.draw_bodypose(canvas, pose.body.keypoints)
|
58 |
+
|
59 |
+
if draw_hand:
|
60 |
+
canvas = util.draw_handpose(canvas, pose.left_hand)
|
61 |
+
canvas = util.draw_handpose(canvas, pose.right_hand)
|
62 |
+
|
63 |
+
if draw_face:
|
64 |
+
canvas = util.draw_facepose(canvas, pose.face)
|
65 |
+
|
66 |
+
return canvas
|
67 |
+
|
68 |
+
def encode_poses_as_json(poses: List[PoseResult], canvas_height: int, canvas_width: int) -> str:
|
69 |
+
""" Encode the pose as a JSON string following openpose JSON output format:
|
70 |
+
https://github.com/CMU-Perceptual-Computing-Lab/openpose/blob/master/doc/02_output.md
|
71 |
+
"""
|
72 |
+
def compress_keypoints(keypoints: Union[List[Keypoint], None]) -> Union[List[float], None]:
|
73 |
+
if not keypoints:
|
74 |
+
return None
|
75 |
+
|
76 |
+
return [
|
77 |
+
value
|
78 |
+
for keypoint in keypoints
|
79 |
+
for value in (
|
80 |
+
[float(keypoint.x), float(keypoint.y), 1.0]
|
81 |
+
if keypoint is not None
|
82 |
+
else [0.0, 0.0, 0.0]
|
83 |
+
)
|
84 |
+
]
|
85 |
+
|
86 |
+
return json.dumps({
|
87 |
+
'people': [
|
88 |
+
{
|
89 |
+
'pose_keypoints_2d': compress_keypoints(pose.body.keypoints),
|
90 |
+
"face_keypoints_2d": compress_keypoints(pose.face),
|
91 |
+
"hand_left_keypoints_2d": compress_keypoints(pose.left_hand),
|
92 |
+
"hand_right_keypoints_2d":compress_keypoints(pose.right_hand),
|
93 |
+
}
|
94 |
+
for pose in poses
|
95 |
+
],
|
96 |
+
'canvas_height': canvas_height,
|
97 |
+
'canvas_width': canvas_width,
|
98 |
+
}, indent=4)
|
99 |
+
|
100 |
+
|
101 |
+
class OpenposeDetector:
|
102 |
+
"""
|
103 |
+
A class for detecting human poses in images using the Openpose model.
|
104 |
+
|
105 |
+
Attributes:
|
106 |
+
model_dir (str): Path to the directory where the pose models are stored.
|
107 |
+
"""
|
108 |
+
model_dir = os.path.join(models_path, "openpose")
|
109 |
+
|
110 |
+
def __init__(self):
|
111 |
+
self.device = devices.get_device_for("controlnet")
|
112 |
+
self.body_estimation = None
|
113 |
+
self.hand_estimation = None
|
114 |
+
self.face_estimation = None
|
115 |
+
|
116 |
+
def load_model(self):
|
117 |
+
"""
|
118 |
+
Load the Openpose body, hand, and face models.
|
119 |
+
"""
|
120 |
+
body_modelpath = os.path.join(self.model_dir, "body_pose_model.pth")
|
121 |
+
hand_modelpath = os.path.join(self.model_dir, "hand_pose_model.pth")
|
122 |
+
face_modelpath = os.path.join(self.model_dir, "facenet.pth")
|
123 |
+
|
124 |
+
if not os.path.exists(body_modelpath):
|
125 |
+
from basicsr.utils.download_util import load_file_from_url
|
126 |
+
load_file_from_url(body_model_path, model_dir=self.model_dir)
|
127 |
+
|
128 |
+
if not os.path.exists(hand_modelpath):
|
129 |
+
from basicsr.utils.download_util import load_file_from_url
|
130 |
+
load_file_from_url(hand_model_path, model_dir=self.model_dir)
|
131 |
+
|
132 |
+
if not os.path.exists(face_modelpath):
|
133 |
+
from basicsr.utils.download_util import load_file_from_url
|
134 |
+
load_file_from_url(face_model_path, model_dir=self.model_dir)
|
135 |
+
|
136 |
+
self.body_estimation = Body(body_modelpath)
|
137 |
+
self.hand_estimation = Hand(hand_modelpath)
|
138 |
+
self.face_estimation = Face(face_modelpath)
|
139 |
+
|
140 |
+
def unload_model(self):
|
141 |
+
"""
|
142 |
+
Unload the Openpose models by moving them to the CPU.
|
143 |
+
"""
|
144 |
+
if self.body_estimation is not None:
|
145 |
+
self.body_estimation.model.to("cpu")
|
146 |
+
self.hand_estimation.model.to("cpu")
|
147 |
+
self.face_estimation.model.to("cpu")
|
148 |
+
|
149 |
+
def detect_hands(self, body: BodyResult, oriImg) -> Tuple[Union[HandResult, None], Union[HandResult, None]]:
|
150 |
+
left_hand = None
|
151 |
+
right_hand = None
|
152 |
+
H, W, _ = oriImg.shape
|
153 |
+
for x, y, w, is_left in util.handDetect(body, oriImg):
|
154 |
+
peaks = self.hand_estimation(oriImg[y:y+w, x:x+w, :]).astype(np.float32)
|
155 |
+
if peaks.ndim == 2 and peaks.shape[1] == 2:
|
156 |
+
peaks[:, 0] = np.where(peaks[:, 0] < 1e-6, -1, peaks[:, 0] + x) / float(W)
|
157 |
+
peaks[:, 1] = np.where(peaks[:, 1] < 1e-6, -1, peaks[:, 1] + y) / float(H)
|
158 |
+
|
159 |
+
hand_result = [
|
160 |
+
Keypoint(x=peak[0], y=peak[1])
|
161 |
+
for peak in peaks
|
162 |
+
]
|
163 |
+
|
164 |
+
if is_left:
|
165 |
+
left_hand = hand_result
|
166 |
+
else:
|
167 |
+
right_hand = hand_result
|
168 |
+
|
169 |
+
return left_hand, right_hand
|
170 |
+
|
171 |
+
def detect_face(self, body: BodyResult, oriImg) -> Union[FaceResult, None]:
|
172 |
+
face = util.faceDetect(body, oriImg)
|
173 |
+
if face is None:
|
174 |
+
return None
|
175 |
+
|
176 |
+
x, y, w = face
|
177 |
+
H, W, _ = oriImg.shape
|
178 |
+
heatmaps = self.face_estimation(oriImg[y:y+w, x:x+w, :])
|
179 |
+
peaks = self.face_estimation.compute_peaks_from_heatmaps(heatmaps).astype(np.float32)
|
180 |
+
if peaks.ndim == 2 and peaks.shape[1] == 2:
|
181 |
+
peaks[:, 0] = np.where(peaks[:, 0] < 1e-6, -1, peaks[:, 0] + x) / float(W)
|
182 |
+
peaks[:, 1] = np.where(peaks[:, 1] < 1e-6, -1, peaks[:, 1] + y) / float(H)
|
183 |
+
return [
|
184 |
+
Keypoint(x=peak[0], y=peak[1])
|
185 |
+
for peak in peaks
|
186 |
+
]
|
187 |
+
|
188 |
+
return None
|
189 |
+
|
190 |
+
def detect_poses(self, oriImg, include_hand=False, include_face=False) -> List[PoseResult]:
|
191 |
+
"""
|
192 |
+
Detect poses in the given image.
|
193 |
+
Args:
|
194 |
+
oriImg (numpy.ndarray): The input image for pose detection.
|
195 |
+
include_hand (bool, optional): Whether to include hand detection. Defaults to False.
|
196 |
+
include_face (bool, optional): Whether to include face detection. Defaults to False.
|
197 |
+
|
198 |
+
Returns:
|
199 |
+
List[PoseResult]: A list of PoseResult objects containing the detected poses.
|
200 |
+
"""
|
201 |
+
if self.body_estimation is None:
|
202 |
+
self.load_model()
|
203 |
+
|
204 |
+
self.body_estimation.model.to(self.device)
|
205 |
+
self.hand_estimation.model.to(self.device)
|
206 |
+
self.face_estimation.model.to(self.device)
|
207 |
+
|
208 |
+
self.body_estimation.cn_device = self.device
|
209 |
+
self.hand_estimation.cn_device = self.device
|
210 |
+
self.face_estimation.cn_device = self.device
|
211 |
+
|
212 |
+
oriImg = oriImg[:, :, ::-1].copy()
|
213 |
+
H, W, C = oriImg.shape
|
214 |
+
with torch.no_grad():
|
215 |
+
candidate, subset = self.body_estimation(oriImg)
|
216 |
+
bodies = self.body_estimation.format_body_result(candidate, subset)
|
217 |
+
|
218 |
+
results = []
|
219 |
+
for body in bodies:
|
220 |
+
left_hand, right_hand, face = (None,) * 3
|
221 |
+
if include_hand:
|
222 |
+
left_hand, right_hand = self.detect_hands(body, oriImg)
|
223 |
+
if include_face:
|
224 |
+
face = self.detect_face(body, oriImg)
|
225 |
+
|
226 |
+
results.append(PoseResult(BodyResult(
|
227 |
+
keypoints=[
|
228 |
+
Keypoint(
|
229 |
+
x=keypoint.x / float(W),
|
230 |
+
y=keypoint.y / float(H)
|
231 |
+
) if keypoint is not None else None
|
232 |
+
for keypoint in body.keypoints
|
233 |
+
],
|
234 |
+
total_score=body.total_score,
|
235 |
+
total_parts=body.total_parts
|
236 |
+
), left_hand, right_hand, face))
|
237 |
+
|
238 |
+
return results
|
239 |
+
|
240 |
+
def __call__(
|
241 |
+
self, oriImg, include_body=True, include_hand=False, include_face=False,
|
242 |
+
json_pose_callback: Callable[[str], None] = None,
|
243 |
+
):
|
244 |
+
"""
|
245 |
+
Detect and draw poses in the given image.
|
246 |
+
|
247 |
+
Args:
|
248 |
+
oriImg (numpy.ndarray): The input image for pose detection and drawing.
|
249 |
+
include_body (bool, optional): Whether to include body keypoints. Defaults to True.
|
250 |
+
include_hand (bool, optional): Whether to include hand keypoints. Defaults to False.
|
251 |
+
include_face (bool, optional): Whether to include face keypoints. Defaults to False.
|
252 |
+
json_pose_callback (Callable, optional): A callback that accepts the pose JSON string.
|
253 |
+
|
254 |
+
Returns:
|
255 |
+
numpy.ndarray: The image with detected and drawn poses.
|
256 |
+
"""
|
257 |
+
H, W, _ = oriImg.shape
|
258 |
+
poses = self.detect_poses(oriImg, include_hand, include_face)
|
259 |
+
if json_pose_callback:
|
260 |
+
json_pose_callback(encode_poses_as_json(poses, H, W))
|
261 |
+
return draw_poses(poses, H, W, draw_body=include_body, draw_hand=include_hand, draw_face=include_face)
|
262 |
+
|
extensions/microsoftexcel-controlnet/annotator/openpose/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (9.36 kB). View file
|
|
extensions/microsoftexcel-controlnet/annotator/openpose/__pycache__/body.cpython-310.pyc
ADDED
Binary file (9.4 kB). View file
|
|
extensions/microsoftexcel-controlnet/annotator/openpose/__pycache__/face.cpython-310.pyc
ADDED
Binary file (8.11 kB). View file
|
|
extensions/microsoftexcel-controlnet/annotator/openpose/__pycache__/hand.cpython-310.pyc
ADDED
Binary file (3.18 kB). View file
|
|
extensions/microsoftexcel-controlnet/annotator/openpose/__pycache__/model.cpython-310.pyc
ADDED
Binary file (6.22 kB). View file
|
|
extensions/microsoftexcel-controlnet/annotator/openpose/__pycache__/util.cpython-310.pyc
ADDED
Binary file (11.6 kB). View file
|
|
extensions/microsoftexcel-controlnet/annotator/openpose/body.py
ADDED
@@ -0,0 +1,278 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import cv2
|
2 |
+
import numpy as np
|
3 |
+
import math
|
4 |
+
import time
|
5 |
+
from scipy.ndimage.filters import gaussian_filter
|
6 |
+
import matplotlib.pyplot as plt
|
7 |
+
import matplotlib
|
8 |
+
import torch
|
9 |
+
from torchvision import transforms
|
10 |
+
from typing import NamedTuple, List, Union
|
11 |
+
|
12 |
+
from . import util
|
13 |
+
from .model import bodypose_model
|
14 |
+
|
15 |
+
class Keypoint(NamedTuple):
|
16 |
+
x: float
|
17 |
+
y: float
|
18 |
+
score: float = 1.0
|
19 |
+
id: int = -1
|
20 |
+
|
21 |
+
|
22 |
+
class BodyResult(NamedTuple):
|
23 |
+
# Note: Using `Union` instead of `|` operator as the ladder is a Python
|
24 |
+
# 3.10 feature.
|
25 |
+
# Annotator code should be Python 3.8 Compatible, as controlnet repo uses
|
26 |
+
# Python 3.8 environment.
|
27 |
+
# https://github.com/lllyasviel/ControlNet/blob/d3284fcd0972c510635a4f5abe2eeb71dc0de524/environment.yaml#L6
|
28 |
+
keypoints: List[Union[Keypoint, None]]
|
29 |
+
total_score: float
|
30 |
+
total_parts: int
|
31 |
+
|
32 |
+
|
33 |
+
class Body(object):
|
34 |
+
def __init__(self, model_path):
|
35 |
+
self.model = bodypose_model()
|
36 |
+
# if torch.cuda.is_available():
|
37 |
+
# self.model = self.model.cuda()
|
38 |
+
# print('cuda')
|
39 |
+
model_dict = util.transfer(self.model, torch.load(model_path))
|
40 |
+
self.model.load_state_dict(model_dict)
|
41 |
+
self.model.eval()
|
42 |
+
|
43 |
+
def __call__(self, oriImg):
|
44 |
+
# scale_search = [0.5, 1.0, 1.5, 2.0]
|
45 |
+
scale_search = [0.5]
|
46 |
+
boxsize = 368
|
47 |
+
stride = 8
|
48 |
+
padValue = 128
|
49 |
+
thre1 = 0.1
|
50 |
+
thre2 = 0.05
|
51 |
+
multiplier = [x * boxsize / oriImg.shape[0] for x in scale_search]
|
52 |
+
heatmap_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 19))
|
53 |
+
paf_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 38))
|
54 |
+
|
55 |
+
for m in range(len(multiplier)):
|
56 |
+
scale = multiplier[m]
|
57 |
+
imageToTest = util.smart_resize_k(oriImg, fx=scale, fy=scale)
|
58 |
+
imageToTest_padded, pad = util.padRightDownCorner(imageToTest, stride, padValue)
|
59 |
+
im = np.transpose(np.float32(imageToTest_padded[:, :, :, np.newaxis]), (3, 2, 0, 1)) / 256 - 0.5
|
60 |
+
im = np.ascontiguousarray(im)
|
61 |
+
|
62 |
+
data = torch.from_numpy(im).float()
|
63 |
+
if torch.cuda.is_available():
|
64 |
+
data = data.cuda()
|
65 |
+
# data = data.permute([2, 0, 1]).unsqueeze(0).float()
|
66 |
+
with torch.no_grad():
|
67 |
+
data = data.to(self.cn_device)
|
68 |
+
Mconv7_stage6_L1, Mconv7_stage6_L2 = self.model(data)
|
69 |
+
Mconv7_stage6_L1 = Mconv7_stage6_L1.cpu().numpy()
|
70 |
+
Mconv7_stage6_L2 = Mconv7_stage6_L2.cpu().numpy()
|
71 |
+
|
72 |
+
# extract outputs, resize, and remove padding
|
73 |
+
# heatmap = np.transpose(np.squeeze(net.blobs[output_blobs.keys()[1]].data), (1, 2, 0)) # output 1 is heatmaps
|
74 |
+
heatmap = np.transpose(np.squeeze(Mconv7_stage6_L2), (1, 2, 0)) # output 1 is heatmaps
|
75 |
+
heatmap = util.smart_resize_k(heatmap, fx=stride, fy=stride)
|
76 |
+
heatmap = heatmap[:imageToTest_padded.shape[0] - pad[2], :imageToTest_padded.shape[1] - pad[3], :]
|
77 |
+
heatmap = util.smart_resize(heatmap, (oriImg.shape[0], oriImg.shape[1]))
|
78 |
+
|
79 |
+
# paf = np.transpose(np.squeeze(net.blobs[output_blobs.keys()[0]].data), (1, 2, 0)) # output 0 is PAFs
|
80 |
+
paf = np.transpose(np.squeeze(Mconv7_stage6_L1), (1, 2, 0)) # output 0 is PAFs
|
81 |
+
paf = util.smart_resize_k(paf, fx=stride, fy=stride)
|
82 |
+
paf = paf[:imageToTest_padded.shape[0] - pad[2], :imageToTest_padded.shape[1] - pad[3], :]
|
83 |
+
paf = util.smart_resize(paf, (oriImg.shape[0], oriImg.shape[1]))
|
84 |
+
|
85 |
+
heatmap_avg += heatmap_avg + heatmap / len(multiplier)
|
86 |
+
paf_avg += + paf / len(multiplier)
|
87 |
+
|
88 |
+
all_peaks = []
|
89 |
+
peak_counter = 0
|
90 |
+
|
91 |
+
for part in range(18):
|
92 |
+
map_ori = heatmap_avg[:, :, part]
|
93 |
+
one_heatmap = gaussian_filter(map_ori, sigma=3)
|
94 |
+
|
95 |
+
map_left = np.zeros(one_heatmap.shape)
|
96 |
+
map_left[1:, :] = one_heatmap[:-1, :]
|
97 |
+
map_right = np.zeros(one_heatmap.shape)
|
98 |
+
map_right[:-1, :] = one_heatmap[1:, :]
|
99 |
+
map_up = np.zeros(one_heatmap.shape)
|
100 |
+
map_up[:, 1:] = one_heatmap[:, :-1]
|
101 |
+
map_down = np.zeros(one_heatmap.shape)
|
102 |
+
map_down[:, :-1] = one_heatmap[:, 1:]
|
103 |
+
|
104 |
+
peaks_binary = np.logical_and.reduce(
|
105 |
+
(one_heatmap >= map_left, one_heatmap >= map_right, one_heatmap >= map_up, one_heatmap >= map_down, one_heatmap > thre1))
|
106 |
+
peaks = list(zip(np.nonzero(peaks_binary)[1], np.nonzero(peaks_binary)[0])) # note reverse
|
107 |
+
peaks_with_score = [x + (map_ori[x[1], x[0]],) for x in peaks]
|
108 |
+
peak_id = range(peak_counter, peak_counter + len(peaks))
|
109 |
+
peaks_with_score_and_id = [peaks_with_score[i] + (peak_id[i],) for i in range(len(peak_id))]
|
110 |
+
|
111 |
+
all_peaks.append(peaks_with_score_and_id)
|
112 |
+
peak_counter += len(peaks)
|
113 |
+
|
114 |
+
# find connection in the specified sequence, center 29 is in the position 15
|
115 |
+
limbSeq = [[2, 3], [2, 6], [3, 4], [4, 5], [6, 7], [7, 8], [2, 9], [9, 10], \
|
116 |
+
[10, 11], [2, 12], [12, 13], [13, 14], [2, 1], [1, 15], [15, 17], \
|
117 |
+
[1, 16], [16, 18], [3, 17], [6, 18]]
|
118 |
+
# the middle joints heatmap correpondence
|
119 |
+
mapIdx = [[31, 32], [39, 40], [33, 34], [35, 36], [41, 42], [43, 44], [19, 20], [21, 22], \
|
120 |
+
[23, 24], [25, 26], [27, 28], [29, 30], [47, 48], [49, 50], [53, 54], [51, 52], \
|
121 |
+
[55, 56], [37, 38], [45, 46]]
|
122 |
+
|
123 |
+
connection_all = []
|
124 |
+
special_k = []
|
125 |
+
mid_num = 10
|
126 |
+
|
127 |
+
for k in range(len(mapIdx)):
|
128 |
+
score_mid = paf_avg[:, :, [x - 19 for x in mapIdx[k]]]
|
129 |
+
candA = all_peaks[limbSeq[k][0] - 1]
|
130 |
+
candB = all_peaks[limbSeq[k][1] - 1]
|
131 |
+
nA = len(candA)
|
132 |
+
nB = len(candB)
|
133 |
+
indexA, indexB = limbSeq[k]
|
134 |
+
if (nA != 0 and nB != 0):
|
135 |
+
connection_candidate = []
|
136 |
+
for i in range(nA):
|
137 |
+
for j in range(nB):
|
138 |
+
vec = np.subtract(candB[j][:2], candA[i][:2])
|
139 |
+
norm = math.sqrt(vec[0] * vec[0] + vec[1] * vec[1])
|
140 |
+
norm = max(0.001, norm)
|
141 |
+
vec = np.divide(vec, norm)
|
142 |
+
|
143 |
+
startend = list(zip(np.linspace(candA[i][0], candB[j][0], num=mid_num), \
|
144 |
+
np.linspace(candA[i][1], candB[j][1], num=mid_num)))
|
145 |
+
|
146 |
+
vec_x = np.array([score_mid[int(round(startend[I][1])), int(round(startend[I][0])), 0] \
|
147 |
+
for I in range(len(startend))])
|
148 |
+
vec_y = np.array([score_mid[int(round(startend[I][1])), int(round(startend[I][0])), 1] \
|
149 |
+
for I in range(len(startend))])
|
150 |
+
|
151 |
+
score_midpts = np.multiply(vec_x, vec[0]) + np.multiply(vec_y, vec[1])
|
152 |
+
score_with_dist_prior = sum(score_midpts) / len(score_midpts) + min(
|
153 |
+
0.5 * oriImg.shape[0] / norm - 1, 0)
|
154 |
+
criterion1 = len(np.nonzero(score_midpts > thre2)[0]) > 0.8 * len(score_midpts)
|
155 |
+
criterion2 = score_with_dist_prior > 0
|
156 |
+
if criterion1 and criterion2:
|
157 |
+
connection_candidate.append(
|
158 |
+
[i, j, score_with_dist_prior, score_with_dist_prior + candA[i][2] + candB[j][2]])
|
159 |
+
|
160 |
+
connection_candidate = sorted(connection_candidate, key=lambda x: x[2], reverse=True)
|
161 |
+
connection = np.zeros((0, 5))
|
162 |
+
for c in range(len(connection_candidate)):
|
163 |
+
i, j, s = connection_candidate[c][0:3]
|
164 |
+
if (i not in connection[:, 3] and j not in connection[:, 4]):
|
165 |
+
connection = np.vstack([connection, [candA[i][3], candB[j][3], s, i, j]])
|
166 |
+
if (len(connection) >= min(nA, nB)):
|
167 |
+
break
|
168 |
+
|
169 |
+
connection_all.append(connection)
|
170 |
+
else:
|
171 |
+
special_k.append(k)
|
172 |
+
connection_all.append([])
|
173 |
+
|
174 |
+
# last number in each row is the total parts number of that person
|
175 |
+
# the second last number in each row is the score of the overall configuration
|
176 |
+
subset = -1 * np.ones((0, 20))
|
177 |
+
candidate = np.array([item for sublist in all_peaks for item in sublist])
|
178 |
+
|
179 |
+
for k in range(len(mapIdx)):
|
180 |
+
if k not in special_k:
|
181 |
+
partAs = connection_all[k][:, 0]
|
182 |
+
partBs = connection_all[k][:, 1]
|
183 |
+
indexA, indexB = np.array(limbSeq[k]) - 1
|
184 |
+
|
185 |
+
for i in range(len(connection_all[k])): # = 1:size(temp,1)
|
186 |
+
found = 0
|
187 |
+
subset_idx = [-1, -1]
|
188 |
+
for j in range(len(subset)): # 1:size(subset,1):
|
189 |
+
if subset[j][indexA] == partAs[i] or subset[j][indexB] == partBs[i]:
|
190 |
+
subset_idx[found] = j
|
191 |
+
found += 1
|
192 |
+
|
193 |
+
if found == 1:
|
194 |
+
j = subset_idx[0]
|
195 |
+
if subset[j][indexB] != partBs[i]:
|
196 |
+
subset[j][indexB] = partBs[i]
|
197 |
+
subset[j][-1] += 1
|
198 |
+
subset[j][-2] += candidate[partBs[i].astype(int), 2] + connection_all[k][i][2]
|
199 |
+
elif found == 2: # if found 2 and disjoint, merge them
|
200 |
+
j1, j2 = subset_idx
|
201 |
+
membership = ((subset[j1] >= 0).astype(int) + (subset[j2] >= 0).astype(int))[:-2]
|
202 |
+
if len(np.nonzero(membership == 2)[0]) == 0: # merge
|
203 |
+
subset[j1][:-2] += (subset[j2][:-2] + 1)
|
204 |
+
subset[j1][-2:] += subset[j2][-2:]
|
205 |
+
subset[j1][-2] += connection_all[k][i][2]
|
206 |
+
subset = np.delete(subset, j2, 0)
|
207 |
+
else: # as like found == 1
|
208 |
+
subset[j1][indexB] = partBs[i]
|
209 |
+
subset[j1][-1] += 1
|
210 |
+
subset[j1][-2] += candidate[partBs[i].astype(int), 2] + connection_all[k][i][2]
|
211 |
+
|
212 |
+
# if find no partA in the subset, create a new subset
|
213 |
+
elif not found and k < 17:
|
214 |
+
row = -1 * np.ones(20)
|
215 |
+
row[indexA] = partAs[i]
|
216 |
+
row[indexB] = partBs[i]
|
217 |
+
row[-1] = 2
|
218 |
+
row[-2] = sum(candidate[connection_all[k][i, :2].astype(int), 2]) + connection_all[k][i][2]
|
219 |
+
subset = np.vstack([subset, row])
|
220 |
+
# delete some rows of subset which has few parts occur
|
221 |
+
deleteIdx = []
|
222 |
+
for i in range(len(subset)):
|
223 |
+
if subset[i][-1] < 4 or subset[i][-2] / subset[i][-1] < 0.4:
|
224 |
+
deleteIdx.append(i)
|
225 |
+
subset = np.delete(subset, deleteIdx, axis=0)
|
226 |
+
|
227 |
+
# subset: n*20 array, 0-17 is the index in candidate, 18 is the total score, 19 is the total parts
|
228 |
+
# candidate: x, y, score, id
|
229 |
+
return candidate, subset
|
230 |
+
|
231 |
+
@staticmethod
|
232 |
+
def format_body_result(candidate: np.ndarray, subset: np.ndarray) -> List[BodyResult]:
|
233 |
+
"""
|
234 |
+
Format the body results from the candidate and subset arrays into a list of BodyResult objects.
|
235 |
+
|
236 |
+
Args:
|
237 |
+
candidate (np.ndarray): An array of candidates containing the x, y coordinates, score, and id
|
238 |
+
for each body part.
|
239 |
+
subset (np.ndarray): An array of subsets containing indices to the candidate array for each
|
240 |
+
person detected. The last two columns of each row hold the total score and total parts
|
241 |
+
of the person.
|
242 |
+
|
243 |
+
Returns:
|
244 |
+
List[BodyResult]: A list of BodyResult objects, where each object represents a person with
|
245 |
+
detected keypoints, total score, and total parts.
|
246 |
+
"""
|
247 |
+
return [
|
248 |
+
BodyResult(
|
249 |
+
keypoints=[
|
250 |
+
Keypoint(
|
251 |
+
x=candidate[candidate_index][0],
|
252 |
+
y=candidate[candidate_index][1],
|
253 |
+
score=candidate[candidate_index][2],
|
254 |
+
id=candidate[candidate_index][3]
|
255 |
+
) if candidate_index != -1 else None
|
256 |
+
for candidate_index in person[:18].astype(int)
|
257 |
+
],
|
258 |
+
total_score=person[18],
|
259 |
+
total_parts=person[19]
|
260 |
+
)
|
261 |
+
for person in subset
|
262 |
+
]
|
263 |
+
|
264 |
+
|
265 |
+
if __name__ == "__main__":
|
266 |
+
body_estimation = Body('../model/body_pose_model.pth')
|
267 |
+
|
268 |
+
test_image = '../images/ski.jpg'
|
269 |
+
oriImg = cv2.imread(test_image) # B,G,R order
|
270 |
+
candidate, subset = body_estimation(oriImg)
|
271 |
+
bodies = body_estimation.format_body_result(candidate, subset)
|
272 |
+
|
273 |
+
canvas = oriImg
|
274 |
+
for body in bodies:
|
275 |
+
canvas = util.draw_bodypose(canvas, body)
|
276 |
+
|
277 |
+
plt.imshow(canvas[:, :, [2, 1, 0]])
|
278 |
+
plt.show()
|
extensions/microsoftexcel-controlnet/annotator/openpose/face.py
ADDED
@@ -0,0 +1,362 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import numpy as np
|
3 |
+
from torchvision.transforms import ToTensor, ToPILImage
|
4 |
+
import torch
|
5 |
+
import torch.nn.functional as F
|
6 |
+
import cv2
|
7 |
+
|
8 |
+
from . import util
|
9 |
+
from torch.nn import Conv2d, Module, ReLU, MaxPool2d, init
|
10 |
+
|
11 |
+
|
12 |
+
class FaceNet(Module):
|
13 |
+
"""Model the cascading heatmaps. """
|
14 |
+
def __init__(self):
|
15 |
+
super(FaceNet, self).__init__()
|
16 |
+
# cnn to make feature map
|
17 |
+
self.relu = ReLU()
|
18 |
+
self.max_pooling_2d = MaxPool2d(kernel_size=2, stride=2)
|
19 |
+
self.conv1_1 = Conv2d(in_channels=3, out_channels=64,
|
20 |
+
kernel_size=3, stride=1, padding=1)
|
21 |
+
self.conv1_2 = Conv2d(
|
22 |
+
in_channels=64, out_channels=64, kernel_size=3, stride=1,
|
23 |
+
padding=1)
|
24 |
+
self.conv2_1 = Conv2d(
|
25 |
+
in_channels=64, out_channels=128, kernel_size=3, stride=1,
|
26 |
+
padding=1)
|
27 |
+
self.conv2_2 = Conv2d(
|
28 |
+
in_channels=128, out_channels=128, kernel_size=3, stride=1,
|
29 |
+
padding=1)
|
30 |
+
self.conv3_1 = Conv2d(
|
31 |
+
in_channels=128, out_channels=256, kernel_size=3, stride=1,
|
32 |
+
padding=1)
|
33 |
+
self.conv3_2 = Conv2d(
|
34 |
+
in_channels=256, out_channels=256, kernel_size=3, stride=1,
|
35 |
+
padding=1)
|
36 |
+
self.conv3_3 = Conv2d(
|
37 |
+
in_channels=256, out_channels=256, kernel_size=3, stride=1,
|
38 |
+
padding=1)
|
39 |
+
self.conv3_4 = Conv2d(
|
40 |
+
in_channels=256, out_channels=256, kernel_size=3, stride=1,
|
41 |
+
padding=1)
|
42 |
+
self.conv4_1 = Conv2d(
|
43 |
+
in_channels=256, out_channels=512, kernel_size=3, stride=1,
|
44 |
+
padding=1)
|
45 |
+
self.conv4_2 = Conv2d(
|
46 |
+
in_channels=512, out_channels=512, kernel_size=3, stride=1,
|
47 |
+
padding=1)
|
48 |
+
self.conv4_3 = Conv2d(
|
49 |
+
in_channels=512, out_channels=512, kernel_size=3, stride=1,
|
50 |
+
padding=1)
|
51 |
+
self.conv4_4 = Conv2d(
|
52 |
+
in_channels=512, out_channels=512, kernel_size=3, stride=1,
|
53 |
+
padding=1)
|
54 |
+
self.conv5_1 = Conv2d(
|
55 |
+
in_channels=512, out_channels=512, kernel_size=3, stride=1,
|
56 |
+
padding=1)
|
57 |
+
self.conv5_2 = Conv2d(
|
58 |
+
in_channels=512, out_channels=512, kernel_size=3, stride=1,
|
59 |
+
padding=1)
|
60 |
+
self.conv5_3_CPM = Conv2d(
|
61 |
+
in_channels=512, out_channels=128, kernel_size=3, stride=1,
|
62 |
+
padding=1)
|
63 |
+
|
64 |
+
# stage1
|
65 |
+
self.conv6_1_CPM = Conv2d(
|
66 |
+
in_channels=128, out_channels=512, kernel_size=1, stride=1,
|
67 |
+
padding=0)
|
68 |
+
self.conv6_2_CPM = Conv2d(
|
69 |
+
in_channels=512, out_channels=71, kernel_size=1, stride=1,
|
70 |
+
padding=0)
|
71 |
+
|
72 |
+
# stage2
|
73 |
+
self.Mconv1_stage2 = Conv2d(
|
74 |
+
in_channels=199, out_channels=128, kernel_size=7, stride=1,
|
75 |
+
padding=3)
|
76 |
+
self.Mconv2_stage2 = Conv2d(
|
77 |
+
in_channels=128, out_channels=128, kernel_size=7, stride=1,
|
78 |
+
padding=3)
|
79 |
+
self.Mconv3_stage2 = Conv2d(
|
80 |
+
in_channels=128, out_channels=128, kernel_size=7, stride=1,
|
81 |
+
padding=3)
|
82 |
+
self.Mconv4_stage2 = Conv2d(
|
83 |
+
in_channels=128, out_channels=128, kernel_size=7, stride=1,
|
84 |
+
padding=3)
|
85 |
+
self.Mconv5_stage2 = Conv2d(
|
86 |
+
in_channels=128, out_channels=128, kernel_size=7, stride=1,
|
87 |
+
padding=3)
|
88 |
+
self.Mconv6_stage2 = Conv2d(
|
89 |
+
in_channels=128, out_channels=128, kernel_size=1, stride=1,
|
90 |
+
padding=0)
|
91 |
+
self.Mconv7_stage2 = Conv2d(
|
92 |
+
in_channels=128, out_channels=71, kernel_size=1, stride=1,
|
93 |
+
padding=0)
|
94 |
+
|
95 |
+
# stage3
|
96 |
+
self.Mconv1_stage3 = Conv2d(
|
97 |
+
in_channels=199, out_channels=128, kernel_size=7, stride=1,
|
98 |
+
padding=3)
|
99 |
+
self.Mconv2_stage3 = Conv2d(
|
100 |
+
in_channels=128, out_channels=128, kernel_size=7, stride=1,
|
101 |
+
padding=3)
|
102 |
+
self.Mconv3_stage3 = Conv2d(
|
103 |
+
in_channels=128, out_channels=128, kernel_size=7, stride=1,
|
104 |
+
padding=3)
|
105 |
+
self.Mconv4_stage3 = Conv2d(
|
106 |
+
in_channels=128, out_channels=128, kernel_size=7, stride=1,
|
107 |
+
padding=3)
|
108 |
+
self.Mconv5_stage3 = Conv2d(
|
109 |
+
in_channels=128, out_channels=128, kernel_size=7, stride=1,
|
110 |
+
padding=3)
|
111 |
+
self.Mconv6_stage3 = Conv2d(
|
112 |
+
in_channels=128, out_channels=128, kernel_size=1, stride=1,
|
113 |
+
padding=0)
|
114 |
+
self.Mconv7_stage3 = Conv2d(
|
115 |
+
in_channels=128, out_channels=71, kernel_size=1, stride=1,
|
116 |
+
padding=0)
|
117 |
+
|
118 |
+
# stage4
|
119 |
+
self.Mconv1_stage4 = Conv2d(
|
120 |
+
in_channels=199, out_channels=128, kernel_size=7, stride=1,
|
121 |
+
padding=3)
|
122 |
+
self.Mconv2_stage4 = Conv2d(
|
123 |
+
in_channels=128, out_channels=128, kernel_size=7, stride=1,
|
124 |
+
padding=3)
|
125 |
+
self.Mconv3_stage4 = Conv2d(
|
126 |
+
in_channels=128, out_channels=128, kernel_size=7, stride=1,
|
127 |
+
padding=3)
|
128 |
+
self.Mconv4_stage4 = Conv2d(
|
129 |
+
in_channels=128, out_channels=128, kernel_size=7, stride=1,
|
130 |
+
padding=3)
|
131 |
+
self.Mconv5_stage4 = Conv2d(
|
132 |
+
in_channels=128, out_channels=128, kernel_size=7, stride=1,
|
133 |
+
padding=3)
|
134 |
+
self.Mconv6_stage4 = Conv2d(
|
135 |
+
in_channels=128, out_channels=128, kernel_size=1, stride=1,
|
136 |
+
padding=0)
|
137 |
+
self.Mconv7_stage4 = Conv2d(
|
138 |
+
in_channels=128, out_channels=71, kernel_size=1, stride=1,
|
139 |
+
padding=0)
|
140 |
+
|
141 |
+
# stage5
|
142 |
+
self.Mconv1_stage5 = Conv2d(
|
143 |
+
in_channels=199, out_channels=128, kernel_size=7, stride=1,
|
144 |
+
padding=3)
|
145 |
+
self.Mconv2_stage5 = Conv2d(
|
146 |
+
in_channels=128, out_channels=128, kernel_size=7, stride=1,
|
147 |
+
padding=3)
|
148 |
+
self.Mconv3_stage5 = Conv2d(
|
149 |
+
in_channels=128, out_channels=128, kernel_size=7, stride=1,
|
150 |
+
padding=3)
|
151 |
+
self.Mconv4_stage5 = Conv2d(
|
152 |
+
in_channels=128, out_channels=128, kernel_size=7, stride=1,
|
153 |
+
padding=3)
|
154 |
+
self.Mconv5_stage5 = Conv2d(
|
155 |
+
in_channels=128, out_channels=128, kernel_size=7, stride=1,
|
156 |
+
padding=3)
|
157 |
+
self.Mconv6_stage5 = Conv2d(
|
158 |
+
in_channels=128, out_channels=128, kernel_size=1, stride=1,
|
159 |
+
padding=0)
|
160 |
+
self.Mconv7_stage5 = Conv2d(
|
161 |
+
in_channels=128, out_channels=71, kernel_size=1, stride=1,
|
162 |
+
padding=0)
|
163 |
+
|
164 |
+
# stage6
|
165 |
+
self.Mconv1_stage6 = Conv2d(
|
166 |
+
in_channels=199, out_channels=128, kernel_size=7, stride=1,
|
167 |
+
padding=3)
|
168 |
+
self.Mconv2_stage6 = Conv2d(
|
169 |
+
in_channels=128, out_channels=128, kernel_size=7, stride=1,
|
170 |
+
padding=3)
|
171 |
+
self.Mconv3_stage6 = Conv2d(
|
172 |
+
in_channels=128, out_channels=128, kernel_size=7, stride=1,
|
173 |
+
padding=3)
|
174 |
+
self.Mconv4_stage6 = Conv2d(
|
175 |
+
in_channels=128, out_channels=128, kernel_size=7, stride=1,
|
176 |
+
padding=3)
|
177 |
+
self.Mconv5_stage6 = Conv2d(
|
178 |
+
in_channels=128, out_channels=128, kernel_size=7, stride=1,
|
179 |
+
padding=3)
|
180 |
+
self.Mconv6_stage6 = Conv2d(
|
181 |
+
in_channels=128, out_channels=128, kernel_size=1, stride=1,
|
182 |
+
padding=0)
|
183 |
+
self.Mconv7_stage6 = Conv2d(
|
184 |
+
in_channels=128, out_channels=71, kernel_size=1, stride=1,
|
185 |
+
padding=0)
|
186 |
+
|
187 |
+
for m in self.modules():
|
188 |
+
if isinstance(m, Conv2d):
|
189 |
+
init.constant_(m.bias, 0)
|
190 |
+
|
191 |
+
def forward(self, x):
|
192 |
+
"""Return a list of heatmaps."""
|
193 |
+
heatmaps = []
|
194 |
+
|
195 |
+
h = self.relu(self.conv1_1(x))
|
196 |
+
h = self.relu(self.conv1_2(h))
|
197 |
+
h = self.max_pooling_2d(h)
|
198 |
+
h = self.relu(self.conv2_1(h))
|
199 |
+
h = self.relu(self.conv2_2(h))
|
200 |
+
h = self.max_pooling_2d(h)
|
201 |
+
h = self.relu(self.conv3_1(h))
|
202 |
+
h = self.relu(self.conv3_2(h))
|
203 |
+
h = self.relu(self.conv3_3(h))
|
204 |
+
h = self.relu(self.conv3_4(h))
|
205 |
+
h = self.max_pooling_2d(h)
|
206 |
+
h = self.relu(self.conv4_1(h))
|
207 |
+
h = self.relu(self.conv4_2(h))
|
208 |
+
h = self.relu(self.conv4_3(h))
|
209 |
+
h = self.relu(self.conv4_4(h))
|
210 |
+
h = self.relu(self.conv5_1(h))
|
211 |
+
h = self.relu(self.conv5_2(h))
|
212 |
+
h = self.relu(self.conv5_3_CPM(h))
|
213 |
+
feature_map = h
|
214 |
+
|
215 |
+
# stage1
|
216 |
+
h = self.relu(self.conv6_1_CPM(h))
|
217 |
+
h = self.conv6_2_CPM(h)
|
218 |
+
heatmaps.append(h)
|
219 |
+
|
220 |
+
# stage2
|
221 |
+
h = torch.cat([h, feature_map], dim=1) # channel concat
|
222 |
+
h = self.relu(self.Mconv1_stage2(h))
|
223 |
+
h = self.relu(self.Mconv2_stage2(h))
|
224 |
+
h = self.relu(self.Mconv3_stage2(h))
|
225 |
+
h = self.relu(self.Mconv4_stage2(h))
|
226 |
+
h = self.relu(self.Mconv5_stage2(h))
|
227 |
+
h = self.relu(self.Mconv6_stage2(h))
|
228 |
+
h = self.Mconv7_stage2(h)
|
229 |
+
heatmaps.append(h)
|
230 |
+
|
231 |
+
# stage3
|
232 |
+
h = torch.cat([h, feature_map], dim=1) # channel concat
|
233 |
+
h = self.relu(self.Mconv1_stage3(h))
|
234 |
+
h = self.relu(self.Mconv2_stage3(h))
|
235 |
+
h = self.relu(self.Mconv3_stage3(h))
|
236 |
+
h = self.relu(self.Mconv4_stage3(h))
|
237 |
+
h = self.relu(self.Mconv5_stage3(h))
|
238 |
+
h = self.relu(self.Mconv6_stage3(h))
|
239 |
+
h = self.Mconv7_stage3(h)
|
240 |
+
heatmaps.append(h)
|
241 |
+
|
242 |
+
# stage4
|
243 |
+
h = torch.cat([h, feature_map], dim=1) # channel concat
|
244 |
+
h = self.relu(self.Mconv1_stage4(h))
|
245 |
+
h = self.relu(self.Mconv2_stage4(h))
|
246 |
+
h = self.relu(self.Mconv3_stage4(h))
|
247 |
+
h = self.relu(self.Mconv4_stage4(h))
|
248 |
+
h = self.relu(self.Mconv5_stage4(h))
|
249 |
+
h = self.relu(self.Mconv6_stage4(h))
|
250 |
+
h = self.Mconv7_stage4(h)
|
251 |
+
heatmaps.append(h)
|
252 |
+
|
253 |
+
# stage5
|
254 |
+
h = torch.cat([h, feature_map], dim=1) # channel concat
|
255 |
+
h = self.relu(self.Mconv1_stage5(h))
|
256 |
+
h = self.relu(self.Mconv2_stage5(h))
|
257 |
+
h = self.relu(self.Mconv3_stage5(h))
|
258 |
+
h = self.relu(self.Mconv4_stage5(h))
|
259 |
+
h = self.relu(self.Mconv5_stage5(h))
|
260 |
+
h = self.relu(self.Mconv6_stage5(h))
|
261 |
+
h = self.Mconv7_stage5(h)
|
262 |
+
heatmaps.append(h)
|
263 |
+
|
264 |
+
# stage6
|
265 |
+
h = torch.cat([h, feature_map], dim=1) # channel concat
|
266 |
+
h = self.relu(self.Mconv1_stage6(h))
|
267 |
+
h = self.relu(self.Mconv2_stage6(h))
|
268 |
+
h = self.relu(self.Mconv3_stage6(h))
|
269 |
+
h = self.relu(self.Mconv4_stage6(h))
|
270 |
+
h = self.relu(self.Mconv5_stage6(h))
|
271 |
+
h = self.relu(self.Mconv6_stage6(h))
|
272 |
+
h = self.Mconv7_stage6(h)
|
273 |
+
heatmaps.append(h)
|
274 |
+
|
275 |
+
return heatmaps
|
276 |
+
|
277 |
+
|
278 |
+
LOG = logging.getLogger(__name__)
|
279 |
+
TOTEN = ToTensor()
|
280 |
+
TOPIL = ToPILImage()
|
281 |
+
|
282 |
+
|
283 |
+
params = {
|
284 |
+
'gaussian_sigma': 2.5,
|
285 |
+
'inference_img_size': 736, # 368, 736, 1312
|
286 |
+
'heatmap_peak_thresh': 0.1,
|
287 |
+
'crop_scale': 1.5,
|
288 |
+
'line_indices': [
|
289 |
+
[0, 1], [1, 2], [2, 3], [3, 4], [4, 5], [5, 6],
|
290 |
+
[6, 7], [7, 8], [8, 9], [9, 10], [10, 11], [11, 12], [12, 13],
|
291 |
+
[13, 14], [14, 15], [15, 16],
|
292 |
+
[17, 18], [18, 19], [19, 20], [20, 21],
|
293 |
+
[22, 23], [23, 24], [24, 25], [25, 26],
|
294 |
+
[27, 28], [28, 29], [29, 30],
|
295 |
+
[31, 32], [32, 33], [33, 34], [34, 35],
|
296 |
+
[36, 37], [37, 38], [38, 39], [39, 40], [40, 41], [41, 36],
|
297 |
+
[42, 43], [43, 44], [44, 45], [45, 46], [46, 47], [47, 42],
|
298 |
+
[48, 49], [49, 50], [50, 51], [51, 52], [52, 53], [53, 54],
|
299 |
+
[54, 55], [55, 56], [56, 57], [57, 58], [58, 59], [59, 48],
|
300 |
+
[60, 61], [61, 62], [62, 63], [63, 64], [64, 65], [65, 66],
|
301 |
+
[66, 67], [67, 60]
|
302 |
+
],
|
303 |
+
}
|
304 |
+
|
305 |
+
|
306 |
+
class Face(object):
|
307 |
+
"""
|
308 |
+
The OpenPose face landmark detector model.
|
309 |
+
|
310 |
+
Args:
|
311 |
+
inference_size: set the size of the inference image size, suggested:
|
312 |
+
368, 736, 1312, default 736
|
313 |
+
gaussian_sigma: blur the heatmaps, default 2.5
|
314 |
+
heatmap_peak_thresh: return landmark if over threshold, default 0.1
|
315 |
+
|
316 |
+
"""
|
317 |
+
def __init__(self, face_model_path,
|
318 |
+
inference_size=None,
|
319 |
+
gaussian_sigma=None,
|
320 |
+
heatmap_peak_thresh=None):
|
321 |
+
self.inference_size = inference_size or params["inference_img_size"]
|
322 |
+
self.sigma = gaussian_sigma or params['gaussian_sigma']
|
323 |
+
self.threshold = heatmap_peak_thresh or params["heatmap_peak_thresh"]
|
324 |
+
self.model = FaceNet()
|
325 |
+
self.model.load_state_dict(torch.load(face_model_path))
|
326 |
+
# if torch.cuda.is_available():
|
327 |
+
# self.model = self.model.cuda()
|
328 |
+
# print('cuda')
|
329 |
+
self.model.eval()
|
330 |
+
|
331 |
+
def __call__(self, face_img):
|
332 |
+
H, W, C = face_img.shape
|
333 |
+
|
334 |
+
w_size = 384
|
335 |
+
x_data = torch.from_numpy(util.smart_resize(face_img, (w_size, w_size))).permute([2, 0, 1]) / 256.0 - 0.5
|
336 |
+
|
337 |
+
x_data = x_data.to(self.cn_device)
|
338 |
+
|
339 |
+
with torch.no_grad():
|
340 |
+
hs = self.model(x_data[None, ...])
|
341 |
+
heatmaps = F.interpolate(
|
342 |
+
hs[-1],
|
343 |
+
(H, W),
|
344 |
+
mode='bilinear', align_corners=True).cpu().numpy()[0]
|
345 |
+
return heatmaps
|
346 |
+
|
347 |
+
def compute_peaks_from_heatmaps(self, heatmaps):
|
348 |
+
all_peaks = []
|
349 |
+
for part in range(heatmaps.shape[0]):
|
350 |
+
map_ori = heatmaps[part].copy()
|
351 |
+
binary = np.ascontiguousarray(map_ori > 0.05, dtype=np.uint8)
|
352 |
+
|
353 |
+
if np.sum(binary) == 0:
|
354 |
+
continue
|
355 |
+
|
356 |
+
positions = np.where(binary > 0.5)
|
357 |
+
intensities = map_ori[positions]
|
358 |
+
mi = np.argmax(intensities)
|
359 |
+
y, x = positions[0][mi], positions[1][mi]
|
360 |
+
all_peaks.append([x, y])
|
361 |
+
|
362 |
+
return np.array(all_peaks)
|
extensions/microsoftexcel-controlnet/annotator/openpose/hand.py
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import cv2
|
2 |
+
import json
|
3 |
+
import numpy as np
|
4 |
+
import math
|
5 |
+
import time
|
6 |
+
from scipy.ndimage.filters import gaussian_filter
|
7 |
+
import matplotlib.pyplot as plt
|
8 |
+
import matplotlib
|
9 |
+
import torch
|
10 |
+
from skimage.measure import label
|
11 |
+
|
12 |
+
from .model import handpose_model
|
13 |
+
from . import util
|
14 |
+
|
15 |
+
class Hand(object):
|
16 |
+
def __init__(self, model_path):
|
17 |
+
self.model = handpose_model()
|
18 |
+
# if torch.cuda.is_available():
|
19 |
+
# self.model = self.model.cuda()
|
20 |
+
# print('cuda')
|
21 |
+
model_dict = util.transfer(self.model, torch.load(model_path))
|
22 |
+
self.model.load_state_dict(model_dict)
|
23 |
+
self.model.eval()
|
24 |
+
|
25 |
+
def __call__(self, oriImgRaw):
|
26 |
+
scale_search = [0.5, 1.0, 1.5, 2.0]
|
27 |
+
# scale_search = [0.5]
|
28 |
+
boxsize = 368
|
29 |
+
stride = 8
|
30 |
+
padValue = 128
|
31 |
+
thre = 0.05
|
32 |
+
multiplier = [x * boxsize for x in scale_search]
|
33 |
+
|
34 |
+
wsize = 128
|
35 |
+
heatmap_avg = np.zeros((wsize, wsize, 22))
|
36 |
+
|
37 |
+
Hr, Wr, Cr = oriImgRaw.shape
|
38 |
+
|
39 |
+
oriImg = cv2.GaussianBlur(oriImgRaw, (0, 0), 0.8)
|
40 |
+
|
41 |
+
for m in range(len(multiplier)):
|
42 |
+
scale = multiplier[m]
|
43 |
+
imageToTest = util.smart_resize(oriImg, (scale, scale))
|
44 |
+
|
45 |
+
imageToTest_padded, pad = util.padRightDownCorner(imageToTest, stride, padValue)
|
46 |
+
im = np.transpose(np.float32(imageToTest_padded[:, :, :, np.newaxis]), (3, 2, 0, 1)) / 256 - 0.5
|
47 |
+
im = np.ascontiguousarray(im)
|
48 |
+
|
49 |
+
data = torch.from_numpy(im).float()
|
50 |
+
if torch.cuda.is_available():
|
51 |
+
data = data.cuda()
|
52 |
+
|
53 |
+
with torch.no_grad():
|
54 |
+
data = data.to(self.cn_device)
|
55 |
+
output = self.model(data).cpu().numpy()
|
56 |
+
|
57 |
+
# extract outputs, resize, and remove padding
|
58 |
+
heatmap = np.transpose(np.squeeze(output), (1, 2, 0)) # output 1 is heatmaps
|
59 |
+
heatmap = util.smart_resize_k(heatmap, fx=stride, fy=stride)
|
60 |
+
heatmap = heatmap[:imageToTest_padded.shape[0] - pad[2], :imageToTest_padded.shape[1] - pad[3], :]
|
61 |
+
heatmap = util.smart_resize(heatmap, (wsize, wsize))
|
62 |
+
|
63 |
+
heatmap_avg += heatmap / len(multiplier)
|
64 |
+
|
65 |
+
all_peaks = []
|
66 |
+
for part in range(21):
|
67 |
+
map_ori = heatmap_avg[:, :, part]
|
68 |
+
one_heatmap = gaussian_filter(map_ori, sigma=3)
|
69 |
+
binary = np.ascontiguousarray(one_heatmap > thre, dtype=np.uint8)
|
70 |
+
|
71 |
+
if np.sum(binary) == 0:
|
72 |
+
all_peaks.append([0, 0])
|
73 |
+
continue
|
74 |
+
label_img, label_numbers = label(binary, return_num=True, connectivity=binary.ndim)
|
75 |
+
max_index = np.argmax([np.sum(map_ori[label_img == i]) for i in range(1, label_numbers + 1)]) + 1
|
76 |
+
label_img[label_img != max_index] = 0
|
77 |
+
map_ori[label_img == 0] = 0
|
78 |
+
|
79 |
+
y, x = util.npmax(map_ori)
|
80 |
+
y = int(float(y) * float(Hr) / float(wsize))
|
81 |
+
x = int(float(x) * float(Wr) / float(wsize))
|
82 |
+
all_peaks.append([x, y])
|
83 |
+
return np.array(all_peaks)
|
84 |
+
|
85 |
+
if __name__ == "__main__":
|
86 |
+
hand_estimation = Hand('../model/hand_pose_model.pth')
|
87 |
+
|
88 |
+
# test_image = '../images/hand.jpg'
|
89 |
+
test_image = '../images/hand.jpg'
|
90 |
+
oriImg = cv2.imread(test_image) # B,G,R order
|
91 |
+
peaks = hand_estimation(oriImg)
|
92 |
+
canvas = util.draw_handpose(oriImg, peaks, True)
|
93 |
+
cv2.imshow('', canvas)
|
94 |
+
cv2.waitKey(0)
|
extensions/microsoftexcel-controlnet/annotator/openpose/model.py
ADDED
@@ -0,0 +1,218 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from collections import OrderedDict
|
3 |
+
|
4 |
+
import torch
|
5 |
+
import torch.nn as nn
|
6 |
+
|
7 |
+
def make_layers(block, no_relu_layers):
|
8 |
+
layers = []
|
9 |
+
for layer_name, v in block.items():
|
10 |
+
if 'pool' in layer_name:
|
11 |
+
layer = nn.MaxPool2d(kernel_size=v[0], stride=v[1],
|
12 |
+
padding=v[2])
|
13 |
+
layers.append((layer_name, layer))
|
14 |
+
else:
|
15 |
+
conv2d = nn.Conv2d(in_channels=v[0], out_channels=v[1],
|
16 |
+
kernel_size=v[2], stride=v[3],
|
17 |
+
padding=v[4])
|
18 |
+
layers.append((layer_name, conv2d))
|
19 |
+
if layer_name not in no_relu_layers:
|
20 |
+
layers.append(('relu_'+layer_name, nn.ReLU(inplace=True)))
|
21 |
+
|
22 |
+
return nn.Sequential(OrderedDict(layers))
|
23 |
+
|
24 |
+
class bodypose_model(nn.Module):
|
25 |
+
def __init__(self):
|
26 |
+
super(bodypose_model, self).__init__()
|
27 |
+
|
28 |
+
# these layers have no relu layer
|
29 |
+
no_relu_layers = ['conv5_5_CPM_L1', 'conv5_5_CPM_L2', 'Mconv7_stage2_L1',\
|
30 |
+
'Mconv7_stage2_L2', 'Mconv7_stage3_L1', 'Mconv7_stage3_L2',\
|
31 |
+
'Mconv7_stage4_L1', 'Mconv7_stage4_L2', 'Mconv7_stage5_L1',\
|
32 |
+
'Mconv7_stage5_L2', 'Mconv7_stage6_L1', 'Mconv7_stage6_L1']
|
33 |
+
blocks = {}
|
34 |
+
block0 = OrderedDict([
|
35 |
+
('conv1_1', [3, 64, 3, 1, 1]),
|
36 |
+
('conv1_2', [64, 64, 3, 1, 1]),
|
37 |
+
('pool1_stage1', [2, 2, 0]),
|
38 |
+
('conv2_1', [64, 128, 3, 1, 1]),
|
39 |
+
('conv2_2', [128, 128, 3, 1, 1]),
|
40 |
+
('pool2_stage1', [2, 2, 0]),
|
41 |
+
('conv3_1', [128, 256, 3, 1, 1]),
|
42 |
+
('conv3_2', [256, 256, 3, 1, 1]),
|
43 |
+
('conv3_3', [256, 256, 3, 1, 1]),
|
44 |
+
('conv3_4', [256, 256, 3, 1, 1]),
|
45 |
+
('pool3_stage1', [2, 2, 0]),
|
46 |
+
('conv4_1', [256, 512, 3, 1, 1]),
|
47 |
+
('conv4_2', [512, 512, 3, 1, 1]),
|
48 |
+
('conv4_3_CPM', [512, 256, 3, 1, 1]),
|
49 |
+
('conv4_4_CPM', [256, 128, 3, 1, 1])
|
50 |
+
])
|
51 |
+
|
52 |
+
|
53 |
+
# Stage 1
|
54 |
+
block1_1 = OrderedDict([
|
55 |
+
('conv5_1_CPM_L1', [128, 128, 3, 1, 1]),
|
56 |
+
('conv5_2_CPM_L1', [128, 128, 3, 1, 1]),
|
57 |
+
('conv5_3_CPM_L1', [128, 128, 3, 1, 1]),
|
58 |
+
('conv5_4_CPM_L1', [128, 512, 1, 1, 0]),
|
59 |
+
('conv5_5_CPM_L1', [512, 38, 1, 1, 0])
|
60 |
+
])
|
61 |
+
|
62 |
+
block1_2 = OrderedDict([
|
63 |
+
('conv5_1_CPM_L2', [128, 128, 3, 1, 1]),
|
64 |
+
('conv5_2_CPM_L2', [128, 128, 3, 1, 1]),
|
65 |
+
('conv5_3_CPM_L2', [128, 128, 3, 1, 1]),
|
66 |
+
('conv5_4_CPM_L2', [128, 512, 1, 1, 0]),
|
67 |
+
('conv5_5_CPM_L2', [512, 19, 1, 1, 0])
|
68 |
+
])
|
69 |
+
blocks['block1_1'] = block1_1
|
70 |
+
blocks['block1_2'] = block1_2
|
71 |
+
|
72 |
+
self.model0 = make_layers(block0, no_relu_layers)
|
73 |
+
|
74 |
+
# Stages 2 - 6
|
75 |
+
for i in range(2, 7):
|
76 |
+
blocks['block%d_1' % i] = OrderedDict([
|
77 |
+
('Mconv1_stage%d_L1' % i, [185, 128, 7, 1, 3]),
|
78 |
+
('Mconv2_stage%d_L1' % i, [128, 128, 7, 1, 3]),
|
79 |
+
('Mconv3_stage%d_L1' % i, [128, 128, 7, 1, 3]),
|
80 |
+
('Mconv4_stage%d_L1' % i, [128, 128, 7, 1, 3]),
|
81 |
+
('Mconv5_stage%d_L1' % i, [128, 128, 7, 1, 3]),
|
82 |
+
('Mconv6_stage%d_L1' % i, [128, 128, 1, 1, 0]),
|
83 |
+
('Mconv7_stage%d_L1' % i, [128, 38, 1, 1, 0])
|
84 |
+
])
|
85 |
+
|
86 |
+
blocks['block%d_2' % i] = OrderedDict([
|
87 |
+
('Mconv1_stage%d_L2' % i, [185, 128, 7, 1, 3]),
|
88 |
+
('Mconv2_stage%d_L2' % i, [128, 128, 7, 1, 3]),
|
89 |
+
('Mconv3_stage%d_L2' % i, [128, 128, 7, 1, 3]),
|
90 |
+
('Mconv4_stage%d_L2' % i, [128, 128, 7, 1, 3]),
|
91 |
+
('Mconv5_stage%d_L2' % i, [128, 128, 7, 1, 3]),
|
92 |
+
('Mconv6_stage%d_L2' % i, [128, 128, 1, 1, 0]),
|
93 |
+
('Mconv7_stage%d_L2' % i, [128, 19, 1, 1, 0])
|
94 |
+
])
|
95 |
+
|
96 |
+
for k in blocks.keys():
|
97 |
+
blocks[k] = make_layers(blocks[k], no_relu_layers)
|
98 |
+
|
99 |
+
self.model1_1 = blocks['block1_1']
|
100 |
+
self.model2_1 = blocks['block2_1']
|
101 |
+
self.model3_1 = blocks['block3_1']
|
102 |
+
self.model4_1 = blocks['block4_1']
|
103 |
+
self.model5_1 = blocks['block5_1']
|
104 |
+
self.model6_1 = blocks['block6_1']
|
105 |
+
|
106 |
+
self.model1_2 = blocks['block1_2']
|
107 |
+
self.model2_2 = blocks['block2_2']
|
108 |
+
self.model3_2 = blocks['block3_2']
|
109 |
+
self.model4_2 = blocks['block4_2']
|
110 |
+
self.model5_2 = blocks['block5_2']
|
111 |
+
self.model6_2 = blocks['block6_2']
|
112 |
+
|
113 |
+
|
114 |
+
def forward(self, x):
|
115 |
+
|
116 |
+
out1 = self.model0(x)
|
117 |
+
|
118 |
+
out1_1 = self.model1_1(out1)
|
119 |
+
out1_2 = self.model1_2(out1)
|
120 |
+
out2 = torch.cat([out1_1, out1_2, out1], 1)
|
121 |
+
|
122 |
+
out2_1 = self.model2_1(out2)
|
123 |
+
out2_2 = self.model2_2(out2)
|
124 |
+
out3 = torch.cat([out2_1, out2_2, out1], 1)
|
125 |
+
|
126 |
+
out3_1 = self.model3_1(out3)
|
127 |
+
out3_2 = self.model3_2(out3)
|
128 |
+
out4 = torch.cat([out3_1, out3_2, out1], 1)
|
129 |
+
|
130 |
+
out4_1 = self.model4_1(out4)
|
131 |
+
out4_2 = self.model4_2(out4)
|
132 |
+
out5 = torch.cat([out4_1, out4_2, out1], 1)
|
133 |
+
|
134 |
+
out5_1 = self.model5_1(out5)
|
135 |
+
out5_2 = self.model5_2(out5)
|
136 |
+
out6 = torch.cat([out5_1, out5_2, out1], 1)
|
137 |
+
|
138 |
+
out6_1 = self.model6_1(out6)
|
139 |
+
out6_2 = self.model6_2(out6)
|
140 |
+
|
141 |
+
return out6_1, out6_2
|
142 |
+
|
143 |
+
class handpose_model(nn.Module):
|
144 |
+
def __init__(self):
|
145 |
+
super(handpose_model, self).__init__()
|
146 |
+
|
147 |
+
# these layers have no relu layer
|
148 |
+
no_relu_layers = ['conv6_2_CPM', 'Mconv7_stage2', 'Mconv7_stage3',\
|
149 |
+
'Mconv7_stage4', 'Mconv7_stage5', 'Mconv7_stage6']
|
150 |
+
# stage 1
|
151 |
+
block1_0 = OrderedDict([
|
152 |
+
('conv1_1', [3, 64, 3, 1, 1]),
|
153 |
+
('conv1_2', [64, 64, 3, 1, 1]),
|
154 |
+
('pool1_stage1', [2, 2, 0]),
|
155 |
+
('conv2_1', [64, 128, 3, 1, 1]),
|
156 |
+
('conv2_2', [128, 128, 3, 1, 1]),
|
157 |
+
('pool2_stage1', [2, 2, 0]),
|
158 |
+
('conv3_1', [128, 256, 3, 1, 1]),
|
159 |
+
('conv3_2', [256, 256, 3, 1, 1]),
|
160 |
+
('conv3_3', [256, 256, 3, 1, 1]),
|
161 |
+
('conv3_4', [256, 256, 3, 1, 1]),
|
162 |
+
('pool3_stage1', [2, 2, 0]),
|
163 |
+
('conv4_1', [256, 512, 3, 1, 1]),
|
164 |
+
('conv4_2', [512, 512, 3, 1, 1]),
|
165 |
+
('conv4_3', [512, 512, 3, 1, 1]),
|
166 |
+
('conv4_4', [512, 512, 3, 1, 1]),
|
167 |
+
('conv5_1', [512, 512, 3, 1, 1]),
|
168 |
+
('conv5_2', [512, 512, 3, 1, 1]),
|
169 |
+
('conv5_3_CPM', [512, 128, 3, 1, 1])
|
170 |
+
])
|
171 |
+
|
172 |
+
block1_1 = OrderedDict([
|
173 |
+
('conv6_1_CPM', [128, 512, 1, 1, 0]),
|
174 |
+
('conv6_2_CPM', [512, 22, 1, 1, 0])
|
175 |
+
])
|
176 |
+
|
177 |
+
blocks = {}
|
178 |
+
blocks['block1_0'] = block1_0
|
179 |
+
blocks['block1_1'] = block1_1
|
180 |
+
|
181 |
+
# stage 2-6
|
182 |
+
for i in range(2, 7):
|
183 |
+
blocks['block%d' % i] = OrderedDict([
|
184 |
+
('Mconv1_stage%d' % i, [150, 128, 7, 1, 3]),
|
185 |
+
('Mconv2_stage%d' % i, [128, 128, 7, 1, 3]),
|
186 |
+
('Mconv3_stage%d' % i, [128, 128, 7, 1, 3]),
|
187 |
+
('Mconv4_stage%d' % i, [128, 128, 7, 1, 3]),
|
188 |
+
('Mconv5_stage%d' % i, [128, 128, 7, 1, 3]),
|
189 |
+
('Mconv6_stage%d' % i, [128, 128, 1, 1, 0]),
|
190 |
+
('Mconv7_stage%d' % i, [128, 22, 1, 1, 0])
|
191 |
+
])
|
192 |
+
|
193 |
+
for k in blocks.keys():
|
194 |
+
blocks[k] = make_layers(blocks[k], no_relu_layers)
|
195 |
+
|
196 |
+
self.model1_0 = blocks['block1_0']
|
197 |
+
self.model1_1 = blocks['block1_1']
|
198 |
+
self.model2 = blocks['block2']
|
199 |
+
self.model3 = blocks['block3']
|
200 |
+
self.model4 = blocks['block4']
|
201 |
+
self.model5 = blocks['block5']
|
202 |
+
self.model6 = blocks['block6']
|
203 |
+
|
204 |
+
def forward(self, x):
|
205 |
+
out1_0 = self.model1_0(x)
|
206 |
+
out1_1 = self.model1_1(out1_0)
|
207 |
+
concat_stage2 = torch.cat([out1_1, out1_0], 1)
|
208 |
+
out_stage2 = self.model2(concat_stage2)
|
209 |
+
concat_stage3 = torch.cat([out_stage2, out1_0], 1)
|
210 |
+
out_stage3 = self.model3(concat_stage3)
|
211 |
+
concat_stage4 = torch.cat([out_stage3, out1_0], 1)
|
212 |
+
out_stage4 = self.model4(concat_stage4)
|
213 |
+
concat_stage5 = torch.cat([out_stage4, out1_0], 1)
|
214 |
+
out_stage5 = self.model5(concat_stage5)
|
215 |
+
concat_stage6 = torch.cat([out_stage5, out1_0], 1)
|
216 |
+
out_stage6 = self.model6(concat_stage6)
|
217 |
+
return out_stage6
|
218 |
+
|
extensions/microsoftexcel-controlnet/annotator/openpose/util.py
ADDED
@@ -0,0 +1,383 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
import numpy as np
|
3 |
+
import matplotlib
|
4 |
+
import cv2
|
5 |
+
from typing import List, Tuple, Union
|
6 |
+
|
7 |
+
from .body import BodyResult, Keypoint
|
8 |
+
|
9 |
+
eps = 0.01
|
10 |
+
|
11 |
+
|
12 |
+
def smart_resize(x, s):
|
13 |
+
Ht, Wt = s
|
14 |
+
if x.ndim == 2:
|
15 |
+
Ho, Wo = x.shape
|
16 |
+
Co = 1
|
17 |
+
else:
|
18 |
+
Ho, Wo, Co = x.shape
|
19 |
+
if Co == 3 or Co == 1:
|
20 |
+
k = float(Ht + Wt) / float(Ho + Wo)
|
21 |
+
return cv2.resize(x, (int(Wt), int(Ht)), interpolation=cv2.INTER_AREA if k < 1 else cv2.INTER_LANCZOS4)
|
22 |
+
else:
|
23 |
+
return np.stack([smart_resize(x[:, :, i], s) for i in range(Co)], axis=2)
|
24 |
+
|
25 |
+
|
26 |
+
def smart_resize_k(x, fx, fy):
|
27 |
+
if x.ndim == 2:
|
28 |
+
Ho, Wo = x.shape
|
29 |
+
Co = 1
|
30 |
+
else:
|
31 |
+
Ho, Wo, Co = x.shape
|
32 |
+
Ht, Wt = Ho * fy, Wo * fx
|
33 |
+
if Co == 3 or Co == 1:
|
34 |
+
k = float(Ht + Wt) / float(Ho + Wo)
|
35 |
+
return cv2.resize(x, (int(Wt), int(Ht)), interpolation=cv2.INTER_AREA if k < 1 else cv2.INTER_LANCZOS4)
|
36 |
+
else:
|
37 |
+
return np.stack([smart_resize_k(x[:, :, i], fx, fy) for i in range(Co)], axis=2)
|
38 |
+
|
39 |
+
|
40 |
+
def padRightDownCorner(img, stride, padValue):
|
41 |
+
h = img.shape[0]
|
42 |
+
w = img.shape[1]
|
43 |
+
|
44 |
+
pad = 4 * [None]
|
45 |
+
pad[0] = 0 # up
|
46 |
+
pad[1] = 0 # left
|
47 |
+
pad[2] = 0 if (h % stride == 0) else stride - (h % stride) # down
|
48 |
+
pad[3] = 0 if (w % stride == 0) else stride - (w % stride) # right
|
49 |
+
|
50 |
+
img_padded = img
|
51 |
+
pad_up = np.tile(img_padded[0:1, :, :]*0 + padValue, (pad[0], 1, 1))
|
52 |
+
img_padded = np.concatenate((pad_up, img_padded), axis=0)
|
53 |
+
pad_left = np.tile(img_padded[:, 0:1, :]*0 + padValue, (1, pad[1], 1))
|
54 |
+
img_padded = np.concatenate((pad_left, img_padded), axis=1)
|
55 |
+
pad_down = np.tile(img_padded[-2:-1, :, :]*0 + padValue, (pad[2], 1, 1))
|
56 |
+
img_padded = np.concatenate((img_padded, pad_down), axis=0)
|
57 |
+
pad_right = np.tile(img_padded[:, -2:-1, :]*0 + padValue, (1, pad[3], 1))
|
58 |
+
img_padded = np.concatenate((img_padded, pad_right), axis=1)
|
59 |
+
|
60 |
+
return img_padded, pad
|
61 |
+
|
62 |
+
|
63 |
+
def transfer(model, model_weights):
|
64 |
+
transfered_model_weights = {}
|
65 |
+
for weights_name in model.state_dict().keys():
|
66 |
+
transfered_model_weights[weights_name] = model_weights['.'.join(weights_name.split('.')[1:])]
|
67 |
+
return transfered_model_weights
|
68 |
+
|
69 |
+
|
70 |
+
def draw_bodypose(canvas: np.ndarray, keypoints: List[Keypoint]) -> np.ndarray:
|
71 |
+
"""
|
72 |
+
Draw keypoints and limbs representing body pose on a given canvas.
|
73 |
+
|
74 |
+
Args:
|
75 |
+
canvas (np.ndarray): A 3D numpy array representing the canvas (image) on which to draw the body pose.
|
76 |
+
keypoints (List[Keypoint]): A list of Keypoint objects representing the body keypoints to be drawn.
|
77 |
+
|
78 |
+
Returns:
|
79 |
+
np.ndarray: A 3D numpy array representing the modified canvas with the drawn body pose.
|
80 |
+
|
81 |
+
Note:
|
82 |
+
The function expects the x and y coordinates of the keypoints to be normalized between 0 and 1.
|
83 |
+
"""
|
84 |
+
H, W, C = canvas.shape
|
85 |
+
stickwidth = 4
|
86 |
+
|
87 |
+
limbSeq = [
|
88 |
+
[2, 3], [2, 6], [3, 4], [4, 5],
|
89 |
+
[6, 7], [7, 8], [2, 9], [9, 10],
|
90 |
+
[10, 11], [2, 12], [12, 13], [13, 14],
|
91 |
+
[2, 1], [1, 15], [15, 17], [1, 16],
|
92 |
+
[16, 18],
|
93 |
+
]
|
94 |
+
|
95 |
+
colors = [[255, 0, 0], [255, 85, 0], [255, 170, 0], [255, 255, 0], [170, 255, 0], [85, 255, 0], [0, 255, 0], \
|
96 |
+
[0, 255, 85], [0, 255, 170], [0, 255, 255], [0, 170, 255], [0, 85, 255], [0, 0, 255], [85, 0, 255], \
|
97 |
+
[170, 0, 255], [255, 0, 255], [255, 0, 170], [255, 0, 85]]
|
98 |
+
|
99 |
+
for (k1_index, k2_index), color in zip(limbSeq, colors):
|
100 |
+
keypoint1 = keypoints[k1_index - 1]
|
101 |
+
keypoint2 = keypoints[k2_index - 1]
|
102 |
+
|
103 |
+
if keypoint1 is None or keypoint2 is None:
|
104 |
+
continue
|
105 |
+
|
106 |
+
Y = np.array([keypoint1.x, keypoint2.x]) * float(W)
|
107 |
+
X = np.array([keypoint1.y, keypoint2.y]) * float(H)
|
108 |
+
mX = np.mean(X)
|
109 |
+
mY = np.mean(Y)
|
110 |
+
length = ((X[0] - X[1]) ** 2 + (Y[0] - Y[1]) ** 2) ** 0.5
|
111 |
+
angle = math.degrees(math.atan2(X[0] - X[1], Y[0] - Y[1]))
|
112 |
+
polygon = cv2.ellipse2Poly((int(mY), int(mX)), (int(length / 2), stickwidth), int(angle), 0, 360, 1)
|
113 |
+
cv2.fillConvexPoly(canvas, polygon, [int(float(c) * 0.6) for c in color])
|
114 |
+
|
115 |
+
for keypoint, color in zip(keypoints, colors):
|
116 |
+
if keypoint is None:
|
117 |
+
continue
|
118 |
+
|
119 |
+
x, y = keypoint.x, keypoint.y
|
120 |
+
x = int(x * W)
|
121 |
+
y = int(y * H)
|
122 |
+
cv2.circle(canvas, (int(x), int(y)), 4, color, thickness=-1)
|
123 |
+
|
124 |
+
return canvas
|
125 |
+
|
126 |
+
|
127 |
+
def draw_handpose(canvas: np.ndarray, keypoints: Union[List[Keypoint], None]) -> np.ndarray:
|
128 |
+
"""
|
129 |
+
Draw keypoints and connections representing hand pose on a given canvas.
|
130 |
+
|
131 |
+
Args:
|
132 |
+
canvas (np.ndarray): A 3D numpy array representing the canvas (image) on which to draw the hand pose.
|
133 |
+
keypoints (List[Keypoint]| None): A list of Keypoint objects representing the hand keypoints to be drawn
|
134 |
+
or None if no keypoints are present.
|
135 |
+
|
136 |
+
Returns:
|
137 |
+
np.ndarray: A 3D numpy array representing the modified canvas with the drawn hand pose.
|
138 |
+
|
139 |
+
Note:
|
140 |
+
The function expects the x and y coordinates of the keypoints to be normalized between 0 and 1.
|
141 |
+
"""
|
142 |
+
if not keypoints:
|
143 |
+
return canvas
|
144 |
+
|
145 |
+
H, W, C = canvas.shape
|
146 |
+
|
147 |
+
edges = [[0, 1], [1, 2], [2, 3], [3, 4], [0, 5], [5, 6], [6, 7], [7, 8], [0, 9], [9, 10], \
|
148 |
+
[10, 11], [11, 12], [0, 13], [13, 14], [14, 15], [15, 16], [0, 17], [17, 18], [18, 19], [19, 20]]
|
149 |
+
|
150 |
+
for ie, (e1, e2) in enumerate(edges):
|
151 |
+
k1 = keypoints[e1]
|
152 |
+
k2 = keypoints[e2]
|
153 |
+
if k1 is None or k2 is None:
|
154 |
+
continue
|
155 |
+
|
156 |
+
x1 = int(k1.x * W)
|
157 |
+
y1 = int(k1.y * H)
|
158 |
+
x2 = int(k2.x * W)
|
159 |
+
y2 = int(k2.y * H)
|
160 |
+
if x1 > eps and y1 > eps and x2 > eps and y2 > eps:
|
161 |
+
cv2.line(canvas, (x1, y1), (x2, y2), matplotlib.colors.hsv_to_rgb([ie / float(len(edges)), 1.0, 1.0]) * 255, thickness=2)
|
162 |
+
|
163 |
+
for keypoint in keypoints:
|
164 |
+
x, y = keypoint.x, keypoint.y
|
165 |
+
x = int(x * W)
|
166 |
+
y = int(y * H)
|
167 |
+
if x > eps and y > eps:
|
168 |
+
cv2.circle(canvas, (x, y), 4, (0, 0, 255), thickness=-1)
|
169 |
+
return canvas
|
170 |
+
|
171 |
+
|
172 |
+
def draw_facepose(canvas: np.ndarray, keypoints: Union[List[Keypoint], None]) -> np.ndarray:
|
173 |
+
"""
|
174 |
+
Draw keypoints representing face pose on a given canvas.
|
175 |
+
|
176 |
+
Args:
|
177 |
+
canvas (np.ndarray): A 3D numpy array representing the canvas (image) on which to draw the face pose.
|
178 |
+
keypoints (List[Keypoint]| None): A list of Keypoint objects representing the face keypoints to be drawn
|
179 |
+
or None if no keypoints are present.
|
180 |
+
|
181 |
+
Returns:
|
182 |
+
np.ndarray: A 3D numpy array representing the modified canvas with the drawn face pose.
|
183 |
+
|
184 |
+
Note:
|
185 |
+
The function expects the x and y coordinates of the keypoints to be normalized between 0 and 1.
|
186 |
+
"""
|
187 |
+
if not keypoints:
|
188 |
+
return canvas
|
189 |
+
|
190 |
+
H, W, C = canvas.shape
|
191 |
+
for keypoint in keypoints:
|
192 |
+
x, y = keypoint.x, keypoint.y
|
193 |
+
x = int(x * W)
|
194 |
+
y = int(y * H)
|
195 |
+
if x > eps and y > eps:
|
196 |
+
cv2.circle(canvas, (x, y), 3, (255, 255, 255), thickness=-1)
|
197 |
+
return canvas
|
198 |
+
|
199 |
+
|
200 |
+
# detect hand according to body pose keypoints
|
201 |
+
# please refer to https://github.com/CMU-Perceptual-Computing-Lab/openpose/blob/master/src/openpose/hand/handDetector.cpp
|
202 |
+
def handDetect(body: BodyResult, oriImg) -> List[Tuple[int, int, int, bool]]:
|
203 |
+
"""
|
204 |
+
Detect hands in the input body pose keypoints and calculate the bounding box for each hand.
|
205 |
+
|
206 |
+
Args:
|
207 |
+
body (BodyResult): A BodyResult object containing the detected body pose keypoints.
|
208 |
+
oriImg (numpy.ndarray): A 3D numpy array representing the original input image.
|
209 |
+
|
210 |
+
Returns:
|
211 |
+
List[Tuple[int, int, int, bool]]: A list of tuples, each containing the coordinates (x, y) of the top-left
|
212 |
+
corner of the bounding box, the width (height) of the bounding box, and
|
213 |
+
a boolean flag indicating whether the hand is a left hand (True) or a
|
214 |
+
right hand (False).
|
215 |
+
|
216 |
+
Notes:
|
217 |
+
- The width and height of the bounding boxes are equal since the network requires squared input.
|
218 |
+
- The minimum bounding box size is 20 pixels.
|
219 |
+
"""
|
220 |
+
ratioWristElbow = 0.33
|
221 |
+
detect_result = []
|
222 |
+
image_height, image_width = oriImg.shape[0:2]
|
223 |
+
|
224 |
+
keypoints = body.keypoints
|
225 |
+
# right hand: wrist 4, elbow 3, shoulder 2
|
226 |
+
# left hand: wrist 7, elbow 6, shoulder 5
|
227 |
+
left_shoulder = keypoints[5]
|
228 |
+
left_elbow = keypoints[6]
|
229 |
+
left_wrist = keypoints[7]
|
230 |
+
right_shoulder = keypoints[2]
|
231 |
+
right_elbow = keypoints[3]
|
232 |
+
right_wrist = keypoints[4]
|
233 |
+
|
234 |
+
# if any of three not detected
|
235 |
+
has_left = all(keypoint is not None for keypoint in (left_shoulder, left_elbow, left_wrist))
|
236 |
+
has_right = all(keypoint is not None for keypoint in (right_shoulder, right_elbow, right_wrist))
|
237 |
+
if not (has_left or has_right):
|
238 |
+
return []
|
239 |
+
|
240 |
+
hands = []
|
241 |
+
#left hand
|
242 |
+
if has_left:
|
243 |
+
hands.append([
|
244 |
+
left_shoulder.x, left_shoulder.y,
|
245 |
+
left_elbow.x, left_elbow.y,
|
246 |
+
left_wrist.x, left_wrist.y,
|
247 |
+
True
|
248 |
+
])
|
249 |
+
# right hand
|
250 |
+
if has_right:
|
251 |
+
hands.append([
|
252 |
+
right_shoulder.x, right_shoulder.y,
|
253 |
+
right_elbow.x, right_elbow.y,
|
254 |
+
right_wrist.x, right_wrist.y,
|
255 |
+
False
|
256 |
+
])
|
257 |
+
|
258 |
+
for x1, y1, x2, y2, x3, y3, is_left in hands:
|
259 |
+
# pos_hand = pos_wrist + ratio * (pos_wrist - pos_elbox) = (1 + ratio) * pos_wrist - ratio * pos_elbox
|
260 |
+
# handRectangle.x = posePtr[wrist*3] + ratioWristElbow * (posePtr[wrist*3] - posePtr[elbow*3]);
|
261 |
+
# handRectangle.y = posePtr[wrist*3+1] + ratioWristElbow * (posePtr[wrist*3+1] - posePtr[elbow*3+1]);
|
262 |
+
# const auto distanceWristElbow = getDistance(poseKeypoints, person, wrist, elbow);
|
263 |
+
# const auto distanceElbowShoulder = getDistance(poseKeypoints, person, elbow, shoulder);
|
264 |
+
# handRectangle.width = 1.5f * fastMax(distanceWristElbow, 0.9f * distanceElbowShoulder);
|
265 |
+
x = x3 + ratioWristElbow * (x3 - x2)
|
266 |
+
y = y3 + ratioWristElbow * (y3 - y2)
|
267 |
+
distanceWristElbow = math.sqrt((x3 - x2) ** 2 + (y3 - y2) ** 2)
|
268 |
+
distanceElbowShoulder = math.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2)
|
269 |
+
width = 1.5 * max(distanceWristElbow, 0.9 * distanceElbowShoulder)
|
270 |
+
# x-y refers to the center --> offset to topLeft point
|
271 |
+
# handRectangle.x -= handRectangle.width / 2.f;
|
272 |
+
# handRectangle.y -= handRectangle.height / 2.f;
|
273 |
+
x -= width / 2
|
274 |
+
y -= width / 2 # width = height
|
275 |
+
# overflow the image
|
276 |
+
if x < 0: x = 0
|
277 |
+
if y < 0: y = 0
|
278 |
+
width1 = width
|
279 |
+
width2 = width
|
280 |
+
if x + width > image_width: width1 = image_width - x
|
281 |
+
if y + width > image_height: width2 = image_height - y
|
282 |
+
width = min(width1, width2)
|
283 |
+
# the max hand box value is 20 pixels
|
284 |
+
if width >= 20:
|
285 |
+
detect_result.append((int(x), int(y), int(width), is_left))
|
286 |
+
|
287 |
+
'''
|
288 |
+
return value: [[x, y, w, True if left hand else False]].
|
289 |
+
width=height since the network require squared input.
|
290 |
+
x, y is the coordinate of top left
|
291 |
+
'''
|
292 |
+
return detect_result
|
293 |
+
|
294 |
+
|
295 |
+
# Written by Lvmin
|
296 |
+
def faceDetect(body: BodyResult, oriImg) -> Union[Tuple[int, int, int], None]:
|
297 |
+
"""
|
298 |
+
Detect the face in the input body pose keypoints and calculate the bounding box for the face.
|
299 |
+
|
300 |
+
Args:
|
301 |
+
body (BodyResult): A BodyResult object containing the detected body pose keypoints.
|
302 |
+
oriImg (numpy.ndarray): A 3D numpy array representing the original input image.
|
303 |
+
|
304 |
+
Returns:
|
305 |
+
Tuple[int, int, int] | None: A tuple containing the coordinates (x, y) of the top-left corner of the
|
306 |
+
bounding box and the width (height) of the bounding box, or None if the
|
307 |
+
face is not detected or the bounding box width is less than 20 pixels.
|
308 |
+
|
309 |
+
Notes:
|
310 |
+
- The width and height of the bounding box are equal.
|
311 |
+
- The minimum bounding box size is 20 pixels.
|
312 |
+
"""
|
313 |
+
# left right eye ear 14 15 16 17
|
314 |
+
image_height, image_width = oriImg.shape[0:2]
|
315 |
+
|
316 |
+
keypoints = body.keypoints
|
317 |
+
head = keypoints[0]
|
318 |
+
left_eye = keypoints[14]
|
319 |
+
right_eye = keypoints[15]
|
320 |
+
left_ear = keypoints[16]
|
321 |
+
right_ear = keypoints[17]
|
322 |
+
|
323 |
+
if head is None or all(keypoint is None for keypoint in (left_eye, right_eye, left_ear, right_ear)):
|
324 |
+
return None
|
325 |
+
|
326 |
+
width = 0.0
|
327 |
+
x0, y0 = head.x, head.y
|
328 |
+
|
329 |
+
if left_eye is not None:
|
330 |
+
x1, y1 = left_eye.x, left_eye.y
|
331 |
+
d = max(abs(x0 - x1), abs(y0 - y1))
|
332 |
+
width = max(width, d * 3.0)
|
333 |
+
|
334 |
+
if right_eye is not None:
|
335 |
+
x1, y1 = right_eye.x, right_eye.y
|
336 |
+
d = max(abs(x0 - x1), abs(y0 - y1))
|
337 |
+
width = max(width, d * 3.0)
|
338 |
+
|
339 |
+
if left_ear is not None:
|
340 |
+
x1, y1 = left_ear.x, left_ear.y
|
341 |
+
d = max(abs(x0 - x1), abs(y0 - y1))
|
342 |
+
width = max(width, d * 1.5)
|
343 |
+
|
344 |
+
if right_ear is not None:
|
345 |
+
x1, y1 = right_ear.x, right_ear.y
|
346 |
+
d = max(abs(x0 - x1), abs(y0 - y1))
|
347 |
+
width = max(width, d * 1.5)
|
348 |
+
|
349 |
+
x, y = x0, y0
|
350 |
+
|
351 |
+
x -= width
|
352 |
+
y -= width
|
353 |
+
|
354 |
+
if x < 0:
|
355 |
+
x = 0
|
356 |
+
|
357 |
+
if y < 0:
|
358 |
+
y = 0
|
359 |
+
|
360 |
+
width1 = width * 2
|
361 |
+
width2 = width * 2
|
362 |
+
|
363 |
+
if x + width > image_width:
|
364 |
+
width1 = image_width - x
|
365 |
+
|
366 |
+
if y + width > image_height:
|
367 |
+
width2 = image_height - y
|
368 |
+
|
369 |
+
width = min(width1, width2)
|
370 |
+
|
371 |
+
if width >= 20:
|
372 |
+
return int(x), int(y), int(width)
|
373 |
+
else:
|
374 |
+
return None
|
375 |
+
|
376 |
+
|
377 |
+
# get max index of 2d array
|
378 |
+
def npmax(array):
|
379 |
+
arrayindex = array.argmax(1)
|
380 |
+
arrayvalue = array.max(1)
|
381 |
+
i = arrayvalue.argmax()
|
382 |
+
j = arrayindex[i]
|
383 |
+
return i, j
|
extensions/microsoftexcel-controlnet/annotator/pidinet/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
It is just for research purpose, and commercial use should be contacted with authors first.
|
2 |
+
|
3 |
+
Copyright (c) 2021 Zhuo Su
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
extensions/microsoftexcel-controlnet/annotator/pidinet/__init__.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import torch
|
3 |
+
import numpy as np
|
4 |
+
from einops import rearrange
|
5 |
+
from annotator.pidinet.model import pidinet
|
6 |
+
from annotator.util import safe_step
|
7 |
+
from modules import devices
|
8 |
+
from annotator.annotator_path import models_path
|
9 |
+
from scripts.utils import load_state_dict
|
10 |
+
|
11 |
+
netNetwork = None
|
12 |
+
remote_model_path = "https://huggingface.co/lllyasviel/Annotators/resolve/main/table5_pidinet.pth"
|
13 |
+
modeldir = os.path.join(models_path, "pidinet")
|
14 |
+
old_modeldir = os.path.dirname(os.path.realpath(__file__))
|
15 |
+
|
16 |
+
def apply_pidinet(input_image, is_safe=False, apply_fliter=False):
|
17 |
+
global netNetwork
|
18 |
+
if netNetwork is None:
|
19 |
+
modelpath = os.path.join(modeldir, "table5_pidinet.pth")
|
20 |
+
old_modelpath = os.path.join(old_modeldir, "table5_pidinet.pth")
|
21 |
+
if os.path.exists(old_modelpath):
|
22 |
+
modelpath = old_modelpath
|
23 |
+
elif not os.path.exists(modelpath):
|
24 |
+
from basicsr.utils.download_util import load_file_from_url
|
25 |
+
load_file_from_url(remote_model_path, model_dir=modeldir)
|
26 |
+
netNetwork = pidinet()
|
27 |
+
ckp = load_state_dict(modelpath)
|
28 |
+
netNetwork.load_state_dict({k.replace('module.',''):v for k, v in ckp.items()})
|
29 |
+
|
30 |
+
netNetwork = netNetwork.to(devices.get_device_for("controlnet"))
|
31 |
+
netNetwork.eval()
|
32 |
+
assert input_image.ndim == 3
|
33 |
+
input_image = input_image[:, :, ::-1].copy()
|
34 |
+
with torch.no_grad():
|
35 |
+
image_pidi = torch.from_numpy(input_image).float().to(devices.get_device_for("controlnet"))
|
36 |
+
image_pidi = image_pidi / 255.0
|
37 |
+
image_pidi = rearrange(image_pidi, 'h w c -> 1 c h w')
|
38 |
+
edge = netNetwork(image_pidi)[-1]
|
39 |
+
edge = edge.cpu().numpy()
|
40 |
+
if apply_fliter:
|
41 |
+
edge = edge > 0.5
|
42 |
+
if is_safe:
|
43 |
+
edge = safe_step(edge)
|
44 |
+
edge = (edge * 255.0).clip(0, 255).astype(np.uint8)
|
45 |
+
|
46 |
+
return edge[0][0]
|
47 |
+
|
48 |
+
def unload_pid_model():
|
49 |
+
global netNetwork
|
50 |
+
if netNetwork is not None:
|
51 |
+
netNetwork.cpu()
|
extensions/microsoftexcel-controlnet/annotator/pidinet/model.py
ADDED
@@ -0,0 +1,653 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Author: Zhuo Su, Wenzhe Liu
|
3 |
+
Date: Feb 18, 2021
|
4 |
+
"""
|
5 |
+
|
6 |
+
import math
|
7 |
+
|
8 |
+
import cv2
|
9 |
+
import numpy as np
|
10 |
+
import torch
|
11 |
+
import torch.nn as nn
|
12 |
+
import torch.nn.functional as F
|
13 |
+
from basicsr.utils import img2tensor
|
14 |
+
|
15 |
+
nets = {
|
16 |
+
'baseline': {
|
17 |
+
'layer0': 'cv',
|
18 |
+
'layer1': 'cv',
|
19 |
+
'layer2': 'cv',
|
20 |
+
'layer3': 'cv',
|
21 |
+
'layer4': 'cv',
|
22 |
+
'layer5': 'cv',
|
23 |
+
'layer6': 'cv',
|
24 |
+
'layer7': 'cv',
|
25 |
+
'layer8': 'cv',
|
26 |
+
'layer9': 'cv',
|
27 |
+
'layer10': 'cv',
|
28 |
+
'layer11': 'cv',
|
29 |
+
'layer12': 'cv',
|
30 |
+
'layer13': 'cv',
|
31 |
+
'layer14': 'cv',
|
32 |
+
'layer15': 'cv',
|
33 |
+
},
|
34 |
+
'c-v15': {
|
35 |
+
'layer0': 'cd',
|
36 |
+
'layer1': 'cv',
|
37 |
+
'layer2': 'cv',
|
38 |
+
'layer3': 'cv',
|
39 |
+
'layer4': 'cv',
|
40 |
+
'layer5': 'cv',
|
41 |
+
'layer6': 'cv',
|
42 |
+
'layer7': 'cv',
|
43 |
+
'layer8': 'cv',
|
44 |
+
'layer9': 'cv',
|
45 |
+
'layer10': 'cv',
|
46 |
+
'layer11': 'cv',
|
47 |
+
'layer12': 'cv',
|
48 |
+
'layer13': 'cv',
|
49 |
+
'layer14': 'cv',
|
50 |
+
'layer15': 'cv',
|
51 |
+
},
|
52 |
+
'a-v15': {
|
53 |
+
'layer0': 'ad',
|
54 |
+
'layer1': 'cv',
|
55 |
+
'layer2': 'cv',
|
56 |
+
'layer3': 'cv',
|
57 |
+
'layer4': 'cv',
|
58 |
+
'layer5': 'cv',
|
59 |
+
'layer6': 'cv',
|
60 |
+
'layer7': 'cv',
|
61 |
+
'layer8': 'cv',
|
62 |
+
'layer9': 'cv',
|
63 |
+
'layer10': 'cv',
|
64 |
+
'layer11': 'cv',
|
65 |
+
'layer12': 'cv',
|
66 |
+
'layer13': 'cv',
|
67 |
+
'layer14': 'cv',
|
68 |
+
'layer15': 'cv',
|
69 |
+
},
|
70 |
+
'r-v15': {
|
71 |
+
'layer0': 'rd',
|
72 |
+
'layer1': 'cv',
|
73 |
+
'layer2': 'cv',
|
74 |
+
'layer3': 'cv',
|
75 |
+
'layer4': 'cv',
|
76 |
+
'layer5': 'cv',
|
77 |
+
'layer6': 'cv',
|
78 |
+
'layer7': 'cv',
|
79 |
+
'layer8': 'cv',
|
80 |
+
'layer9': 'cv',
|
81 |
+
'layer10': 'cv',
|
82 |
+
'layer11': 'cv',
|
83 |
+
'layer12': 'cv',
|
84 |
+
'layer13': 'cv',
|
85 |
+
'layer14': 'cv',
|
86 |
+
'layer15': 'cv',
|
87 |
+
},
|
88 |
+
'cvvv4': {
|
89 |
+
'layer0': 'cd',
|
90 |
+
'layer1': 'cv',
|
91 |
+
'layer2': 'cv',
|
92 |
+
'layer3': 'cv',
|
93 |
+
'layer4': 'cd',
|
94 |
+
'layer5': 'cv',
|
95 |
+
'layer6': 'cv',
|
96 |
+
'layer7': 'cv',
|
97 |
+
'layer8': 'cd',
|
98 |
+
'layer9': 'cv',
|
99 |
+
'layer10': 'cv',
|
100 |
+
'layer11': 'cv',
|
101 |
+
'layer12': 'cd',
|
102 |
+
'layer13': 'cv',
|
103 |
+
'layer14': 'cv',
|
104 |
+
'layer15': 'cv',
|
105 |
+
},
|
106 |
+
'avvv4': {
|
107 |
+
'layer0': 'ad',
|
108 |
+
'layer1': 'cv',
|
109 |
+
'layer2': 'cv',
|
110 |
+
'layer3': 'cv',
|
111 |
+
'layer4': 'ad',
|
112 |
+
'layer5': 'cv',
|
113 |
+
'layer6': 'cv',
|
114 |
+
'layer7': 'cv',
|
115 |
+
'layer8': 'ad',
|
116 |
+
'layer9': 'cv',
|
117 |
+
'layer10': 'cv',
|
118 |
+
'layer11': 'cv',
|
119 |
+
'layer12': 'ad',
|
120 |
+
'layer13': 'cv',
|
121 |
+
'layer14': 'cv',
|
122 |
+
'layer15': 'cv',
|
123 |
+
},
|
124 |
+
'rvvv4': {
|
125 |
+
'layer0': 'rd',
|
126 |
+
'layer1': 'cv',
|
127 |
+
'layer2': 'cv',
|
128 |
+
'layer3': 'cv',
|
129 |
+
'layer4': 'rd',
|
130 |
+
'layer5': 'cv',
|
131 |
+
'layer6': 'cv',
|
132 |
+
'layer7': 'cv',
|
133 |
+
'layer8': 'rd',
|
134 |
+
'layer9': 'cv',
|
135 |
+
'layer10': 'cv',
|
136 |
+
'layer11': 'cv',
|
137 |
+
'layer12': 'rd',
|
138 |
+
'layer13': 'cv',
|
139 |
+
'layer14': 'cv',
|
140 |
+
'layer15': 'cv',
|
141 |
+
},
|
142 |
+
'cccv4': {
|
143 |
+
'layer0': 'cd',
|
144 |
+
'layer1': 'cd',
|
145 |
+
'layer2': 'cd',
|
146 |
+
'layer3': 'cv',
|
147 |
+
'layer4': 'cd',
|
148 |
+
'layer5': 'cd',
|
149 |
+
'layer6': 'cd',
|
150 |
+
'layer7': 'cv',
|
151 |
+
'layer8': 'cd',
|
152 |
+
'layer9': 'cd',
|
153 |
+
'layer10': 'cd',
|
154 |
+
'layer11': 'cv',
|
155 |
+
'layer12': 'cd',
|
156 |
+
'layer13': 'cd',
|
157 |
+
'layer14': 'cd',
|
158 |
+
'layer15': 'cv',
|
159 |
+
},
|
160 |
+
'aaav4': {
|
161 |
+
'layer0': 'ad',
|
162 |
+
'layer1': 'ad',
|
163 |
+
'layer2': 'ad',
|
164 |
+
'layer3': 'cv',
|
165 |
+
'layer4': 'ad',
|
166 |
+
'layer5': 'ad',
|
167 |
+
'layer6': 'ad',
|
168 |
+
'layer7': 'cv',
|
169 |
+
'layer8': 'ad',
|
170 |
+
'layer9': 'ad',
|
171 |
+
'layer10': 'ad',
|
172 |
+
'layer11': 'cv',
|
173 |
+
'layer12': 'ad',
|
174 |
+
'layer13': 'ad',
|
175 |
+
'layer14': 'ad',
|
176 |
+
'layer15': 'cv',
|
177 |
+
},
|
178 |
+
'rrrv4': {
|
179 |
+
'layer0': 'rd',
|
180 |
+
'layer1': 'rd',
|
181 |
+
'layer2': 'rd',
|
182 |
+
'layer3': 'cv',
|
183 |
+
'layer4': 'rd',
|
184 |
+
'layer5': 'rd',
|
185 |
+
'layer6': 'rd',
|
186 |
+
'layer7': 'cv',
|
187 |
+
'layer8': 'rd',
|
188 |
+
'layer9': 'rd',
|
189 |
+
'layer10': 'rd',
|
190 |
+
'layer11': 'cv',
|
191 |
+
'layer12': 'rd',
|
192 |
+
'layer13': 'rd',
|
193 |
+
'layer14': 'rd',
|
194 |
+
'layer15': 'cv',
|
195 |
+
},
|
196 |
+
'c16': {
|
197 |
+
'layer0': 'cd',
|
198 |
+
'layer1': 'cd',
|
199 |
+
'layer2': 'cd',
|
200 |
+
'layer3': 'cd',
|
201 |
+
'layer4': 'cd',
|
202 |
+
'layer5': 'cd',
|
203 |
+
'layer6': 'cd',
|
204 |
+
'layer7': 'cd',
|
205 |
+
'layer8': 'cd',
|
206 |
+
'layer9': 'cd',
|
207 |
+
'layer10': 'cd',
|
208 |
+
'layer11': 'cd',
|
209 |
+
'layer12': 'cd',
|
210 |
+
'layer13': 'cd',
|
211 |
+
'layer14': 'cd',
|
212 |
+
'layer15': 'cd',
|
213 |
+
},
|
214 |
+
'a16': {
|
215 |
+
'layer0': 'ad',
|
216 |
+
'layer1': 'ad',
|
217 |
+
'layer2': 'ad',
|
218 |
+
'layer3': 'ad',
|
219 |
+
'layer4': 'ad',
|
220 |
+
'layer5': 'ad',
|
221 |
+
'layer6': 'ad',
|
222 |
+
'layer7': 'ad',
|
223 |
+
'layer8': 'ad',
|
224 |
+
'layer9': 'ad',
|
225 |
+
'layer10': 'ad',
|
226 |
+
'layer11': 'ad',
|
227 |
+
'layer12': 'ad',
|
228 |
+
'layer13': 'ad',
|
229 |
+
'layer14': 'ad',
|
230 |
+
'layer15': 'ad',
|
231 |
+
},
|
232 |
+
'r16': {
|
233 |
+
'layer0': 'rd',
|
234 |
+
'layer1': 'rd',
|
235 |
+
'layer2': 'rd',
|
236 |
+
'layer3': 'rd',
|
237 |
+
'layer4': 'rd',
|
238 |
+
'layer5': 'rd',
|
239 |
+
'layer6': 'rd',
|
240 |
+
'layer7': 'rd',
|
241 |
+
'layer8': 'rd',
|
242 |
+
'layer9': 'rd',
|
243 |
+
'layer10': 'rd',
|
244 |
+
'layer11': 'rd',
|
245 |
+
'layer12': 'rd',
|
246 |
+
'layer13': 'rd',
|
247 |
+
'layer14': 'rd',
|
248 |
+
'layer15': 'rd',
|
249 |
+
},
|
250 |
+
'carv4': {
|
251 |
+
'layer0': 'cd',
|
252 |
+
'layer1': 'ad',
|
253 |
+
'layer2': 'rd',
|
254 |
+
'layer3': 'cv',
|
255 |
+
'layer4': 'cd',
|
256 |
+
'layer5': 'ad',
|
257 |
+
'layer6': 'rd',
|
258 |
+
'layer7': 'cv',
|
259 |
+
'layer8': 'cd',
|
260 |
+
'layer9': 'ad',
|
261 |
+
'layer10': 'rd',
|
262 |
+
'layer11': 'cv',
|
263 |
+
'layer12': 'cd',
|
264 |
+
'layer13': 'ad',
|
265 |
+
'layer14': 'rd',
|
266 |
+
'layer15': 'cv',
|
267 |
+
},
|
268 |
+
}
|
269 |
+
|
270 |
+
def createConvFunc(op_type):
|
271 |
+
assert op_type in ['cv', 'cd', 'ad', 'rd'], 'unknown op type: %s' % str(op_type)
|
272 |
+
if op_type == 'cv':
|
273 |
+
return F.conv2d
|
274 |
+
|
275 |
+
if op_type == 'cd':
|
276 |
+
def func(x, weights, bias=None, stride=1, padding=0, dilation=1, groups=1):
|
277 |
+
assert dilation in [1, 2], 'dilation for cd_conv should be in 1 or 2'
|
278 |
+
assert weights.size(2) == 3 and weights.size(3) == 3, 'kernel size for cd_conv should be 3x3'
|
279 |
+
assert padding == dilation, 'padding for cd_conv set wrong'
|
280 |
+
|
281 |
+
weights_c = weights.sum(dim=[2, 3], keepdim=True)
|
282 |
+
yc = F.conv2d(x, weights_c, stride=stride, padding=0, groups=groups)
|
283 |
+
y = F.conv2d(x, weights, bias, stride=stride, padding=padding, dilation=dilation, groups=groups)
|
284 |
+
return y - yc
|
285 |
+
return func
|
286 |
+
elif op_type == 'ad':
|
287 |
+
def func(x, weights, bias=None, stride=1, padding=0, dilation=1, groups=1):
|
288 |
+
assert dilation in [1, 2], 'dilation for ad_conv should be in 1 or 2'
|
289 |
+
assert weights.size(2) == 3 and weights.size(3) == 3, 'kernel size for ad_conv should be 3x3'
|
290 |
+
assert padding == dilation, 'padding for ad_conv set wrong'
|
291 |
+
|
292 |
+
shape = weights.shape
|
293 |
+
weights = weights.view(shape[0], shape[1], -1)
|
294 |
+
weights_conv = (weights - weights[:, :, [3, 0, 1, 6, 4, 2, 7, 8, 5]]).view(shape) # clock-wise
|
295 |
+
y = F.conv2d(x, weights_conv, bias, stride=stride, padding=padding, dilation=dilation, groups=groups)
|
296 |
+
return y
|
297 |
+
return func
|
298 |
+
elif op_type == 'rd':
|
299 |
+
def func(x, weights, bias=None, stride=1, padding=0, dilation=1, groups=1):
|
300 |
+
assert dilation in [1, 2], 'dilation for rd_conv should be in 1 or 2'
|
301 |
+
assert weights.size(2) == 3 and weights.size(3) == 3, 'kernel size for rd_conv should be 3x3'
|
302 |
+
padding = 2 * dilation
|
303 |
+
|
304 |
+
shape = weights.shape
|
305 |
+
if weights.is_cuda:
|
306 |
+
buffer = torch.cuda.FloatTensor(shape[0], shape[1], 5 * 5).fill_(0)
|
307 |
+
else:
|
308 |
+
buffer = torch.zeros(shape[0], shape[1], 5 * 5)
|
309 |
+
weights = weights.view(shape[0], shape[1], -1)
|
310 |
+
buffer[:, :, [0, 2, 4, 10, 14, 20, 22, 24]] = weights[:, :, 1:]
|
311 |
+
buffer[:, :, [6, 7, 8, 11, 13, 16, 17, 18]] = -weights[:, :, 1:]
|
312 |
+
buffer[:, :, 12] = 0
|
313 |
+
buffer = buffer.view(shape[0], shape[1], 5, 5)
|
314 |
+
y = F.conv2d(x, buffer, bias, stride=stride, padding=padding, dilation=dilation, groups=groups)
|
315 |
+
return y
|
316 |
+
return func
|
317 |
+
else:
|
318 |
+
print('impossible to be here unless you force that')
|
319 |
+
return None
|
320 |
+
|
321 |
+
class Conv2d(nn.Module):
|
322 |
+
def __init__(self, pdc, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=False):
|
323 |
+
super(Conv2d, self).__init__()
|
324 |
+
if in_channels % groups != 0:
|
325 |
+
raise ValueError('in_channels must be divisible by groups')
|
326 |
+
if out_channels % groups != 0:
|
327 |
+
raise ValueError('out_channels must be divisible by groups')
|
328 |
+
self.in_channels = in_channels
|
329 |
+
self.out_channels = out_channels
|
330 |
+
self.kernel_size = kernel_size
|
331 |
+
self.stride = stride
|
332 |
+
self.padding = padding
|
333 |
+
self.dilation = dilation
|
334 |
+
self.groups = groups
|
335 |
+
self.weight = nn.Parameter(torch.Tensor(out_channels, in_channels // groups, kernel_size, kernel_size))
|
336 |
+
if bias:
|
337 |
+
self.bias = nn.Parameter(torch.Tensor(out_channels))
|
338 |
+
else:
|
339 |
+
self.register_parameter('bias', None)
|
340 |
+
self.reset_parameters()
|
341 |
+
self.pdc = pdc
|
342 |
+
|
343 |
+
def reset_parameters(self):
|
344 |
+
nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5))
|
345 |
+
if self.bias is not None:
|
346 |
+
fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight)
|
347 |
+
bound = 1 / math.sqrt(fan_in)
|
348 |
+
nn.init.uniform_(self.bias, -bound, bound)
|
349 |
+
|
350 |
+
def forward(self, input):
|
351 |
+
|
352 |
+
return self.pdc(input, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
|
353 |
+
|
354 |
+
class CSAM(nn.Module):
|
355 |
+
"""
|
356 |
+
Compact Spatial Attention Module
|
357 |
+
"""
|
358 |
+
def __init__(self, channels):
|
359 |
+
super(CSAM, self).__init__()
|
360 |
+
|
361 |
+
mid_channels = 4
|
362 |
+
self.relu1 = nn.ReLU()
|
363 |
+
self.conv1 = nn.Conv2d(channels, mid_channels, kernel_size=1, padding=0)
|
364 |
+
self.conv2 = nn.Conv2d(mid_channels, 1, kernel_size=3, padding=1, bias=False)
|
365 |
+
self.sigmoid = nn.Sigmoid()
|
366 |
+
nn.init.constant_(self.conv1.bias, 0)
|
367 |
+
|
368 |
+
def forward(self, x):
|
369 |
+
y = self.relu1(x)
|
370 |
+
y = self.conv1(y)
|
371 |
+
y = self.conv2(y)
|
372 |
+
y = self.sigmoid(y)
|
373 |
+
|
374 |
+
return x * y
|
375 |
+
|
376 |
+
class CDCM(nn.Module):
|
377 |
+
"""
|
378 |
+
Compact Dilation Convolution based Module
|
379 |
+
"""
|
380 |
+
def __init__(self, in_channels, out_channels):
|
381 |
+
super(CDCM, self).__init__()
|
382 |
+
|
383 |
+
self.relu1 = nn.ReLU()
|
384 |
+
self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=1, padding=0)
|
385 |
+
self.conv2_1 = nn.Conv2d(out_channels, out_channels, kernel_size=3, dilation=5, padding=5, bias=False)
|
386 |
+
self.conv2_2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, dilation=7, padding=7, bias=False)
|
387 |
+
self.conv2_3 = nn.Conv2d(out_channels, out_channels, kernel_size=3, dilation=9, padding=9, bias=False)
|
388 |
+
self.conv2_4 = nn.Conv2d(out_channels, out_channels, kernel_size=3, dilation=11, padding=11, bias=False)
|
389 |
+
nn.init.constant_(self.conv1.bias, 0)
|
390 |
+
|
391 |
+
def forward(self, x):
|
392 |
+
x = self.relu1(x)
|
393 |
+
x = self.conv1(x)
|
394 |
+
x1 = self.conv2_1(x)
|
395 |
+
x2 = self.conv2_2(x)
|
396 |
+
x3 = self.conv2_3(x)
|
397 |
+
x4 = self.conv2_4(x)
|
398 |
+
return x1 + x2 + x3 + x4
|
399 |
+
|
400 |
+
|
401 |
+
class MapReduce(nn.Module):
|
402 |
+
"""
|
403 |
+
Reduce feature maps into a single edge map
|
404 |
+
"""
|
405 |
+
def __init__(self, channels):
|
406 |
+
super(MapReduce, self).__init__()
|
407 |
+
self.conv = nn.Conv2d(channels, 1, kernel_size=1, padding=0)
|
408 |
+
nn.init.constant_(self.conv.bias, 0)
|
409 |
+
|
410 |
+
def forward(self, x):
|
411 |
+
return self.conv(x)
|
412 |
+
|
413 |
+
|
414 |
+
class PDCBlock(nn.Module):
|
415 |
+
def __init__(self, pdc, inplane, ouplane, stride=1):
|
416 |
+
super(PDCBlock, self).__init__()
|
417 |
+
self.stride=stride
|
418 |
+
|
419 |
+
self.stride=stride
|
420 |
+
if self.stride > 1:
|
421 |
+
self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
|
422 |
+
self.shortcut = nn.Conv2d(inplane, ouplane, kernel_size=1, padding=0)
|
423 |
+
self.conv1 = Conv2d(pdc, inplane, inplane, kernel_size=3, padding=1, groups=inplane, bias=False)
|
424 |
+
self.relu2 = nn.ReLU()
|
425 |
+
self.conv2 = nn.Conv2d(inplane, ouplane, kernel_size=1, padding=0, bias=False)
|
426 |
+
|
427 |
+
def forward(self, x):
|
428 |
+
if self.stride > 1:
|
429 |
+
x = self.pool(x)
|
430 |
+
y = self.conv1(x)
|
431 |
+
y = self.relu2(y)
|
432 |
+
y = self.conv2(y)
|
433 |
+
if self.stride > 1:
|
434 |
+
x = self.shortcut(x)
|
435 |
+
y = y + x
|
436 |
+
return y
|
437 |
+
|
438 |
+
class PDCBlock_converted(nn.Module):
|
439 |
+
"""
|
440 |
+
CPDC, APDC can be converted to vanilla 3x3 convolution
|
441 |
+
RPDC can be converted to vanilla 5x5 convolution
|
442 |
+
"""
|
443 |
+
def __init__(self, pdc, inplane, ouplane, stride=1):
|
444 |
+
super(PDCBlock_converted, self).__init__()
|
445 |
+
self.stride=stride
|
446 |
+
|
447 |
+
if self.stride > 1:
|
448 |
+
self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
|
449 |
+
self.shortcut = nn.Conv2d(inplane, ouplane, kernel_size=1, padding=0)
|
450 |
+
if pdc == 'rd':
|
451 |
+
self.conv1 = nn.Conv2d(inplane, inplane, kernel_size=5, padding=2, groups=inplane, bias=False)
|
452 |
+
else:
|
453 |
+
self.conv1 = nn.Conv2d(inplane, inplane, kernel_size=3, padding=1, groups=inplane, bias=False)
|
454 |
+
self.relu2 = nn.ReLU()
|
455 |
+
self.conv2 = nn.Conv2d(inplane, ouplane, kernel_size=1, padding=0, bias=False)
|
456 |
+
|
457 |
+
def forward(self, x):
|
458 |
+
if self.stride > 1:
|
459 |
+
x = self.pool(x)
|
460 |
+
y = self.conv1(x)
|
461 |
+
y = self.relu2(y)
|
462 |
+
y = self.conv2(y)
|
463 |
+
if self.stride > 1:
|
464 |
+
x = self.shortcut(x)
|
465 |
+
y = y + x
|
466 |
+
return y
|
467 |
+
|
468 |
+
class PiDiNet(nn.Module):
|
469 |
+
def __init__(self, inplane, pdcs, dil=None, sa=False, convert=False):
|
470 |
+
super(PiDiNet, self).__init__()
|
471 |
+
self.sa = sa
|
472 |
+
if dil is not None:
|
473 |
+
assert isinstance(dil, int), 'dil should be an int'
|
474 |
+
self.dil = dil
|
475 |
+
|
476 |
+
self.fuseplanes = []
|
477 |
+
|
478 |
+
self.inplane = inplane
|
479 |
+
if convert:
|
480 |
+
if pdcs[0] == 'rd':
|
481 |
+
init_kernel_size = 5
|
482 |
+
init_padding = 2
|
483 |
+
else:
|
484 |
+
init_kernel_size = 3
|
485 |
+
init_padding = 1
|
486 |
+
self.init_block = nn.Conv2d(3, self.inplane,
|
487 |
+
kernel_size=init_kernel_size, padding=init_padding, bias=False)
|
488 |
+
block_class = PDCBlock_converted
|
489 |
+
else:
|
490 |
+
self.init_block = Conv2d(pdcs[0], 3, self.inplane, kernel_size=3, padding=1)
|
491 |
+
block_class = PDCBlock
|
492 |
+
|
493 |
+
self.block1_1 = block_class(pdcs[1], self.inplane, self.inplane)
|
494 |
+
self.block1_2 = block_class(pdcs[2], self.inplane, self.inplane)
|
495 |
+
self.block1_3 = block_class(pdcs[3], self.inplane, self.inplane)
|
496 |
+
self.fuseplanes.append(self.inplane) # C
|
497 |
+
|
498 |
+
inplane = self.inplane
|
499 |
+
self.inplane = self.inplane * 2
|
500 |
+
self.block2_1 = block_class(pdcs[4], inplane, self.inplane, stride=2)
|
501 |
+
self.block2_2 = block_class(pdcs[5], self.inplane, self.inplane)
|
502 |
+
self.block2_3 = block_class(pdcs[6], self.inplane, self.inplane)
|
503 |
+
self.block2_4 = block_class(pdcs[7], self.inplane, self.inplane)
|
504 |
+
self.fuseplanes.append(self.inplane) # 2C
|
505 |
+
|
506 |
+
inplane = self.inplane
|
507 |
+
self.inplane = self.inplane * 2
|
508 |
+
self.block3_1 = block_class(pdcs[8], inplane, self.inplane, stride=2)
|
509 |
+
self.block3_2 = block_class(pdcs[9], self.inplane, self.inplane)
|
510 |
+
self.block3_3 = block_class(pdcs[10], self.inplane, self.inplane)
|
511 |
+
self.block3_4 = block_class(pdcs[11], self.inplane, self.inplane)
|
512 |
+
self.fuseplanes.append(self.inplane) # 4C
|
513 |
+
|
514 |
+
self.block4_1 = block_class(pdcs[12], self.inplane, self.inplane, stride=2)
|
515 |
+
self.block4_2 = block_class(pdcs[13], self.inplane, self.inplane)
|
516 |
+
self.block4_3 = block_class(pdcs[14], self.inplane, self.inplane)
|
517 |
+
self.block4_4 = block_class(pdcs[15], self.inplane, self.inplane)
|
518 |
+
self.fuseplanes.append(self.inplane) # 4C
|
519 |
+
|
520 |
+
self.conv_reduces = nn.ModuleList()
|
521 |
+
if self.sa and self.dil is not None:
|
522 |
+
self.attentions = nn.ModuleList()
|
523 |
+
self.dilations = nn.ModuleList()
|
524 |
+
for i in range(4):
|
525 |
+
self.dilations.append(CDCM(self.fuseplanes[i], self.dil))
|
526 |
+
self.attentions.append(CSAM(self.dil))
|
527 |
+
self.conv_reduces.append(MapReduce(self.dil))
|
528 |
+
elif self.sa:
|
529 |
+
self.attentions = nn.ModuleList()
|
530 |
+
for i in range(4):
|
531 |
+
self.attentions.append(CSAM(self.fuseplanes[i]))
|
532 |
+
self.conv_reduces.append(MapReduce(self.fuseplanes[i]))
|
533 |
+
elif self.dil is not None:
|
534 |
+
self.dilations = nn.ModuleList()
|
535 |
+
for i in range(4):
|
536 |
+
self.dilations.append(CDCM(self.fuseplanes[i], self.dil))
|
537 |
+
self.conv_reduces.append(MapReduce(self.dil))
|
538 |
+
else:
|
539 |
+
for i in range(4):
|
540 |
+
self.conv_reduces.append(MapReduce(self.fuseplanes[i]))
|
541 |
+
|
542 |
+
self.classifier = nn.Conv2d(4, 1, kernel_size=1) # has bias
|
543 |
+
nn.init.constant_(self.classifier.weight, 0.25)
|
544 |
+
nn.init.constant_(self.classifier.bias, 0)
|
545 |
+
|
546 |
+
# print('initialization done')
|
547 |
+
|
548 |
+
def get_weights(self):
|
549 |
+
conv_weights = []
|
550 |
+
bn_weights = []
|
551 |
+
relu_weights = []
|
552 |
+
for pname, p in self.named_parameters():
|
553 |
+
if 'bn' in pname:
|
554 |
+
bn_weights.append(p)
|
555 |
+
elif 'relu' in pname:
|
556 |
+
relu_weights.append(p)
|
557 |
+
else:
|
558 |
+
conv_weights.append(p)
|
559 |
+
|
560 |
+
return conv_weights, bn_weights, relu_weights
|
561 |
+
|
562 |
+
def forward(self, x):
|
563 |
+
H, W = x.size()[2:]
|
564 |
+
|
565 |
+
x = self.init_block(x)
|
566 |
+
|
567 |
+
x1 = self.block1_1(x)
|
568 |
+
x1 = self.block1_2(x1)
|
569 |
+
x1 = self.block1_3(x1)
|
570 |
+
|
571 |
+
x2 = self.block2_1(x1)
|
572 |
+
x2 = self.block2_2(x2)
|
573 |
+
x2 = self.block2_3(x2)
|
574 |
+
x2 = self.block2_4(x2)
|
575 |
+
|
576 |
+
x3 = self.block3_1(x2)
|
577 |
+
x3 = self.block3_2(x3)
|
578 |
+
x3 = self.block3_3(x3)
|
579 |
+
x3 = self.block3_4(x3)
|
580 |
+
|
581 |
+
x4 = self.block4_1(x3)
|
582 |
+
x4 = self.block4_2(x4)
|
583 |
+
x4 = self.block4_3(x4)
|
584 |
+
x4 = self.block4_4(x4)
|
585 |
+
|
586 |
+
x_fuses = []
|
587 |
+
if self.sa and self.dil is not None:
|
588 |
+
for i, xi in enumerate([x1, x2, x3, x4]):
|
589 |
+
x_fuses.append(self.attentions[i](self.dilations[i](xi)))
|
590 |
+
elif self.sa:
|
591 |
+
for i, xi in enumerate([x1, x2, x3, x4]):
|
592 |
+
x_fuses.append(self.attentions[i](xi))
|
593 |
+
elif self.dil is not None:
|
594 |
+
for i, xi in enumerate([x1, x2, x3, x4]):
|
595 |
+
x_fuses.append(self.dilations[i](xi))
|
596 |
+
else:
|
597 |
+
x_fuses = [x1, x2, x3, x4]
|
598 |
+
|
599 |
+
e1 = self.conv_reduces[0](x_fuses[0])
|
600 |
+
e1 = F.interpolate(e1, (H, W), mode="bilinear", align_corners=False)
|
601 |
+
|
602 |
+
e2 = self.conv_reduces[1](x_fuses[1])
|
603 |
+
e2 = F.interpolate(e2, (H, W), mode="bilinear", align_corners=False)
|
604 |
+
|
605 |
+
e3 = self.conv_reduces[2](x_fuses[2])
|
606 |
+
e3 = F.interpolate(e3, (H, W), mode="bilinear", align_corners=False)
|
607 |
+
|
608 |
+
e4 = self.conv_reduces[3](x_fuses[3])
|
609 |
+
e4 = F.interpolate(e4, (H, W), mode="bilinear", align_corners=False)
|
610 |
+
|
611 |
+
outputs = [e1, e2, e3, e4]
|
612 |
+
|
613 |
+
output = self.classifier(torch.cat(outputs, dim=1))
|
614 |
+
#if not self.training:
|
615 |
+
# return torch.sigmoid(output)
|
616 |
+
|
617 |
+
outputs.append(output)
|
618 |
+
outputs = [torch.sigmoid(r) for r in outputs]
|
619 |
+
return outputs
|
620 |
+
|
621 |
+
def config_model(model):
|
622 |
+
model_options = list(nets.keys())
|
623 |
+
assert model in model_options, \
|
624 |
+
'unrecognized model, please choose from %s' % str(model_options)
|
625 |
+
|
626 |
+
# print(str(nets[model]))
|
627 |
+
|
628 |
+
pdcs = []
|
629 |
+
for i in range(16):
|
630 |
+
layer_name = 'layer%d' % i
|
631 |
+
op = nets[model][layer_name]
|
632 |
+
pdcs.append(createConvFunc(op))
|
633 |
+
|
634 |
+
return pdcs
|
635 |
+
|
636 |
+
def pidinet():
|
637 |
+
pdcs = config_model('carv4')
|
638 |
+
dil = 24 #if args.dil else None
|
639 |
+
return PiDiNet(60, pdcs, dil=dil, sa=True)
|
640 |
+
|
641 |
+
|
642 |
+
if __name__ == '__main__':
|
643 |
+
model = pidinet()
|
644 |
+
ckp = torch.load('table5_pidinet.pth')['state_dict']
|
645 |
+
model.load_state_dict({k.replace('module.',''):v for k, v in ckp.items()})
|
646 |
+
im = cv2.imread('examples/test_my/cat_v4.png')
|
647 |
+
im = img2tensor(im).unsqueeze(0)/255.
|
648 |
+
res = model(im)[-1]
|
649 |
+
res = res>0.5
|
650 |
+
res = res.float()
|
651 |
+
res = (res[0,0].cpu().data.numpy()*255.).astype(np.uint8)
|
652 |
+
print(res.shape)
|
653 |
+
cv2.imwrite('edge.png', res)
|
extensions/microsoftexcel-controlnet/annotator/shuffle/__init__.py
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import random
|
2 |
+
|
3 |
+
import cv2
|
4 |
+
import numpy as np
|
5 |
+
from annotator.util import make_noise_disk, img2mask
|
6 |
+
|
7 |
+
|
8 |
+
class ContentShuffleDetector:
|
9 |
+
def __call__(self, img, h=None, w=None, f=None):
|
10 |
+
H, W, C = img.shape
|
11 |
+
if h is None:
|
12 |
+
h = H
|
13 |
+
if w is None:
|
14 |
+
w = W
|
15 |
+
if f is None:
|
16 |
+
f = 256
|
17 |
+
x = make_noise_disk(h, w, 1, f) * float(W - 1)
|
18 |
+
y = make_noise_disk(h, w, 1, f) * float(H - 1)
|
19 |
+
flow = np.concatenate([x, y], axis=2).astype(np.float32)
|
20 |
+
return cv2.remap(img, flow, None, cv2.INTER_LINEAR)
|
21 |
+
|
22 |
+
|
23 |
+
class ColorShuffleDetector:
|
24 |
+
def __call__(self, img):
|
25 |
+
H, W, C = img.shape
|
26 |
+
F = np.random.randint(64, 384)
|
27 |
+
A = make_noise_disk(H, W, 3, F)
|
28 |
+
B = make_noise_disk(H, W, 3, F)
|
29 |
+
C = (A + B) / 2.0
|
30 |
+
A = (C + (A - C) * 3.0).clip(0, 1)
|
31 |
+
B = (C + (B - C) * 3.0).clip(0, 1)
|
32 |
+
L = img.astype(np.float32) / 255.0
|
33 |
+
Y = A * L + B * (1 - L)
|
34 |
+
Y -= np.min(Y, axis=(0, 1), keepdims=True)
|
35 |
+
Y /= np.maximum(np.max(Y, axis=(0, 1), keepdims=True), 1e-5)
|
36 |
+
Y *= 255.0
|
37 |
+
return Y.clip(0, 255).astype(np.uint8)
|
38 |
+
|
39 |
+
|
40 |
+
class GrayDetector:
|
41 |
+
def __call__(self, img):
|
42 |
+
eps = 1e-5
|
43 |
+
X = img.astype(np.float32)
|
44 |
+
r, g, b = X[:, :, 0], X[:, :, 1], X[:, :, 2]
|
45 |
+
kr, kg, kb = [random.random() + eps for _ in range(3)]
|
46 |
+
ks = kr + kg + kb
|
47 |
+
kr /= ks
|
48 |
+
kg /= ks
|
49 |
+
kb /= ks
|
50 |
+
Y = r * kr + g * kg + b * kb
|
51 |
+
Y = np.stack([Y] * 3, axis=2)
|
52 |
+
return Y.clip(0, 255).astype(np.uint8)
|
53 |
+
|
54 |
+
|
55 |
+
class DownSampleDetector:
|
56 |
+
def __call__(self, img, level=3, k=16.0):
|
57 |
+
h = img.astype(np.float32)
|
58 |
+
for _ in range(level):
|
59 |
+
h += np.random.normal(loc=0.0, scale=k, size=h.shape)
|
60 |
+
h = cv2.pyrDown(h)
|
61 |
+
for _ in range(level):
|
62 |
+
h = cv2.pyrUp(h)
|
63 |
+
h += np.random.normal(loc=0.0, scale=k, size=h.shape)
|
64 |
+
return h.clip(0, 255).astype(np.uint8)
|
65 |
+
|
66 |
+
|
67 |
+
class Image2MaskShuffleDetector:
|
68 |
+
def __init__(self, resolution=(640, 512)):
|
69 |
+
self.H, self.W = resolution
|
70 |
+
|
71 |
+
def __call__(self, img):
|
72 |
+
m = img2mask(img, self.H, self.W)
|
73 |
+
m *= 255.0
|
74 |
+
return m.clip(0, 255).astype(np.uint8)
|
extensions/microsoftexcel-controlnet/annotator/uniformer/LICENSE
ADDED
@@ -0,0 +1,203 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Copyright 2022 SenseTime X-Lab. All rights reserved.
|
2 |
+
|
3 |
+
Apache License
|
4 |
+
Version 2.0, January 2004
|
5 |
+
http://www.apache.org/licenses/
|
6 |
+
|
7 |
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
8 |
+
|
9 |
+
1. Definitions.
|
10 |
+
|
11 |
+
"License" shall mean the terms and conditions for use, reproduction,
|
12 |
+
and distribution as defined by Sections 1 through 9 of this document.
|
13 |
+
|
14 |
+
"Licensor" shall mean the copyright owner or entity authorized by
|
15 |
+
the copyright owner that is granting the License.
|
16 |
+
|
17 |
+
"Legal Entity" shall mean the union of the acting entity and all
|
18 |
+
other entities that control, are controlled by, or are under common
|
19 |
+
control with that entity. For the purposes of this definition,
|
20 |
+
"control" means (i) the power, direct or indirect, to cause the
|
21 |
+
direction or management of such entity, whether by contract or
|
22 |
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
23 |
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
24 |
+
|
25 |
+
"You" (or "Your") shall mean an individual or Legal Entity
|
26 |
+
exercising permissions granted by this License.
|
27 |
+
|
28 |
+
"Source" form shall mean the preferred form for making modifications,
|
29 |
+
including but not limited to software source code, documentation
|
30 |
+
source, and configuration files.
|
31 |
+
|
32 |
+
"Object" form shall mean any form resulting from mechanical
|
33 |
+
transformation or translation of a Source form, including but
|
34 |
+
not limited to compiled object code, generated documentation,
|
35 |
+
and conversions to other media types.
|
36 |
+
|
37 |
+
"Work" shall mean the work of authorship, whether in Source or
|
38 |
+
Object form, made available under the License, as indicated by a
|
39 |
+
copyright notice that is included in or attached to the work
|
40 |
+
(an example is provided in the Appendix below).
|
41 |
+
|
42 |
+
"Derivative Works" shall mean any work, whether in Source or Object
|
43 |
+
form, that is based on (or derived from) the Work and for which the
|
44 |
+
editorial revisions, annotations, elaborations, or other modifications
|
45 |
+
represent, as a whole, an original work of authorship. For the purposes
|
46 |
+
of this License, Derivative Works shall not include works that remain
|
47 |
+
separable from, or merely link (or bind by name) to the interfaces of,
|
48 |
+
the Work and Derivative Works thereof.
|
49 |
+
|
50 |
+
"Contribution" shall mean any work of authorship, including
|
51 |
+
the original version of the Work and any modifications or additions
|
52 |
+
to that Work or Derivative Works thereof, that is intentionally
|
53 |
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
54 |
+
or by an individual or Legal Entity authorized to submit on behalf of
|
55 |
+
the copyright owner. For the purposes of this definition, "submitted"
|
56 |
+
means any form of electronic, verbal, or written communication sent
|
57 |
+
to the Licensor or its representatives, including but not limited to
|
58 |
+
communication on electronic mailing lists, source code control systems,
|
59 |
+
and issue tracking systems that are managed by, or on behalf of, the
|
60 |
+
Licensor for the purpose of discussing and improving the Work, but
|
61 |
+
excluding communication that is conspicuously marked or otherwise
|
62 |
+
designated in writing by the copyright owner as "Not a Contribution."
|
63 |
+
|
64 |
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
65 |
+
on behalf of whom a Contribution has been received by Licensor and
|
66 |
+
subsequently incorporated within the Work.
|
67 |
+
|
68 |
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
69 |
+
this License, each Contributor hereby grants to You a perpetual,
|
70 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
71 |
+
copyright license to reproduce, prepare Derivative Works of,
|
72 |
+
publicly display, publicly perform, sublicense, and distribute the
|
73 |
+
Work and such Derivative Works in Source or Object form.
|
74 |
+
|
75 |
+
3. Grant of Patent License. Subject to the terms and conditions of
|
76 |
+
this License, each Contributor hereby grants to You a perpetual,
|
77 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
78 |
+
(except as stated in this section) patent license to make, have made,
|
79 |
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
80 |
+
where such license applies only to those patent claims licensable
|
81 |
+
by such Contributor that are necessarily infringed by their
|
82 |
+
Contribution(s) alone or by combination of their Contribution(s)
|
83 |
+
with the Work to which such Contribution(s) was submitted. If You
|
84 |
+
institute patent litigation against any entity (including a
|
85 |
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
86 |
+
or a Contribution incorporated within the Work constitutes direct
|
87 |
+
or contributory patent infringement, then any patent licenses
|
88 |
+
granted to You under this License for that Work shall terminate
|
89 |
+
as of the date such litigation is filed.
|
90 |
+
|
91 |
+
4. Redistribution. You may reproduce and distribute copies of the
|
92 |
+
Work or Derivative Works thereof in any medium, with or without
|
93 |
+
modifications, and in Source or Object form, provided that You
|
94 |
+
meet the following conditions:
|
95 |
+
|
96 |
+
(a) You must give any other recipients of the Work or
|
97 |
+
Derivative Works a copy of this License; and
|
98 |
+
|
99 |
+
(b) You must cause any modified files to carry prominent notices
|
100 |
+
stating that You changed the files; and
|
101 |
+
|
102 |
+
(c) You must retain, in the Source form of any Derivative Works
|
103 |
+
that You distribute, all copyright, patent, trademark, and
|
104 |
+
attribution notices from the Source form of the Work,
|
105 |
+
excluding those notices that do not pertain to any part of
|
106 |
+
the Derivative Works; and
|
107 |
+
|
108 |
+
(d) If the Work includes a "NOTICE" text file as part of its
|
109 |
+
distribution, then any Derivative Works that You distribute must
|
110 |
+
include a readable copy of the attribution notices contained
|
111 |
+
within such NOTICE file, excluding those notices that do not
|
112 |
+
pertain to any part of the Derivative Works, in at least one
|
113 |
+
of the following places: within a NOTICE text file distributed
|
114 |
+
as part of the Derivative Works; within the Source form or
|
115 |
+
documentation, if provided along with the Derivative Works; or,
|
116 |
+
within a display generated by the Derivative Works, if and
|
117 |
+
wherever such third-party notices normally appear. The contents
|
118 |
+
of the NOTICE file are for informational purposes only and
|
119 |
+
do not modify the License. You may add Your own attribution
|
120 |
+
notices within Derivative Works that You distribute, alongside
|
121 |
+
or as an addendum to the NOTICE text from the Work, provided
|
122 |
+
that such additional attribution notices cannot be construed
|
123 |
+
as modifying the License.
|
124 |
+
|
125 |
+
You may add Your own copyright statement to Your modifications and
|
126 |
+
may provide additional or different license terms and conditions
|
127 |
+
for use, reproduction, or distribution of Your modifications, or
|
128 |
+
for any such Derivative Works as a whole, provided Your use,
|
129 |
+
reproduction, and distribution of the Work otherwise complies with
|
130 |
+
the conditions stated in this License.
|
131 |
+
|
132 |
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
133 |
+
any Contribution intentionally submitted for inclusion in the Work
|
134 |
+
by You to the Licensor shall be under the terms and conditions of
|
135 |
+
this License, without any additional terms or conditions.
|
136 |
+
Notwithstanding the above, nothing herein shall supersede or modify
|
137 |
+
the terms of any separate license agreement you may have executed
|
138 |
+
with Licensor regarding such Contributions.
|
139 |
+
|
140 |
+
6. Trademarks. This License does not grant permission to use the trade
|
141 |
+
names, trademarks, service marks, or product names of the Licensor,
|
142 |
+
except as required for reasonable and customary use in describing the
|
143 |
+
origin of the Work and reproducing the content of the NOTICE file.
|
144 |
+
|
145 |
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
146 |
+
agreed to in writing, Licensor provides the Work (and each
|
147 |
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
148 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
149 |
+
implied, including, without limitation, any warranties or conditions
|
150 |
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
151 |
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
152 |
+
appropriateness of using or redistributing the Work and assume any
|
153 |
+
risks associated with Your exercise of permissions under this License.
|
154 |
+
|
155 |
+
8. Limitation of Liability. In no event and under no legal theory,
|
156 |
+
whether in tort (including negligence), contract, or otherwise,
|
157 |
+
unless required by applicable law (such as deliberate and grossly
|
158 |
+
negligent acts) or agreed to in writing, shall any Contributor be
|
159 |
+
liable to You for damages, including any direct, indirect, special,
|
160 |
+
incidental, or consequential damages of any character arising as a
|
161 |
+
result of this License or out of the use or inability to use the
|
162 |
+
Work (including but not limited to damages for loss of goodwill,
|
163 |
+
work stoppage, computer failure or malfunction, or any and all
|
164 |
+
other commercial damages or losses), even if such Contributor
|
165 |
+
has been advised of the possibility of such damages.
|
166 |
+
|
167 |
+
9. Accepting Warranty or Additional Liability. While redistributing
|
168 |
+
the Work or Derivative Works thereof, You may choose to offer,
|
169 |
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
170 |
+
or other liability obligations and/or rights consistent with this
|
171 |
+
License. However, in accepting such obligations, You may act only
|
172 |
+
on Your own behalf and on Your sole responsibility, not on behalf
|
173 |
+
of any other Contributor, and only if You agree to indemnify,
|
174 |
+
defend, and hold each Contributor harmless for any liability
|
175 |
+
incurred by, or claims asserted against, such Contributor by reason
|
176 |
+
of your accepting any such warranty or additional liability.
|
177 |
+
|
178 |
+
END OF TERMS AND CONDITIONS
|
179 |
+
|
180 |
+
APPENDIX: How to apply the Apache License to your work.
|
181 |
+
|
182 |
+
To apply the Apache License to your work, attach the following
|
183 |
+
boilerplate notice, with the fields enclosed by brackets "[]"
|
184 |
+
replaced with your own identifying information. (Don't include
|
185 |
+
the brackets!) The text should be enclosed in the appropriate
|
186 |
+
comment syntax for the file format. We also recommend that a
|
187 |
+
file or class name and description of purpose be included on the
|
188 |
+
same "printed page" as the copyright notice for easier
|
189 |
+
identification within third-party archives.
|
190 |
+
|
191 |
+
Copyright 2022 SenseTime X-Lab.
|
192 |
+
|
193 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
194 |
+
you may not use this file except in compliance with the License.
|
195 |
+
You may obtain a copy of the License at
|
196 |
+
|
197 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
198 |
+
|
199 |
+
Unless required by applicable law or agreed to in writing, software
|
200 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
201 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
202 |
+
See the License for the specific language governing permissions and
|
203 |
+
limitations under the License.
|
extensions/microsoftexcel-controlnet/annotator/uniformer/__init__.py
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from annotator.annotator_path import models_path
|
3 |
+
from modules import devices
|
4 |
+
from annotator.uniformer.inference import init_segmentor, inference_segmentor, show_result_pyplot
|
5 |
+
|
6 |
+
try:
|
7 |
+
from mmseg.core.evaluation import get_palette
|
8 |
+
except ImportError:
|
9 |
+
from annotator.mmpkg.mmseg.core.evaluation import get_palette
|
10 |
+
|
11 |
+
modeldir = os.path.join(models_path, "uniformer")
|
12 |
+
checkpoint_file = "https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/upernet_global_small.pth"
|
13 |
+
config_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), "upernet_global_small.py")
|
14 |
+
old_modeldir = os.path.dirname(os.path.realpath(__file__))
|
15 |
+
model = None
|
16 |
+
|
17 |
+
def unload_uniformer_model():
|
18 |
+
global model
|
19 |
+
if model is not None:
|
20 |
+
model = model.cpu()
|
21 |
+
|
22 |
+
def apply_uniformer(img):
|
23 |
+
global model
|
24 |
+
if model is None:
|
25 |
+
modelpath = os.path.join(modeldir, "upernet_global_small.pth")
|
26 |
+
old_modelpath = os.path.join(old_modeldir, "upernet_global_small.pth")
|
27 |
+
if os.path.exists(old_modelpath):
|
28 |
+
modelpath = old_modelpath
|
29 |
+
elif not os.path.exists(modelpath):
|
30 |
+
from basicsr.utils.download_util import load_file_from_url
|
31 |
+
load_file_from_url(checkpoint_file, model_dir=modeldir)
|
32 |
+
|
33 |
+
model = init_segmentor(config_file, modelpath, device=devices.get_device_for("controlnet"))
|
34 |
+
model = model.to(devices.get_device_for("controlnet"))
|
35 |
+
|
36 |
+
if devices.get_device_for("controlnet").type == 'mps':
|
37 |
+
# adaptive_avg_pool2d can fail on MPS, workaround with CPU
|
38 |
+
import torch.nn.functional
|
39 |
+
|
40 |
+
orig_adaptive_avg_pool2d = torch.nn.functional.adaptive_avg_pool2d
|
41 |
+
def cpu_if_exception(input, *args, **kwargs):
|
42 |
+
try:
|
43 |
+
return orig_adaptive_avg_pool2d(input, *args, **kwargs)
|
44 |
+
except:
|
45 |
+
return orig_adaptive_avg_pool2d(input.cpu(), *args, **kwargs).to(input.device)
|
46 |
+
|
47 |
+
try:
|
48 |
+
torch.nn.functional.adaptive_avg_pool2d = cpu_if_exception
|
49 |
+
result = inference_segmentor(model, img)
|
50 |
+
finally:
|
51 |
+
torch.nn.functional.adaptive_avg_pool2d = orig_adaptive_avg_pool2d
|
52 |
+
else:
|
53 |
+
result = inference_segmentor(model, img)
|
54 |
+
|
55 |
+
res_img = show_result_pyplot(model, img, result, get_palette('ade'), opacity=1)
|
56 |
+
return res_img
|
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/datasets/ade20k.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# dataset settings
|
2 |
+
dataset_type = 'ADE20KDataset'
|
3 |
+
data_root = 'data/ade/ADEChallengeData2016'
|
4 |
+
img_norm_cfg = dict(
|
5 |
+
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
|
6 |
+
crop_size = (512, 512)
|
7 |
+
train_pipeline = [
|
8 |
+
dict(type='LoadImageFromFile'),
|
9 |
+
dict(type='LoadAnnotations', reduce_zero_label=True),
|
10 |
+
dict(type='Resize', img_scale=(2048, 512), ratio_range=(0.5, 2.0)),
|
11 |
+
dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
|
12 |
+
dict(type='RandomFlip', prob=0.5),
|
13 |
+
dict(type='PhotoMetricDistortion'),
|
14 |
+
dict(type='Normalize', **img_norm_cfg),
|
15 |
+
dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
|
16 |
+
dict(type='DefaultFormatBundle'),
|
17 |
+
dict(type='Collect', keys=['img', 'gt_semantic_seg']),
|
18 |
+
]
|
19 |
+
test_pipeline = [
|
20 |
+
dict(type='LoadImageFromFile'),
|
21 |
+
dict(
|
22 |
+
type='MultiScaleFlipAug',
|
23 |
+
img_scale=(2048, 512),
|
24 |
+
# img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
|
25 |
+
flip=False,
|
26 |
+
transforms=[
|
27 |
+
dict(type='Resize', keep_ratio=True),
|
28 |
+
dict(type='RandomFlip'),
|
29 |
+
dict(type='Normalize', **img_norm_cfg),
|
30 |
+
dict(type='ImageToTensor', keys=['img']),
|
31 |
+
dict(type='Collect', keys=['img']),
|
32 |
+
])
|
33 |
+
]
|
34 |
+
data = dict(
|
35 |
+
samples_per_gpu=4,
|
36 |
+
workers_per_gpu=4,
|
37 |
+
train=dict(
|
38 |
+
type=dataset_type,
|
39 |
+
data_root=data_root,
|
40 |
+
img_dir='images/training',
|
41 |
+
ann_dir='annotations/training',
|
42 |
+
pipeline=train_pipeline),
|
43 |
+
val=dict(
|
44 |
+
type=dataset_type,
|
45 |
+
data_root=data_root,
|
46 |
+
img_dir='images/validation',
|
47 |
+
ann_dir='annotations/validation',
|
48 |
+
pipeline=test_pipeline),
|
49 |
+
test=dict(
|
50 |
+
type=dataset_type,
|
51 |
+
data_root=data_root,
|
52 |
+
img_dir='images/validation',
|
53 |
+
ann_dir='annotations/validation',
|
54 |
+
pipeline=test_pipeline))
|
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/datasets/chase_db1.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# dataset settings
|
2 |
+
dataset_type = 'ChaseDB1Dataset'
|
3 |
+
data_root = 'data/CHASE_DB1'
|
4 |
+
img_norm_cfg = dict(
|
5 |
+
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
|
6 |
+
img_scale = (960, 999)
|
7 |
+
crop_size = (128, 128)
|
8 |
+
train_pipeline = [
|
9 |
+
dict(type='LoadImageFromFile'),
|
10 |
+
dict(type='LoadAnnotations'),
|
11 |
+
dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)),
|
12 |
+
dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
|
13 |
+
dict(type='RandomFlip', prob=0.5),
|
14 |
+
dict(type='PhotoMetricDistortion'),
|
15 |
+
dict(type='Normalize', **img_norm_cfg),
|
16 |
+
dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
|
17 |
+
dict(type='DefaultFormatBundle'),
|
18 |
+
dict(type='Collect', keys=['img', 'gt_semantic_seg'])
|
19 |
+
]
|
20 |
+
test_pipeline = [
|
21 |
+
dict(type='LoadImageFromFile'),
|
22 |
+
dict(
|
23 |
+
type='MultiScaleFlipAug',
|
24 |
+
img_scale=img_scale,
|
25 |
+
# img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0],
|
26 |
+
flip=False,
|
27 |
+
transforms=[
|
28 |
+
dict(type='Resize', keep_ratio=True),
|
29 |
+
dict(type='RandomFlip'),
|
30 |
+
dict(type='Normalize', **img_norm_cfg),
|
31 |
+
dict(type='ImageToTensor', keys=['img']),
|
32 |
+
dict(type='Collect', keys=['img'])
|
33 |
+
])
|
34 |
+
]
|
35 |
+
|
36 |
+
data = dict(
|
37 |
+
samples_per_gpu=4,
|
38 |
+
workers_per_gpu=4,
|
39 |
+
train=dict(
|
40 |
+
type='RepeatDataset',
|
41 |
+
times=40000,
|
42 |
+
dataset=dict(
|
43 |
+
type=dataset_type,
|
44 |
+
data_root=data_root,
|
45 |
+
img_dir='images/training',
|
46 |
+
ann_dir='annotations/training',
|
47 |
+
pipeline=train_pipeline)),
|
48 |
+
val=dict(
|
49 |
+
type=dataset_type,
|
50 |
+
data_root=data_root,
|
51 |
+
img_dir='images/validation',
|
52 |
+
ann_dir='annotations/validation',
|
53 |
+
pipeline=test_pipeline),
|
54 |
+
test=dict(
|
55 |
+
type=dataset_type,
|
56 |
+
data_root=data_root,
|
57 |
+
img_dir='images/validation',
|
58 |
+
ann_dir='annotations/validation',
|
59 |
+
pipeline=test_pipeline))
|
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/datasets/cityscapes.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# dataset settings
|
2 |
+
dataset_type = 'CityscapesDataset'
|
3 |
+
data_root = 'data/cityscapes/'
|
4 |
+
img_norm_cfg = dict(
|
5 |
+
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
|
6 |
+
crop_size = (512, 1024)
|
7 |
+
train_pipeline = [
|
8 |
+
dict(type='LoadImageFromFile'),
|
9 |
+
dict(type='LoadAnnotations'),
|
10 |
+
dict(type='Resize', img_scale=(2048, 1024), ratio_range=(0.5, 2.0)),
|
11 |
+
dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
|
12 |
+
dict(type='RandomFlip', prob=0.5),
|
13 |
+
dict(type='PhotoMetricDistortion'),
|
14 |
+
dict(type='Normalize', **img_norm_cfg),
|
15 |
+
dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
|
16 |
+
dict(type='DefaultFormatBundle'),
|
17 |
+
dict(type='Collect', keys=['img', 'gt_semantic_seg']),
|
18 |
+
]
|
19 |
+
test_pipeline = [
|
20 |
+
dict(type='LoadImageFromFile'),
|
21 |
+
dict(
|
22 |
+
type='MultiScaleFlipAug',
|
23 |
+
img_scale=(2048, 1024),
|
24 |
+
# img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
|
25 |
+
flip=False,
|
26 |
+
transforms=[
|
27 |
+
dict(type='Resize', keep_ratio=True),
|
28 |
+
dict(type='RandomFlip'),
|
29 |
+
dict(type='Normalize', **img_norm_cfg),
|
30 |
+
dict(type='ImageToTensor', keys=['img']),
|
31 |
+
dict(type='Collect', keys=['img']),
|
32 |
+
])
|
33 |
+
]
|
34 |
+
data = dict(
|
35 |
+
samples_per_gpu=2,
|
36 |
+
workers_per_gpu=2,
|
37 |
+
train=dict(
|
38 |
+
type=dataset_type,
|
39 |
+
data_root=data_root,
|
40 |
+
img_dir='leftImg8bit/train',
|
41 |
+
ann_dir='gtFine/train',
|
42 |
+
pipeline=train_pipeline),
|
43 |
+
val=dict(
|
44 |
+
type=dataset_type,
|
45 |
+
data_root=data_root,
|
46 |
+
img_dir='leftImg8bit/val',
|
47 |
+
ann_dir='gtFine/val',
|
48 |
+
pipeline=test_pipeline),
|
49 |
+
test=dict(
|
50 |
+
type=dataset_type,
|
51 |
+
data_root=data_root,
|
52 |
+
img_dir='leftImg8bit/val',
|
53 |
+
ann_dir='gtFine/val',
|
54 |
+
pipeline=test_pipeline))
|
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/datasets/cityscapes_769x769.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_base_ = './cityscapes.py'
|
2 |
+
img_norm_cfg = dict(
|
3 |
+
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
|
4 |
+
crop_size = (769, 769)
|
5 |
+
train_pipeline = [
|
6 |
+
dict(type='LoadImageFromFile'),
|
7 |
+
dict(type='LoadAnnotations'),
|
8 |
+
dict(type='Resize', img_scale=(2049, 1025), ratio_range=(0.5, 2.0)),
|
9 |
+
dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
|
10 |
+
dict(type='RandomFlip', prob=0.5),
|
11 |
+
dict(type='PhotoMetricDistortion'),
|
12 |
+
dict(type='Normalize', **img_norm_cfg),
|
13 |
+
dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
|
14 |
+
dict(type='DefaultFormatBundle'),
|
15 |
+
dict(type='Collect', keys=['img', 'gt_semantic_seg']),
|
16 |
+
]
|
17 |
+
test_pipeline = [
|
18 |
+
dict(type='LoadImageFromFile'),
|
19 |
+
dict(
|
20 |
+
type='MultiScaleFlipAug',
|
21 |
+
img_scale=(2049, 1025),
|
22 |
+
# img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
|
23 |
+
flip=False,
|
24 |
+
transforms=[
|
25 |
+
dict(type='Resize', keep_ratio=True),
|
26 |
+
dict(type='RandomFlip'),
|
27 |
+
dict(type='Normalize', **img_norm_cfg),
|
28 |
+
dict(type='ImageToTensor', keys=['img']),
|
29 |
+
dict(type='Collect', keys=['img']),
|
30 |
+
])
|
31 |
+
]
|
32 |
+
data = dict(
|
33 |
+
train=dict(pipeline=train_pipeline),
|
34 |
+
val=dict(pipeline=test_pipeline),
|
35 |
+
test=dict(pipeline=test_pipeline))
|
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/datasets/drive.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# dataset settings
|
2 |
+
dataset_type = 'DRIVEDataset'
|
3 |
+
data_root = 'data/DRIVE'
|
4 |
+
img_norm_cfg = dict(
|
5 |
+
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
|
6 |
+
img_scale = (584, 565)
|
7 |
+
crop_size = (64, 64)
|
8 |
+
train_pipeline = [
|
9 |
+
dict(type='LoadImageFromFile'),
|
10 |
+
dict(type='LoadAnnotations'),
|
11 |
+
dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)),
|
12 |
+
dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
|
13 |
+
dict(type='RandomFlip', prob=0.5),
|
14 |
+
dict(type='PhotoMetricDistortion'),
|
15 |
+
dict(type='Normalize', **img_norm_cfg),
|
16 |
+
dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
|
17 |
+
dict(type='DefaultFormatBundle'),
|
18 |
+
dict(type='Collect', keys=['img', 'gt_semantic_seg'])
|
19 |
+
]
|
20 |
+
test_pipeline = [
|
21 |
+
dict(type='LoadImageFromFile'),
|
22 |
+
dict(
|
23 |
+
type='MultiScaleFlipAug',
|
24 |
+
img_scale=img_scale,
|
25 |
+
# img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0],
|
26 |
+
flip=False,
|
27 |
+
transforms=[
|
28 |
+
dict(type='Resize', keep_ratio=True),
|
29 |
+
dict(type='RandomFlip'),
|
30 |
+
dict(type='Normalize', **img_norm_cfg),
|
31 |
+
dict(type='ImageToTensor', keys=['img']),
|
32 |
+
dict(type='Collect', keys=['img'])
|
33 |
+
])
|
34 |
+
]
|
35 |
+
|
36 |
+
data = dict(
|
37 |
+
samples_per_gpu=4,
|
38 |
+
workers_per_gpu=4,
|
39 |
+
train=dict(
|
40 |
+
type='RepeatDataset',
|
41 |
+
times=40000,
|
42 |
+
dataset=dict(
|
43 |
+
type=dataset_type,
|
44 |
+
data_root=data_root,
|
45 |
+
img_dir='images/training',
|
46 |
+
ann_dir='annotations/training',
|
47 |
+
pipeline=train_pipeline)),
|
48 |
+
val=dict(
|
49 |
+
type=dataset_type,
|
50 |
+
data_root=data_root,
|
51 |
+
img_dir='images/validation',
|
52 |
+
ann_dir='annotations/validation',
|
53 |
+
pipeline=test_pipeline),
|
54 |
+
test=dict(
|
55 |
+
type=dataset_type,
|
56 |
+
data_root=data_root,
|
57 |
+
img_dir='images/validation',
|
58 |
+
ann_dir='annotations/validation',
|
59 |
+
pipeline=test_pipeline))
|
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/datasets/hrf.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# dataset settings
|
2 |
+
dataset_type = 'HRFDataset'
|
3 |
+
data_root = 'data/HRF'
|
4 |
+
img_norm_cfg = dict(
|
5 |
+
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
|
6 |
+
img_scale = (2336, 3504)
|
7 |
+
crop_size = (256, 256)
|
8 |
+
train_pipeline = [
|
9 |
+
dict(type='LoadImageFromFile'),
|
10 |
+
dict(type='LoadAnnotations'),
|
11 |
+
dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)),
|
12 |
+
dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
|
13 |
+
dict(type='RandomFlip', prob=0.5),
|
14 |
+
dict(type='PhotoMetricDistortion'),
|
15 |
+
dict(type='Normalize', **img_norm_cfg),
|
16 |
+
dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
|
17 |
+
dict(type='DefaultFormatBundle'),
|
18 |
+
dict(type='Collect', keys=['img', 'gt_semantic_seg'])
|
19 |
+
]
|
20 |
+
test_pipeline = [
|
21 |
+
dict(type='LoadImageFromFile'),
|
22 |
+
dict(
|
23 |
+
type='MultiScaleFlipAug',
|
24 |
+
img_scale=img_scale,
|
25 |
+
# img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0],
|
26 |
+
flip=False,
|
27 |
+
transforms=[
|
28 |
+
dict(type='Resize', keep_ratio=True),
|
29 |
+
dict(type='RandomFlip'),
|
30 |
+
dict(type='Normalize', **img_norm_cfg),
|
31 |
+
dict(type='ImageToTensor', keys=['img']),
|
32 |
+
dict(type='Collect', keys=['img'])
|
33 |
+
])
|
34 |
+
]
|
35 |
+
|
36 |
+
data = dict(
|
37 |
+
samples_per_gpu=4,
|
38 |
+
workers_per_gpu=4,
|
39 |
+
train=dict(
|
40 |
+
type='RepeatDataset',
|
41 |
+
times=40000,
|
42 |
+
dataset=dict(
|
43 |
+
type=dataset_type,
|
44 |
+
data_root=data_root,
|
45 |
+
img_dir='images/training',
|
46 |
+
ann_dir='annotations/training',
|
47 |
+
pipeline=train_pipeline)),
|
48 |
+
val=dict(
|
49 |
+
type=dataset_type,
|
50 |
+
data_root=data_root,
|
51 |
+
img_dir='images/validation',
|
52 |
+
ann_dir='annotations/validation',
|
53 |
+
pipeline=test_pipeline),
|
54 |
+
test=dict(
|
55 |
+
type=dataset_type,
|
56 |
+
data_root=data_root,
|
57 |
+
img_dir='images/validation',
|
58 |
+
ann_dir='annotations/validation',
|
59 |
+
pipeline=test_pipeline))
|
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/datasets/pascal_context.py
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# dataset settings
|
2 |
+
dataset_type = 'PascalContextDataset'
|
3 |
+
data_root = 'data/VOCdevkit/VOC2010/'
|
4 |
+
img_norm_cfg = dict(
|
5 |
+
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
|
6 |
+
|
7 |
+
img_scale = (520, 520)
|
8 |
+
crop_size = (480, 480)
|
9 |
+
|
10 |
+
train_pipeline = [
|
11 |
+
dict(type='LoadImageFromFile'),
|
12 |
+
dict(type='LoadAnnotations'),
|
13 |
+
dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)),
|
14 |
+
dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
|
15 |
+
dict(type='RandomFlip', prob=0.5),
|
16 |
+
dict(type='PhotoMetricDistortion'),
|
17 |
+
dict(type='Normalize', **img_norm_cfg),
|
18 |
+
dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
|
19 |
+
dict(type='DefaultFormatBundle'),
|
20 |
+
dict(type='Collect', keys=['img', 'gt_semantic_seg']),
|
21 |
+
]
|
22 |
+
test_pipeline = [
|
23 |
+
dict(type='LoadImageFromFile'),
|
24 |
+
dict(
|
25 |
+
type='MultiScaleFlipAug',
|
26 |
+
img_scale=img_scale,
|
27 |
+
# img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
|
28 |
+
flip=False,
|
29 |
+
transforms=[
|
30 |
+
dict(type='Resize', keep_ratio=True),
|
31 |
+
dict(type='RandomFlip'),
|
32 |
+
dict(type='Normalize', **img_norm_cfg),
|
33 |
+
dict(type='ImageToTensor', keys=['img']),
|
34 |
+
dict(type='Collect', keys=['img']),
|
35 |
+
])
|
36 |
+
]
|
37 |
+
data = dict(
|
38 |
+
samples_per_gpu=4,
|
39 |
+
workers_per_gpu=4,
|
40 |
+
train=dict(
|
41 |
+
type=dataset_type,
|
42 |
+
data_root=data_root,
|
43 |
+
img_dir='JPEGImages',
|
44 |
+
ann_dir='SegmentationClassContext',
|
45 |
+
split='ImageSets/SegmentationContext/train.txt',
|
46 |
+
pipeline=train_pipeline),
|
47 |
+
val=dict(
|
48 |
+
type=dataset_type,
|
49 |
+
data_root=data_root,
|
50 |
+
img_dir='JPEGImages',
|
51 |
+
ann_dir='SegmentationClassContext',
|
52 |
+
split='ImageSets/SegmentationContext/val.txt',
|
53 |
+
pipeline=test_pipeline),
|
54 |
+
test=dict(
|
55 |
+
type=dataset_type,
|
56 |
+
data_root=data_root,
|
57 |
+
img_dir='JPEGImages',
|
58 |
+
ann_dir='SegmentationClassContext',
|
59 |
+
split='ImageSets/SegmentationContext/val.txt',
|
60 |
+
pipeline=test_pipeline))
|
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/datasets/pascal_context_59.py
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# dataset settings
|
2 |
+
dataset_type = 'PascalContextDataset59'
|
3 |
+
data_root = 'data/VOCdevkit/VOC2010/'
|
4 |
+
img_norm_cfg = dict(
|
5 |
+
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
|
6 |
+
|
7 |
+
img_scale = (520, 520)
|
8 |
+
crop_size = (480, 480)
|
9 |
+
|
10 |
+
train_pipeline = [
|
11 |
+
dict(type='LoadImageFromFile'),
|
12 |
+
dict(type='LoadAnnotations', reduce_zero_label=True),
|
13 |
+
dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)),
|
14 |
+
dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
|
15 |
+
dict(type='RandomFlip', prob=0.5),
|
16 |
+
dict(type='PhotoMetricDistortion'),
|
17 |
+
dict(type='Normalize', **img_norm_cfg),
|
18 |
+
dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
|
19 |
+
dict(type='DefaultFormatBundle'),
|
20 |
+
dict(type='Collect', keys=['img', 'gt_semantic_seg']),
|
21 |
+
]
|
22 |
+
test_pipeline = [
|
23 |
+
dict(type='LoadImageFromFile'),
|
24 |
+
dict(
|
25 |
+
type='MultiScaleFlipAug',
|
26 |
+
img_scale=img_scale,
|
27 |
+
# img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
|
28 |
+
flip=False,
|
29 |
+
transforms=[
|
30 |
+
dict(type='Resize', keep_ratio=True),
|
31 |
+
dict(type='RandomFlip'),
|
32 |
+
dict(type='Normalize', **img_norm_cfg),
|
33 |
+
dict(type='ImageToTensor', keys=['img']),
|
34 |
+
dict(type='Collect', keys=['img']),
|
35 |
+
])
|
36 |
+
]
|
37 |
+
data = dict(
|
38 |
+
samples_per_gpu=4,
|
39 |
+
workers_per_gpu=4,
|
40 |
+
train=dict(
|
41 |
+
type=dataset_type,
|
42 |
+
data_root=data_root,
|
43 |
+
img_dir='JPEGImages',
|
44 |
+
ann_dir='SegmentationClassContext',
|
45 |
+
split='ImageSets/SegmentationContext/train.txt',
|
46 |
+
pipeline=train_pipeline),
|
47 |
+
val=dict(
|
48 |
+
type=dataset_type,
|
49 |
+
data_root=data_root,
|
50 |
+
img_dir='JPEGImages',
|
51 |
+
ann_dir='SegmentationClassContext',
|
52 |
+
split='ImageSets/SegmentationContext/val.txt',
|
53 |
+
pipeline=test_pipeline),
|
54 |
+
test=dict(
|
55 |
+
type=dataset_type,
|
56 |
+
data_root=data_root,
|
57 |
+
img_dir='JPEGImages',
|
58 |
+
ann_dir='SegmentationClassContext',
|
59 |
+
split='ImageSets/SegmentationContext/val.txt',
|
60 |
+
pipeline=test_pipeline))
|
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/datasets/pascal_voc12.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# dataset settings
|
2 |
+
dataset_type = 'PascalVOCDataset'
|
3 |
+
data_root = 'data/VOCdevkit/VOC2012'
|
4 |
+
img_norm_cfg = dict(
|
5 |
+
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
|
6 |
+
crop_size = (512, 512)
|
7 |
+
train_pipeline = [
|
8 |
+
dict(type='LoadImageFromFile'),
|
9 |
+
dict(type='LoadAnnotations'),
|
10 |
+
dict(type='Resize', img_scale=(2048, 512), ratio_range=(0.5, 2.0)),
|
11 |
+
dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
|
12 |
+
dict(type='RandomFlip', prob=0.5),
|
13 |
+
dict(type='PhotoMetricDistortion'),
|
14 |
+
dict(type='Normalize', **img_norm_cfg),
|
15 |
+
dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
|
16 |
+
dict(type='DefaultFormatBundle'),
|
17 |
+
dict(type='Collect', keys=['img', 'gt_semantic_seg']),
|
18 |
+
]
|
19 |
+
test_pipeline = [
|
20 |
+
dict(type='LoadImageFromFile'),
|
21 |
+
dict(
|
22 |
+
type='MultiScaleFlipAug',
|
23 |
+
img_scale=(2048, 512),
|
24 |
+
# img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
|
25 |
+
flip=False,
|
26 |
+
transforms=[
|
27 |
+
dict(type='Resize', keep_ratio=True),
|
28 |
+
dict(type='RandomFlip'),
|
29 |
+
dict(type='Normalize', **img_norm_cfg),
|
30 |
+
dict(type='ImageToTensor', keys=['img']),
|
31 |
+
dict(type='Collect', keys=['img']),
|
32 |
+
])
|
33 |
+
]
|
34 |
+
data = dict(
|
35 |
+
samples_per_gpu=4,
|
36 |
+
workers_per_gpu=4,
|
37 |
+
train=dict(
|
38 |
+
type=dataset_type,
|
39 |
+
data_root=data_root,
|
40 |
+
img_dir='JPEGImages',
|
41 |
+
ann_dir='SegmentationClass',
|
42 |
+
split='ImageSets/Segmentation/train.txt',
|
43 |
+
pipeline=train_pipeline),
|
44 |
+
val=dict(
|
45 |
+
type=dataset_type,
|
46 |
+
data_root=data_root,
|
47 |
+
img_dir='JPEGImages',
|
48 |
+
ann_dir='SegmentationClass',
|
49 |
+
split='ImageSets/Segmentation/val.txt',
|
50 |
+
pipeline=test_pipeline),
|
51 |
+
test=dict(
|
52 |
+
type=dataset_type,
|
53 |
+
data_root=data_root,
|
54 |
+
img_dir='JPEGImages',
|
55 |
+
ann_dir='SegmentationClass',
|
56 |
+
split='ImageSets/Segmentation/val.txt',
|
57 |
+
pipeline=test_pipeline))
|
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/datasets/pascal_voc12_aug.py
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_base_ = './pascal_voc12.py'
|
2 |
+
# dataset settings
|
3 |
+
data = dict(
|
4 |
+
train=dict(
|
5 |
+
ann_dir=['SegmentationClass', 'SegmentationClassAug'],
|
6 |
+
split=[
|
7 |
+
'ImageSets/Segmentation/train.txt',
|
8 |
+
'ImageSets/Segmentation/aug.txt'
|
9 |
+
]))
|
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/datasets/stare.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# dataset settings
|
2 |
+
dataset_type = 'STAREDataset'
|
3 |
+
data_root = 'data/STARE'
|
4 |
+
img_norm_cfg = dict(
|
5 |
+
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
|
6 |
+
img_scale = (605, 700)
|
7 |
+
crop_size = (128, 128)
|
8 |
+
train_pipeline = [
|
9 |
+
dict(type='LoadImageFromFile'),
|
10 |
+
dict(type='LoadAnnotations'),
|
11 |
+
dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)),
|
12 |
+
dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
|
13 |
+
dict(type='RandomFlip', prob=0.5),
|
14 |
+
dict(type='PhotoMetricDistortion'),
|
15 |
+
dict(type='Normalize', **img_norm_cfg),
|
16 |
+
dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
|
17 |
+
dict(type='DefaultFormatBundle'),
|
18 |
+
dict(type='Collect', keys=['img', 'gt_semantic_seg'])
|
19 |
+
]
|
20 |
+
test_pipeline = [
|
21 |
+
dict(type='LoadImageFromFile'),
|
22 |
+
dict(
|
23 |
+
type='MultiScaleFlipAug',
|
24 |
+
img_scale=img_scale,
|
25 |
+
# img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0],
|
26 |
+
flip=False,
|
27 |
+
transforms=[
|
28 |
+
dict(type='Resize', keep_ratio=True),
|
29 |
+
dict(type='RandomFlip'),
|
30 |
+
dict(type='Normalize', **img_norm_cfg),
|
31 |
+
dict(type='ImageToTensor', keys=['img']),
|
32 |
+
dict(type='Collect', keys=['img'])
|
33 |
+
])
|
34 |
+
]
|
35 |
+
|
36 |
+
data = dict(
|
37 |
+
samples_per_gpu=4,
|
38 |
+
workers_per_gpu=4,
|
39 |
+
train=dict(
|
40 |
+
type='RepeatDataset',
|
41 |
+
times=40000,
|
42 |
+
dataset=dict(
|
43 |
+
type=dataset_type,
|
44 |
+
data_root=data_root,
|
45 |
+
img_dir='images/training',
|
46 |
+
ann_dir='annotations/training',
|
47 |
+
pipeline=train_pipeline)),
|
48 |
+
val=dict(
|
49 |
+
type=dataset_type,
|
50 |
+
data_root=data_root,
|
51 |
+
img_dir='images/validation',
|
52 |
+
ann_dir='annotations/validation',
|
53 |
+
pipeline=test_pipeline),
|
54 |
+
test=dict(
|
55 |
+
type=dataset_type,
|
56 |
+
data_root=data_root,
|
57 |
+
img_dir='images/validation',
|
58 |
+
ann_dir='annotations/validation',
|
59 |
+
pipeline=test_pipeline))
|
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/default_runtime.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# yapf:disable
|
2 |
+
log_config = dict(
|
3 |
+
interval=50,
|
4 |
+
hooks=[
|
5 |
+
dict(type='TextLoggerHook', by_epoch=False),
|
6 |
+
# dict(type='TensorboardLoggerHook')
|
7 |
+
])
|
8 |
+
# yapf:enable
|
9 |
+
dist_params = dict(backend='nccl')
|
10 |
+
log_level = 'INFO'
|
11 |
+
load_from = None
|
12 |
+
resume_from = None
|
13 |
+
workflow = [('train', 1)]
|
14 |
+
cudnn_benchmark = True
|
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/ann_r50-d8.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# model settings
|
2 |
+
norm_cfg = dict(type='SyncBN', requires_grad=True)
|
3 |
+
model = dict(
|
4 |
+
type='EncoderDecoder',
|
5 |
+
pretrained='open-mmlab://resnet50_v1c',
|
6 |
+
backbone=dict(
|
7 |
+
type='ResNetV1c',
|
8 |
+
depth=50,
|
9 |
+
num_stages=4,
|
10 |
+
out_indices=(0, 1, 2, 3),
|
11 |
+
dilations=(1, 1, 2, 4),
|
12 |
+
strides=(1, 2, 1, 1),
|
13 |
+
norm_cfg=norm_cfg,
|
14 |
+
norm_eval=False,
|
15 |
+
style='pytorch',
|
16 |
+
contract_dilation=True),
|
17 |
+
decode_head=dict(
|
18 |
+
type='ANNHead',
|
19 |
+
in_channels=[1024, 2048],
|
20 |
+
in_index=[2, 3],
|
21 |
+
channels=512,
|
22 |
+
project_channels=256,
|
23 |
+
query_scales=(1, ),
|
24 |
+
key_pool_scales=(1, 3, 6, 8),
|
25 |
+
dropout_ratio=0.1,
|
26 |
+
num_classes=19,
|
27 |
+
norm_cfg=norm_cfg,
|
28 |
+
align_corners=False,
|
29 |
+
loss_decode=dict(
|
30 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
|
31 |
+
auxiliary_head=dict(
|
32 |
+
type='FCNHead',
|
33 |
+
in_channels=1024,
|
34 |
+
in_index=2,
|
35 |
+
channels=256,
|
36 |
+
num_convs=1,
|
37 |
+
concat_input=False,
|
38 |
+
dropout_ratio=0.1,
|
39 |
+
num_classes=19,
|
40 |
+
norm_cfg=norm_cfg,
|
41 |
+
align_corners=False,
|
42 |
+
loss_decode=dict(
|
43 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
|
44 |
+
# model training and testing settings
|
45 |
+
train_cfg=dict(),
|
46 |
+
test_cfg=dict(mode='whole'))
|
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/apcnet_r50-d8.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# model settings
|
2 |
+
norm_cfg = dict(type='SyncBN', requires_grad=True)
|
3 |
+
model = dict(
|
4 |
+
type='EncoderDecoder',
|
5 |
+
pretrained='open-mmlab://resnet50_v1c',
|
6 |
+
backbone=dict(
|
7 |
+
type='ResNetV1c',
|
8 |
+
depth=50,
|
9 |
+
num_stages=4,
|
10 |
+
out_indices=(0, 1, 2, 3),
|
11 |
+
dilations=(1, 1, 2, 4),
|
12 |
+
strides=(1, 2, 1, 1),
|
13 |
+
norm_cfg=norm_cfg,
|
14 |
+
norm_eval=False,
|
15 |
+
style='pytorch',
|
16 |
+
contract_dilation=True),
|
17 |
+
decode_head=dict(
|
18 |
+
type='APCHead',
|
19 |
+
in_channels=2048,
|
20 |
+
in_index=3,
|
21 |
+
channels=512,
|
22 |
+
pool_scales=(1, 2, 3, 6),
|
23 |
+
dropout_ratio=0.1,
|
24 |
+
num_classes=19,
|
25 |
+
norm_cfg=dict(type='SyncBN', requires_grad=True),
|
26 |
+
align_corners=False,
|
27 |
+
loss_decode=dict(
|
28 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
|
29 |
+
auxiliary_head=dict(
|
30 |
+
type='FCNHead',
|
31 |
+
in_channels=1024,
|
32 |
+
in_index=2,
|
33 |
+
channels=256,
|
34 |
+
num_convs=1,
|
35 |
+
concat_input=False,
|
36 |
+
dropout_ratio=0.1,
|
37 |
+
num_classes=19,
|
38 |
+
norm_cfg=norm_cfg,
|
39 |
+
align_corners=False,
|
40 |
+
loss_decode=dict(
|
41 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
|
42 |
+
# model training and testing settings
|
43 |
+
train_cfg=dict(),
|
44 |
+
test_cfg=dict(mode='whole'))
|
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/ccnet_r50-d8.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# model settings
|
2 |
+
norm_cfg = dict(type='SyncBN', requires_grad=True)
|
3 |
+
model = dict(
|
4 |
+
type='EncoderDecoder',
|
5 |
+
pretrained='open-mmlab://resnet50_v1c',
|
6 |
+
backbone=dict(
|
7 |
+
type='ResNetV1c',
|
8 |
+
depth=50,
|
9 |
+
num_stages=4,
|
10 |
+
out_indices=(0, 1, 2, 3),
|
11 |
+
dilations=(1, 1, 2, 4),
|
12 |
+
strides=(1, 2, 1, 1),
|
13 |
+
norm_cfg=norm_cfg,
|
14 |
+
norm_eval=False,
|
15 |
+
style='pytorch',
|
16 |
+
contract_dilation=True),
|
17 |
+
decode_head=dict(
|
18 |
+
type='CCHead',
|
19 |
+
in_channels=2048,
|
20 |
+
in_index=3,
|
21 |
+
channels=512,
|
22 |
+
recurrence=2,
|
23 |
+
dropout_ratio=0.1,
|
24 |
+
num_classes=19,
|
25 |
+
norm_cfg=norm_cfg,
|
26 |
+
align_corners=False,
|
27 |
+
loss_decode=dict(
|
28 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
|
29 |
+
auxiliary_head=dict(
|
30 |
+
type='FCNHead',
|
31 |
+
in_channels=1024,
|
32 |
+
in_index=2,
|
33 |
+
channels=256,
|
34 |
+
num_convs=1,
|
35 |
+
concat_input=False,
|
36 |
+
dropout_ratio=0.1,
|
37 |
+
num_classes=19,
|
38 |
+
norm_cfg=norm_cfg,
|
39 |
+
align_corners=False,
|
40 |
+
loss_decode=dict(
|
41 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
|
42 |
+
# model training and testing settings
|
43 |
+
train_cfg=dict(),
|
44 |
+
test_cfg=dict(mode='whole'))
|
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/cgnet.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# model settings
|
2 |
+
norm_cfg = dict(type='SyncBN', eps=1e-03, requires_grad=True)
|
3 |
+
model = dict(
|
4 |
+
type='EncoderDecoder',
|
5 |
+
backbone=dict(
|
6 |
+
type='CGNet',
|
7 |
+
norm_cfg=norm_cfg,
|
8 |
+
in_channels=3,
|
9 |
+
num_channels=(32, 64, 128),
|
10 |
+
num_blocks=(3, 21),
|
11 |
+
dilations=(2, 4),
|
12 |
+
reductions=(8, 16)),
|
13 |
+
decode_head=dict(
|
14 |
+
type='FCNHead',
|
15 |
+
in_channels=256,
|
16 |
+
in_index=2,
|
17 |
+
channels=256,
|
18 |
+
num_convs=0,
|
19 |
+
concat_input=False,
|
20 |
+
dropout_ratio=0,
|
21 |
+
num_classes=19,
|
22 |
+
norm_cfg=norm_cfg,
|
23 |
+
loss_decode=dict(
|
24 |
+
type='CrossEntropyLoss',
|
25 |
+
use_sigmoid=False,
|
26 |
+
loss_weight=1.0,
|
27 |
+
class_weight=[
|
28 |
+
2.5959933, 6.7415504, 3.5354059, 9.8663225, 9.690899, 9.369352,
|
29 |
+
10.289121, 9.953208, 4.3097677, 9.490387, 7.674431, 9.396905,
|
30 |
+
10.347791, 6.3927646, 10.226669, 10.241062, 10.280587,
|
31 |
+
10.396974, 10.055647
|
32 |
+
])),
|
33 |
+
# model training and testing settings
|
34 |
+
train_cfg=dict(sampler=None),
|
35 |
+
test_cfg=dict(mode='whole'))
|
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/danet_r50-d8.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# model settings
|
2 |
+
norm_cfg = dict(type='SyncBN', requires_grad=True)
|
3 |
+
model = dict(
|
4 |
+
type='EncoderDecoder',
|
5 |
+
pretrained='open-mmlab://resnet50_v1c',
|
6 |
+
backbone=dict(
|
7 |
+
type='ResNetV1c',
|
8 |
+
depth=50,
|
9 |
+
num_stages=4,
|
10 |
+
out_indices=(0, 1, 2, 3),
|
11 |
+
dilations=(1, 1, 2, 4),
|
12 |
+
strides=(1, 2, 1, 1),
|
13 |
+
norm_cfg=norm_cfg,
|
14 |
+
norm_eval=False,
|
15 |
+
style='pytorch',
|
16 |
+
contract_dilation=True),
|
17 |
+
decode_head=dict(
|
18 |
+
type='DAHead',
|
19 |
+
in_channels=2048,
|
20 |
+
in_index=3,
|
21 |
+
channels=512,
|
22 |
+
pam_channels=64,
|
23 |
+
dropout_ratio=0.1,
|
24 |
+
num_classes=19,
|
25 |
+
norm_cfg=norm_cfg,
|
26 |
+
align_corners=False,
|
27 |
+
loss_decode=dict(
|
28 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
|
29 |
+
auxiliary_head=dict(
|
30 |
+
type='FCNHead',
|
31 |
+
in_channels=1024,
|
32 |
+
in_index=2,
|
33 |
+
channels=256,
|
34 |
+
num_convs=1,
|
35 |
+
concat_input=False,
|
36 |
+
dropout_ratio=0.1,
|
37 |
+
num_classes=19,
|
38 |
+
norm_cfg=norm_cfg,
|
39 |
+
align_corners=False,
|
40 |
+
loss_decode=dict(
|
41 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
|
42 |
+
# model training and testing settings
|
43 |
+
train_cfg=dict(),
|
44 |
+
test_cfg=dict(mode='whole'))
|
extensions/microsoftexcel-controlnet/annotator/uniformer/configs/_base_/models/deeplabv3_r50-d8.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# model settings
|
2 |
+
norm_cfg = dict(type='SyncBN', requires_grad=True)
|
3 |
+
model = dict(
|
4 |
+
type='EncoderDecoder',
|
5 |
+
pretrained='open-mmlab://resnet50_v1c',
|
6 |
+
backbone=dict(
|
7 |
+
type='ResNetV1c',
|
8 |
+
depth=50,
|
9 |
+
num_stages=4,
|
10 |
+
out_indices=(0, 1, 2, 3),
|
11 |
+
dilations=(1, 1, 2, 4),
|
12 |
+
strides=(1, 2, 1, 1),
|
13 |
+
norm_cfg=norm_cfg,
|
14 |
+
norm_eval=False,
|
15 |
+
style='pytorch',
|
16 |
+
contract_dilation=True),
|
17 |
+
decode_head=dict(
|
18 |
+
type='ASPPHead',
|
19 |
+
in_channels=2048,
|
20 |
+
in_index=3,
|
21 |
+
channels=512,
|
22 |
+
dilations=(1, 12, 24, 36),
|
23 |
+
dropout_ratio=0.1,
|
24 |
+
num_classes=19,
|
25 |
+
norm_cfg=norm_cfg,
|
26 |
+
align_corners=False,
|
27 |
+
loss_decode=dict(
|
28 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
|
29 |
+
auxiliary_head=dict(
|
30 |
+
type='FCNHead',
|
31 |
+
in_channels=1024,
|
32 |
+
in_index=2,
|
33 |
+
channels=256,
|
34 |
+
num_convs=1,
|
35 |
+
concat_input=False,
|
36 |
+
dropout_ratio=0.1,
|
37 |
+
num_classes=19,
|
38 |
+
norm_cfg=norm_cfg,
|
39 |
+
align_corners=False,
|
40 |
+
loss_decode=dict(
|
41 |
+
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
|
42 |
+
# model training and testing settings
|
43 |
+
train_cfg=dict(),
|
44 |
+
test_cfg=dict(mode='whole'))
|