`_.
+
+ Args:
+ loss_type (str, optional): Binary or multi-class loss.
+ Default: 'multi_class'. Options are "binary" and "multi_class".
+ classes (str | list[int], optional): Classes chosen to calculate loss.
+ 'all' for all classes, 'present' for classes present in labels, or
+ a list of classes to average. Default: 'present'.
+ per_image (bool, optional): If per_image is True, compute the loss per
+ image instead of per batch. Default: False.
+ reduction (str, optional): The method used to reduce the loss. Options
+ are "none", "mean" and "sum". This parameter only works when
+ per_image is True. Default: 'mean'.
+ class_weight (list[float] | str, optional): Weight of each class. If in
+ str format, read them from a file. Defaults to None.
+ loss_weight (float, optional): Weight of the loss. Defaults to 1.0.
+ """
+
+ def __init__(self,
+ loss_type='multi_class',
+ classes='present',
+ per_image=False,
+ reduction='mean',
+ class_weight=None,
+ loss_weight=1.0):
+ super(LovaszLoss, self).__init__()
+ assert loss_type in ('binary', 'multi_class'), "loss_type should be \
+ 'binary' or 'multi_class'."
+
+ if loss_type == 'binary':
+ self.cls_criterion = lovasz_hinge
+ else:
+ self.cls_criterion = lovasz_softmax
+ assert classes in ('all', 'present') or mmcv.is_list_of(classes, int)
+ if not per_image:
+ assert reduction == 'none', "reduction should be 'none' when \
+ per_image is False."
+
+ self.classes = classes
+ self.per_image = per_image
+ self.reduction = reduction
+ self.loss_weight = loss_weight
+ self.class_weight = get_class_weight(class_weight)
+
+ def forward(self,
+ cls_score,
+ label,
+ weight=None,
+ avg_factor=None,
+ reduction_override=None,
+ **kwargs):
+ """Forward function."""
+ assert reduction_override in (None, 'none', 'mean', 'sum')
+ reduction = (
+ reduction_override if reduction_override else self.reduction)
+ if self.class_weight is not None:
+ class_weight = cls_score.new_tensor(self.class_weight)
+ else:
+ class_weight = None
+
+ # if multi-class loss, transform logits to probs
+ if self.cls_criterion == lovasz_softmax:
+ cls_score = F.softmax(cls_score, dim=1)
+
+ loss_cls = self.loss_weight * self.cls_criterion(
+ cls_score,
+ label,
+ self.classes,
+ self.per_image,
+ class_weight=class_weight,
+ reduction=reduction,
+ avg_factor=avg_factor,
+ **kwargs)
+ return loss_cls
diff --git a/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/models/losses/utils.py b/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/models/losses/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..2afb477a153ba9dead71066fa66ee024482afd82
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/models/losses/utils.py
@@ -0,0 +1,121 @@
+import functools
+
+import annotator.mmpkg.mmcv as mmcv
+import numpy as np
+import torch.nn.functional as F
+
+
+def get_class_weight(class_weight):
+ """Get class weight for loss function.
+
+ Args:
+ class_weight (list[float] | str | None): If class_weight is a str,
+ take it as a file name and read from it.
+ """
+ if isinstance(class_weight, str):
+ # take it as a file path
+ if class_weight.endswith('.npy'):
+ class_weight = np.load(class_weight)
+ else:
+ # pkl, json or yaml
+ class_weight = mmcv.load(class_weight)
+
+ return class_weight
+
+
+def reduce_loss(loss, reduction):
+ """Reduce loss as specified.
+
+ Args:
+ loss (Tensor): Elementwise loss tensor.
+ reduction (str): Options are "none", "mean" and "sum".
+
+ Return:
+ Tensor: Reduced loss tensor.
+ """
+ reduction_enum = F._Reduction.get_enum(reduction)
+ # none: 0, elementwise_mean:1, sum: 2
+ if reduction_enum == 0:
+ return loss
+ elif reduction_enum == 1:
+ return loss.mean()
+ elif reduction_enum == 2:
+ return loss.sum()
+
+
+def weight_reduce_loss(loss, weight=None, reduction='mean', avg_factor=None):
+ """Apply element-wise weight and reduce loss.
+
+ Args:
+ loss (Tensor): Element-wise loss.
+ weight (Tensor): Element-wise weights.
+ reduction (str): Same as built-in losses of PyTorch.
+ avg_factor (float): Avarage factor when computing the mean of losses.
+
+ Returns:
+ Tensor: Processed loss values.
+ """
+ # if weight is specified, apply element-wise weight
+ if weight is not None:
+ assert weight.dim() == loss.dim()
+ if weight.dim() > 1:
+ assert weight.size(1) == 1 or weight.size(1) == loss.size(1)
+ loss = loss * weight
+
+ # if avg_factor is not specified, just reduce the loss
+ if avg_factor is None:
+ loss = reduce_loss(loss, reduction)
+ else:
+ # if reduction is mean, then average the loss by avg_factor
+ if reduction == 'mean':
+ loss = loss.sum() / avg_factor
+ # if reduction is 'none', then do nothing, otherwise raise an error
+ elif reduction != 'none':
+ raise ValueError('avg_factor can not be used with reduction="sum"')
+ return loss
+
+
+def weighted_loss(loss_func):
+ """Create a weighted version of a given loss function.
+
+ To use this decorator, the loss function must have the signature like
+ `loss_func(pred, target, **kwargs)`. The function only needs to compute
+ element-wise loss without any reduction. This decorator will add weight
+ and reduction arguments to the function. The decorated function will have
+ the signature like `loss_func(pred, target, weight=None, reduction='mean',
+ avg_factor=None, **kwargs)`.
+
+ :Example:
+
+ >>> import torch
+ >>> @weighted_loss
+ >>> def l1_loss(pred, target):
+ >>> return (pred - target).abs()
+
+ >>> pred = torch.Tensor([0, 2, 3])
+ >>> target = torch.Tensor([1, 1, 1])
+ >>> weight = torch.Tensor([1, 0, 1])
+
+ >>> l1_loss(pred, target)
+ tensor(1.3333)
+ >>> l1_loss(pred, target, weight)
+ tensor(1.)
+ >>> l1_loss(pred, target, reduction='none')
+ tensor([1., 1., 2.])
+ >>> l1_loss(pred, target, weight, avg_factor=2)
+ tensor(1.5000)
+ """
+
+ @functools.wraps(loss_func)
+ def wrapper(pred,
+ target,
+ weight=None,
+ reduction='mean',
+ avg_factor=None,
+ **kwargs):
+ # get element-wise loss
+ loss = loss_func(pred, target, **kwargs)
+ loss = weight_reduce_loss(loss, weight, reduction, avg_factor)
+ return loss
+
+ return wrapper
diff --git a/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/models/necks/__init__.py b/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/models/necks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b9d3d5b3fe80247642d962edd6fb787537d01d6
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/models/necks/__init__.py
@@ -0,0 +1,4 @@
+from .fpn import FPN
+from .multilevel_neck import MultiLevelNeck
+
+__all__ = ['FPN', 'MultiLevelNeck']
diff --git a/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/models/necks/fpn.py b/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/models/necks/fpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba47bbe1a0225587315627ac288e5ddf6497a244
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/models/necks/fpn.py
@@ -0,0 +1,212 @@
+import torch.nn as nn
+import torch.nn.functional as F
+from annotator.mmpkg.mmcv.cnn import ConvModule, xavier_init
+
+from ..builder import NECKS
+
+
+@NECKS.register_module()
+class FPN(nn.Module):
+ """Feature Pyramid Network.
+
+ This is an implementation of - Feature Pyramid Networks for Object
+ Detection (https://arxiv.org/abs/1612.03144)
+
+ Args:
+ in_channels (List[int]): Number of input channels per scale.
+ out_channels (int): Number of output channels (used at each scale)
+ num_outs (int): Number of output scales.
+ start_level (int): Index of the start input backbone level used to
+ build the feature pyramid. Default: 0.
+ end_level (int): Index of the end input backbone level (exclusive) to
+ build the feature pyramid. Default: -1, which means the last level.
+ add_extra_convs (bool | str): If bool, it decides whether to add conv
+ layers on top of the original feature maps. Default to False.
+ If True, its actual mode is specified by `extra_convs_on_inputs`.
+ If str, it specifies the source feature map of the extra convs.
+ Only the following options are allowed
+
+ - 'on_input': Last feat map of neck inputs (i.e. backbone feature).
+ - 'on_lateral': Last feature map after lateral convs.
+ - 'on_output': The last output feature map after fpn convs.
+ extra_convs_on_inputs (bool, deprecated): Whether to apply extra convs
+ on the original feature from the backbone. If True,
+ it is equivalent to `add_extra_convs='on_input'`. If False, it is
+ equivalent to set `add_extra_convs='on_output'`. Default to True.
+ relu_before_extra_convs (bool): Whether to apply relu before the extra
+ conv. Default: False.
+ no_norm_on_lateral (bool): Whether to apply norm on lateral.
+ Default: False.
+ conv_cfg (dict): Config dict for convolution layer. Default: None.
+ norm_cfg (dict): Config dict for normalization layer. Default: None.
+ act_cfg (str): Config dict for activation layer in ConvModule.
+ Default: None.
+ upsample_cfg (dict): Config dict for interpolate layer.
+ Default: `dict(mode='nearest')`
+
+ Example:
+ >>> import torch
+ >>> in_channels = [2, 3, 5, 7]
+ >>> scales = [340, 170, 84, 43]
+ >>> inputs = [torch.rand(1, c, s, s)
+ ... for c, s in zip(in_channels, scales)]
+ >>> self = FPN(in_channels, 11, len(in_channels)).eval()
+ >>> outputs = self.forward(inputs)
+ >>> for i in range(len(outputs)):
+ ... print(f'outputs[{i}].shape = {outputs[i].shape}')
+ outputs[0].shape = torch.Size([1, 11, 340, 340])
+ outputs[1].shape = torch.Size([1, 11, 170, 170])
+ outputs[2].shape = torch.Size([1, 11, 84, 84])
+ outputs[3].shape = torch.Size([1, 11, 43, 43])
+ """
+
+ def __init__(self,
+ in_channels,
+ out_channels,
+ num_outs,
+ start_level=0,
+ end_level=-1,
+ add_extra_convs=False,
+ extra_convs_on_inputs=False,
+ relu_before_extra_convs=False,
+ no_norm_on_lateral=False,
+ conv_cfg=None,
+ norm_cfg=None,
+ act_cfg=None,
+ upsample_cfg=dict(mode='nearest')):
+ super(FPN, self).__init__()
+ assert isinstance(in_channels, list)
+ self.in_channels = in_channels
+ self.out_channels = out_channels
+ self.num_ins = len(in_channels)
+ self.num_outs = num_outs
+ self.relu_before_extra_convs = relu_before_extra_convs
+ self.no_norm_on_lateral = no_norm_on_lateral
+ self.fp16_enabled = False
+ self.upsample_cfg = upsample_cfg.copy()
+
+ if end_level == -1:
+ self.backbone_end_level = self.num_ins
+ assert num_outs >= self.num_ins - start_level
+ else:
+ # if end_level < inputs, no extra level is allowed
+ self.backbone_end_level = end_level
+ assert end_level <= len(in_channels)
+ assert num_outs == end_level - start_level
+ self.start_level = start_level
+ self.end_level = end_level
+ self.add_extra_convs = add_extra_convs
+ assert isinstance(add_extra_convs, (str, bool))
+ if isinstance(add_extra_convs, str):
+ # Extra_convs_source choices: 'on_input', 'on_lateral', 'on_output'
+ assert add_extra_convs in ('on_input', 'on_lateral', 'on_output')
+ elif add_extra_convs: # True
+ if extra_convs_on_inputs:
+ # For compatibility with previous release
+ # TODO: deprecate `extra_convs_on_inputs`
+ self.add_extra_convs = 'on_input'
+ else:
+ self.add_extra_convs = 'on_output'
+
+ self.lateral_convs = nn.ModuleList()
+ self.fpn_convs = nn.ModuleList()
+
+ for i in range(self.start_level, self.backbone_end_level):
+ l_conv = ConvModule(
+ in_channels[i],
+ out_channels,
+ 1,
+ conv_cfg=conv_cfg,
+ norm_cfg=norm_cfg if not self.no_norm_on_lateral else None,
+ act_cfg=act_cfg,
+ inplace=False)
+ fpn_conv = ConvModule(
+ out_channels,
+ out_channels,
+ 3,
+ padding=1,
+ conv_cfg=conv_cfg,
+ norm_cfg=norm_cfg,
+ act_cfg=act_cfg,
+ inplace=False)
+
+ self.lateral_convs.append(l_conv)
+ self.fpn_convs.append(fpn_conv)
+
+ # add extra conv layers (e.g., RetinaNet)
+ extra_levels = num_outs - self.backbone_end_level + self.start_level
+ if self.add_extra_convs and extra_levels >= 1:
+ for i in range(extra_levels):
+ if i == 0 and self.add_extra_convs == 'on_input':
+ in_channels = self.in_channels[self.backbone_end_level - 1]
+ else:
+ in_channels = out_channels
+ extra_fpn_conv = ConvModule(
+ in_channels,
+ out_channels,
+ 3,
+ stride=2,
+ padding=1,
+ conv_cfg=conv_cfg,
+ norm_cfg=norm_cfg,
+ act_cfg=act_cfg,
+ inplace=False)
+ self.fpn_convs.append(extra_fpn_conv)
+
+ # default init_weights for conv(msra) and norm in ConvModule
+ def init_weights(self):
+ for m in self.modules():
+ if isinstance(m, nn.Conv2d):
+ xavier_init(m, distribution='uniform')
+
+ def forward(self, inputs):
+ assert len(inputs) == len(self.in_channels)
+
+ # build laterals
+ laterals = [
+ lateral_conv(inputs[i + self.start_level])
+ for i, lateral_conv in enumerate(self.lateral_convs)
+ ]
+
+ # build top-down path
+ used_backbone_levels = len(laterals)
+ for i in range(used_backbone_levels - 1, 0, -1):
+ # In some cases, fixing `scale factor` (e.g. 2) is preferred, but
+ # it cannot co-exist with `size` in `F.interpolate`.
+ if 'scale_factor' in self.upsample_cfg:
+ laterals[i - 1] += F.interpolate(laterals[i],
+ **self.upsample_cfg)
+ else:
+ prev_shape = laterals[i - 1].shape[2:]
+ laterals[i - 1] += F.interpolate(
+ laterals[i], size=prev_shape, **self.upsample_cfg)
+
+ # build outputs
+ # part 1: from original levels
+ outs = [
+ self.fpn_convs[i](laterals[i]) for i in range(used_backbone_levels)
+ ]
+ # part 2: add extra levels
+ if self.num_outs > len(outs):
+ # use max pool to get more levels on top of outputs
+ # (e.g., Faster R-CNN, Mask R-CNN)
+ if not self.add_extra_convs:
+ for i in range(self.num_outs - used_backbone_levels):
+ outs.append(F.max_pool2d(outs[-1], 1, stride=2))
+ # add conv layers on top of original feature maps (RetinaNet)
+ else:
+ if self.add_extra_convs == 'on_input':
+ extra_source = inputs[self.backbone_end_level - 1]
+ elif self.add_extra_convs == 'on_lateral':
+ extra_source = laterals[-1]
+ elif self.add_extra_convs == 'on_output':
+ extra_source = outs[-1]
+ else:
+ raise NotImplementedError
+ outs.append(self.fpn_convs[used_backbone_levels](extra_source))
+ for i in range(used_backbone_levels + 1, self.num_outs):
+ if self.relu_before_extra_convs:
+ outs.append(self.fpn_convs[i](F.relu(outs[-1])))
+ else:
+ outs.append(self.fpn_convs[i](outs[-1]))
+ return tuple(outs)
diff --git a/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/models/necks/multilevel_neck.py b/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/models/necks/multilevel_neck.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b86c073cd1a72354d2426846125e80f7ab20dbc
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/models/necks/multilevel_neck.py
@@ -0,0 +1,70 @@
+import torch.nn as nn
+import torch.nn.functional as F
+from annotator.mmpkg.mmcv.cnn import ConvModule
+
+from ..builder import NECKS
+
+
+@NECKS.register_module()
+class MultiLevelNeck(nn.Module):
+ """MultiLevelNeck.
+
+ A neck structure connect vit backbone and decoder_heads.
+ Args:
+ in_channels (List[int]): Number of input channels per scale.
+ out_channels (int): Number of output channels (used at each scale).
+ scales (List[int]): Scale factors for each input feature map.
+ norm_cfg (dict): Config dict for normalization layer. Default: None.
+ act_cfg (dict): Config dict for activation layer in ConvModule.
+ Default: None.
+ """
+
+ def __init__(self,
+ in_channels,
+ out_channels,
+ scales=[0.5, 1, 2, 4],
+ norm_cfg=None,
+ act_cfg=None):
+ super(MultiLevelNeck, self).__init__()
+ assert isinstance(in_channels, list)
+ self.in_channels = in_channels
+ self.out_channels = out_channels
+ self.scales = scales
+ self.num_outs = len(scales)
+ self.lateral_convs = nn.ModuleList()
+ self.convs = nn.ModuleList()
+ for in_channel in in_channels:
+ self.lateral_convs.append(
+ ConvModule(
+ in_channel,
+ out_channels,
+ kernel_size=1,
+ norm_cfg=norm_cfg,
+ act_cfg=act_cfg))
+ for _ in range(self.num_outs):
+ self.convs.append(
+ ConvModule(
+ out_channels,
+ out_channels,
+ kernel_size=3,
+ padding=1,
+ stride=1,
+ norm_cfg=norm_cfg,
+ act_cfg=act_cfg))
+
+ def forward(self, inputs):
+ assert len(inputs) == len(self.in_channels)
+ print(inputs[0].shape)
+ inputs = [
+ lateral_conv(inputs[i])
+ for i, lateral_conv in enumerate(self.lateral_convs)
+ ]
+ # for len(inputs) not equal to self.num_outs
+ if len(inputs) == 1:
+ inputs = [inputs[0] for _ in range(self.num_outs)]
+ outs = []
+ for i in range(self.num_outs):
+ x_resize = F.interpolate(
+ inputs[i], scale_factor=self.scales[i], mode='bilinear')
+ outs.append(self.convs[i](x_resize))
+ return tuple(outs)
diff --git a/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/models/segmentors/__init__.py b/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/models/segmentors/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..dca2f09405330743c476e190896bee39c45498ea
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/models/segmentors/__init__.py
@@ -0,0 +1,5 @@
+from .base import BaseSegmentor
+from .cascade_encoder_decoder import CascadeEncoderDecoder
+from .encoder_decoder import EncoderDecoder
+
+__all__ = ['BaseSegmentor', 'EncoderDecoder', 'CascadeEncoderDecoder']
diff --git a/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/models/segmentors/base.py b/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/models/segmentors/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..a12d8beb8ea40bfa234197eddb4d3ef40dbfeb6f
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/models/segmentors/base.py
@@ -0,0 +1,273 @@
+import logging
+import warnings
+from abc import ABCMeta, abstractmethod
+from collections import OrderedDict
+
+import annotator.mmpkg.mmcv as mmcv
+import numpy as np
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from annotator.mmpkg.mmcv.runner import auto_fp16
+
+
+class BaseSegmentor(nn.Module):
+ """Base class for segmentors."""
+
+ __metaclass__ = ABCMeta
+
+ def __init__(self):
+ super(BaseSegmentor, self).__init__()
+ self.fp16_enabled = False
+
+ @property
+ def with_neck(self):
+ """bool: whether the segmentor has neck"""
+ return hasattr(self, 'neck') and self.neck is not None
+
+ @property
+ def with_auxiliary_head(self):
+ """bool: whether the segmentor has auxiliary head"""
+ return hasattr(self,
+ 'auxiliary_head') and self.auxiliary_head is not None
+
+ @property
+ def with_decode_head(self):
+ """bool: whether the segmentor has decode head"""
+ return hasattr(self, 'decode_head') and self.decode_head is not None
+
+ @abstractmethod
+ def extract_feat(self, imgs):
+ """Placeholder for extract features from images."""
+ pass
+
+ @abstractmethod
+ def encode_decode(self, img, img_metas):
+ """Placeholder for encode images with backbone and decode into a
+ semantic segmentation map of the same size as input."""
+ pass
+
+ @abstractmethod
+ def forward_train(self, imgs, img_metas, **kwargs):
+ """Placeholder for Forward function for training."""
+ pass
+
+ @abstractmethod
+ def simple_test(self, img, img_meta, **kwargs):
+ """Placeholder for single image test."""
+ pass
+
+ @abstractmethod
+ def aug_test(self, imgs, img_metas, **kwargs):
+ """Placeholder for augmentation test."""
+ pass
+
+ def init_weights(self, pretrained=None):
+ """Initialize the weights in segmentor.
+
+ Args:
+ pretrained (str, optional): Path to pre-trained weights.
+ Defaults to None.
+ """
+ if pretrained is not None:
+ logger = logging.getLogger()
+ logger.info(f'load model from: {pretrained}')
+
+ def forward_test(self, imgs, img_metas, **kwargs):
+ """
+ Args:
+ imgs (List[Tensor]): the outer list indicates test-time
+ augmentations and inner Tensor should have a shape NxCxHxW,
+ which contains all images in the batch.
+ img_metas (List[List[dict]]): the outer list indicates test-time
+ augs (multiscale, flip, etc.) and the inner list indicates
+ images in a batch.
+ """
+ for var, name in [(imgs, 'imgs'), (img_metas, 'img_metas')]:
+ if not isinstance(var, list):
+ raise TypeError(f'{name} must be a list, but got '
+ f'{type(var)}')
+
+ num_augs = len(imgs)
+ if num_augs != len(img_metas):
+ raise ValueError(f'num of augmentations ({len(imgs)}) != '
+ f'num of image meta ({len(img_metas)})')
+ # all images in the same aug batch all of the same ori_shape and pad
+ # shape
+ for img_meta in img_metas:
+ ori_shapes = [_['ori_shape'] for _ in img_meta]
+ assert all(shape == ori_shapes[0] for shape in ori_shapes)
+ img_shapes = [_['img_shape'] for _ in img_meta]
+ assert all(shape == img_shapes[0] for shape in img_shapes)
+ pad_shapes = [_['pad_shape'] for _ in img_meta]
+ assert all(shape == pad_shapes[0] for shape in pad_shapes)
+
+ if num_augs == 1:
+ return self.simple_test(imgs[0], img_metas[0], **kwargs)
+ else:
+ return self.aug_test(imgs, img_metas, **kwargs)
+
+ @auto_fp16(apply_to=('img', ))
+ def forward(self, img, img_metas, return_loss=True, **kwargs):
+ """Calls either :func:`forward_train` or :func:`forward_test` depending
+ on whether ``return_loss`` is ``True``.
+
+ Note this setting will change the expected inputs. When
+ ``return_loss=True``, img and img_meta are single-nested (i.e. Tensor
+ and List[dict]), and when ``resturn_loss=False``, img and img_meta
+ should be double nested (i.e. List[Tensor], List[List[dict]]), with
+ the outer list indicating test time augmentations.
+ """
+ if return_loss:
+ return self.forward_train(img, img_metas, **kwargs)
+ else:
+ return self.forward_test(img, img_metas, **kwargs)
+
+ def train_step(self, data_batch, optimizer, **kwargs):
+ """The iteration step during training.
+
+ This method defines an iteration step during training, except for the
+ back propagation and optimizer updating, which are done in an optimizer
+ hook. Note that in some complicated cases or models, the whole process
+ including back propagation and optimizer updating is also defined in
+ this method, such as GAN.
+
+ Args:
+ data (dict): The output of dataloader.
+ optimizer (:obj:`torch.optim.Optimizer` | dict): The optimizer of
+ runner is passed to ``train_step()``. This argument is unused
+ and reserved.
+
+ Returns:
+ dict: It should contain at least 3 keys: ``loss``, ``log_vars``,
+ ``num_samples``.
+ ``loss`` is a tensor for back propagation, which can be a
+ weighted sum of multiple losses.
+ ``log_vars`` contains all the variables to be sent to the
+ logger.
+ ``num_samples`` indicates the batch size (when the model is
+ DDP, it means the batch size on each GPU), which is used for
+ averaging the logs.
+ """
+ losses = self(**data_batch)
+ loss, log_vars = self._parse_losses(losses)
+
+ outputs = dict(
+ loss=loss,
+ log_vars=log_vars,
+ num_samples=len(data_batch['img_metas']))
+
+ return outputs
+
+ def val_step(self, data_batch, **kwargs):
+ """The iteration step during validation.
+
+ This method shares the same signature as :func:`train_step`, but used
+ during val epochs. Note that the evaluation after training epochs is
+ not implemented with this method, but an evaluation hook.
+ """
+ output = self(**data_batch, **kwargs)
+ return output
+
+ @staticmethod
+ def _parse_losses(losses):
+ """Parse the raw outputs (losses) of the network.
+
+ Args:
+ losses (dict): Raw output of the network, which usually contain
+ losses and other necessary information.
+
+ Returns:
+ tuple[Tensor, dict]: (loss, log_vars), loss is the loss tensor
+ which may be a weighted sum of all losses, log_vars contains
+ all the variables to be sent to the logger.
+ """
+ log_vars = OrderedDict()
+ for loss_name, loss_value in losses.items():
+ if isinstance(loss_value, torch.Tensor):
+ log_vars[loss_name] = loss_value.mean()
+ elif isinstance(loss_value, list):
+ log_vars[loss_name] = sum(_loss.mean() for _loss in loss_value)
+ else:
+ raise TypeError(
+ f'{loss_name} is not a tensor or list of tensors')
+
+ loss = sum(_value for _key, _value in log_vars.items()
+ if 'loss' in _key)
+
+ log_vars['loss'] = loss
+ for loss_name, loss_value in log_vars.items():
+ # reduce loss when distributed training
+ if dist.is_available() and dist.is_initialized():
+ loss_value = loss_value.data.clone()
+ dist.all_reduce(loss_value.div_(dist.get_world_size()))
+ log_vars[loss_name] = loss_value.item()
+
+ return loss, log_vars
+
+ def show_result(self,
+ img,
+ result,
+ palette=None,
+ win_name='',
+ show=False,
+ wait_time=0,
+ out_file=None,
+ opacity=0.5):
+ """Draw `result` over `img`.
+
+ Args:
+ img (str or Tensor): The image to be displayed.
+ result (Tensor): The semantic segmentation results to draw over
+ `img`.
+ palette (list[list[int]]] | np.ndarray | None): The palette of
+ segmentation map. If None is given, random palette will be
+ generated. Default: None
+ win_name (str): The window name.
+ wait_time (int): Value of waitKey param.
+ Default: 0.
+ show (bool): Whether to show the image.
+ Default: False.
+ out_file (str or None): The filename to write the image.
+ Default: None.
+ opacity(float): Opacity of painted segmentation map.
+ Default 0.5.
+ Must be in (0, 1] range.
+ Returns:
+ img (Tensor): Only if not `show` or `out_file`
+ """
+ img = mmcv.imread(img)
+ img = img.copy()
+ seg = result[0]
+ if palette is None:
+ if self.PALETTE is None:
+ palette = np.random.randint(
+ 0, 255, size=(len(self.CLASSES), 3))
+ else:
+ palette = self.PALETTE
+ palette = np.array(palette)
+ assert palette.shape[0] == len(self.CLASSES)
+ assert palette.shape[1] == 3
+ assert len(palette.shape) == 2
+ assert 0 < opacity <= 1.0
+ color_seg = np.zeros((seg.shape[0], seg.shape[1], 3), dtype=np.uint8)
+ for label, color in enumerate(palette):
+ color_seg[seg == label, :] = color
+ # convert to BGR
+ color_seg = color_seg[..., ::-1]
+
+ img = img * (1 - opacity) + color_seg * opacity
+ img = img.astype(np.uint8)
+ # if out_file specified, do not show image in window
+ if out_file is not None:
+ show = False
+
+ if show:
+ mmcv.imshow(img, win_name, wait_time)
+ if out_file is not None:
+ mmcv.imwrite(img, out_file)
+
+ if not (show or out_file):
+ warnings.warn('show==False and out_file is not specified, only '
+ 'result image will be returned')
+ return img
diff --git a/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/models/segmentors/cascade_encoder_decoder.py b/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/models/segmentors/cascade_encoder_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..74547f0fb01da9fe32c1d142768eb788b7e8673c
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/models/segmentors/cascade_encoder_decoder.py
@@ -0,0 +1,98 @@
+from torch import nn
+
+from annotator.mmpkg.mmseg.core import add_prefix
+from annotator.mmpkg.mmseg.ops import resize
+from .. import builder
+from ..builder import SEGMENTORS
+from .encoder_decoder import EncoderDecoder
+
+
+@SEGMENTORS.register_module()
+class CascadeEncoderDecoder(EncoderDecoder):
+ """Cascade Encoder Decoder segmentors.
+
+ CascadeEncoderDecoder almost the same as EncoderDecoder, while decoders of
+ CascadeEncoderDecoder are cascaded. The output of previous decoder_head
+ will be the input of next decoder_head.
+ """
+
+ def __init__(self,
+ num_stages,
+ backbone,
+ decode_head,
+ neck=None,
+ auxiliary_head=None,
+ train_cfg=None,
+ test_cfg=None,
+ pretrained=None):
+ self.num_stages = num_stages
+ super(CascadeEncoderDecoder, self).__init__(
+ backbone=backbone,
+ decode_head=decode_head,
+ neck=neck,
+ auxiliary_head=auxiliary_head,
+ train_cfg=train_cfg,
+ test_cfg=test_cfg,
+ pretrained=pretrained)
+
+ def _init_decode_head(self, decode_head):
+ """Initialize ``decode_head``"""
+ assert isinstance(decode_head, list)
+ assert len(decode_head) == self.num_stages
+ self.decode_head = nn.ModuleList()
+ for i in range(self.num_stages):
+ self.decode_head.append(builder.build_head(decode_head[i]))
+ self.align_corners = self.decode_head[-1].align_corners
+ self.num_classes = self.decode_head[-1].num_classes
+
+ def init_weights(self, pretrained=None):
+ """Initialize the weights in backbone and heads.
+
+ Args:
+ pretrained (str, optional): Path to pre-trained weights.
+ Defaults to None.
+ """
+ self.backbone.init_weights(pretrained=pretrained)
+ for i in range(self.num_stages):
+ self.decode_head[i].init_weights()
+ if self.with_auxiliary_head:
+ if isinstance(self.auxiliary_head, nn.ModuleList):
+ for aux_head in self.auxiliary_head:
+ aux_head.init_weights()
+ else:
+ self.auxiliary_head.init_weights()
+
+ def encode_decode(self, img, img_metas):
+ """Encode images with backbone and decode into a semantic segmentation
+ map of the same size as input."""
+ x = self.extract_feat(img)
+ out = self.decode_head[0].forward_test(x, img_metas, self.test_cfg)
+ for i in range(1, self.num_stages):
+ out = self.decode_head[i].forward_test(x, out, img_metas,
+ self.test_cfg)
+ out = resize(
+ input=out,
+ size=img.shape[2:],
+ mode='bilinear',
+ align_corners=self.align_corners)
+ return out
+
+ def _decode_head_forward_train(self, x, img_metas, gt_semantic_seg):
+ """Run forward function and calculate loss for decode head in
+ training."""
+ losses = dict()
+
+ loss_decode = self.decode_head[0].forward_train(
+ x, img_metas, gt_semantic_seg, self.train_cfg)
+
+ losses.update(add_prefix(loss_decode, 'decode_0'))
+
+ for i in range(1, self.num_stages):
+ # forward test again, maybe unnecessary for most methods.
+ prev_outputs = self.decode_head[i - 1].forward_test(
+ x, img_metas, self.test_cfg)
+ loss_decode = self.decode_head[i].forward_train(
+ x, prev_outputs, img_metas, gt_semantic_seg, self.train_cfg)
+ losses.update(add_prefix(loss_decode, f'decode_{i}'))
+
+ return losses
diff --git a/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/models/segmentors/encoder_decoder.py b/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/models/segmentors/encoder_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..30c25f35a15e65e45f9221a3f19ace8579f73301
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/models/segmentors/encoder_decoder.py
@@ -0,0 +1,298 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from annotator.mmpkg.mmseg.core import add_prefix
+from annotator.mmpkg.mmseg.ops import resize
+from .. import builder
+from ..builder import SEGMENTORS
+from .base import BaseSegmentor
+
+
+@SEGMENTORS.register_module()
+class EncoderDecoder(BaseSegmentor):
+ """Encoder Decoder segmentors.
+
+ EncoderDecoder typically consists of backbone, decode_head, auxiliary_head.
+ Note that auxiliary_head is only used for deep supervision during training,
+ which could be dumped during inference.
+ """
+
+ def __init__(self,
+ backbone,
+ decode_head,
+ neck=None,
+ auxiliary_head=None,
+ train_cfg=None,
+ test_cfg=None,
+ pretrained=None):
+ super(EncoderDecoder, self).__init__()
+ self.backbone = builder.build_backbone(backbone)
+ if neck is not None:
+ self.neck = builder.build_neck(neck)
+ self._init_decode_head(decode_head)
+ self._init_auxiliary_head(auxiliary_head)
+
+ self.train_cfg = train_cfg
+ self.test_cfg = test_cfg
+
+ self.init_weights(pretrained=pretrained)
+
+ assert self.with_decode_head
+
+ def _init_decode_head(self, decode_head):
+ """Initialize ``decode_head``"""
+ self.decode_head = builder.build_head(decode_head)
+ self.align_corners = self.decode_head.align_corners
+ self.num_classes = self.decode_head.num_classes
+
+ def _init_auxiliary_head(self, auxiliary_head):
+ """Initialize ``auxiliary_head``"""
+ if auxiliary_head is not None:
+ if isinstance(auxiliary_head, list):
+ self.auxiliary_head = nn.ModuleList()
+ for head_cfg in auxiliary_head:
+ self.auxiliary_head.append(builder.build_head(head_cfg))
+ else:
+ self.auxiliary_head = builder.build_head(auxiliary_head)
+
+ def init_weights(self, pretrained=None):
+ """Initialize the weights in backbone and heads.
+
+ Args:
+ pretrained (str, optional): Path to pre-trained weights.
+ Defaults to None.
+ """
+
+ super(EncoderDecoder, self).init_weights(pretrained)
+ self.backbone.init_weights(pretrained=pretrained)
+ self.decode_head.init_weights()
+ if self.with_auxiliary_head:
+ if isinstance(self.auxiliary_head, nn.ModuleList):
+ for aux_head in self.auxiliary_head:
+ aux_head.init_weights()
+ else:
+ self.auxiliary_head.init_weights()
+
+ def extract_feat(self, img):
+ """Extract features from images."""
+ x = self.backbone(img)
+ if self.with_neck:
+ x = self.neck(x)
+ return x
+
+ def encode_decode(self, img, img_metas):
+ """Encode images with backbone and decode into a semantic segmentation
+ map of the same size as input."""
+ x = self.extract_feat(img)
+ out = self._decode_head_forward_test(x, img_metas)
+ out = resize(
+ input=out,
+ size=img.shape[2:],
+ mode='bilinear',
+ align_corners=self.align_corners)
+ return out
+
+ def _decode_head_forward_train(self, x, img_metas, gt_semantic_seg):
+ """Run forward function and calculate loss for decode head in
+ training."""
+ losses = dict()
+ loss_decode = self.decode_head.forward_train(x, img_metas,
+ gt_semantic_seg,
+ self.train_cfg)
+
+ losses.update(add_prefix(loss_decode, 'decode'))
+ return losses
+
+ def _decode_head_forward_test(self, x, img_metas):
+ """Run forward function and calculate loss for decode head in
+ inference."""
+ seg_logits = self.decode_head.forward_test(x, img_metas, self.test_cfg)
+ return seg_logits
+
+ def _auxiliary_head_forward_train(self, x, img_metas, gt_semantic_seg):
+ """Run forward function and calculate loss for auxiliary head in
+ training."""
+ losses = dict()
+ if isinstance(self.auxiliary_head, nn.ModuleList):
+ for idx, aux_head in enumerate(self.auxiliary_head):
+ loss_aux = aux_head.forward_train(x, img_metas,
+ gt_semantic_seg,
+ self.train_cfg)
+ losses.update(add_prefix(loss_aux, f'aux_{idx}'))
+ else:
+ loss_aux = self.auxiliary_head.forward_train(
+ x, img_metas, gt_semantic_seg, self.train_cfg)
+ losses.update(add_prefix(loss_aux, 'aux'))
+
+ return losses
+
+ def forward_dummy(self, img):
+ """Dummy forward function."""
+ seg_logit = self.encode_decode(img, None)
+
+ return seg_logit
+
+ def forward_train(self, img, img_metas, gt_semantic_seg):
+ """Forward function for training.
+
+ Args:
+ img (Tensor): Input images.
+ img_metas (list[dict]): List of image info dict where each dict
+ has: 'img_shape', 'scale_factor', 'flip', and may also contain
+ 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+ For details on the values of these keys see
+ `mmseg/datasets/pipelines/formatting.py:Collect`.
+ gt_semantic_seg (Tensor): Semantic segmentation masks
+ used if the architecture supports semantic segmentation task.
+
+ Returns:
+ dict[str, Tensor]: a dictionary of loss components
+ """
+
+ x = self.extract_feat(img)
+
+ losses = dict()
+
+ loss_decode = self._decode_head_forward_train(x, img_metas,
+ gt_semantic_seg)
+ losses.update(loss_decode)
+
+ if self.with_auxiliary_head:
+ loss_aux = self._auxiliary_head_forward_train(
+ x, img_metas, gt_semantic_seg)
+ losses.update(loss_aux)
+
+ return losses
+
+ # TODO refactor
+ def slide_inference(self, img, img_meta, rescale):
+ """Inference by sliding-window with overlap.
+
+ If h_crop > h_img or w_crop > w_img, the small patch will be used to
+ decode without padding.
+ """
+
+ h_stride, w_stride = self.test_cfg.stride
+ h_crop, w_crop = self.test_cfg.crop_size
+ batch_size, _, h_img, w_img = img.size()
+ num_classes = self.num_classes
+ h_grids = max(h_img - h_crop + h_stride - 1, 0) // h_stride + 1
+ w_grids = max(w_img - w_crop + w_stride - 1, 0) // w_stride + 1
+ preds = img.new_zeros((batch_size, num_classes, h_img, w_img))
+ count_mat = img.new_zeros((batch_size, 1, h_img, w_img))
+ for h_idx in range(h_grids):
+ for w_idx in range(w_grids):
+ y1 = h_idx * h_stride
+ x1 = w_idx * w_stride
+ y2 = min(y1 + h_crop, h_img)
+ x2 = min(x1 + w_crop, w_img)
+ y1 = max(y2 - h_crop, 0)
+ x1 = max(x2 - w_crop, 0)
+ crop_img = img[:, :, y1:y2, x1:x2]
+ crop_seg_logit = self.encode_decode(crop_img, img_meta)
+ preds += F.pad(crop_seg_logit,
+ (int(x1), int(preds.shape[3] - x2), int(y1),
+ int(preds.shape[2] - y2)))
+
+ count_mat[:, :, y1:y2, x1:x2] += 1
+ assert (count_mat == 0).sum() == 0
+ if torch.onnx.is_in_onnx_export():
+ # cast count_mat to constant while exporting to ONNX
+ count_mat = torch.from_numpy(
+ count_mat.cpu().detach().numpy()).to(device=img.device)
+ preds = preds / count_mat
+ if rescale:
+ preds = resize(
+ preds,
+ size=img_meta[0]['ori_shape'][:2],
+ mode='bilinear',
+ align_corners=self.align_corners,
+ warning=False)
+ return preds
+
+ def whole_inference(self, img, img_meta, rescale):
+ """Inference with full image."""
+
+ seg_logit = self.encode_decode(img, img_meta)
+ if rescale:
+ # support dynamic shape for onnx
+ if torch.onnx.is_in_onnx_export():
+ size = img.shape[2:]
+ else:
+ size = img_meta[0]['ori_shape'][:2]
+ seg_logit = resize(
+ seg_logit,
+ size=size,
+ mode='bilinear',
+ align_corners=self.align_corners,
+ warning=False)
+
+ return seg_logit
+
+ def inference(self, img, img_meta, rescale):
+ """Inference with slide/whole style.
+
+ Args:
+ img (Tensor): The input image of shape (N, 3, H, W).
+ img_meta (dict): Image info dict where each dict has: 'img_shape',
+ 'scale_factor', 'flip', and may also contain
+ 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+ For details on the values of these keys see
+ `mmseg/datasets/pipelines/formatting.py:Collect`.
+ rescale (bool): Whether rescale back to original shape.
+
+ Returns:
+ Tensor: The output segmentation map.
+ """
+
+ assert self.test_cfg.mode in ['slide', 'whole']
+ ori_shape = img_meta[0]['ori_shape']
+ assert all(_['ori_shape'] == ori_shape for _ in img_meta)
+ if self.test_cfg.mode == 'slide':
+ seg_logit = self.slide_inference(img, img_meta, rescale)
+ else:
+ seg_logit = self.whole_inference(img, img_meta, rescale)
+ output = F.softmax(seg_logit, dim=1)
+ flip = img_meta[0]['flip']
+ if flip:
+ flip_direction = img_meta[0]['flip_direction']
+ assert flip_direction in ['horizontal', 'vertical']
+ if flip_direction == 'horizontal':
+ output = output.flip(dims=(3, ))
+ elif flip_direction == 'vertical':
+ output = output.flip(dims=(2, ))
+
+ return output
+
+ def simple_test(self, img, img_meta, rescale=True):
+ """Simple test with single image."""
+ seg_logit = self.inference(img, img_meta, rescale)
+ seg_pred = seg_logit.argmax(dim=1)
+ if torch.onnx.is_in_onnx_export():
+ # our inference backend only support 4D output
+ seg_pred = seg_pred.unsqueeze(0)
+ return seg_pred
+ seg_pred = seg_pred.cpu().numpy()
+ # unravel batch dim
+ seg_pred = list(seg_pred)
+ return seg_pred
+
+ def aug_test(self, imgs, img_metas, rescale=True):
+ """Test with augmentations.
+
+ Only rescale=True is supported.
+ """
+ # aug_test rescale all imgs back to ori_shape for now
+ assert rescale
+ # to save memory, we get augmented seg logit inplace
+ seg_logit = self.inference(imgs[0], img_metas[0], rescale)
+ for i in range(1, len(imgs)):
+ cur_seg_logit = self.inference(imgs[i], img_metas[i], rescale)
+ seg_logit += cur_seg_logit
+ seg_logit /= len(imgs)
+ seg_pred = seg_logit.argmax(dim=1)
+ seg_pred = seg_pred.cpu().numpy()
+ # unravel batch dim
+ seg_pred = list(seg_pred)
+ return seg_pred
diff --git a/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/models/utils/__init__.py b/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/models/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d3bdd349b9f2ae499a2fcb2ac1d2e3c77befebe
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/models/utils/__init__.py
@@ -0,0 +1,13 @@
+from .drop import DropPath
+from .inverted_residual import InvertedResidual, InvertedResidualV3
+from .make_divisible import make_divisible
+from .res_layer import ResLayer
+from .se_layer import SELayer
+from .self_attention_block import SelfAttentionBlock
+from .up_conv_block import UpConvBlock
+from .weight_init import trunc_normal_
+
+__all__ = [
+ 'ResLayer', 'SelfAttentionBlock', 'make_divisible', 'InvertedResidual',
+ 'UpConvBlock', 'InvertedResidualV3', 'SELayer', 'DropPath', 'trunc_normal_'
+]
diff --git a/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/models/utils/drop.py b/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/models/utils/drop.py
new file mode 100644
index 0000000000000000000000000000000000000000..4520b0ff407d2a95a864086bdbca0065f222aa63
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/models/utils/drop.py
@@ -0,0 +1,31 @@
+"""Modified from https://github.com/rwightman/pytorch-image-
+models/blob/master/timm/models/layers/drop.py."""
+
+import torch
+from torch import nn
+
+
+class DropPath(nn.Module):
+ """Drop paths (Stochastic Depth) per sample (when applied in main path of
+ residual blocks).
+
+ Args:
+ drop_prob (float): Drop rate for paths of model. Dropout rate has
+ to be between 0 and 1. Default: 0.
+ """
+
+ def __init__(self, drop_prob=0.):
+ super(DropPath, self).__init__()
+ self.drop_prob = drop_prob
+ self.keep_prob = 1 - drop_prob
+
+ def forward(self, x):
+ if self.drop_prob == 0. or not self.training:
+ return x
+ shape = (x.shape[0], ) + (1, ) * (
+ x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets
+ random_tensor = self.keep_prob + torch.rand(
+ shape, dtype=x.dtype, device=x.device)
+ random_tensor.floor_() # binarize
+ output = x.div(self.keep_prob) * random_tensor
+ return output
diff --git a/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/models/utils/inverted_residual.py b/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/models/utils/inverted_residual.py
new file mode 100644
index 0000000000000000000000000000000000000000..2df5ebd7c94c0a66b0d05ef9e200ddbeabfa79f6
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/models/utils/inverted_residual.py
@@ -0,0 +1,208 @@
+from annotator.mmpkg.mmcv.cnn import ConvModule
+from torch import nn
+from torch.utils import checkpoint as cp
+
+from .se_layer import SELayer
+
+
+class InvertedResidual(nn.Module):
+ """InvertedResidual block for MobileNetV2.
+
+ Args:
+ in_channels (int): The input channels of the InvertedResidual block.
+ out_channels (int): The output channels of the InvertedResidual block.
+ stride (int): Stride of the middle (first) 3x3 convolution.
+ expand_ratio (int): Adjusts number of channels of the hidden layer
+ in InvertedResidual by this amount.
+ dilation (int): Dilation rate of depthwise conv. Default: 1
+ conv_cfg (dict): Config dict for convolution layer.
+ Default: None, which means using conv2d.
+ norm_cfg (dict): Config dict for normalization layer.
+ Default: dict(type='BN').
+ act_cfg (dict): Config dict for activation layer.
+ Default: dict(type='ReLU6').
+ with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+ memory while slowing down the training speed. Default: False.
+
+ Returns:
+ Tensor: The output tensor.
+ """
+
+ def __init__(self,
+ in_channels,
+ out_channels,
+ stride,
+ expand_ratio,
+ dilation=1,
+ conv_cfg=None,
+ norm_cfg=dict(type='BN'),
+ act_cfg=dict(type='ReLU6'),
+ with_cp=False):
+ super(InvertedResidual, self).__init__()
+ self.stride = stride
+ assert stride in [1, 2], f'stride must in [1, 2]. ' \
+ f'But received {stride}.'
+ self.with_cp = with_cp
+ self.use_res_connect = self.stride == 1 and in_channels == out_channels
+ hidden_dim = int(round(in_channels * expand_ratio))
+
+ layers = []
+ if expand_ratio != 1:
+ layers.append(
+ ConvModule(
+ in_channels=in_channels,
+ out_channels=hidden_dim,
+ kernel_size=1,
+ conv_cfg=conv_cfg,
+ norm_cfg=norm_cfg,
+ act_cfg=act_cfg))
+ layers.extend([
+ ConvModule(
+ in_channels=hidden_dim,
+ out_channels=hidden_dim,
+ kernel_size=3,
+ stride=stride,
+ padding=dilation,
+ dilation=dilation,
+ groups=hidden_dim,
+ conv_cfg=conv_cfg,
+ norm_cfg=norm_cfg,
+ act_cfg=act_cfg),
+ ConvModule(
+ in_channels=hidden_dim,
+ out_channels=out_channels,
+ kernel_size=1,
+ conv_cfg=conv_cfg,
+ norm_cfg=norm_cfg,
+ act_cfg=None)
+ ])
+ self.conv = nn.Sequential(*layers)
+
+ def forward(self, x):
+
+ def _inner_forward(x):
+ if self.use_res_connect:
+ return x + self.conv(x)
+ else:
+ return self.conv(x)
+
+ if self.with_cp and x.requires_grad:
+ out = cp.checkpoint(_inner_forward, x)
+ else:
+ out = _inner_forward(x)
+
+ return out
+
+
+class InvertedResidualV3(nn.Module):
+ """Inverted Residual Block for MobileNetV3.
+
+ Args:
+ in_channels (int): The input channels of this Module.
+ out_channels (int): The output channels of this Module.
+ mid_channels (int): The input channels of the depthwise convolution.
+ kernel_size (int): The kernel size of the depthwise convolution.
+ Default: 3.
+ stride (int): The stride of the depthwise convolution. Default: 1.
+ se_cfg (dict): Config dict for se layer. Default: None, which means no
+ se layer.
+ with_expand_conv (bool): Use expand conv or not. If set False,
+ mid_channels must be the same with in_channels. Default: True.
+ conv_cfg (dict): Config dict for convolution layer. Default: None,
+ which means using conv2d.
+ norm_cfg (dict): Config dict for normalization layer.
+ Default: dict(type='BN').
+ act_cfg (dict): Config dict for activation layer.
+ Default: dict(type='ReLU').
+ with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+ memory while slowing down the training speed. Default: False.
+
+ Returns:
+ Tensor: The output tensor.
+ """
+
+ def __init__(self,
+ in_channels,
+ out_channels,
+ mid_channels,
+ kernel_size=3,
+ stride=1,
+ se_cfg=None,
+ with_expand_conv=True,
+ conv_cfg=None,
+ norm_cfg=dict(type='BN'),
+ act_cfg=dict(type='ReLU'),
+ with_cp=False):
+ super(InvertedResidualV3, self).__init__()
+ self.with_res_shortcut = (stride == 1 and in_channels == out_channels)
+ assert stride in [1, 2]
+ self.with_cp = with_cp
+ self.with_se = se_cfg is not None
+ self.with_expand_conv = with_expand_conv
+
+ if self.with_se:
+ assert isinstance(se_cfg, dict)
+ if not self.with_expand_conv:
+ assert mid_channels == in_channels
+
+ if self.with_expand_conv:
+ self.expand_conv = ConvModule(
+ in_channels=in_channels,
+ out_channels=mid_channels,
+ kernel_size=1,
+ stride=1,
+ padding=0,
+ conv_cfg=conv_cfg,
+ norm_cfg=norm_cfg,
+ act_cfg=act_cfg)
+ self.depthwise_conv = ConvModule(
+ in_channels=mid_channels,
+ out_channels=mid_channels,
+ kernel_size=kernel_size,
+ stride=stride,
+ padding=kernel_size // 2,
+ groups=mid_channels,
+ conv_cfg=dict(
+ type='Conv2dAdaptivePadding') if stride == 2 else conv_cfg,
+ norm_cfg=norm_cfg,
+ act_cfg=act_cfg)
+
+ if self.with_se:
+ self.se = SELayer(**se_cfg)
+
+ self.linear_conv = ConvModule(
+ in_channels=mid_channels,
+ out_channels=out_channels,
+ kernel_size=1,
+ stride=1,
+ padding=0,
+ conv_cfg=conv_cfg,
+ norm_cfg=norm_cfg,
+ act_cfg=None)
+
+ def forward(self, x):
+
+ def _inner_forward(x):
+ out = x
+
+ if self.with_expand_conv:
+ out = self.expand_conv(out)
+
+ out = self.depthwise_conv(out)
+
+ if self.with_se:
+ out = self.se(out)
+
+ out = self.linear_conv(out)
+
+ if self.with_res_shortcut:
+ return x + out
+ else:
+ return out
+
+ if self.with_cp and x.requires_grad:
+ out = cp.checkpoint(_inner_forward, x)
+ else:
+ out = _inner_forward(x)
+
+ return out
diff --git a/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/models/utils/make_divisible.py b/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/models/utils/make_divisible.py
new file mode 100644
index 0000000000000000000000000000000000000000..75ad756052529f52fe83bb95dd1f0ecfc9a13078
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/models/utils/make_divisible.py
@@ -0,0 +1,27 @@
+def make_divisible(value, divisor, min_value=None, min_ratio=0.9):
+ """Make divisible function.
+
+ This function rounds the channel number to the nearest value that can be
+ divisible by the divisor. It is taken from the original tf repo. It ensures
+ that all layers have a channel number that is divisible by divisor. It can
+ be seen here: https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py # noqa
+
+ Args:
+ value (int): The original channel number.
+ divisor (int): The divisor to fully divide the channel number.
+ min_value (int): The minimum value of the output channel.
+ Default: None, means that the minimum value equal to the divisor.
+ min_ratio (float): The minimum ratio of the rounded channel number to
+ the original channel number. Default: 0.9.
+
+ Returns:
+ int: The modified output channel number.
+ """
+
+ if min_value is None:
+ min_value = divisor
+ new_value = max(min_value, int(value + divisor / 2) // divisor * divisor)
+ # Make sure that round down does not go down by more than (1-min_ratio).
+ if new_value < min_ratio * value:
+ new_value += divisor
+ return new_value
diff --git a/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/models/utils/res_layer.py b/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/models/utils/res_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..d41075a57356b4fd802bc4ff199e55e63678b589
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/models/utils/res_layer.py
@@ -0,0 +1,94 @@
+from annotator.mmpkg.mmcv.cnn import build_conv_layer, build_norm_layer
+from torch import nn as nn
+
+
+class ResLayer(nn.Sequential):
+ """ResLayer to build ResNet style backbone.
+
+ Args:
+ block (nn.Module): block used to build ResLayer.
+ inplanes (int): inplanes of block.
+ planes (int): planes of block.
+ num_blocks (int): number of blocks.
+ stride (int): stride of the first block. Default: 1
+ avg_down (bool): Use AvgPool instead of stride conv when
+ downsampling in the bottleneck. Default: False
+ conv_cfg (dict): dictionary to construct and config conv layer.
+ Default: None
+ norm_cfg (dict): dictionary to construct and config norm layer.
+ Default: dict(type='BN')
+ multi_grid (int | None): Multi grid dilation rates of last
+ stage. Default: None
+ contract_dilation (bool): Whether contract first dilation of each layer
+ Default: False
+ """
+
+ def __init__(self,
+ block,
+ inplanes,
+ planes,
+ num_blocks,
+ stride=1,
+ dilation=1,
+ avg_down=False,
+ conv_cfg=None,
+ norm_cfg=dict(type='BN'),
+ multi_grid=None,
+ contract_dilation=False,
+ **kwargs):
+ self.block = block
+
+ downsample = None
+ if stride != 1 or inplanes != planes * block.expansion:
+ downsample = []
+ conv_stride = stride
+ if avg_down:
+ conv_stride = 1
+ downsample.append(
+ nn.AvgPool2d(
+ kernel_size=stride,
+ stride=stride,
+ ceil_mode=True,
+ count_include_pad=False))
+ downsample.extend([
+ build_conv_layer(
+ conv_cfg,
+ inplanes,
+ planes * block.expansion,
+ kernel_size=1,
+ stride=conv_stride,
+ bias=False),
+ build_norm_layer(norm_cfg, planes * block.expansion)[1]
+ ])
+ downsample = nn.Sequential(*downsample)
+
+ layers = []
+ if multi_grid is None:
+ if dilation > 1 and contract_dilation:
+ first_dilation = dilation // 2
+ else:
+ first_dilation = dilation
+ else:
+ first_dilation = multi_grid[0]
+ layers.append(
+ block(
+ inplanes=inplanes,
+ planes=planes,
+ stride=stride,
+ dilation=first_dilation,
+ downsample=downsample,
+ conv_cfg=conv_cfg,
+ norm_cfg=norm_cfg,
+ **kwargs))
+ inplanes = planes * block.expansion
+ for i in range(1, num_blocks):
+ layers.append(
+ block(
+ inplanes=inplanes,
+ planes=planes,
+ stride=1,
+ dilation=dilation if multi_grid is None else multi_grid[i],
+ conv_cfg=conv_cfg,
+ norm_cfg=norm_cfg,
+ **kwargs))
+ super(ResLayer, self).__init__(*layers)
diff --git a/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/models/utils/se_layer.py b/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/models/utils/se_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..42ab005e1fe2211e9ecb651d31de128cf95cfec7
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/models/utils/se_layer.py
@@ -0,0 +1,57 @@
+import annotator.mmpkg.mmcv as mmcv
+import torch.nn as nn
+from annotator.mmpkg.mmcv.cnn import ConvModule
+
+from .make_divisible import make_divisible
+
+
+class SELayer(nn.Module):
+ """Squeeze-and-Excitation Module.
+
+ Args:
+ channels (int): The input (and output) channels of the SE layer.
+ ratio (int): Squeeze ratio in SELayer, the intermediate channel will be
+ ``int(channels/ratio)``. Default: 16.
+ conv_cfg (None or dict): Config dict for convolution layer.
+ Default: None, which means using conv2d.
+ act_cfg (dict or Sequence[dict]): Config dict for activation layer.
+ If act_cfg is a dict, two activation layers will be configured
+ by this dict. If act_cfg is a sequence of dicts, the first
+ activation layer will be configured by the first dict and the
+ second activation layer will be configured by the second dict.
+ Default: (dict(type='ReLU'), dict(type='HSigmoid', bias=3.0,
+ divisor=6.0)).
+ """
+
+ def __init__(self,
+ channels,
+ ratio=16,
+ conv_cfg=None,
+ act_cfg=(dict(type='ReLU'),
+ dict(type='HSigmoid', bias=3.0, divisor=6.0))):
+ super(SELayer, self).__init__()
+ if isinstance(act_cfg, dict):
+ act_cfg = (act_cfg, act_cfg)
+ assert len(act_cfg) == 2
+ assert mmcv.is_tuple_of(act_cfg, dict)
+ self.global_avgpool = nn.AdaptiveAvgPool2d(1)
+ self.conv1 = ConvModule(
+ in_channels=channels,
+ out_channels=make_divisible(channels // ratio, 8),
+ kernel_size=1,
+ stride=1,
+ conv_cfg=conv_cfg,
+ act_cfg=act_cfg[0])
+ self.conv2 = ConvModule(
+ in_channels=make_divisible(channels // ratio, 8),
+ out_channels=channels,
+ kernel_size=1,
+ stride=1,
+ conv_cfg=conv_cfg,
+ act_cfg=act_cfg[1])
+
+ def forward(self, x):
+ out = self.global_avgpool(x)
+ out = self.conv1(out)
+ out = self.conv2(out)
+ return x * out
diff --git a/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/models/utils/self_attention_block.py b/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/models/utils/self_attention_block.py
new file mode 100644
index 0000000000000000000000000000000000000000..a342e2b29ad53916c98d0342bde8f0f6cb10197a
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/models/utils/self_attention_block.py
@@ -0,0 +1,159 @@
+import torch
+from annotator.mmpkg.mmcv.cnn import ConvModule, constant_init
+from torch import nn as nn
+from torch.nn import functional as F
+
+
+class SelfAttentionBlock(nn.Module):
+ """General self-attention block/non-local block.
+
+ Please refer to https://arxiv.org/abs/1706.03762 for details about key,
+ query and value.
+
+ Args:
+ key_in_channels (int): Input channels of key feature.
+ query_in_channels (int): Input channels of query feature.
+ channels (int): Output channels of key/query transform.
+ out_channels (int): Output channels.
+ share_key_query (bool): Whether share projection weight between key
+ and query projection.
+ query_downsample (nn.Module): Query downsample module.
+ key_downsample (nn.Module): Key downsample module.
+ key_query_num_convs (int): Number of convs for key/query projection.
+ value_num_convs (int): Number of convs for value projection.
+ matmul_norm (bool): Whether normalize attention map with sqrt of
+ channels
+ with_out (bool): Whether use out projection.
+ conv_cfg (dict|None): Config of conv layers.
+ norm_cfg (dict|None): Config of norm layers.
+ act_cfg (dict|None): Config of activation layers.
+ """
+
+ def __init__(self, key_in_channels, query_in_channels, channels,
+ out_channels, share_key_query, query_downsample,
+ key_downsample, key_query_num_convs, value_out_num_convs,
+ key_query_norm, value_out_norm, matmul_norm, with_out,
+ conv_cfg, norm_cfg, act_cfg):
+ super(SelfAttentionBlock, self).__init__()
+ if share_key_query:
+ assert key_in_channels == query_in_channels
+ self.key_in_channels = key_in_channels
+ self.query_in_channels = query_in_channels
+ self.out_channels = out_channels
+ self.channels = channels
+ self.share_key_query = share_key_query
+ self.conv_cfg = conv_cfg
+ self.norm_cfg = norm_cfg
+ self.act_cfg = act_cfg
+ self.key_project = self.build_project(
+ key_in_channels,
+ channels,
+ num_convs=key_query_num_convs,
+ use_conv_module=key_query_norm,
+ conv_cfg=conv_cfg,
+ norm_cfg=norm_cfg,
+ act_cfg=act_cfg)
+ if share_key_query:
+ self.query_project = self.key_project
+ else:
+ self.query_project = self.build_project(
+ query_in_channels,
+ channels,
+ num_convs=key_query_num_convs,
+ use_conv_module=key_query_norm,
+ conv_cfg=conv_cfg,
+ norm_cfg=norm_cfg,
+ act_cfg=act_cfg)
+ self.value_project = self.build_project(
+ key_in_channels,
+ channels if with_out else out_channels,
+ num_convs=value_out_num_convs,
+ use_conv_module=value_out_norm,
+ conv_cfg=conv_cfg,
+ norm_cfg=norm_cfg,
+ act_cfg=act_cfg)
+ if with_out:
+ self.out_project = self.build_project(
+ channels,
+ out_channels,
+ num_convs=value_out_num_convs,
+ use_conv_module=value_out_norm,
+ conv_cfg=conv_cfg,
+ norm_cfg=norm_cfg,
+ act_cfg=act_cfg)
+ else:
+ self.out_project = None
+
+ self.query_downsample = query_downsample
+ self.key_downsample = key_downsample
+ self.matmul_norm = matmul_norm
+
+ self.init_weights()
+
+ def init_weights(self):
+ """Initialize weight of later layer."""
+ if self.out_project is not None:
+ if not isinstance(self.out_project, ConvModule):
+ constant_init(self.out_project, 0)
+
+ def build_project(self, in_channels, channels, num_convs, use_conv_module,
+ conv_cfg, norm_cfg, act_cfg):
+ """Build projection layer for key/query/value/out."""
+ if use_conv_module:
+ convs = [
+ ConvModule(
+ in_channels,
+ channels,
+ 1,
+ conv_cfg=conv_cfg,
+ norm_cfg=norm_cfg,
+ act_cfg=act_cfg)
+ ]
+ for _ in range(num_convs - 1):
+ convs.append(
+ ConvModule(
+ channels,
+ channels,
+ 1,
+ conv_cfg=conv_cfg,
+ norm_cfg=norm_cfg,
+ act_cfg=act_cfg))
+ else:
+ convs = [nn.Conv2d(in_channels, channels, 1)]
+ for _ in range(num_convs - 1):
+ convs.append(nn.Conv2d(channels, channels, 1))
+ if len(convs) > 1:
+ convs = nn.Sequential(*convs)
+ else:
+ convs = convs[0]
+ return convs
+
+ def forward(self, query_feats, key_feats):
+ """Forward function."""
+ batch_size = query_feats.size(0)
+ query = self.query_project(query_feats)
+ if self.query_downsample is not None:
+ query = self.query_downsample(query)
+ query = query.reshape(*query.shape[:2], -1)
+ query = query.permute(0, 2, 1).contiguous()
+
+ key = self.key_project(key_feats)
+ value = self.value_project(key_feats)
+ if self.key_downsample is not None:
+ key = self.key_downsample(key)
+ value = self.key_downsample(value)
+ key = key.reshape(*key.shape[:2], -1)
+ value = value.reshape(*value.shape[:2], -1)
+ value = value.permute(0, 2, 1).contiguous()
+
+ sim_map = torch.matmul(query, key)
+ if self.matmul_norm:
+ sim_map = (self.channels**-.5) * sim_map
+ sim_map = F.softmax(sim_map, dim=-1)
+
+ context = torch.matmul(sim_map, value)
+ context = context.permute(0, 2, 1).contiguous()
+ context = context.reshape(batch_size, -1, *query_feats.shape[2:])
+ if self.out_project is not None:
+ context = self.out_project(context)
+ return context
diff --git a/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/models/utils/up_conv_block.py b/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/models/utils/up_conv_block.py
new file mode 100644
index 0000000000000000000000000000000000000000..86328011a9704d17e9f9d0d54994719ead5caa56
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/models/utils/up_conv_block.py
@@ -0,0 +1,101 @@
+import torch
+import torch.nn as nn
+from annotator.mmpkg.mmcv.cnn import ConvModule, build_upsample_layer
+
+
+class UpConvBlock(nn.Module):
+ """Upsample convolution block in decoder for UNet.
+
+ This upsample convolution block consists of one upsample module
+ followed by one convolution block. The upsample module expands the
+ high-level low-resolution feature map and the convolution block fuses
+ the upsampled high-level low-resolution feature map and the low-level
+ high-resolution feature map from encoder.
+
+ Args:
+ conv_block (nn.Sequential): Sequential of convolutional layers.
+ in_channels (int): Number of input channels of the high-level
+ skip_channels (int): Number of input channels of the low-level
+ high-resolution feature map from encoder.
+ out_channels (int): Number of output channels.
+ num_convs (int): Number of convolutional layers in the conv_block.
+ Default: 2.
+ stride (int): Stride of convolutional layer in conv_block. Default: 1.
+ dilation (int): Dilation rate of convolutional layer in conv_block.
+ Default: 1.
+ with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+ memory while slowing down the training speed. Default: False.
+ conv_cfg (dict | None): Config dict for convolution layer.
+ Default: None.
+ norm_cfg (dict | None): Config dict for normalization layer.
+ Default: dict(type='BN').
+ act_cfg (dict | None): Config dict for activation layer in ConvModule.
+ Default: dict(type='ReLU').
+ upsample_cfg (dict): The upsample config of the upsample module in
+ decoder. Default: dict(type='InterpConv'). If the size of
+ high-level feature map is the same as that of skip feature map
+ (low-level feature map from encoder), it does not need upsample the
+ high-level feature map and the upsample_cfg is None.
+ dcn (bool): Use deformable convolution in convolutional layer or not.
+ Default: None.
+ plugins (dict): plugins for convolutional layers. Default: None.
+ """
+
+ def __init__(self,
+ conv_block,
+ in_channels,
+ skip_channels,
+ out_channels,
+ num_convs=2,
+ stride=1,
+ dilation=1,
+ with_cp=False,
+ conv_cfg=None,
+ norm_cfg=dict(type='BN'),
+ act_cfg=dict(type='ReLU'),
+ upsample_cfg=dict(type='InterpConv'),
+ dcn=None,
+ plugins=None):
+ super(UpConvBlock, self).__init__()
+ assert dcn is None, 'Not implemented yet.'
+ assert plugins is None, 'Not implemented yet.'
+
+ self.conv_block = conv_block(
+ in_channels=2 * skip_channels,
+ out_channels=out_channels,
+ num_convs=num_convs,
+ stride=stride,
+ dilation=dilation,
+ with_cp=with_cp,
+ conv_cfg=conv_cfg,
+ norm_cfg=norm_cfg,
+ act_cfg=act_cfg,
+ dcn=None,
+ plugins=None)
+ if upsample_cfg is not None:
+ self.upsample = build_upsample_layer(
+ cfg=upsample_cfg,
+ in_channels=in_channels,
+ out_channels=skip_channels,
+ with_cp=with_cp,
+ norm_cfg=norm_cfg,
+ act_cfg=act_cfg)
+ else:
+ self.upsample = ConvModule(
+ in_channels,
+ skip_channels,
+ kernel_size=1,
+ stride=1,
+ padding=0,
+ conv_cfg=conv_cfg,
+ norm_cfg=norm_cfg,
+ act_cfg=act_cfg)
+
+ def forward(self, skip, x):
+ """Forward function."""
+
+ x = self.upsample(x)
+ out = torch.cat([skip, x], dim=1)
+ out = self.conv_block(out)
+
+ return out
diff --git a/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/models/utils/weight_init.py b/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/models/utils/weight_init.py
new file mode 100644
index 0000000000000000000000000000000000000000..38141ba3d61f64ddfc0a31574b4648cbad96d7dd
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/models/utils/weight_init.py
@@ -0,0 +1,62 @@
+"""Modified from https://github.com/rwightman/pytorch-image-
+models/blob/master/timm/models/layers/drop.py."""
+
+import math
+import warnings
+
+import torch
+
+
+def _no_grad_trunc_normal_(tensor, mean, std, a, b):
+ """Reference: https://people.sc.fsu.edu/~jburkardt/presentations
+ /truncated_normal.pdf"""
+
+ def norm_cdf(x):
+ # Computes standard normal cumulative distribution function
+ return (1. + math.erf(x / math.sqrt(2.))) / 2.
+
+ if (mean < a - 2 * std) or (mean > b + 2 * std):
+ warnings.warn(
+ 'mean is more than 2 std from [a, b] in nn.init.trunc_normal_. '
+ 'The distribution of values may be incorrect.',
+ stacklevel=2)
+
+ with torch.no_grad():
+ # Values are generated by using a truncated uniform distribution and
+ # then using the inverse CDF for the normal distribution.
+ # Get upper and lower cdf values
+ lower_bound = norm_cdf((a - mean) / std)
+ upper_bound = norm_cdf((b - mean) / std)
+
+ # Uniformly fill tensor with values from [l, u], then translate to
+ # [2l-1, 2u-1].
+ tensor.uniform_(2 * lower_bound - 1, 2 * upper_bound - 1)
+
+ # Use inverse cdf transform for normal distribution to get truncated
+ # standard normal
+ tensor.erfinv_()
+
+ # Transform to proper mean, std
+ tensor.mul_(std * math.sqrt(2.))
+ tensor.add_(mean)
+
+ # Clamp to ensure it's in the proper range
+ tensor.clamp_(min=a, max=b)
+ return tensor
+
+
+def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
+ r"""Fills the input Tensor with values drawn from a truncated
+ normal distribution. The values are effectively drawn from the
+ normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
+ with values outside :math:`[a, b]` redrawn until they are within
+ the bounds. The method used for generating the random values works
+ best when :math:`a \leq \text{mean} \leq b`.
+ Args:
+ tensor (``torch.Tensor``): an n-dimensional `torch.Tensor`
+ mean (float): the mean of the normal distribution
+ std (float): the standard deviation of the normal distribution
+ a (float): the minimum cutoff value
+ b (float): the maximum cutoff value
+ """
+ return _no_grad_trunc_normal_(tensor, mean, std, a, b)
diff --git a/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/ops/__init__.py b/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..bec51c75b9363a9a19e9fb5c35f4e7dbd6f7751c
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/ops/__init__.py
@@ -0,0 +1,4 @@
+from .encoding import Encoding
+from .wrappers import Upsample, resize
+
+__all__ = ['Upsample', 'resize', 'Encoding']
diff --git a/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/ops/encoding.py b/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/ops/encoding.py
new file mode 100644
index 0000000000000000000000000000000000000000..7eb3629a6426550b8e4c537ee1ff4341893e489e
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/ops/encoding.py
@@ -0,0 +1,74 @@
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+
+class Encoding(nn.Module):
+ """Encoding Layer: a learnable residual encoder.
+
+ Input is of shape (batch_size, channels, height, width).
+ Output is of shape (batch_size, num_codes, channels).
+
+ Args:
+ channels: dimension of the features or feature channels
+ num_codes: number of code words
+ """
+
+ def __init__(self, channels, num_codes):
+ super(Encoding, self).__init__()
+ # init codewords and smoothing factor
+ self.channels, self.num_codes = channels, num_codes
+ std = 1. / ((num_codes * channels)**0.5)
+ # [num_codes, channels]
+ self.codewords = nn.Parameter(
+ torch.empty(num_codes, channels,
+ dtype=torch.float).uniform_(-std, std),
+ requires_grad=True)
+ # [num_codes]
+ self.scale = nn.Parameter(
+ torch.empty(num_codes, dtype=torch.float).uniform_(-1, 0),
+ requires_grad=True)
+
+ @staticmethod
+ def scaled_l2(x, codewords, scale):
+ num_codes, channels = codewords.size()
+ batch_size = x.size(0)
+ reshaped_scale = scale.view((1, 1, num_codes))
+ expanded_x = x.unsqueeze(2).expand(
+ (batch_size, x.size(1), num_codes, channels))
+ reshaped_codewords = codewords.view((1, 1, num_codes, channels))
+
+ scaled_l2_norm = reshaped_scale * (
+ expanded_x - reshaped_codewords).pow(2).sum(dim=3)
+ return scaled_l2_norm
+
+ @staticmethod
+ def aggregate(assignment_weights, x, codewords):
+ num_codes, channels = codewords.size()
+ reshaped_codewords = codewords.view((1, 1, num_codes, channels))
+ batch_size = x.size(0)
+
+ expanded_x = x.unsqueeze(2).expand(
+ (batch_size, x.size(1), num_codes, channels))
+ encoded_feat = (assignment_weights.unsqueeze(3) *
+ (expanded_x - reshaped_codewords)).sum(dim=1)
+ return encoded_feat
+
+ def forward(self, x):
+ assert x.dim() == 4 and x.size(1) == self.channels
+ # [batch_size, channels, height, width]
+ batch_size = x.size(0)
+ # [batch_size, height x width, channels]
+ x = x.view(batch_size, self.channels, -1).transpose(1, 2).contiguous()
+ # assignment_weights: [batch_size, channels, num_codes]
+ assignment_weights = F.softmax(
+ self.scaled_l2(x, self.codewords, self.scale), dim=2)
+ # aggregate
+ encoded_feat = self.aggregate(assignment_weights, x, self.codewords)
+ return encoded_feat
+
+ def __repr__(self):
+ repr_str = self.__class__.__name__
+ repr_str += f'(Nx{self.channels}xHxW =>Nx{self.num_codes}' \
+ f'x{self.channels})'
+ return repr_str
diff --git a/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/ops/wrappers.py b/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/ops/wrappers.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ed9a0cb8d7c0e0ec2748dd89c652756653cac78
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/ops/wrappers.py
@@ -0,0 +1,50 @@
+import warnings
+
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+def resize(input,
+ size=None,
+ scale_factor=None,
+ mode='nearest',
+ align_corners=None,
+ warning=True):
+ if warning:
+ if size is not None and align_corners:
+ input_h, input_w = tuple(int(x) for x in input.shape[2:])
+ output_h, output_w = tuple(int(x) for x in size)
+ if output_h > input_h or output_w > output_h:
+ if ((output_h > 1 and output_w > 1 and input_h > 1
+ and input_w > 1) and (output_h - 1) % (input_h - 1)
+ and (output_w - 1) % (input_w - 1)):
+ warnings.warn(
+ f'When align_corners={align_corners}, '
+ 'the output would more aligned if '
+ f'input size {(input_h, input_w)} is `x+1` and '
+ f'out size {(output_h, output_w)} is `nx+1`')
+ return F.interpolate(input, size, scale_factor, mode, align_corners)
+
+
+class Upsample(nn.Module):
+
+ def __init__(self,
+ size=None,
+ scale_factor=None,
+ mode='nearest',
+ align_corners=None):
+ super(Upsample, self).__init__()
+ self.size = size
+ if isinstance(scale_factor, tuple):
+ self.scale_factor = tuple(float(factor) for factor in scale_factor)
+ else:
+ self.scale_factor = float(scale_factor) if scale_factor else None
+ self.mode = mode
+ self.align_corners = align_corners
+
+ def forward(self, x):
+ if not self.size:
+ size = [int(t * self.scale_factor) for t in x.shape[-2:]]
+ else:
+ size = self.size
+ return resize(x, size, None, self.mode, self.align_corners)
diff --git a/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/utils/__init__.py b/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac489e2dbbc0e6fa87f5088b4edcc20f8cadc1a6
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/utils/__init__.py
@@ -0,0 +1,4 @@
+from .collect_env import collect_env
+from .logger import get_root_logger
+
+__all__ = ['get_root_logger', 'collect_env']
diff --git a/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/utils/collect_env.py b/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/utils/collect_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..015d5a6b4f3ff31859cca36584879f646b3864d4
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/utils/collect_env.py
@@ -0,0 +1,17 @@
+from annotator.mmpkg.mmcv.utils import collect_env as collect_base_env
+from annotator.mmpkg.mmcv.utils import get_git_hash
+
+import annotator.mmpkg.mmseg as mmseg
+
+
+def collect_env():
+ """Collect the information of the running environments."""
+ env_info = collect_base_env()
+ env_info['MMSegmentation'] = f'{mmseg.__version__}+{get_git_hash()[:7]}'
+
+ return env_info
+
+
+if __name__ == '__main__':
+ for name, val in collect_env().items():
+ print('{}: {}'.format(name, val))
diff --git a/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/utils/logger.py b/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/utils/logger.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c37733358e3e21479b41f54220bfe34b482009c
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/mmpkg/mmseg/utils/logger.py
@@ -0,0 +1,27 @@
+import logging
+
+from annotator.mmpkg.mmcv.utils import get_logger
+
+
+def get_root_logger(log_file=None, log_level=logging.INFO):
+ """Get the root logger.
+
+ The logger will be initialized if it has not been initialized. By default a
+ StreamHandler will be added. If `log_file` is specified, a FileHandler will
+ also be added. The name of the root logger is the top-level package name,
+ e.g., "mmseg".
+
+ Args:
+ log_file (str | None): The log filename. If specified, a FileHandler
+ will be added to the root logger.
+ log_level (int): The root logger level. Note that only the process of
+ rank 0 is affected, while other processes will set the level to
+ "Error" and be silent most of the time.
+
+ Returns:
+ logging.Logger: The root logger.
+ """
+
+ logger = get_logger(name='mmseg', log_file=log_file, log_level=log_level)
+
+ return logger
diff --git a/extensions/sd-webui-controlnet/annotator/openpose/__init__.py b/extensions/sd-webui-controlnet/annotator/openpose/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ed4b41dcdab8a0a7fde2f58314e84a8aca25f53
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/openpose/__init__.py
@@ -0,0 +1,62 @@
+import os
+os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
+
+import torch
+import numpy as np
+from . import util
+from .body import Body
+from .hand import Hand
+from modules.paths import models_path
+
+body_estimation = None
+hand_estimation = None
+
+body_model_path = "https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/body_pose_model.pth"
+hand_model_path = "https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/hand_pose_model.pth"
+modeldir = os.path.join(models_path, "openpose")
+old_modeldir = os.path.dirname(os.path.realpath(__file__))
+
+def unload_openpose_model():
+ global body_estimation, hand_estimation
+ if body_estimation is not None:
+ body_estimation.model.cpu()
+ hand_estimation.model.cpu()
+
+def apply_openpose(oriImg, hand=False):
+ global body_estimation, hand_estimation
+ if body_estimation is None:
+ body_modelpath = os.path.join(modeldir, "body_pose_model.pth")
+ hand_modelpath = os.path.join(modeldir, "hand_pose_model.pth")
+ old_body_modelpath = os.path.join(old_modeldir, "body_pose_model.pth")
+ old_hand_modelpath = os.path.join(old_modeldir, "hand_pose_model.pth")
+
+ if os.path.exists(old_body_modelpath):
+ body_modelpath = old_body_modelpath
+ elif not os.path.exists(hand_modelpath):
+ from basicsr.utils.download_util import load_file_from_url
+ load_file_from_url(hand_model_path, model_dir=modeldir)
+
+ if os.path.exists(old_hand_modelpath):
+ hand_modelpath = old_hand_modelpath
+ elif not os.path.exists(body_model_path):
+ from basicsr.utils.download_util import load_file_from_url
+ load_file_from_url(body_model_path, model_dir=modeldir)
+
+ body_estimation = Body(body_modelpath)
+ hand_estimation = Hand(hand_modelpath)
+
+ oriImg = oriImg[:, :, ::-1].copy()
+ with torch.no_grad():
+ candidate, subset = body_estimation(oriImg)
+ canvas = np.zeros_like(oriImg)
+ canvas = util.draw_bodypose(canvas, candidate, subset)
+ if hand:
+ hands_list = util.handDetect(candidate, subset, oriImg)
+ all_hand_peaks = []
+ for x, y, w, is_left in hands_list:
+ peaks = hand_estimation(oriImg[y:y+w, x:x+w, :])
+ peaks[:, 0] = np.where(peaks[:, 0] == 0, peaks[:, 0], peaks[:, 0] + x)
+ peaks[:, 1] = np.where(peaks[:, 1] == 0, peaks[:, 1], peaks[:, 1] + y)
+ all_hand_peaks.append(peaks)
+ canvas = util.draw_handpose(canvas, all_hand_peaks)
+ return canvas, dict(candidate=candidate.tolist(), subset=subset.tolist())
diff --git a/extensions/sd-webui-controlnet/annotator/openpose/body.py b/extensions/sd-webui-controlnet/annotator/openpose/body.py
new file mode 100644
index 0000000000000000000000000000000000000000..f85ee675e93b14b56d0ad8ef6c648707b6d35637
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/openpose/body.py
@@ -0,0 +1,221 @@
+import os
+import cv2
+import numpy as np
+import math
+import time
+from scipy.ndimage.filters import gaussian_filter
+import matplotlib.pyplot as plt
+import matplotlib
+import torch
+from torchvision import transforms
+
+from . import util
+from .model import bodypose_model
+from modules import devices
+from modules.paths import models_path
+
+class Body(object):
+ def __init__(self, model_path):
+ self.model = bodypose_model()
+ self.model = self.model.to(devices.get_device_for("controlnet"))
+ model_dict = util.transfer(self.model, torch.load(model_path))
+ self.model.load_state_dict(model_dict)
+ self.model.eval()
+
+ def __call__(self, oriImg):
+ self.model = self.model.to(devices.get_device_for("controlnet"))
+
+ # scale_search = [0.5, 1.0, 1.5, 2.0]
+ scale_search = [0.5]
+ boxsize = 368
+ stride = 8
+ padValue = 128
+ thre1 = 0.1
+ thre2 = 0.05
+ multiplier = [x * boxsize / oriImg.shape[0] for x in scale_search]
+ heatmap_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 19))
+ paf_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 38))
+
+ for m in range(len(multiplier)):
+ scale = multiplier[m]
+ imageToTest = cv2.resize(oriImg, (0, 0), fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
+ imageToTest_padded, pad = util.padRightDownCorner(imageToTest, stride, padValue)
+ im = np.transpose(np.float32(imageToTest_padded[:, :, :, np.newaxis]), (3, 2, 0, 1)) / 256 - 0.5
+ im = np.ascontiguousarray(im)
+
+ data = torch.from_numpy(im).float()
+ data = data.to(devices.get_device_for("controlnet"))
+ # data = data.permute([2, 0, 1]).unsqueeze(0).float()
+ with torch.no_grad():
+ Mconv7_stage6_L1, Mconv7_stage6_L2 = self.model(data)
+ Mconv7_stage6_L1 = Mconv7_stage6_L1.cpu().numpy()
+ Mconv7_stage6_L2 = Mconv7_stage6_L2.cpu().numpy()
+
+ # extract outputs, resize, and remove padding
+ # heatmap = np.transpose(np.squeeze(net.blobs[output_blobs.keys()[1]].data), (1, 2, 0)) # output 1 is heatmaps
+ heatmap = np.transpose(np.squeeze(Mconv7_stage6_L2), (1, 2, 0)) # output 1 is heatmaps
+ heatmap = cv2.resize(heatmap, (0, 0), fx=stride, fy=stride, interpolation=cv2.INTER_CUBIC)
+ heatmap = heatmap[:imageToTest_padded.shape[0] - pad[2], :imageToTest_padded.shape[1] - pad[3], :]
+ heatmap = cv2.resize(heatmap, (oriImg.shape[1], oriImg.shape[0]), interpolation=cv2.INTER_CUBIC)
+
+ # paf = np.transpose(np.squeeze(net.blobs[output_blobs.keys()[0]].data), (1, 2, 0)) # output 0 is PAFs
+ paf = np.transpose(np.squeeze(Mconv7_stage6_L1), (1, 2, 0)) # output 0 is PAFs
+ paf = cv2.resize(paf, (0, 0), fx=stride, fy=stride, interpolation=cv2.INTER_CUBIC)
+ paf = paf[:imageToTest_padded.shape[0] - pad[2], :imageToTest_padded.shape[1] - pad[3], :]
+ paf = cv2.resize(paf, (oriImg.shape[1], oriImg.shape[0]), interpolation=cv2.INTER_CUBIC)
+
+ heatmap_avg += heatmap_avg + heatmap / len(multiplier)
+ paf_avg += + paf / len(multiplier)
+
+ all_peaks = []
+ peak_counter = 0
+
+ for part in range(18):
+ map_ori = heatmap_avg[:, :, part]
+ one_heatmap = gaussian_filter(map_ori, sigma=3)
+
+ map_left = np.zeros(one_heatmap.shape)
+ map_left[1:, :] = one_heatmap[:-1, :]
+ map_right = np.zeros(one_heatmap.shape)
+ map_right[:-1, :] = one_heatmap[1:, :]
+ map_up = np.zeros(one_heatmap.shape)
+ map_up[:, 1:] = one_heatmap[:, :-1]
+ map_down = np.zeros(one_heatmap.shape)
+ map_down[:, :-1] = one_heatmap[:, 1:]
+
+ peaks_binary = np.logical_and.reduce(
+ (one_heatmap >= map_left, one_heatmap >= map_right, one_heatmap >= map_up, one_heatmap >= map_down, one_heatmap > thre1))
+ peaks = list(zip(np.nonzero(peaks_binary)[1], np.nonzero(peaks_binary)[0])) # note reverse
+ peaks_with_score = [x + (map_ori[x[1], x[0]],) for x in peaks]
+ peak_id = range(peak_counter, peak_counter + len(peaks))
+ peaks_with_score_and_id = [peaks_with_score[i] + (peak_id[i],) for i in range(len(peak_id))]
+
+ all_peaks.append(peaks_with_score_and_id)
+ peak_counter += len(peaks)
+
+ # find connection in the specified sequence, center 29 is in the position 15
+ limbSeq = [[2, 3], [2, 6], [3, 4], [4, 5], [6, 7], [7, 8], [2, 9], [9, 10], \
+ [10, 11], [2, 12], [12, 13], [13, 14], [2, 1], [1, 15], [15, 17], \
+ [1, 16], [16, 18], [3, 17], [6, 18]]
+ # the middle joints heatmap correpondence
+ mapIdx = [[31, 32], [39, 40], [33, 34], [35, 36], [41, 42], [43, 44], [19, 20], [21, 22], \
+ [23, 24], [25, 26], [27, 28], [29, 30], [47, 48], [49, 50], [53, 54], [51, 52], \
+ [55, 56], [37, 38], [45, 46]]
+
+ connection_all = []
+ special_k = []
+ mid_num = 10
+
+ for k in range(len(mapIdx)):
+ score_mid = paf_avg[:, :, [x - 19 for x in mapIdx[k]]]
+ candA = all_peaks[limbSeq[k][0] - 1]
+ candB = all_peaks[limbSeq[k][1] - 1]
+ nA = len(candA)
+ nB = len(candB)
+ indexA, indexB = limbSeq[k]
+ if (nA != 0 and nB != 0):
+ connection_candidate = []
+ for i in range(nA):
+ for j in range(nB):
+ vec = np.subtract(candB[j][:2], candA[i][:2])
+ norm = math.sqrt(vec[0] * vec[0] + vec[1] * vec[1])
+ norm = max(0.001, norm)
+ vec = np.divide(vec, norm)
+
+ startend = list(zip(np.linspace(candA[i][0], candB[j][0], num=mid_num), \
+ np.linspace(candA[i][1], candB[j][1], num=mid_num)))
+
+ vec_x = np.array([score_mid[int(round(startend[I][1])), int(round(startend[I][0])), 0] \
+ for I in range(len(startend))])
+ vec_y = np.array([score_mid[int(round(startend[I][1])), int(round(startend[I][0])), 1] \
+ for I in range(len(startend))])
+
+ score_midpts = np.multiply(vec_x, vec[0]) + np.multiply(vec_y, vec[1])
+ score_with_dist_prior = sum(score_midpts) / len(score_midpts) + min(
+ 0.5 * oriImg.shape[0] / norm - 1, 0)
+ criterion1 = len(np.nonzero(score_midpts > thre2)[0]) > 0.8 * len(score_midpts)
+ criterion2 = score_with_dist_prior > 0
+ if criterion1 and criterion2:
+ connection_candidate.append(
+ [i, j, score_with_dist_prior, score_with_dist_prior + candA[i][2] + candB[j][2]])
+
+ connection_candidate = sorted(connection_candidate, key=lambda x: x[2], reverse=True)
+ connection = np.zeros((0, 5))
+ for c in range(len(connection_candidate)):
+ i, j, s = connection_candidate[c][0:3]
+ if (i not in connection[:, 3] and j not in connection[:, 4]):
+ connection = np.vstack([connection, [candA[i][3], candB[j][3], s, i, j]])
+ if (len(connection) >= min(nA, nB)):
+ break
+
+ connection_all.append(connection)
+ else:
+ special_k.append(k)
+ connection_all.append([])
+
+ # last number in each row is the total parts number of that person
+ # the second last number in each row is the score of the overall configuration
+ subset = -1 * np.ones((0, 20))
+ candidate = np.array([item for sublist in all_peaks for item in sublist])
+
+ for k in range(len(mapIdx)):
+ if k not in special_k:
+ partAs = connection_all[k][:, 0]
+ partBs = connection_all[k][:, 1]
+ indexA, indexB = np.array(limbSeq[k]) - 1
+
+ for i in range(len(connection_all[k])): # = 1:size(temp,1)
+ found = 0
+ subset_idx = [-1, -1]
+ for j in range(len(subset)): # 1:size(subset,1):
+ if subset[j][indexA] == partAs[i] or subset[j][indexB] == partBs[i]:
+ subset_idx[found] = j
+ found += 1
+
+ if found == 1:
+ j = subset_idx[0]
+ if subset[j][indexB] != partBs[i]:
+ subset[j][indexB] = partBs[i]
+ subset[j][-1] += 1
+ subset[j][-2] += candidate[partBs[i].astype(int), 2] + connection_all[k][i][2]
+ elif found == 2: # if found 2 and disjoint, merge them
+ j1, j2 = subset_idx
+ membership = ((subset[j1] >= 0).astype(int) + (subset[j2] >= 0).astype(int))[:-2]
+ if len(np.nonzero(membership == 2)[0]) == 0: # merge
+ subset[j1][:-2] += (subset[j2][:-2] + 1)
+ subset[j1][-2:] += subset[j2][-2:]
+ subset[j1][-2] += connection_all[k][i][2]
+ subset = np.delete(subset, j2, 0)
+ else: # as like found == 1
+ subset[j1][indexB] = partBs[i]
+ subset[j1][-1] += 1
+ subset[j1][-2] += candidate[partBs[i].astype(int), 2] + connection_all[k][i][2]
+
+ # if find no partA in the subset, create a new subset
+ elif not found and k < 17:
+ row = -1 * np.ones(20)
+ row[indexA] = partAs[i]
+ row[indexB] = partBs[i]
+ row[-1] = 2
+ row[-2] = sum(candidate[connection_all[k][i, :2].astype(int), 2]) + connection_all[k][i][2]
+ subset = np.vstack([subset, row])
+ # delete some rows of subset which has few parts occur
+ deleteIdx = []
+ for i in range(len(subset)):
+ if subset[i][-1] < 4 or subset[i][-2] / subset[i][-1] < 0.4:
+ deleteIdx.append(i)
+ subset = np.delete(subset, deleteIdx, axis=0)
+
+ # subset: n*20 array, 0-17 is the index in candidate, 18 is the total score, 19 is the total parts
+ # candidate: x, y, score, id
+ return candidate, subset
+
+if __name__ == "__main__":
+ body_estimation = Body(os.path.join(models_path, "openpose", "body_pose_model.pth"))
+
+ test_image = '../images/ski.jpg'
+ oriImg = cv2.imread(test_image) # B,G,R order
+ candidate, subset = body_estimation(oriImg)
+ canvas = util.draw_bodypose(oriImg, candidate, subset)
+ plt.imshow(canvas[:, :, [2, 1, 0]])
+ plt.show()
diff --git a/extensions/sd-webui-controlnet/annotator/openpose/hand.py b/extensions/sd-webui-controlnet/annotator/openpose/hand.py
new file mode 100644
index 0000000000000000000000000000000000000000..b44baa35e4b266e5da64eb095d9c2cead09e71b6
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/openpose/hand.py
@@ -0,0 +1,87 @@
+import cv2
+import json
+import numpy as np
+import math
+import time
+from scipy.ndimage.filters import gaussian_filter
+import matplotlib.pyplot as plt
+import matplotlib
+import torch
+from skimage.measure import label
+
+from .model import handpose_model
+from . import util
+from modules import devices
+from modules.paths import models_path
+
+class Hand(object):
+ def __init__(self, model_path):
+ self.model = handpose_model()
+ self.model = self.model.to(devices.get_device_for("controlnet"))
+ model_dict = util.transfer(self.model, torch.load(model_path))
+ self.model.load_state_dict(model_dict)
+ self.model.eval()
+
+ def __call__(self, oriImg):
+ self.model = self.model.to(devices.get_device_for("controlnet"))
+
+ scale_search = [0.5, 1.0, 1.5, 2.0]
+ # scale_search = [0.5]
+ boxsize = 368
+ stride = 8
+ padValue = 128
+ thre = 0.05
+ multiplier = [x * boxsize / oriImg.shape[0] for x in scale_search]
+ heatmap_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 22))
+ # paf_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 38))
+
+ for m in range(len(multiplier)):
+ scale = multiplier[m]
+ imageToTest = cv2.resize(oriImg, (0, 0), fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
+ imageToTest_padded, pad = util.padRightDownCorner(imageToTest, stride, padValue)
+ im = np.transpose(np.float32(imageToTest_padded[:, :, :, np.newaxis]), (3, 2, 0, 1)) / 256 - 0.5
+ im = np.ascontiguousarray(im)
+
+ data = torch.from_numpy(im).float()
+ data = data.to(devices.get_device_for("controlnet"))
+ # data = data.permute([2, 0, 1]).unsqueeze(0).float()
+ with torch.no_grad():
+ output = self.model(data).cpu().numpy()
+ # output = self.model(data).numpy()q
+
+ # extract outputs, resize, and remove padding
+ heatmap = np.transpose(np.squeeze(output), (1, 2, 0)) # output 1 is heatmaps
+ heatmap = cv2.resize(heatmap, (0, 0), fx=stride, fy=stride, interpolation=cv2.INTER_CUBIC)
+ heatmap = heatmap[:imageToTest_padded.shape[0] - pad[2], :imageToTest_padded.shape[1] - pad[3], :]
+ heatmap = cv2.resize(heatmap, (oriImg.shape[1], oriImg.shape[0]), interpolation=cv2.INTER_CUBIC)
+
+ heatmap_avg += heatmap / len(multiplier)
+
+ all_peaks = []
+ for part in range(21):
+ map_ori = heatmap_avg[:, :, part]
+ one_heatmap = gaussian_filter(map_ori, sigma=3)
+ binary = np.ascontiguousarray(one_heatmap > thre, dtype=np.uint8)
+ # å
šéšå°äºéåŒ
+ if np.sum(binary) == 0:
+ all_peaks.append([0, 0])
+ continue
+ label_img, label_numbers = label(binary, return_num=True, connectivity=binary.ndim)
+ max_index = np.argmax([np.sum(map_ori[label_img == i]) for i in range(1, label_numbers + 1)]) + 1
+ label_img[label_img != max_index] = 0
+ map_ori[label_img == 0] = 0
+
+ y, x = util.npmax(map_ori)
+ all_peaks.append([x, y])
+ return np.array(all_peaks)
+
+if __name__ == "__main__":
+ hand_estimation = Hand(os.path.join(models_path, "openpose", "hand_pose_model.pth"))
+
+ # test_image = '../images/hand.jpg'
+ test_image = '../images/hand.jpg'
+ oriImg = cv2.imread(test_image) # B,G,R order
+ peaks = hand_estimation(oriImg)
+ canvas = util.draw_handpose(oriImg, peaks, True)
+ cv2.imshow('', canvas)
+ cv2.waitKey(0)
\ No newline at end of file
diff --git a/extensions/sd-webui-controlnet/annotator/openpose/model.py b/extensions/sd-webui-controlnet/annotator/openpose/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..5dfc80de827a17beccb9b0f3f7588545be78c9de
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/openpose/model.py
@@ -0,0 +1,219 @@
+import torch
+from collections import OrderedDict
+
+import torch
+import torch.nn as nn
+
+def make_layers(block, no_relu_layers):
+ layers = []
+ for layer_name, v in block.items():
+ if 'pool' in layer_name:
+ layer = nn.MaxPool2d(kernel_size=v[0], stride=v[1],
+ padding=v[2])
+ layers.append((layer_name, layer))
+ else:
+ conv2d = nn.Conv2d(in_channels=v[0], out_channels=v[1],
+ kernel_size=v[2], stride=v[3],
+ padding=v[4])
+ layers.append((layer_name, conv2d))
+ if layer_name not in no_relu_layers:
+ layers.append(('relu_'+layer_name, nn.ReLU(inplace=True)))
+
+ return nn.Sequential(OrderedDict(layers))
+
+class bodypose_model(nn.Module):
+ def __init__(self):
+ super(bodypose_model, self).__init__()
+
+ # these layers have no relu layer
+ no_relu_layers = ['conv5_5_CPM_L1', 'conv5_5_CPM_L2', 'Mconv7_stage2_L1',\
+ 'Mconv7_stage2_L2', 'Mconv7_stage3_L1', 'Mconv7_stage3_L2',\
+ 'Mconv7_stage4_L1', 'Mconv7_stage4_L2', 'Mconv7_stage5_L1',\
+ 'Mconv7_stage5_L2', 'Mconv7_stage6_L1', 'Mconv7_stage6_L1']
+ blocks = {}
+ block0 = OrderedDict([
+ ('conv1_1', [3, 64, 3, 1, 1]),
+ ('conv1_2', [64, 64, 3, 1, 1]),
+ ('pool1_stage1', [2, 2, 0]),
+ ('conv2_1', [64, 128, 3, 1, 1]),
+ ('conv2_2', [128, 128, 3, 1, 1]),
+ ('pool2_stage1', [2, 2, 0]),
+ ('conv3_1', [128, 256, 3, 1, 1]),
+ ('conv3_2', [256, 256, 3, 1, 1]),
+ ('conv3_3', [256, 256, 3, 1, 1]),
+ ('conv3_4', [256, 256, 3, 1, 1]),
+ ('pool3_stage1', [2, 2, 0]),
+ ('conv4_1', [256, 512, 3, 1, 1]),
+ ('conv4_2', [512, 512, 3, 1, 1]),
+ ('conv4_3_CPM', [512, 256, 3, 1, 1]),
+ ('conv4_4_CPM', [256, 128, 3, 1, 1])
+ ])
+
+
+ # Stage 1
+ block1_1 = OrderedDict([
+ ('conv5_1_CPM_L1', [128, 128, 3, 1, 1]),
+ ('conv5_2_CPM_L1', [128, 128, 3, 1, 1]),
+ ('conv5_3_CPM_L1', [128, 128, 3, 1, 1]),
+ ('conv5_4_CPM_L1', [128, 512, 1, 1, 0]),
+ ('conv5_5_CPM_L1', [512, 38, 1, 1, 0])
+ ])
+
+ block1_2 = OrderedDict([
+ ('conv5_1_CPM_L2', [128, 128, 3, 1, 1]),
+ ('conv5_2_CPM_L2', [128, 128, 3, 1, 1]),
+ ('conv5_3_CPM_L2', [128, 128, 3, 1, 1]),
+ ('conv5_4_CPM_L2', [128, 512, 1, 1, 0]),
+ ('conv5_5_CPM_L2', [512, 19, 1, 1, 0])
+ ])
+ blocks['block1_1'] = block1_1
+ blocks['block1_2'] = block1_2
+
+ self.model0 = make_layers(block0, no_relu_layers)
+
+ # Stages 2 - 6
+ for i in range(2, 7):
+ blocks['block%d_1' % i] = OrderedDict([
+ ('Mconv1_stage%d_L1' % i, [185, 128, 7, 1, 3]),
+ ('Mconv2_stage%d_L1' % i, [128, 128, 7, 1, 3]),
+ ('Mconv3_stage%d_L1' % i, [128, 128, 7, 1, 3]),
+ ('Mconv4_stage%d_L1' % i, [128, 128, 7, 1, 3]),
+ ('Mconv5_stage%d_L1' % i, [128, 128, 7, 1, 3]),
+ ('Mconv6_stage%d_L1' % i, [128, 128, 1, 1, 0]),
+ ('Mconv7_stage%d_L1' % i, [128, 38, 1, 1, 0])
+ ])
+
+ blocks['block%d_2' % i] = OrderedDict([
+ ('Mconv1_stage%d_L2' % i, [185, 128, 7, 1, 3]),
+ ('Mconv2_stage%d_L2' % i, [128, 128, 7, 1, 3]),
+ ('Mconv3_stage%d_L2' % i, [128, 128, 7, 1, 3]),
+ ('Mconv4_stage%d_L2' % i, [128, 128, 7, 1, 3]),
+ ('Mconv5_stage%d_L2' % i, [128, 128, 7, 1, 3]),
+ ('Mconv6_stage%d_L2' % i, [128, 128, 1, 1, 0]),
+ ('Mconv7_stage%d_L2' % i, [128, 19, 1, 1, 0])
+ ])
+
+ for k in blocks.keys():
+ blocks[k] = make_layers(blocks[k], no_relu_layers)
+
+ self.model1_1 = blocks['block1_1']
+ self.model2_1 = blocks['block2_1']
+ self.model3_1 = blocks['block3_1']
+ self.model4_1 = blocks['block4_1']
+ self.model5_1 = blocks['block5_1']
+ self.model6_1 = blocks['block6_1']
+
+ self.model1_2 = blocks['block1_2']
+ self.model2_2 = blocks['block2_2']
+ self.model3_2 = blocks['block3_2']
+ self.model4_2 = blocks['block4_2']
+ self.model5_2 = blocks['block5_2']
+ self.model6_2 = blocks['block6_2']
+
+
+ def forward(self, x):
+
+ out1 = self.model0(x)
+
+ out1_1 = self.model1_1(out1)
+ out1_2 = self.model1_2(out1)
+ out2 = torch.cat([out1_1, out1_2, out1], 1)
+
+ out2_1 = self.model2_1(out2)
+ out2_2 = self.model2_2(out2)
+ out3 = torch.cat([out2_1, out2_2, out1], 1)
+
+ out3_1 = self.model3_1(out3)
+ out3_2 = self.model3_2(out3)
+ out4 = torch.cat([out3_1, out3_2, out1], 1)
+
+ out4_1 = self.model4_1(out4)
+ out4_2 = self.model4_2(out4)
+ out5 = torch.cat([out4_1, out4_2, out1], 1)
+
+ out5_1 = self.model5_1(out5)
+ out5_2 = self.model5_2(out5)
+ out6 = torch.cat([out5_1, out5_2, out1], 1)
+
+ out6_1 = self.model6_1(out6)
+ out6_2 = self.model6_2(out6)
+
+ return out6_1, out6_2
+
+class handpose_model(nn.Module):
+ def __init__(self):
+ super(handpose_model, self).__init__()
+
+ # these layers have no relu layer
+ no_relu_layers = ['conv6_2_CPM', 'Mconv7_stage2', 'Mconv7_stage3',\
+ 'Mconv7_stage4', 'Mconv7_stage5', 'Mconv7_stage6']
+ # stage 1
+ block1_0 = OrderedDict([
+ ('conv1_1', [3, 64, 3, 1, 1]),
+ ('conv1_2', [64, 64, 3, 1, 1]),
+ ('pool1_stage1', [2, 2, 0]),
+ ('conv2_1', [64, 128, 3, 1, 1]),
+ ('conv2_2', [128, 128, 3, 1, 1]),
+ ('pool2_stage1', [2, 2, 0]),
+ ('conv3_1', [128, 256, 3, 1, 1]),
+ ('conv3_2', [256, 256, 3, 1, 1]),
+ ('conv3_3', [256, 256, 3, 1, 1]),
+ ('conv3_4', [256, 256, 3, 1, 1]),
+ ('pool3_stage1', [2, 2, 0]),
+ ('conv4_1', [256, 512, 3, 1, 1]),
+ ('conv4_2', [512, 512, 3, 1, 1]),
+ ('conv4_3', [512, 512, 3, 1, 1]),
+ ('conv4_4', [512, 512, 3, 1, 1]),
+ ('conv5_1', [512, 512, 3, 1, 1]),
+ ('conv5_2', [512, 512, 3, 1, 1]),
+ ('conv5_3_CPM', [512, 128, 3, 1, 1])
+ ])
+
+ block1_1 = OrderedDict([
+ ('conv6_1_CPM', [128, 512, 1, 1, 0]),
+ ('conv6_2_CPM', [512, 22, 1, 1, 0])
+ ])
+
+ blocks = {}
+ blocks['block1_0'] = block1_0
+ blocks['block1_1'] = block1_1
+
+ # stage 2-6
+ for i in range(2, 7):
+ blocks['block%d' % i] = OrderedDict([
+ ('Mconv1_stage%d' % i, [150, 128, 7, 1, 3]),
+ ('Mconv2_stage%d' % i, [128, 128, 7, 1, 3]),
+ ('Mconv3_stage%d' % i, [128, 128, 7, 1, 3]),
+ ('Mconv4_stage%d' % i, [128, 128, 7, 1, 3]),
+ ('Mconv5_stage%d' % i, [128, 128, 7, 1, 3]),
+ ('Mconv6_stage%d' % i, [128, 128, 1, 1, 0]),
+ ('Mconv7_stage%d' % i, [128, 22, 1, 1, 0])
+ ])
+
+ for k in blocks.keys():
+ blocks[k] = make_layers(blocks[k], no_relu_layers)
+
+ self.model1_0 = blocks['block1_0']
+ self.model1_1 = blocks['block1_1']
+ self.model2 = blocks['block2']
+ self.model3 = blocks['block3']
+ self.model4 = blocks['block4']
+ self.model5 = blocks['block5']
+ self.model6 = blocks['block6']
+
+ def forward(self, x):
+ out1_0 = self.model1_0(x)
+ out1_1 = self.model1_1(out1_0)
+ concat_stage2 = torch.cat([out1_1, out1_0], 1)
+ out_stage2 = self.model2(concat_stage2)
+ concat_stage3 = torch.cat([out_stage2, out1_0], 1)
+ out_stage3 = self.model3(concat_stage3)
+ concat_stage4 = torch.cat([out_stage3, out1_0], 1)
+ out_stage4 = self.model4(concat_stage4)
+ concat_stage5 = torch.cat([out_stage4, out1_0], 1)
+ out_stage5 = self.model5(concat_stage5)
+ concat_stage6 = torch.cat([out_stage5, out1_0], 1)
+ out_stage6 = self.model6(concat_stage6)
+ return out_stage6
+
+
diff --git a/extensions/sd-webui-controlnet/annotator/openpose/util.py b/extensions/sd-webui-controlnet/annotator/openpose/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f91ae0e65abaf0cbd62d803f56498991141e61b
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/openpose/util.py
@@ -0,0 +1,164 @@
+import math
+import numpy as np
+import matplotlib
+import cv2
+
+
+def padRightDownCorner(img, stride, padValue):
+ h = img.shape[0]
+ w = img.shape[1]
+
+ pad = 4 * [None]
+ pad[0] = 0 # up
+ pad[1] = 0 # left
+ pad[2] = 0 if (h % stride == 0) else stride - (h % stride) # down
+ pad[3] = 0 if (w % stride == 0) else stride - (w % stride) # right
+
+ img_padded = img
+ pad_up = np.tile(img_padded[0:1, :, :]*0 + padValue, (pad[0], 1, 1))
+ img_padded = np.concatenate((pad_up, img_padded), axis=0)
+ pad_left = np.tile(img_padded[:, 0:1, :]*0 + padValue, (1, pad[1], 1))
+ img_padded = np.concatenate((pad_left, img_padded), axis=1)
+ pad_down = np.tile(img_padded[-2:-1, :, :]*0 + padValue, (pad[2], 1, 1))
+ img_padded = np.concatenate((img_padded, pad_down), axis=0)
+ pad_right = np.tile(img_padded[:, -2:-1, :]*0 + padValue, (1, pad[3], 1))
+ img_padded = np.concatenate((img_padded, pad_right), axis=1)
+
+ return img_padded, pad
+
+# transfer caffe model to pytorch which will match the layer name
+def transfer(model, model_weights):
+ transfered_model_weights = {}
+ for weights_name in model.state_dict().keys():
+ transfered_model_weights[weights_name] = model_weights['.'.join(weights_name.split('.')[1:])]
+ return transfered_model_weights
+
+# draw the body keypoint and lims
+def draw_bodypose(canvas, candidate, subset):
+ stickwidth = 4
+ limbSeq = [[2, 3], [2, 6], [3, 4], [4, 5], [6, 7], [7, 8], [2, 9], [9, 10], \
+ [10, 11], [2, 12], [12, 13], [13, 14], [2, 1], [1, 15], [15, 17], \
+ [1, 16], [16, 18], [3, 17], [6, 18]]
+
+ colors = [[255, 0, 0], [255, 85, 0], [255, 170, 0], [255, 255, 0], [170, 255, 0], [85, 255, 0], [0, 255, 0], \
+ [0, 255, 85], [0, 255, 170], [0, 255, 255], [0, 170, 255], [0, 85, 255], [0, 0, 255], [85, 0, 255], \
+ [170, 0, 255], [255, 0, 255], [255, 0, 170], [255, 0, 85]]
+ for i in range(18):
+ for n in range(len(subset)):
+ index = int(subset[n][i])
+ if index == -1:
+ continue
+ x, y = candidate[index][0:2]
+ cv2.circle(canvas, (int(x), int(y)), 4, colors[i], thickness=-1)
+ for i in range(17):
+ for n in range(len(subset)):
+ index = subset[n][np.array(limbSeq[i]) - 1]
+ if -1 in index:
+ continue
+ cur_canvas = canvas.copy()
+ Y = candidate[index.astype(int), 0]
+ X = candidate[index.astype(int), 1]
+ mX = np.mean(X)
+ mY = np.mean(Y)
+ length = ((X[0] - X[1]) ** 2 + (Y[0] - Y[1]) ** 2) ** 0.5
+ angle = math.degrees(math.atan2(X[0] - X[1], Y[0] - Y[1]))
+ polygon = cv2.ellipse2Poly((int(mY), int(mX)), (int(length / 2), stickwidth), int(angle), 0, 360, 1)
+ cv2.fillConvexPoly(cur_canvas, polygon, colors[i])
+ canvas = cv2.addWeighted(canvas, 0.4, cur_canvas, 0.6, 0)
+ # plt.imsave("preview.jpg", canvas[:, :, [2, 1, 0]])
+ # plt.imshow(canvas[:, :, [2, 1, 0]])
+ return canvas
+
+
+# image drawed by opencv is not good.
+def draw_handpose(canvas, all_hand_peaks, show_number=False):
+ edges = [[0, 1], [1, 2], [2, 3], [3, 4], [0, 5], [5, 6], [6, 7], [7, 8], [0, 9], [9, 10], \
+ [10, 11], [11, 12], [0, 13], [13, 14], [14, 15], [15, 16], [0, 17], [17, 18], [18, 19], [19, 20]]
+
+ for peaks in all_hand_peaks:
+ for ie, e in enumerate(edges):
+ if np.sum(np.all(peaks[e], axis=1)==0)==0:
+ x1, y1 = peaks[e[0]]
+ x2, y2 = peaks[e[1]]
+ cv2.line(canvas, (x1, y1), (x2, y2), matplotlib.colors.hsv_to_rgb([ie/float(len(edges)), 1.0, 1.0])*255, thickness=2)
+
+ for i, keyponit in enumerate(peaks):
+ x, y = keyponit
+ cv2.circle(canvas, (x, y), 4, (0, 0, 255), thickness=-1)
+ if show_number:
+ cv2.putText(canvas, str(i), (x, y), cv2.FONT_HERSHEY_SIMPLEX, 0.3, (0, 0, 0), lineType=cv2.LINE_AA)
+ return canvas
+
+# detect hand according to body pose keypoints
+# please refer to https://github.com/CMU-Perceptual-Computing-Lab/openpose/blob/master/src/openpose/hand/handDetector.cpp
+def handDetect(candidate, subset, oriImg):
+ # right hand: wrist 4, elbow 3, shoulder 2
+ # left hand: wrist 7, elbow 6, shoulder 5
+ ratioWristElbow = 0.33
+ detect_result = []
+ image_height, image_width = oriImg.shape[0:2]
+ for person in subset.astype(int):
+ # if any of three not detected
+ has_left = np.sum(person[[5, 6, 7]] == -1) == 0
+ has_right = np.sum(person[[2, 3, 4]] == -1) == 0
+ if not (has_left or has_right):
+ continue
+ hands = []
+ #left hand
+ if has_left:
+ left_shoulder_index, left_elbow_index, left_wrist_index = person[[5, 6, 7]]
+ x1, y1 = candidate[left_shoulder_index][:2]
+ x2, y2 = candidate[left_elbow_index][:2]
+ x3, y3 = candidate[left_wrist_index][:2]
+ hands.append([x1, y1, x2, y2, x3, y3, True])
+ # right hand
+ if has_right:
+ right_shoulder_index, right_elbow_index, right_wrist_index = person[[2, 3, 4]]
+ x1, y1 = candidate[right_shoulder_index][:2]
+ x2, y2 = candidate[right_elbow_index][:2]
+ x3, y3 = candidate[right_wrist_index][:2]
+ hands.append([x1, y1, x2, y2, x3, y3, False])
+
+ for x1, y1, x2, y2, x3, y3, is_left in hands:
+ # pos_hand = pos_wrist + ratio * (pos_wrist - pos_elbox) = (1 + ratio) * pos_wrist - ratio * pos_elbox
+ # handRectangle.x = posePtr[wrist*3] + ratioWristElbow * (posePtr[wrist*3] - posePtr[elbow*3]);
+ # handRectangle.y = posePtr[wrist*3+1] + ratioWristElbow * (posePtr[wrist*3+1] - posePtr[elbow*3+1]);
+ # const auto distanceWristElbow = getDistance(poseKeypoints, person, wrist, elbow);
+ # const auto distanceElbowShoulder = getDistance(poseKeypoints, person, elbow, shoulder);
+ # handRectangle.width = 1.5f * fastMax(distanceWristElbow, 0.9f * distanceElbowShoulder);
+ x = x3 + ratioWristElbow * (x3 - x2)
+ y = y3 + ratioWristElbow * (y3 - y2)
+ distanceWristElbow = math.sqrt((x3 - x2) ** 2 + (y3 - y2) ** 2)
+ distanceElbowShoulder = math.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2)
+ width = 1.5 * max(distanceWristElbow, 0.9 * distanceElbowShoulder)
+ # x-y refers to the center --> offset to topLeft point
+ # handRectangle.x -= handRectangle.width / 2.f;
+ # handRectangle.y -= handRectangle.height / 2.f;
+ x -= width / 2
+ y -= width / 2 # width = height
+ # overflow the image
+ if x < 0: x = 0
+ if y < 0: y = 0
+ width1 = width
+ width2 = width
+ if x + width > image_width: width1 = image_width - x
+ if y + width > image_height: width2 = image_height - y
+ width = min(width1, width2)
+ # the max hand box value is 20 pixels
+ if width >= 20:
+ detect_result.append([int(x), int(y), int(width), is_left])
+
+ '''
+ return value: [[x, y, w, True if left hand else False]].
+ width=height since the network require squared input.
+ x, y is the coordinate of top left
+ '''
+ return detect_result
+
+# get max index of 2d array
+def npmax(array):
+ arrayindex = array.argmax(1)
+ arrayvalue = array.max(1)
+ i = arrayvalue.argmax()
+ j = arrayindex[i]
+ return i, j
diff --git a/extensions/sd-webui-controlnet/annotator/pidinet/__init__.py b/extensions/sd-webui-controlnet/annotator/pidinet/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..02554d532ebd18e22e1fa33f2e6d18aebc1f7fd9
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/pidinet/__init__.py
@@ -0,0 +1,46 @@
+import os
+import torch
+import numpy as np
+from einops import rearrange
+from annotator.pidinet.model import pidinet
+from modules import devices
+from modules.paths import models_path
+from scripts.utils import load_state_dict
+
+netNetwork = None
+remote_model_path = "https://github.com/TencentARC/T2I-Adapter/raw/main/models/table5_pidinet.pth"
+modeldir = os.path.join(models_path, "pidinet")
+old_modeldir = os.path.dirname(os.path.realpath(__file__))
+
+def apply_pidinet(input_image):
+ global netNetwork
+ if netNetwork is None:
+ modelpath = os.path.join(modeldir, "table5_pidinet.pth")
+ old_modelpath = os.path.join(old_modeldir, "table5_pidinet.pth")
+ if os.path.exists(old_modelpath):
+ modelpath = old_modelpath
+ elif not os.path.exists(modelpath):
+ from basicsr.utils.download_util import load_file_from_url
+ load_file_from_url(remote_model_path, model_dir=modeldir)
+ netNetwork = pidinet()
+ ckp = load_state_dict(modelpath)
+ netNetwork.load_state_dict({k.replace('module.',''):v for k, v in ckp.items()})
+
+ netNetwork = netNetwork.to(devices.get_device_for("controlnet"))
+ netNetwork.eval()
+ assert input_image.ndim == 3
+ input_image = input_image[:, :, ::-1].copy()
+ with torch.no_grad():
+ image_pidi = torch.from_numpy(input_image).float().to(devices.get_device_for("controlnet"))
+ image_pidi = image_pidi / 255.0
+ image_pidi = rearrange(image_pidi, 'h w c -> 1 c h w')
+ edge = netNetwork(image_pidi)[-1]
+ edge = edge>0.5
+ edge = (edge * 255.0).clip(0, 255).cpu().numpy().astype(np.uint8)
+
+ return edge[0][0]
+
+def unload_pid_model():
+ global netNetwork
+ if netNetwork is not None:
+ netNetwork.cpu()
\ No newline at end of file
diff --git a/extensions/sd-webui-controlnet/annotator/pidinet/model.py b/extensions/sd-webui-controlnet/annotator/pidinet/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..eaed557dea23d56f2aea6045f6a350bee03083ce
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/pidinet/model.py
@@ -0,0 +1,653 @@
+"""
+Author: Zhuo Su, Wenzhe Liu
+Date: Feb 18, 2021
+"""
+
+import math
+
+import cv2
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from basicsr.utils import img2tensor
+
+nets = {
+ 'baseline': {
+ 'layer0': 'cv',
+ 'layer1': 'cv',
+ 'layer2': 'cv',
+ 'layer3': 'cv',
+ 'layer4': 'cv',
+ 'layer5': 'cv',
+ 'layer6': 'cv',
+ 'layer7': 'cv',
+ 'layer8': 'cv',
+ 'layer9': 'cv',
+ 'layer10': 'cv',
+ 'layer11': 'cv',
+ 'layer12': 'cv',
+ 'layer13': 'cv',
+ 'layer14': 'cv',
+ 'layer15': 'cv',
+ },
+ 'c-v15': {
+ 'layer0': 'cd',
+ 'layer1': 'cv',
+ 'layer2': 'cv',
+ 'layer3': 'cv',
+ 'layer4': 'cv',
+ 'layer5': 'cv',
+ 'layer6': 'cv',
+ 'layer7': 'cv',
+ 'layer8': 'cv',
+ 'layer9': 'cv',
+ 'layer10': 'cv',
+ 'layer11': 'cv',
+ 'layer12': 'cv',
+ 'layer13': 'cv',
+ 'layer14': 'cv',
+ 'layer15': 'cv',
+ },
+ 'a-v15': {
+ 'layer0': 'ad',
+ 'layer1': 'cv',
+ 'layer2': 'cv',
+ 'layer3': 'cv',
+ 'layer4': 'cv',
+ 'layer5': 'cv',
+ 'layer6': 'cv',
+ 'layer7': 'cv',
+ 'layer8': 'cv',
+ 'layer9': 'cv',
+ 'layer10': 'cv',
+ 'layer11': 'cv',
+ 'layer12': 'cv',
+ 'layer13': 'cv',
+ 'layer14': 'cv',
+ 'layer15': 'cv',
+ },
+ 'r-v15': {
+ 'layer0': 'rd',
+ 'layer1': 'cv',
+ 'layer2': 'cv',
+ 'layer3': 'cv',
+ 'layer4': 'cv',
+ 'layer5': 'cv',
+ 'layer6': 'cv',
+ 'layer7': 'cv',
+ 'layer8': 'cv',
+ 'layer9': 'cv',
+ 'layer10': 'cv',
+ 'layer11': 'cv',
+ 'layer12': 'cv',
+ 'layer13': 'cv',
+ 'layer14': 'cv',
+ 'layer15': 'cv',
+ },
+ 'cvvv4': {
+ 'layer0': 'cd',
+ 'layer1': 'cv',
+ 'layer2': 'cv',
+ 'layer3': 'cv',
+ 'layer4': 'cd',
+ 'layer5': 'cv',
+ 'layer6': 'cv',
+ 'layer7': 'cv',
+ 'layer8': 'cd',
+ 'layer9': 'cv',
+ 'layer10': 'cv',
+ 'layer11': 'cv',
+ 'layer12': 'cd',
+ 'layer13': 'cv',
+ 'layer14': 'cv',
+ 'layer15': 'cv',
+ },
+ 'avvv4': {
+ 'layer0': 'ad',
+ 'layer1': 'cv',
+ 'layer2': 'cv',
+ 'layer3': 'cv',
+ 'layer4': 'ad',
+ 'layer5': 'cv',
+ 'layer6': 'cv',
+ 'layer7': 'cv',
+ 'layer8': 'ad',
+ 'layer9': 'cv',
+ 'layer10': 'cv',
+ 'layer11': 'cv',
+ 'layer12': 'ad',
+ 'layer13': 'cv',
+ 'layer14': 'cv',
+ 'layer15': 'cv',
+ },
+ 'rvvv4': {
+ 'layer0': 'rd',
+ 'layer1': 'cv',
+ 'layer2': 'cv',
+ 'layer3': 'cv',
+ 'layer4': 'rd',
+ 'layer5': 'cv',
+ 'layer6': 'cv',
+ 'layer7': 'cv',
+ 'layer8': 'rd',
+ 'layer9': 'cv',
+ 'layer10': 'cv',
+ 'layer11': 'cv',
+ 'layer12': 'rd',
+ 'layer13': 'cv',
+ 'layer14': 'cv',
+ 'layer15': 'cv',
+ },
+ 'cccv4': {
+ 'layer0': 'cd',
+ 'layer1': 'cd',
+ 'layer2': 'cd',
+ 'layer3': 'cv',
+ 'layer4': 'cd',
+ 'layer5': 'cd',
+ 'layer6': 'cd',
+ 'layer7': 'cv',
+ 'layer8': 'cd',
+ 'layer9': 'cd',
+ 'layer10': 'cd',
+ 'layer11': 'cv',
+ 'layer12': 'cd',
+ 'layer13': 'cd',
+ 'layer14': 'cd',
+ 'layer15': 'cv',
+ },
+ 'aaav4': {
+ 'layer0': 'ad',
+ 'layer1': 'ad',
+ 'layer2': 'ad',
+ 'layer3': 'cv',
+ 'layer4': 'ad',
+ 'layer5': 'ad',
+ 'layer6': 'ad',
+ 'layer7': 'cv',
+ 'layer8': 'ad',
+ 'layer9': 'ad',
+ 'layer10': 'ad',
+ 'layer11': 'cv',
+ 'layer12': 'ad',
+ 'layer13': 'ad',
+ 'layer14': 'ad',
+ 'layer15': 'cv',
+ },
+ 'rrrv4': {
+ 'layer0': 'rd',
+ 'layer1': 'rd',
+ 'layer2': 'rd',
+ 'layer3': 'cv',
+ 'layer4': 'rd',
+ 'layer5': 'rd',
+ 'layer6': 'rd',
+ 'layer7': 'cv',
+ 'layer8': 'rd',
+ 'layer9': 'rd',
+ 'layer10': 'rd',
+ 'layer11': 'cv',
+ 'layer12': 'rd',
+ 'layer13': 'rd',
+ 'layer14': 'rd',
+ 'layer15': 'cv',
+ },
+ 'c16': {
+ 'layer0': 'cd',
+ 'layer1': 'cd',
+ 'layer2': 'cd',
+ 'layer3': 'cd',
+ 'layer4': 'cd',
+ 'layer5': 'cd',
+ 'layer6': 'cd',
+ 'layer7': 'cd',
+ 'layer8': 'cd',
+ 'layer9': 'cd',
+ 'layer10': 'cd',
+ 'layer11': 'cd',
+ 'layer12': 'cd',
+ 'layer13': 'cd',
+ 'layer14': 'cd',
+ 'layer15': 'cd',
+ },
+ 'a16': {
+ 'layer0': 'ad',
+ 'layer1': 'ad',
+ 'layer2': 'ad',
+ 'layer3': 'ad',
+ 'layer4': 'ad',
+ 'layer5': 'ad',
+ 'layer6': 'ad',
+ 'layer7': 'ad',
+ 'layer8': 'ad',
+ 'layer9': 'ad',
+ 'layer10': 'ad',
+ 'layer11': 'ad',
+ 'layer12': 'ad',
+ 'layer13': 'ad',
+ 'layer14': 'ad',
+ 'layer15': 'ad',
+ },
+ 'r16': {
+ 'layer0': 'rd',
+ 'layer1': 'rd',
+ 'layer2': 'rd',
+ 'layer3': 'rd',
+ 'layer4': 'rd',
+ 'layer5': 'rd',
+ 'layer6': 'rd',
+ 'layer7': 'rd',
+ 'layer8': 'rd',
+ 'layer9': 'rd',
+ 'layer10': 'rd',
+ 'layer11': 'rd',
+ 'layer12': 'rd',
+ 'layer13': 'rd',
+ 'layer14': 'rd',
+ 'layer15': 'rd',
+ },
+ 'carv4': {
+ 'layer0': 'cd',
+ 'layer1': 'ad',
+ 'layer2': 'rd',
+ 'layer3': 'cv',
+ 'layer4': 'cd',
+ 'layer5': 'ad',
+ 'layer6': 'rd',
+ 'layer7': 'cv',
+ 'layer8': 'cd',
+ 'layer9': 'ad',
+ 'layer10': 'rd',
+ 'layer11': 'cv',
+ 'layer12': 'cd',
+ 'layer13': 'ad',
+ 'layer14': 'rd',
+ 'layer15': 'cv',
+ },
+ }
+
+def createConvFunc(op_type):
+ assert op_type in ['cv', 'cd', 'ad', 'rd'], 'unknown op type: %s' % str(op_type)
+ if op_type == 'cv':
+ return F.conv2d
+
+ if op_type == 'cd':
+ def func(x, weights, bias=None, stride=1, padding=0, dilation=1, groups=1):
+ assert dilation in [1, 2], 'dilation for cd_conv should be in 1 or 2'
+ assert weights.size(2) == 3 and weights.size(3) == 3, 'kernel size for cd_conv should be 3x3'
+ assert padding == dilation, 'padding for cd_conv set wrong'
+
+ weights_c = weights.sum(dim=[2, 3], keepdim=True)
+ yc = F.conv2d(x, weights_c, stride=stride, padding=0, groups=groups)
+ y = F.conv2d(x, weights, bias, stride=stride, padding=padding, dilation=dilation, groups=groups)
+ return y - yc
+ return func
+ elif op_type == 'ad':
+ def func(x, weights, bias=None, stride=1, padding=0, dilation=1, groups=1):
+ assert dilation in [1, 2], 'dilation for ad_conv should be in 1 or 2'
+ assert weights.size(2) == 3 and weights.size(3) == 3, 'kernel size for ad_conv should be 3x3'
+ assert padding == dilation, 'padding for ad_conv set wrong'
+
+ shape = weights.shape
+ weights = weights.view(shape[0], shape[1], -1)
+ weights_conv = (weights - weights[:, :, [3, 0, 1, 6, 4, 2, 7, 8, 5]]).view(shape) # clock-wise
+ y = F.conv2d(x, weights_conv, bias, stride=stride, padding=padding, dilation=dilation, groups=groups)
+ return y
+ return func
+ elif op_type == 'rd':
+ def func(x, weights, bias=None, stride=1, padding=0, dilation=1, groups=1):
+ assert dilation in [1, 2], 'dilation for rd_conv should be in 1 or 2'
+ assert weights.size(2) == 3 and weights.size(3) == 3, 'kernel size for rd_conv should be 3x3'
+ padding = 2 * dilation
+
+ shape = weights.shape
+ if weights.is_cuda:
+ buffer = torch.cuda.FloatTensor(shape[0], shape[1], 5 * 5).fill_(0)
+ else:
+ buffer = torch.zeros(shape[0], shape[1], 5 * 5)
+ weights = weights.view(shape[0], shape[1], -1)
+ buffer[:, :, [0, 2, 4, 10, 14, 20, 22, 24]] = weights[:, :, 1:]
+ buffer[:, :, [6, 7, 8, 11, 13, 16, 17, 18]] = -weights[:, :, 1:]
+ buffer[:, :, 12] = 0
+ buffer = buffer.view(shape[0], shape[1], 5, 5)
+ y = F.conv2d(x, buffer, bias, stride=stride, padding=padding, dilation=dilation, groups=groups)
+ return y
+ return func
+ else:
+ print('impossible to be here unless you force that')
+ return None
+
+class Conv2d(nn.Module):
+ def __init__(self, pdc, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=False):
+ super(Conv2d, self).__init__()
+ if in_channels % groups != 0:
+ raise ValueError('in_channels must be divisible by groups')
+ if out_channels % groups != 0:
+ raise ValueError('out_channels must be divisible by groups')
+ self.in_channels = in_channels
+ self.out_channels = out_channels
+ self.kernel_size = kernel_size
+ self.stride = stride
+ self.padding = padding
+ self.dilation = dilation
+ self.groups = groups
+ self.weight = nn.Parameter(torch.Tensor(out_channels, in_channels // groups, kernel_size, kernel_size))
+ if bias:
+ self.bias = nn.Parameter(torch.Tensor(out_channels))
+ else:
+ self.register_parameter('bias', None)
+ self.reset_parameters()
+ self.pdc = pdc
+
+ def reset_parameters(self):
+ nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+ if self.bias is not None:
+ fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight)
+ bound = 1 / math.sqrt(fan_in)
+ nn.init.uniform_(self.bias, -bound, bound)
+
+ def forward(self, input):
+
+ return self.pdc(input, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
+
+class CSAM(nn.Module):
+ """
+ Compact Spatial Attention Module
+ """
+ def __init__(self, channels):
+ super(CSAM, self).__init__()
+
+ mid_channels = 4
+ self.relu1 = nn.ReLU()
+ self.conv1 = nn.Conv2d(channels, mid_channels, kernel_size=1, padding=0)
+ self.conv2 = nn.Conv2d(mid_channels, 1, kernel_size=3, padding=1, bias=False)
+ self.sigmoid = nn.Sigmoid()
+ nn.init.constant_(self.conv1.bias, 0)
+
+ def forward(self, x):
+ y = self.relu1(x)
+ y = self.conv1(y)
+ y = self.conv2(y)
+ y = self.sigmoid(y)
+
+ return x * y
+
+class CDCM(nn.Module):
+ """
+ Compact Dilation Convolution based Module
+ """
+ def __init__(self, in_channels, out_channels):
+ super(CDCM, self).__init__()
+
+ self.relu1 = nn.ReLU()
+ self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=1, padding=0)
+ self.conv2_1 = nn.Conv2d(out_channels, out_channels, kernel_size=3, dilation=5, padding=5, bias=False)
+ self.conv2_2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, dilation=7, padding=7, bias=False)
+ self.conv2_3 = nn.Conv2d(out_channels, out_channels, kernel_size=3, dilation=9, padding=9, bias=False)
+ self.conv2_4 = nn.Conv2d(out_channels, out_channels, kernel_size=3, dilation=11, padding=11, bias=False)
+ nn.init.constant_(self.conv1.bias, 0)
+
+ def forward(self, x):
+ x = self.relu1(x)
+ x = self.conv1(x)
+ x1 = self.conv2_1(x)
+ x2 = self.conv2_2(x)
+ x3 = self.conv2_3(x)
+ x4 = self.conv2_4(x)
+ return x1 + x2 + x3 + x4
+
+
+class MapReduce(nn.Module):
+ """
+ Reduce feature maps into a single edge map
+ """
+ def __init__(self, channels):
+ super(MapReduce, self).__init__()
+ self.conv = nn.Conv2d(channels, 1, kernel_size=1, padding=0)
+ nn.init.constant_(self.conv.bias, 0)
+
+ def forward(self, x):
+ return self.conv(x)
+
+
+class PDCBlock(nn.Module):
+ def __init__(self, pdc, inplane, ouplane, stride=1):
+ super(PDCBlock, self).__init__()
+ self.stride=stride
+
+ self.stride=stride
+ if self.stride > 1:
+ self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
+ self.shortcut = nn.Conv2d(inplane, ouplane, kernel_size=1, padding=0)
+ self.conv1 = Conv2d(pdc, inplane, inplane, kernel_size=3, padding=1, groups=inplane, bias=False)
+ self.relu2 = nn.ReLU()
+ self.conv2 = nn.Conv2d(inplane, ouplane, kernel_size=1, padding=0, bias=False)
+
+ def forward(self, x):
+ if self.stride > 1:
+ x = self.pool(x)
+ y = self.conv1(x)
+ y = self.relu2(y)
+ y = self.conv2(y)
+ if self.stride > 1:
+ x = self.shortcut(x)
+ y = y + x
+ return y
+
+class PDCBlock_converted(nn.Module):
+ """
+ CPDC, APDC can be converted to vanilla 3x3 convolution
+ RPDC can be converted to vanilla 5x5 convolution
+ """
+ def __init__(self, pdc, inplane, ouplane, stride=1):
+ super(PDCBlock_converted, self).__init__()
+ self.stride=stride
+
+ if self.stride > 1:
+ self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
+ self.shortcut = nn.Conv2d(inplane, ouplane, kernel_size=1, padding=0)
+ if pdc == 'rd':
+ self.conv1 = nn.Conv2d(inplane, inplane, kernel_size=5, padding=2, groups=inplane, bias=False)
+ else:
+ self.conv1 = nn.Conv2d(inplane, inplane, kernel_size=3, padding=1, groups=inplane, bias=False)
+ self.relu2 = nn.ReLU()
+ self.conv2 = nn.Conv2d(inplane, ouplane, kernel_size=1, padding=0, bias=False)
+
+ def forward(self, x):
+ if self.stride > 1:
+ x = self.pool(x)
+ y = self.conv1(x)
+ y = self.relu2(y)
+ y = self.conv2(y)
+ if self.stride > 1:
+ x = self.shortcut(x)
+ y = y + x
+ return y
+
+class PiDiNet(nn.Module):
+ def __init__(self, inplane, pdcs, dil=None, sa=False, convert=False):
+ super(PiDiNet, self).__init__()
+ self.sa = sa
+ if dil is not None:
+ assert isinstance(dil, int), 'dil should be an int'
+ self.dil = dil
+
+ self.fuseplanes = []
+
+ self.inplane = inplane
+ if convert:
+ if pdcs[0] == 'rd':
+ init_kernel_size = 5
+ init_padding = 2
+ else:
+ init_kernel_size = 3
+ init_padding = 1
+ self.init_block = nn.Conv2d(3, self.inplane,
+ kernel_size=init_kernel_size, padding=init_padding, bias=False)
+ block_class = PDCBlock_converted
+ else:
+ self.init_block = Conv2d(pdcs[0], 3, self.inplane, kernel_size=3, padding=1)
+ block_class = PDCBlock
+
+ self.block1_1 = block_class(pdcs[1], self.inplane, self.inplane)
+ self.block1_2 = block_class(pdcs[2], self.inplane, self.inplane)
+ self.block1_3 = block_class(pdcs[3], self.inplane, self.inplane)
+ self.fuseplanes.append(self.inplane) # C
+
+ inplane = self.inplane
+ self.inplane = self.inplane * 2
+ self.block2_1 = block_class(pdcs[4], inplane, self.inplane, stride=2)
+ self.block2_2 = block_class(pdcs[5], self.inplane, self.inplane)
+ self.block2_3 = block_class(pdcs[6], self.inplane, self.inplane)
+ self.block2_4 = block_class(pdcs[7], self.inplane, self.inplane)
+ self.fuseplanes.append(self.inplane) # 2C
+
+ inplane = self.inplane
+ self.inplane = self.inplane * 2
+ self.block3_1 = block_class(pdcs[8], inplane, self.inplane, stride=2)
+ self.block3_2 = block_class(pdcs[9], self.inplane, self.inplane)
+ self.block3_3 = block_class(pdcs[10], self.inplane, self.inplane)
+ self.block3_4 = block_class(pdcs[11], self.inplane, self.inplane)
+ self.fuseplanes.append(self.inplane) # 4C
+
+ self.block4_1 = block_class(pdcs[12], self.inplane, self.inplane, stride=2)
+ self.block4_2 = block_class(pdcs[13], self.inplane, self.inplane)
+ self.block4_3 = block_class(pdcs[14], self.inplane, self.inplane)
+ self.block4_4 = block_class(pdcs[15], self.inplane, self.inplane)
+ self.fuseplanes.append(self.inplane) # 4C
+
+ self.conv_reduces = nn.ModuleList()
+ if self.sa and self.dil is not None:
+ self.attentions = nn.ModuleList()
+ self.dilations = nn.ModuleList()
+ for i in range(4):
+ self.dilations.append(CDCM(self.fuseplanes[i], self.dil))
+ self.attentions.append(CSAM(self.dil))
+ self.conv_reduces.append(MapReduce(self.dil))
+ elif self.sa:
+ self.attentions = nn.ModuleList()
+ for i in range(4):
+ self.attentions.append(CSAM(self.fuseplanes[i]))
+ self.conv_reduces.append(MapReduce(self.fuseplanes[i]))
+ elif self.dil is not None:
+ self.dilations = nn.ModuleList()
+ for i in range(4):
+ self.dilations.append(CDCM(self.fuseplanes[i], self.dil))
+ self.conv_reduces.append(MapReduce(self.dil))
+ else:
+ for i in range(4):
+ self.conv_reduces.append(MapReduce(self.fuseplanes[i]))
+
+ self.classifier = nn.Conv2d(4, 1, kernel_size=1) # has bias
+ nn.init.constant_(self.classifier.weight, 0.25)
+ nn.init.constant_(self.classifier.bias, 0)
+
+ # print('initialization done')
+
+ def get_weights(self):
+ conv_weights = []
+ bn_weights = []
+ relu_weights = []
+ for pname, p in self.named_parameters():
+ if 'bn' in pname:
+ bn_weights.append(p)
+ elif 'relu' in pname:
+ relu_weights.append(p)
+ else:
+ conv_weights.append(p)
+
+ return conv_weights, bn_weights, relu_weights
+
+ def forward(self, x):
+ H, W = x.size()[2:]
+
+ x = self.init_block(x)
+
+ x1 = self.block1_1(x)
+ x1 = self.block1_2(x1)
+ x1 = self.block1_3(x1)
+
+ x2 = self.block2_1(x1)
+ x2 = self.block2_2(x2)
+ x2 = self.block2_3(x2)
+ x2 = self.block2_4(x2)
+
+ x3 = self.block3_1(x2)
+ x3 = self.block3_2(x3)
+ x3 = self.block3_3(x3)
+ x3 = self.block3_4(x3)
+
+ x4 = self.block4_1(x3)
+ x4 = self.block4_2(x4)
+ x4 = self.block4_3(x4)
+ x4 = self.block4_4(x4)
+
+ x_fuses = []
+ if self.sa and self.dil is not None:
+ for i, xi in enumerate([x1, x2, x3, x4]):
+ x_fuses.append(self.attentions[i](self.dilations[i](xi)))
+ elif self.sa:
+ for i, xi in enumerate([x1, x2, x3, x4]):
+ x_fuses.append(self.attentions[i](xi))
+ elif self.dil is not None:
+ for i, xi in enumerate([x1, x2, x3, x4]):
+ x_fuses.append(self.dilations[i](xi))
+ else:
+ x_fuses = [x1, x2, x3, x4]
+
+ e1 = self.conv_reduces[0](x_fuses[0])
+ e1 = F.interpolate(e1, (H, W), mode="bilinear", align_corners=False)
+
+ e2 = self.conv_reduces[1](x_fuses[1])
+ e2 = F.interpolate(e2, (H, W), mode="bilinear", align_corners=False)
+
+ e3 = self.conv_reduces[2](x_fuses[2])
+ e3 = F.interpolate(e3, (H, W), mode="bilinear", align_corners=False)
+
+ e4 = self.conv_reduces[3](x_fuses[3])
+ e4 = F.interpolate(e4, (H, W), mode="bilinear", align_corners=False)
+
+ outputs = [e1, e2, e3, e4]
+
+ output = self.classifier(torch.cat(outputs, dim=1))
+ #if not self.training:
+ # return torch.sigmoid(output)
+
+ outputs.append(output)
+ outputs = [torch.sigmoid(r) for r in outputs]
+ return outputs
+
+def config_model(model):
+ model_options = list(nets.keys())
+ assert model in model_options, \
+ 'unrecognized model, please choose from %s' % str(model_options)
+
+ # print(str(nets[model]))
+
+ pdcs = []
+ for i in range(16):
+ layer_name = 'layer%d' % i
+ op = nets[model][layer_name]
+ pdcs.append(createConvFunc(op))
+
+ return pdcs
+
+def pidinet():
+ pdcs = config_model('carv4')
+ dil = 24 #if args.dil else None
+ return PiDiNet(60, pdcs, dil=dil, sa=True)
+
+
+if __name__ == '__main__':
+ model = pidinet()
+ ckp = torch.load('table5_pidinet.pth')['state_dict']
+ model.load_state_dict({k.replace('module.',''):v for k, v in ckp.items()})
+ im = cv2.imread('examples/test_my/cat_v4.png')
+ im = img2tensor(im).unsqueeze(0)/255.
+ res = model(im)[-1]
+ res = res>0.5
+ res = res.float()
+ res = (res[0,0].cpu().data.numpy()*255.).astype(np.uint8)
+ print(res.shape)
+ cv2.imwrite('edge.png', res)
\ No newline at end of file
diff --git a/extensions/sd-webui-controlnet/annotator/uniformer/__init__.py b/extensions/sd-webui-controlnet/annotator/uniformer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ee8b4edaf1c2153fb9436706e572f5aa49d9803
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/uniformer/__init__.py
@@ -0,0 +1,56 @@
+import os
+from modules.paths import models_path
+from modules import devices
+from annotator.uniformer.inference import init_segmentor, inference_segmentor, show_result_pyplot
+
+try:
+ from mmseg.core.evaluation import get_palette
+except ImportError:
+ from annotator.mmpkg.mmseg.core.evaluation import get_palette
+
+modeldir = os.path.join(models_path, "uniformer")
+checkpoint_file = "https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/upernet_global_small.pth"
+config_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), "upernet_global_small.py")
+old_modeldir = os.path.dirname(os.path.realpath(__file__))
+model = None
+
+def unload_uniformer_model():
+ global model
+ if model is not None:
+ model = model.cpu()
+
+def apply_uniformer(img):
+ global model
+ if model is None:
+ modelpath = os.path.join(modeldir, "upernet_global_small.pth")
+ old_modelpath = os.path.join(old_modeldir, "upernet_global_small.pth")
+ if os.path.exists(old_modelpath):
+ modelpath = old_modelpath
+ elif not os.path.exists(modelpath):
+ from basicsr.utils.download_util import load_file_from_url
+ load_file_from_url(checkpoint_file, model_dir=modeldir)
+
+ model = init_segmentor(config_file, modelpath)
+ model = model.to(devices.get_device_for("controlnet"))
+
+ if devices.get_device_for("controlnet").type == 'mps':
+ # adaptive_avg_pool2d can fail on MPS, workaround with CPU
+ import torch.nn.functional
+
+ orig_adaptive_avg_pool2d = torch.nn.functional.adaptive_avg_pool2d
+ def cpu_if_exception(input, *args, **kwargs):
+ try:
+ return orig_adaptive_avg_pool2d(input, *args, **kwargs)
+ except:
+ return orig_adaptive_avg_pool2d(input.cpu(), *args, **kwargs).to(input.device)
+
+ try:
+ torch.nn.functional.adaptive_avg_pool2d = cpu_if_exception
+ result = inference_segmentor(model, img)
+ finally:
+ torch.nn.functional.adaptive_avg_pool2d = orig_adaptive_avg_pool2d
+ else:
+ result = inference_segmentor(model, img)
+
+ res_img = show_result_pyplot(model, img, result, get_palette('ade'), opacity=1)
+ return res_img
diff --git a/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/datasets/ade20k.py b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/datasets/ade20k.py
new file mode 100644
index 0000000000000000000000000000000000000000..efc8b4bb20c981f3db6df7eb52b3dc0744c94cc0
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/datasets/ade20k.py
@@ -0,0 +1,54 @@
+# dataset settings
+dataset_type = 'ADE20KDataset'
+data_root = 'data/ade/ADEChallengeData2016'
+img_norm_cfg = dict(
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+crop_size = (512, 512)
+train_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(type='LoadAnnotations', reduce_zero_label=True),
+ dict(type='Resize', img_scale=(2048, 512), ratio_range=(0.5, 2.0)),
+ dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+ dict(type='RandomFlip', prob=0.5),
+ dict(type='PhotoMetricDistortion'),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
+ dict(type='DefaultFormatBundle'),
+ dict(type='Collect', keys=['img', 'gt_semantic_seg']),
+]
+test_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(
+ type='MultiScaleFlipAug',
+ img_scale=(2048, 512),
+ # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
+ flip=False,
+ transforms=[
+ dict(type='Resize', keep_ratio=True),
+ dict(type='RandomFlip'),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='ImageToTensor', keys=['img']),
+ dict(type='Collect', keys=['img']),
+ ])
+]
+data = dict(
+ samples_per_gpu=4,
+ workers_per_gpu=4,
+ train=dict(
+ type=dataset_type,
+ data_root=data_root,
+ img_dir='images/training',
+ ann_dir='annotations/training',
+ pipeline=train_pipeline),
+ val=dict(
+ type=dataset_type,
+ data_root=data_root,
+ img_dir='images/validation',
+ ann_dir='annotations/validation',
+ pipeline=test_pipeline),
+ test=dict(
+ type=dataset_type,
+ data_root=data_root,
+ img_dir='images/validation',
+ ann_dir='annotations/validation',
+ pipeline=test_pipeline))
diff --git a/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/datasets/chase_db1.py b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/datasets/chase_db1.py
new file mode 100644
index 0000000000000000000000000000000000000000..298594ea925f87f22b37094a2ec50e370aec96a0
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/datasets/chase_db1.py
@@ -0,0 +1,59 @@
+# dataset settings
+dataset_type = 'ChaseDB1Dataset'
+data_root = 'data/CHASE_DB1'
+img_norm_cfg = dict(
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+img_scale = (960, 999)
+crop_size = (128, 128)
+train_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(type='LoadAnnotations'),
+ dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)),
+ dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+ dict(type='RandomFlip', prob=0.5),
+ dict(type='PhotoMetricDistortion'),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
+ dict(type='DefaultFormatBundle'),
+ dict(type='Collect', keys=['img', 'gt_semantic_seg'])
+]
+test_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(
+ type='MultiScaleFlipAug',
+ img_scale=img_scale,
+ # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0],
+ flip=False,
+ transforms=[
+ dict(type='Resize', keep_ratio=True),
+ dict(type='RandomFlip'),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='ImageToTensor', keys=['img']),
+ dict(type='Collect', keys=['img'])
+ ])
+]
+
+data = dict(
+ samples_per_gpu=4,
+ workers_per_gpu=4,
+ train=dict(
+ type='RepeatDataset',
+ times=40000,
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ img_dir='images/training',
+ ann_dir='annotations/training',
+ pipeline=train_pipeline)),
+ val=dict(
+ type=dataset_type,
+ data_root=data_root,
+ img_dir='images/validation',
+ ann_dir='annotations/validation',
+ pipeline=test_pipeline),
+ test=dict(
+ type=dataset_type,
+ data_root=data_root,
+ img_dir='images/validation',
+ ann_dir='annotations/validation',
+ pipeline=test_pipeline))
diff --git a/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/datasets/cityscapes.py b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/datasets/cityscapes.py
new file mode 100644
index 0000000000000000000000000000000000000000..f21867c63e1835f6fceb61f066e802fd8fd2a735
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/datasets/cityscapes.py
@@ -0,0 +1,54 @@
+# dataset settings
+dataset_type = 'CityscapesDataset'
+data_root = 'data/cityscapes/'
+img_norm_cfg = dict(
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+crop_size = (512, 1024)
+train_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(type='LoadAnnotations'),
+ dict(type='Resize', img_scale=(2048, 1024), ratio_range=(0.5, 2.0)),
+ dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+ dict(type='RandomFlip', prob=0.5),
+ dict(type='PhotoMetricDistortion'),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
+ dict(type='DefaultFormatBundle'),
+ dict(type='Collect', keys=['img', 'gt_semantic_seg']),
+]
+test_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(
+ type='MultiScaleFlipAug',
+ img_scale=(2048, 1024),
+ # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
+ flip=False,
+ transforms=[
+ dict(type='Resize', keep_ratio=True),
+ dict(type='RandomFlip'),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='ImageToTensor', keys=['img']),
+ dict(type='Collect', keys=['img']),
+ ])
+]
+data = dict(
+ samples_per_gpu=2,
+ workers_per_gpu=2,
+ train=dict(
+ type=dataset_type,
+ data_root=data_root,
+ img_dir='leftImg8bit/train',
+ ann_dir='gtFine/train',
+ pipeline=train_pipeline),
+ val=dict(
+ type=dataset_type,
+ data_root=data_root,
+ img_dir='leftImg8bit/val',
+ ann_dir='gtFine/val',
+ pipeline=test_pipeline),
+ test=dict(
+ type=dataset_type,
+ data_root=data_root,
+ img_dir='leftImg8bit/val',
+ ann_dir='gtFine/val',
+ pipeline=test_pipeline))
diff --git a/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/datasets/cityscapes_769x769.py b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/datasets/cityscapes_769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..336c7b254fe392b4703039fec86a83acdbd2e1a5
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/datasets/cityscapes_769x769.py
@@ -0,0 +1,35 @@
+_base_ = './cityscapes.py'
+img_norm_cfg = dict(
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+crop_size = (769, 769)
+train_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(type='LoadAnnotations'),
+ dict(type='Resize', img_scale=(2049, 1025), ratio_range=(0.5, 2.0)),
+ dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+ dict(type='RandomFlip', prob=0.5),
+ dict(type='PhotoMetricDistortion'),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
+ dict(type='DefaultFormatBundle'),
+ dict(type='Collect', keys=['img', 'gt_semantic_seg']),
+]
+test_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(
+ type='MultiScaleFlipAug',
+ img_scale=(2049, 1025),
+ # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
+ flip=False,
+ transforms=[
+ dict(type='Resize', keep_ratio=True),
+ dict(type='RandomFlip'),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='ImageToTensor', keys=['img']),
+ dict(type='Collect', keys=['img']),
+ ])
+]
+data = dict(
+ train=dict(pipeline=train_pipeline),
+ val=dict(pipeline=test_pipeline),
+ test=dict(pipeline=test_pipeline))
diff --git a/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/datasets/drive.py b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/datasets/drive.py
new file mode 100644
index 0000000000000000000000000000000000000000..06e8ff606e0d2a4514ec8b7d2c6c436a32efcbf4
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/datasets/drive.py
@@ -0,0 +1,59 @@
+# dataset settings
+dataset_type = 'DRIVEDataset'
+data_root = 'data/DRIVE'
+img_norm_cfg = dict(
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+img_scale = (584, 565)
+crop_size = (64, 64)
+train_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(type='LoadAnnotations'),
+ dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)),
+ dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+ dict(type='RandomFlip', prob=0.5),
+ dict(type='PhotoMetricDistortion'),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
+ dict(type='DefaultFormatBundle'),
+ dict(type='Collect', keys=['img', 'gt_semantic_seg'])
+]
+test_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(
+ type='MultiScaleFlipAug',
+ img_scale=img_scale,
+ # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0],
+ flip=False,
+ transforms=[
+ dict(type='Resize', keep_ratio=True),
+ dict(type='RandomFlip'),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='ImageToTensor', keys=['img']),
+ dict(type='Collect', keys=['img'])
+ ])
+]
+
+data = dict(
+ samples_per_gpu=4,
+ workers_per_gpu=4,
+ train=dict(
+ type='RepeatDataset',
+ times=40000,
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ img_dir='images/training',
+ ann_dir='annotations/training',
+ pipeline=train_pipeline)),
+ val=dict(
+ type=dataset_type,
+ data_root=data_root,
+ img_dir='images/validation',
+ ann_dir='annotations/validation',
+ pipeline=test_pipeline),
+ test=dict(
+ type=dataset_type,
+ data_root=data_root,
+ img_dir='images/validation',
+ ann_dir='annotations/validation',
+ pipeline=test_pipeline))
diff --git a/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/datasets/hrf.py b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/datasets/hrf.py
new file mode 100644
index 0000000000000000000000000000000000000000..242d790eb1b83e75cf6b7eaa7a35c674099311ad
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/datasets/hrf.py
@@ -0,0 +1,59 @@
+# dataset settings
+dataset_type = 'HRFDataset'
+data_root = 'data/HRF'
+img_norm_cfg = dict(
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+img_scale = (2336, 3504)
+crop_size = (256, 256)
+train_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(type='LoadAnnotations'),
+ dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)),
+ dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+ dict(type='RandomFlip', prob=0.5),
+ dict(type='PhotoMetricDistortion'),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
+ dict(type='DefaultFormatBundle'),
+ dict(type='Collect', keys=['img', 'gt_semantic_seg'])
+]
+test_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(
+ type='MultiScaleFlipAug',
+ img_scale=img_scale,
+ # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0],
+ flip=False,
+ transforms=[
+ dict(type='Resize', keep_ratio=True),
+ dict(type='RandomFlip'),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='ImageToTensor', keys=['img']),
+ dict(type='Collect', keys=['img'])
+ ])
+]
+
+data = dict(
+ samples_per_gpu=4,
+ workers_per_gpu=4,
+ train=dict(
+ type='RepeatDataset',
+ times=40000,
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ img_dir='images/training',
+ ann_dir='annotations/training',
+ pipeline=train_pipeline)),
+ val=dict(
+ type=dataset_type,
+ data_root=data_root,
+ img_dir='images/validation',
+ ann_dir='annotations/validation',
+ pipeline=test_pipeline),
+ test=dict(
+ type=dataset_type,
+ data_root=data_root,
+ img_dir='images/validation',
+ ann_dir='annotations/validation',
+ pipeline=test_pipeline))
diff --git a/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/datasets/pascal_context.py b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/datasets/pascal_context.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff65bad1b86d7e3a5980bb5b9fc55798dc8df5f4
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/datasets/pascal_context.py
@@ -0,0 +1,60 @@
+# dataset settings
+dataset_type = 'PascalContextDataset'
+data_root = 'data/VOCdevkit/VOC2010/'
+img_norm_cfg = dict(
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+
+img_scale = (520, 520)
+crop_size = (480, 480)
+
+train_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(type='LoadAnnotations'),
+ dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)),
+ dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+ dict(type='RandomFlip', prob=0.5),
+ dict(type='PhotoMetricDistortion'),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
+ dict(type='DefaultFormatBundle'),
+ dict(type='Collect', keys=['img', 'gt_semantic_seg']),
+]
+test_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(
+ type='MultiScaleFlipAug',
+ img_scale=img_scale,
+ # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
+ flip=False,
+ transforms=[
+ dict(type='Resize', keep_ratio=True),
+ dict(type='RandomFlip'),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='ImageToTensor', keys=['img']),
+ dict(type='Collect', keys=['img']),
+ ])
+]
+data = dict(
+ samples_per_gpu=4,
+ workers_per_gpu=4,
+ train=dict(
+ type=dataset_type,
+ data_root=data_root,
+ img_dir='JPEGImages',
+ ann_dir='SegmentationClassContext',
+ split='ImageSets/SegmentationContext/train.txt',
+ pipeline=train_pipeline),
+ val=dict(
+ type=dataset_type,
+ data_root=data_root,
+ img_dir='JPEGImages',
+ ann_dir='SegmentationClassContext',
+ split='ImageSets/SegmentationContext/val.txt',
+ pipeline=test_pipeline),
+ test=dict(
+ type=dataset_type,
+ data_root=data_root,
+ img_dir='JPEGImages',
+ ann_dir='SegmentationClassContext',
+ split='ImageSets/SegmentationContext/val.txt',
+ pipeline=test_pipeline))
diff --git a/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/datasets/pascal_context_59.py b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/datasets/pascal_context_59.py
new file mode 100644
index 0000000000000000000000000000000000000000..37585abab89834b95cd5bdd993b994fca1db65f6
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/datasets/pascal_context_59.py
@@ -0,0 +1,60 @@
+# dataset settings
+dataset_type = 'PascalContextDataset59'
+data_root = 'data/VOCdevkit/VOC2010/'
+img_norm_cfg = dict(
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+
+img_scale = (520, 520)
+crop_size = (480, 480)
+
+train_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(type='LoadAnnotations', reduce_zero_label=True),
+ dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)),
+ dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+ dict(type='RandomFlip', prob=0.5),
+ dict(type='PhotoMetricDistortion'),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
+ dict(type='DefaultFormatBundle'),
+ dict(type='Collect', keys=['img', 'gt_semantic_seg']),
+]
+test_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(
+ type='MultiScaleFlipAug',
+ img_scale=img_scale,
+ # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
+ flip=False,
+ transforms=[
+ dict(type='Resize', keep_ratio=True),
+ dict(type='RandomFlip'),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='ImageToTensor', keys=['img']),
+ dict(type='Collect', keys=['img']),
+ ])
+]
+data = dict(
+ samples_per_gpu=4,
+ workers_per_gpu=4,
+ train=dict(
+ type=dataset_type,
+ data_root=data_root,
+ img_dir='JPEGImages',
+ ann_dir='SegmentationClassContext',
+ split='ImageSets/SegmentationContext/train.txt',
+ pipeline=train_pipeline),
+ val=dict(
+ type=dataset_type,
+ data_root=data_root,
+ img_dir='JPEGImages',
+ ann_dir='SegmentationClassContext',
+ split='ImageSets/SegmentationContext/val.txt',
+ pipeline=test_pipeline),
+ test=dict(
+ type=dataset_type,
+ data_root=data_root,
+ img_dir='JPEGImages',
+ ann_dir='SegmentationClassContext',
+ split='ImageSets/SegmentationContext/val.txt',
+ pipeline=test_pipeline))
diff --git a/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/datasets/pascal_voc12.py b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/datasets/pascal_voc12.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba1d42d0c5781f56dc177d860d856bb34adce555
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/datasets/pascal_voc12.py
@@ -0,0 +1,57 @@
+# dataset settings
+dataset_type = 'PascalVOCDataset'
+data_root = 'data/VOCdevkit/VOC2012'
+img_norm_cfg = dict(
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+crop_size = (512, 512)
+train_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(type='LoadAnnotations'),
+ dict(type='Resize', img_scale=(2048, 512), ratio_range=(0.5, 2.0)),
+ dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+ dict(type='RandomFlip', prob=0.5),
+ dict(type='PhotoMetricDistortion'),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
+ dict(type='DefaultFormatBundle'),
+ dict(type='Collect', keys=['img', 'gt_semantic_seg']),
+]
+test_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(
+ type='MultiScaleFlipAug',
+ img_scale=(2048, 512),
+ # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
+ flip=False,
+ transforms=[
+ dict(type='Resize', keep_ratio=True),
+ dict(type='RandomFlip'),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='ImageToTensor', keys=['img']),
+ dict(type='Collect', keys=['img']),
+ ])
+]
+data = dict(
+ samples_per_gpu=4,
+ workers_per_gpu=4,
+ train=dict(
+ type=dataset_type,
+ data_root=data_root,
+ img_dir='JPEGImages',
+ ann_dir='SegmentationClass',
+ split='ImageSets/Segmentation/train.txt',
+ pipeline=train_pipeline),
+ val=dict(
+ type=dataset_type,
+ data_root=data_root,
+ img_dir='JPEGImages',
+ ann_dir='SegmentationClass',
+ split='ImageSets/Segmentation/val.txt',
+ pipeline=test_pipeline),
+ test=dict(
+ type=dataset_type,
+ data_root=data_root,
+ img_dir='JPEGImages',
+ ann_dir='SegmentationClass',
+ split='ImageSets/Segmentation/val.txt',
+ pipeline=test_pipeline))
diff --git a/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/datasets/pascal_voc12_aug.py b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/datasets/pascal_voc12_aug.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f23b6717d53ad29f02dd15046802a2631a5076b
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/datasets/pascal_voc12_aug.py
@@ -0,0 +1,9 @@
+_base_ = './pascal_voc12.py'
+# dataset settings
+data = dict(
+ train=dict(
+ ann_dir=['SegmentationClass', 'SegmentationClassAug'],
+ split=[
+ 'ImageSets/Segmentation/train.txt',
+ 'ImageSets/Segmentation/aug.txt'
+ ]))
diff --git a/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/datasets/stare.py b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/datasets/stare.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f71b25488cc11a6b4d582ac52b5a24e1ad1cf8e
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/datasets/stare.py
@@ -0,0 +1,59 @@
+# dataset settings
+dataset_type = 'STAREDataset'
+data_root = 'data/STARE'
+img_norm_cfg = dict(
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+img_scale = (605, 700)
+crop_size = (128, 128)
+train_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(type='LoadAnnotations'),
+ dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)),
+ dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+ dict(type='RandomFlip', prob=0.5),
+ dict(type='PhotoMetricDistortion'),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
+ dict(type='DefaultFormatBundle'),
+ dict(type='Collect', keys=['img', 'gt_semantic_seg'])
+]
+test_pipeline = [
+ dict(type='LoadImageFromFile'),
+ dict(
+ type='MultiScaleFlipAug',
+ img_scale=img_scale,
+ # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0],
+ flip=False,
+ transforms=[
+ dict(type='Resize', keep_ratio=True),
+ dict(type='RandomFlip'),
+ dict(type='Normalize', **img_norm_cfg),
+ dict(type='ImageToTensor', keys=['img']),
+ dict(type='Collect', keys=['img'])
+ ])
+]
+
+data = dict(
+ samples_per_gpu=4,
+ workers_per_gpu=4,
+ train=dict(
+ type='RepeatDataset',
+ times=40000,
+ dataset=dict(
+ type=dataset_type,
+ data_root=data_root,
+ img_dir='images/training',
+ ann_dir='annotations/training',
+ pipeline=train_pipeline)),
+ val=dict(
+ type=dataset_type,
+ data_root=data_root,
+ img_dir='images/validation',
+ ann_dir='annotations/validation',
+ pipeline=test_pipeline),
+ test=dict(
+ type=dataset_type,
+ data_root=data_root,
+ img_dir='images/validation',
+ ann_dir='annotations/validation',
+ pipeline=test_pipeline))
diff --git a/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/default_runtime.py b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/default_runtime.py
new file mode 100644
index 0000000000000000000000000000000000000000..b564cc4e7e7d9a67dacaaddecb100e4d8f5c005b
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/default_runtime.py
@@ -0,0 +1,14 @@
+# yapf:disable
+log_config = dict(
+ interval=50,
+ hooks=[
+ dict(type='TextLoggerHook', by_epoch=False),
+ # dict(type='TensorboardLoggerHook')
+ ])
+# yapf:enable
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
+cudnn_benchmark = True
diff --git a/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/ann_r50-d8.py b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/ann_r50-d8.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2cb653827e44e6015b3b83bc578003e614a6aa1
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/ann_r50-d8.py
@@ -0,0 +1,46 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='EncoderDecoder',
+ pretrained='open-mmlab://resnet50_v1c',
+ backbone=dict(
+ type='ResNetV1c',
+ depth=50,
+ num_stages=4,
+ out_indices=(0, 1, 2, 3),
+ dilations=(1, 1, 2, 4),
+ strides=(1, 2, 1, 1),
+ norm_cfg=norm_cfg,
+ norm_eval=False,
+ style='pytorch',
+ contract_dilation=True),
+ decode_head=dict(
+ type='ANNHead',
+ in_channels=[1024, 2048],
+ in_index=[2, 3],
+ channels=512,
+ project_channels=256,
+ query_scales=(1, ),
+ key_pool_scales=(1, 3, 6, 8),
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+ auxiliary_head=dict(
+ type='FCNHead',
+ in_channels=1024,
+ in_index=2,
+ channels=256,
+ num_convs=1,
+ concat_input=False,
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+ # model training and testing settings
+ train_cfg=dict(),
+ test_cfg=dict(mode='whole'))
diff --git a/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/apcnet_r50-d8.py b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/apcnet_r50-d8.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8f5316cbcf3896ba9de7ca2c801eba512f01d5e
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/apcnet_r50-d8.py
@@ -0,0 +1,44 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='EncoderDecoder',
+ pretrained='open-mmlab://resnet50_v1c',
+ backbone=dict(
+ type='ResNetV1c',
+ depth=50,
+ num_stages=4,
+ out_indices=(0, 1, 2, 3),
+ dilations=(1, 1, 2, 4),
+ strides=(1, 2, 1, 1),
+ norm_cfg=norm_cfg,
+ norm_eval=False,
+ style='pytorch',
+ contract_dilation=True),
+ decode_head=dict(
+ type='APCHead',
+ in_channels=2048,
+ in_index=3,
+ channels=512,
+ pool_scales=(1, 2, 3, 6),
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+ auxiliary_head=dict(
+ type='FCNHead',
+ in_channels=1024,
+ in_index=2,
+ channels=256,
+ num_convs=1,
+ concat_input=False,
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+ # model training and testing settings
+ train_cfg=dict(),
+ test_cfg=dict(mode='whole'))
diff --git a/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/ccnet_r50-d8.py b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/ccnet_r50-d8.py
new file mode 100644
index 0000000000000000000000000000000000000000..794148f576b9e215c3c6963e73dffe98204b7717
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/ccnet_r50-d8.py
@@ -0,0 +1,44 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='EncoderDecoder',
+ pretrained='open-mmlab://resnet50_v1c',
+ backbone=dict(
+ type='ResNetV1c',
+ depth=50,
+ num_stages=4,
+ out_indices=(0, 1, 2, 3),
+ dilations=(1, 1, 2, 4),
+ strides=(1, 2, 1, 1),
+ norm_cfg=norm_cfg,
+ norm_eval=False,
+ style='pytorch',
+ contract_dilation=True),
+ decode_head=dict(
+ type='CCHead',
+ in_channels=2048,
+ in_index=3,
+ channels=512,
+ recurrence=2,
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+ auxiliary_head=dict(
+ type='FCNHead',
+ in_channels=1024,
+ in_index=2,
+ channels=256,
+ num_convs=1,
+ concat_input=False,
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+ # model training and testing settings
+ train_cfg=dict(),
+ test_cfg=dict(mode='whole'))
diff --git a/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/cgnet.py b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/cgnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..eff8d9458c877c5db894957e0b1b4597e40da6ab
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/cgnet.py
@@ -0,0 +1,35 @@
+# model settings
+norm_cfg = dict(type='SyncBN', eps=1e-03, requires_grad=True)
+model = dict(
+ type='EncoderDecoder',
+ backbone=dict(
+ type='CGNet',
+ norm_cfg=norm_cfg,
+ in_channels=3,
+ num_channels=(32, 64, 128),
+ num_blocks=(3, 21),
+ dilations=(2, 4),
+ reductions=(8, 16)),
+ decode_head=dict(
+ type='FCNHead',
+ in_channels=256,
+ in_index=2,
+ channels=256,
+ num_convs=0,
+ concat_input=False,
+ dropout_ratio=0,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ loss_decode=dict(
+ type='CrossEntropyLoss',
+ use_sigmoid=False,
+ loss_weight=1.0,
+ class_weight=[
+ 2.5959933, 6.7415504, 3.5354059, 9.8663225, 9.690899, 9.369352,
+ 10.289121, 9.953208, 4.3097677, 9.490387, 7.674431, 9.396905,
+ 10.347791, 6.3927646, 10.226669, 10.241062, 10.280587,
+ 10.396974, 10.055647
+ ])),
+ # model training and testing settings
+ train_cfg=dict(sampler=None),
+ test_cfg=dict(mode='whole'))
diff --git a/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/danet_r50-d8.py b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/danet_r50-d8.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c934939fac48525f22ad86f489a041dd7db7d09
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/danet_r50-d8.py
@@ -0,0 +1,44 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='EncoderDecoder',
+ pretrained='open-mmlab://resnet50_v1c',
+ backbone=dict(
+ type='ResNetV1c',
+ depth=50,
+ num_stages=4,
+ out_indices=(0, 1, 2, 3),
+ dilations=(1, 1, 2, 4),
+ strides=(1, 2, 1, 1),
+ norm_cfg=norm_cfg,
+ norm_eval=False,
+ style='pytorch',
+ contract_dilation=True),
+ decode_head=dict(
+ type='DAHead',
+ in_channels=2048,
+ in_index=3,
+ channels=512,
+ pam_channels=64,
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+ auxiliary_head=dict(
+ type='FCNHead',
+ in_channels=1024,
+ in_index=2,
+ channels=256,
+ num_convs=1,
+ concat_input=False,
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+ # model training and testing settings
+ train_cfg=dict(),
+ test_cfg=dict(mode='whole'))
diff --git a/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/deeplabv3_r50-d8.py b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/deeplabv3_r50-d8.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7a43bee01422ad4795dd27874e0cd4bb6cbfecf
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/deeplabv3_r50-d8.py
@@ -0,0 +1,44 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='EncoderDecoder',
+ pretrained='open-mmlab://resnet50_v1c',
+ backbone=dict(
+ type='ResNetV1c',
+ depth=50,
+ num_stages=4,
+ out_indices=(0, 1, 2, 3),
+ dilations=(1, 1, 2, 4),
+ strides=(1, 2, 1, 1),
+ norm_cfg=norm_cfg,
+ norm_eval=False,
+ style='pytorch',
+ contract_dilation=True),
+ decode_head=dict(
+ type='ASPPHead',
+ in_channels=2048,
+ in_index=3,
+ channels=512,
+ dilations=(1, 12, 24, 36),
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+ auxiliary_head=dict(
+ type='FCNHead',
+ in_channels=1024,
+ in_index=2,
+ channels=256,
+ num_convs=1,
+ concat_input=False,
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+ # model training and testing settings
+ train_cfg=dict(),
+ test_cfg=dict(mode='whole'))
diff --git a/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/deeplabv3_unet_s5-d16.py b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/deeplabv3_unet_s5-d16.py
new file mode 100644
index 0000000000000000000000000000000000000000..0cd262999d8b2cb8e14a5c32190ae73f479d8e81
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/deeplabv3_unet_s5-d16.py
@@ -0,0 +1,50 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='EncoderDecoder',
+ pretrained=None,
+ backbone=dict(
+ type='UNet',
+ in_channels=3,
+ base_channels=64,
+ num_stages=5,
+ strides=(1, 1, 1, 1, 1),
+ enc_num_convs=(2, 2, 2, 2, 2),
+ dec_num_convs=(2, 2, 2, 2),
+ downsamples=(True, True, True, True),
+ enc_dilations=(1, 1, 1, 1, 1),
+ dec_dilations=(1, 1, 1, 1),
+ with_cp=False,
+ conv_cfg=None,
+ norm_cfg=norm_cfg,
+ act_cfg=dict(type='ReLU'),
+ upsample_cfg=dict(type='InterpConv'),
+ norm_eval=False),
+ decode_head=dict(
+ type='ASPPHead',
+ in_channels=64,
+ in_index=4,
+ channels=16,
+ dilations=(1, 12, 24, 36),
+ dropout_ratio=0.1,
+ num_classes=2,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+ auxiliary_head=dict(
+ type='FCNHead',
+ in_channels=128,
+ in_index=3,
+ channels=64,
+ num_convs=1,
+ concat_input=False,
+ dropout_ratio=0.1,
+ num_classes=2,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+ # model training and testing settings
+ train_cfg=dict(),
+ test_cfg=dict(mode='slide', crop_size=256, stride=170))
diff --git a/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/deeplabv3plus_r50-d8.py b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/deeplabv3plus_r50-d8.py
new file mode 100644
index 0000000000000000000000000000000000000000..050e39e091d816df9028d23aa3ecf9db74e441e1
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/deeplabv3plus_r50-d8.py
@@ -0,0 +1,46 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='EncoderDecoder',
+ pretrained='open-mmlab://resnet50_v1c',
+ backbone=dict(
+ type='ResNetV1c',
+ depth=50,
+ num_stages=4,
+ out_indices=(0, 1, 2, 3),
+ dilations=(1, 1, 2, 4),
+ strides=(1, 2, 1, 1),
+ norm_cfg=norm_cfg,
+ norm_eval=False,
+ style='pytorch',
+ contract_dilation=True),
+ decode_head=dict(
+ type='DepthwiseSeparableASPPHead',
+ in_channels=2048,
+ in_index=3,
+ channels=512,
+ dilations=(1, 12, 24, 36),
+ c1_in_channels=256,
+ c1_channels=48,
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+ auxiliary_head=dict(
+ type='FCNHead',
+ in_channels=1024,
+ in_index=2,
+ channels=256,
+ num_convs=1,
+ concat_input=False,
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+ # model training and testing settings
+ train_cfg=dict(),
+ test_cfg=dict(mode='whole'))
diff --git a/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/dmnet_r50-d8.py b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/dmnet_r50-d8.py
new file mode 100644
index 0000000000000000000000000000000000000000..d22ba52640bebd805b3b8d07025e276dfb023759
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/dmnet_r50-d8.py
@@ -0,0 +1,44 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='EncoderDecoder',
+ pretrained='open-mmlab://resnet50_v1c',
+ backbone=dict(
+ type='ResNetV1c',
+ depth=50,
+ num_stages=4,
+ out_indices=(0, 1, 2, 3),
+ dilations=(1, 1, 2, 4),
+ strides=(1, 2, 1, 1),
+ norm_cfg=norm_cfg,
+ norm_eval=False,
+ style='pytorch',
+ contract_dilation=True),
+ decode_head=dict(
+ type='DMHead',
+ in_channels=2048,
+ in_index=3,
+ channels=512,
+ filter_sizes=(1, 3, 5, 7),
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+ auxiliary_head=dict(
+ type='FCNHead',
+ in_channels=1024,
+ in_index=2,
+ channels=256,
+ num_convs=1,
+ concat_input=False,
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+ # model training and testing settings
+ train_cfg=dict(),
+ test_cfg=dict(mode='whole'))
diff --git a/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/dnl_r50-d8.py b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/dnl_r50-d8.py
new file mode 100644
index 0000000000000000000000000000000000000000..edb4c174c51e34c103737ba39bfc48bf831e561d
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/dnl_r50-d8.py
@@ -0,0 +1,46 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='EncoderDecoder',
+ pretrained='open-mmlab://resnet50_v1c',
+ backbone=dict(
+ type='ResNetV1c',
+ depth=50,
+ num_stages=4,
+ out_indices=(0, 1, 2, 3),
+ dilations=(1, 1, 2, 4),
+ strides=(1, 2, 1, 1),
+ norm_cfg=norm_cfg,
+ norm_eval=False,
+ style='pytorch',
+ contract_dilation=True),
+ decode_head=dict(
+ type='DNLHead',
+ in_channels=2048,
+ in_index=3,
+ channels=512,
+ dropout_ratio=0.1,
+ reduction=2,
+ use_scale=True,
+ mode='embedded_gaussian',
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+ auxiliary_head=dict(
+ type='FCNHead',
+ in_channels=1024,
+ in_index=2,
+ channels=256,
+ num_convs=1,
+ concat_input=False,
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+ # model training and testing settings
+ train_cfg=dict(),
+ test_cfg=dict(mode='whole'))
diff --git a/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/emanet_r50-d8.py b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/emanet_r50-d8.py
new file mode 100644
index 0000000000000000000000000000000000000000..26adcd430926de0862204a71d345f2543167f27b
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/emanet_r50-d8.py
@@ -0,0 +1,47 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='EncoderDecoder',
+ pretrained='open-mmlab://resnet50_v1c',
+ backbone=dict(
+ type='ResNetV1c',
+ depth=50,
+ num_stages=4,
+ out_indices=(0, 1, 2, 3),
+ dilations=(1, 1, 2, 4),
+ strides=(1, 2, 1, 1),
+ norm_cfg=norm_cfg,
+ norm_eval=False,
+ style='pytorch',
+ contract_dilation=True),
+ decode_head=dict(
+ type='EMAHead',
+ in_channels=2048,
+ in_index=3,
+ channels=256,
+ ema_channels=512,
+ num_bases=64,
+ num_stages=3,
+ momentum=0.1,
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+ auxiliary_head=dict(
+ type='FCNHead',
+ in_channels=1024,
+ in_index=2,
+ channels=256,
+ num_convs=1,
+ concat_input=False,
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+ # model training and testing settings
+ train_cfg=dict(),
+ test_cfg=dict(mode='whole'))
diff --git a/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/encnet_r50-d8.py b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/encnet_r50-d8.py
new file mode 100644
index 0000000000000000000000000000000000000000..be777123a886503172a95fe0719e956a147bbd68
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/encnet_r50-d8.py
@@ -0,0 +1,48 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='EncoderDecoder',
+ pretrained='open-mmlab://resnet50_v1c',
+ backbone=dict(
+ type='ResNetV1c',
+ depth=50,
+ num_stages=4,
+ out_indices=(0, 1, 2, 3),
+ dilations=(1, 1, 2, 4),
+ strides=(1, 2, 1, 1),
+ norm_cfg=norm_cfg,
+ norm_eval=False,
+ style='pytorch',
+ contract_dilation=True),
+ decode_head=dict(
+ type='EncHead',
+ in_channels=[512, 1024, 2048],
+ in_index=(1, 2, 3),
+ channels=512,
+ num_codes=32,
+ use_se_loss=True,
+ add_lateral=False,
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+ loss_se_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.2)),
+ auxiliary_head=dict(
+ type='FCNHead',
+ in_channels=1024,
+ in_index=2,
+ channels=256,
+ num_convs=1,
+ concat_input=False,
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+ # model training and testing settings
+ train_cfg=dict(),
+ test_cfg=dict(mode='whole'))
diff --git a/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/fast_scnn.py b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/fast_scnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..32fdeb659355a5ce5ef2cc7c2f30742703811cdf
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/fast_scnn.py
@@ -0,0 +1,57 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True, momentum=0.01)
+model = dict(
+ type='EncoderDecoder',
+ backbone=dict(
+ type='FastSCNN',
+ downsample_dw_channels=(32, 48),
+ global_in_channels=64,
+ global_block_channels=(64, 96, 128),
+ global_block_strides=(2, 2, 1),
+ global_out_channels=128,
+ higher_in_channels=64,
+ lower_in_channels=128,
+ fusion_out_channels=128,
+ out_indices=(0, 1, 2),
+ norm_cfg=norm_cfg,
+ align_corners=False),
+ decode_head=dict(
+ type='DepthwiseSeparableFCNHead',
+ in_channels=128,
+ channels=128,
+ concat_input=False,
+ num_classes=19,
+ in_index=-1,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.4)),
+ auxiliary_head=[
+ dict(
+ type='FCNHead',
+ in_channels=128,
+ channels=32,
+ num_convs=1,
+ num_classes=19,
+ in_index=-2,
+ norm_cfg=norm_cfg,
+ concat_input=False,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.4)),
+ dict(
+ type='FCNHead',
+ in_channels=64,
+ channels=32,
+ num_convs=1,
+ num_classes=19,
+ in_index=-3,
+ norm_cfg=norm_cfg,
+ concat_input=False,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.4)),
+ ],
+ # model training and testing settings
+ train_cfg=dict(),
+ test_cfg=dict(mode='whole'))
diff --git a/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/fcn_hr18.py b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/fcn_hr18.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3e299bc89ada56ca14bbffcbdb08a586b8ed9e9
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/fcn_hr18.py
@@ -0,0 +1,52 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='EncoderDecoder',
+ pretrained='open-mmlab://msra/hrnetv2_w18',
+ backbone=dict(
+ type='HRNet',
+ norm_cfg=norm_cfg,
+ norm_eval=False,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(18, 36)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(18, 36, 72)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(18, 36, 72, 144)))),
+ decode_head=dict(
+ type='FCNHead',
+ in_channels=[18, 36, 72, 144],
+ in_index=(0, 1, 2, 3),
+ channels=sum([18, 36, 72, 144]),
+ input_transform='resize_concat',
+ kernel_size=1,
+ num_convs=1,
+ concat_input=False,
+ dropout_ratio=-1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+ # model training and testing settings
+ train_cfg=dict(),
+ test_cfg=dict(mode='whole'))
diff --git a/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/fcn_r50-d8.py b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/fcn_r50-d8.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e98f6cc918b6146fc6d613c6918e825ef1355c3
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/fcn_r50-d8.py
@@ -0,0 +1,45 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='EncoderDecoder',
+ pretrained='open-mmlab://resnet50_v1c',
+ backbone=dict(
+ type='ResNetV1c',
+ depth=50,
+ num_stages=4,
+ out_indices=(0, 1, 2, 3),
+ dilations=(1, 1, 2, 4),
+ strides=(1, 2, 1, 1),
+ norm_cfg=norm_cfg,
+ norm_eval=False,
+ style='pytorch',
+ contract_dilation=True),
+ decode_head=dict(
+ type='FCNHead',
+ in_channels=2048,
+ in_index=3,
+ channels=512,
+ num_convs=2,
+ concat_input=True,
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+ auxiliary_head=dict(
+ type='FCNHead',
+ in_channels=1024,
+ in_index=2,
+ channels=256,
+ num_convs=1,
+ concat_input=False,
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+ # model training and testing settings
+ train_cfg=dict(),
+ test_cfg=dict(mode='whole'))
diff --git a/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/fcn_unet_s5-d16.py b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/fcn_unet_s5-d16.py
new file mode 100644
index 0000000000000000000000000000000000000000..a33e7972877f902d0e7d18401ca675e3e4e60a18
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/fcn_unet_s5-d16.py
@@ -0,0 +1,51 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='EncoderDecoder',
+ pretrained=None,
+ backbone=dict(
+ type='UNet',
+ in_channels=3,
+ base_channels=64,
+ num_stages=5,
+ strides=(1, 1, 1, 1, 1),
+ enc_num_convs=(2, 2, 2, 2, 2),
+ dec_num_convs=(2, 2, 2, 2),
+ downsamples=(True, True, True, True),
+ enc_dilations=(1, 1, 1, 1, 1),
+ dec_dilations=(1, 1, 1, 1),
+ with_cp=False,
+ conv_cfg=None,
+ norm_cfg=norm_cfg,
+ act_cfg=dict(type='ReLU'),
+ upsample_cfg=dict(type='InterpConv'),
+ norm_eval=False),
+ decode_head=dict(
+ type='FCNHead',
+ in_channels=64,
+ in_index=4,
+ channels=64,
+ num_convs=1,
+ concat_input=False,
+ dropout_ratio=0.1,
+ num_classes=2,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+ auxiliary_head=dict(
+ type='FCNHead',
+ in_channels=128,
+ in_index=3,
+ channels=64,
+ num_convs=1,
+ concat_input=False,
+ dropout_ratio=0.1,
+ num_classes=2,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+ # model training and testing settings
+ train_cfg=dict(),
+ test_cfg=dict(mode='slide', crop_size=256, stride=170))
diff --git a/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/fpn_r50.py b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/fpn_r50.py
new file mode 100644
index 0000000000000000000000000000000000000000..86ab327db92e44c14822d65f1c9277cb007f17c1
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/fpn_r50.py
@@ -0,0 +1,36 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='EncoderDecoder',
+ pretrained='open-mmlab://resnet50_v1c',
+ backbone=dict(
+ type='ResNetV1c',
+ depth=50,
+ num_stages=4,
+ out_indices=(0, 1, 2, 3),
+ dilations=(1, 1, 1, 1),
+ strides=(1, 2, 2, 2),
+ norm_cfg=norm_cfg,
+ norm_eval=False,
+ style='pytorch',
+ contract_dilation=True),
+ neck=dict(
+ type='FPN',
+ in_channels=[256, 512, 1024, 2048],
+ out_channels=256,
+ num_outs=4),
+ decode_head=dict(
+ type='FPNHead',
+ in_channels=[256, 256, 256, 256],
+ in_index=[0, 1, 2, 3],
+ feature_strides=[4, 8, 16, 32],
+ channels=128,
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+ # model training and testing settings
+ train_cfg=dict(),
+ test_cfg=dict(mode='whole'))
diff --git a/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/fpn_uniformer.py b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/fpn_uniformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..8aae98c5991055bfcc08e82ccdc09f8b1d9f8a8d
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/fpn_uniformer.py
@@ -0,0 +1,35 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='EncoderDecoder',
+ backbone=dict(
+ type='UniFormer',
+ embed_dim=[64, 128, 320, 512],
+ layers=[3, 4, 8, 3],
+ head_dim=64,
+ mlp_ratio=4.,
+ qkv_bias=True,
+ drop_rate=0.,
+ attn_drop_rate=0.,
+ drop_path_rate=0.1),
+ neck=dict(
+ type='FPN',
+ in_channels=[64, 128, 320, 512],
+ out_channels=256,
+ num_outs=4),
+ decode_head=dict(
+ type='FPNHead',
+ in_channels=[256, 256, 256, 256],
+ in_index=[0, 1, 2, 3],
+ feature_strides=[4, 8, 16, 32],
+ channels=128,
+ dropout_ratio=0.1,
+ num_classes=150,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+ # model training and testing settings
+ train_cfg=dict(),
+ test_cfg=dict(mode='whole')
+)
diff --git a/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/gcnet_r50-d8.py b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/gcnet_r50-d8.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d2ad69f5c22adfe79d5fdabf920217628987166
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/gcnet_r50-d8.py
@@ -0,0 +1,46 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='EncoderDecoder',
+ pretrained='open-mmlab://resnet50_v1c',
+ backbone=dict(
+ type='ResNetV1c',
+ depth=50,
+ num_stages=4,
+ out_indices=(0, 1, 2, 3),
+ dilations=(1, 1, 2, 4),
+ strides=(1, 2, 1, 1),
+ norm_cfg=norm_cfg,
+ norm_eval=False,
+ style='pytorch',
+ contract_dilation=True),
+ decode_head=dict(
+ type='GCHead',
+ in_channels=2048,
+ in_index=3,
+ channels=512,
+ ratio=1 / 4.,
+ pooling_type='att',
+ fusion_types=('channel_add', ),
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+ auxiliary_head=dict(
+ type='FCNHead',
+ in_channels=1024,
+ in_index=2,
+ channels=256,
+ num_convs=1,
+ concat_input=False,
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+ # model training and testing settings
+ train_cfg=dict(),
+ test_cfg=dict(mode='whole'))
diff --git a/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/lraspp_m-v3-d8.py b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/lraspp_m-v3-d8.py
new file mode 100644
index 0000000000000000000000000000000000000000..93258242a90695cc94a7c6bd41562d6a75988771
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/lraspp_m-v3-d8.py
@@ -0,0 +1,25 @@
+# model settings
+norm_cfg = dict(type='SyncBN', eps=0.001, requires_grad=True)
+model = dict(
+ type='EncoderDecoder',
+ backbone=dict(
+ type='MobileNetV3',
+ arch='large',
+ out_indices=(1, 3, 16),
+ norm_cfg=norm_cfg),
+ decode_head=dict(
+ type='LRASPPHead',
+ in_channels=(16, 24, 960),
+ in_index=(0, 1, 2),
+ channels=128,
+ input_transform='multiple_select',
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ act_cfg=dict(type='ReLU'),
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+ # model training and testing settings
+ train_cfg=dict(),
+ test_cfg=dict(mode='whole'))
diff --git a/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/nonlocal_r50-d8.py b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/nonlocal_r50-d8.py
new file mode 100644
index 0000000000000000000000000000000000000000..5674a39854cafd1f2e363bac99c58ccae62f24da
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/nonlocal_r50-d8.py
@@ -0,0 +1,46 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='EncoderDecoder',
+ pretrained='open-mmlab://resnet50_v1c',
+ backbone=dict(
+ type='ResNetV1c',
+ depth=50,
+ num_stages=4,
+ out_indices=(0, 1, 2, 3),
+ dilations=(1, 1, 2, 4),
+ strides=(1, 2, 1, 1),
+ norm_cfg=norm_cfg,
+ norm_eval=False,
+ style='pytorch',
+ contract_dilation=True),
+ decode_head=dict(
+ type='NLHead',
+ in_channels=2048,
+ in_index=3,
+ channels=512,
+ dropout_ratio=0.1,
+ reduction=2,
+ use_scale=True,
+ mode='embedded_gaussian',
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+ auxiliary_head=dict(
+ type='FCNHead',
+ in_channels=1024,
+ in_index=2,
+ channels=256,
+ num_convs=1,
+ concat_input=False,
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+ # model training and testing settings
+ train_cfg=dict(),
+ test_cfg=dict(mode='whole'))
diff --git a/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/ocrnet_hr18.py b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/ocrnet_hr18.py
new file mode 100644
index 0000000000000000000000000000000000000000..c60f62a7cdf3f5c5096a7a7e725e8268fddcb057
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/ocrnet_hr18.py
@@ -0,0 +1,68 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='CascadeEncoderDecoder',
+ num_stages=2,
+ pretrained='open-mmlab://msra/hrnetv2_w18',
+ backbone=dict(
+ type='HRNet',
+ norm_cfg=norm_cfg,
+ norm_eval=False,
+ extra=dict(
+ stage1=dict(
+ num_modules=1,
+ num_branches=1,
+ block='BOTTLENECK',
+ num_blocks=(4, ),
+ num_channels=(64, )),
+ stage2=dict(
+ num_modules=1,
+ num_branches=2,
+ block='BASIC',
+ num_blocks=(4, 4),
+ num_channels=(18, 36)),
+ stage3=dict(
+ num_modules=4,
+ num_branches=3,
+ block='BASIC',
+ num_blocks=(4, 4, 4),
+ num_channels=(18, 36, 72)),
+ stage4=dict(
+ num_modules=3,
+ num_branches=4,
+ block='BASIC',
+ num_blocks=(4, 4, 4, 4),
+ num_channels=(18, 36, 72, 144)))),
+ decode_head=[
+ dict(
+ type='FCNHead',
+ in_channels=[18, 36, 72, 144],
+ channels=sum([18, 36, 72, 144]),
+ in_index=(0, 1, 2, 3),
+ input_transform='resize_concat',
+ kernel_size=1,
+ num_convs=1,
+ concat_input=False,
+ dropout_ratio=-1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+ dict(
+ type='OCRHead',
+ in_channels=[18, 36, 72, 144],
+ in_index=(0, 1, 2, 3),
+ input_transform='resize_concat',
+ channels=512,
+ ocr_channels=256,
+ dropout_ratio=-1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+ ],
+ # model training and testing settings
+ train_cfg=dict(),
+ test_cfg=dict(mode='whole'))
diff --git a/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/ocrnet_r50-d8.py b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/ocrnet_r50-d8.py
new file mode 100644
index 0000000000000000000000000000000000000000..615aa3ff703942b6c22b2d6e9642504dd3e41ebd
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/ocrnet_r50-d8.py
@@ -0,0 +1,47 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='CascadeEncoderDecoder',
+ num_stages=2,
+ pretrained='open-mmlab://resnet50_v1c',
+ backbone=dict(
+ type='ResNetV1c',
+ depth=50,
+ num_stages=4,
+ out_indices=(0, 1, 2, 3),
+ dilations=(1, 1, 2, 4),
+ strides=(1, 2, 1, 1),
+ norm_cfg=norm_cfg,
+ norm_eval=False,
+ style='pytorch',
+ contract_dilation=True),
+ decode_head=[
+ dict(
+ type='FCNHead',
+ in_channels=1024,
+ in_index=2,
+ channels=256,
+ num_convs=1,
+ concat_input=False,
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+ dict(
+ type='OCRHead',
+ in_channels=2048,
+ in_index=3,
+ channels=512,
+ ocr_channels=256,
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0))
+ ],
+ # model training and testing settings
+ train_cfg=dict(),
+ test_cfg=dict(mode='whole'))
diff --git a/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/pointrend_r50.py b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/pointrend_r50.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d323dbf9466d41e0800aa57ef84045f3d874bdf
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/pointrend_r50.py
@@ -0,0 +1,56 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='CascadeEncoderDecoder',
+ num_stages=2,
+ pretrained='open-mmlab://resnet50_v1c',
+ backbone=dict(
+ type='ResNetV1c',
+ depth=50,
+ num_stages=4,
+ out_indices=(0, 1, 2, 3),
+ dilations=(1, 1, 1, 1),
+ strides=(1, 2, 2, 2),
+ norm_cfg=norm_cfg,
+ norm_eval=False,
+ style='pytorch',
+ contract_dilation=True),
+ neck=dict(
+ type='FPN',
+ in_channels=[256, 512, 1024, 2048],
+ out_channels=256,
+ num_outs=4),
+ decode_head=[
+ dict(
+ type='FPNHead',
+ in_channels=[256, 256, 256, 256],
+ in_index=[0, 1, 2, 3],
+ feature_strides=[4, 8, 16, 32],
+ channels=128,
+ dropout_ratio=-1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+ dict(
+ type='PointHead',
+ in_channels=[256],
+ in_index=[0],
+ channels=256,
+ num_fcs=3,
+ coarse_pred_each_layer=True,
+ dropout_ratio=-1,
+ num_classes=19,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0))
+ ],
+ # model training and testing settings
+ train_cfg=dict(
+ num_points=2048, oversample_ratio=3, importance_sample_ratio=0.75),
+ test_cfg=dict(
+ mode='whole',
+ subdivision_steps=2,
+ subdivision_num_points=8196,
+ scale_factor=2))
diff --git a/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/psanet_r50-d8.py b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/psanet_r50-d8.py
new file mode 100644
index 0000000000000000000000000000000000000000..689513fa9d2a40f14bf0ae4ae61f38f0dcc1b3da
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/psanet_r50-d8.py
@@ -0,0 +1,49 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='EncoderDecoder',
+ pretrained='open-mmlab://resnet50_v1c',
+ backbone=dict(
+ type='ResNetV1c',
+ depth=50,
+ num_stages=4,
+ out_indices=(0, 1, 2, 3),
+ dilations=(1, 1, 2, 4),
+ strides=(1, 2, 1, 1),
+ norm_cfg=norm_cfg,
+ norm_eval=False,
+ style='pytorch',
+ contract_dilation=True),
+ decode_head=dict(
+ type='PSAHead',
+ in_channels=2048,
+ in_index=3,
+ channels=512,
+ mask_size=(97, 97),
+ psa_type='bi-direction',
+ compact=False,
+ shrink_factor=2,
+ normalization_factor=1.0,
+ psa_softmax=True,
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+ auxiliary_head=dict(
+ type='FCNHead',
+ in_channels=1024,
+ in_index=2,
+ channels=256,
+ num_convs=1,
+ concat_input=False,
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+ # model training and testing settings
+ train_cfg=dict(),
+ test_cfg=dict(mode='whole'))
diff --git a/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/pspnet_r50-d8.py b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/pspnet_r50-d8.py
new file mode 100644
index 0000000000000000000000000000000000000000..f451e08ad2eb0732dcb806b1851eb978d4acf136
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/pspnet_r50-d8.py
@@ -0,0 +1,44 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='EncoderDecoder',
+ pretrained='open-mmlab://resnet50_v1c',
+ backbone=dict(
+ type='ResNetV1c',
+ depth=50,
+ num_stages=4,
+ out_indices=(0, 1, 2, 3),
+ dilations=(1, 1, 2, 4),
+ strides=(1, 2, 1, 1),
+ norm_cfg=norm_cfg,
+ norm_eval=False,
+ style='pytorch',
+ contract_dilation=True),
+ decode_head=dict(
+ type='PSPHead',
+ in_channels=2048,
+ in_index=3,
+ channels=512,
+ pool_scales=(1, 2, 3, 6),
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+ auxiliary_head=dict(
+ type='FCNHead',
+ in_channels=1024,
+ in_index=2,
+ channels=256,
+ num_convs=1,
+ concat_input=False,
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+ # model training and testing settings
+ train_cfg=dict(),
+ test_cfg=dict(mode='whole'))
diff --git a/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/pspnet_unet_s5-d16.py b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/pspnet_unet_s5-d16.py
new file mode 100644
index 0000000000000000000000000000000000000000..fcff9ec4f41fad158344ecd77313dc14564f3682
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/pspnet_unet_s5-d16.py
@@ -0,0 +1,50 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='EncoderDecoder',
+ pretrained=None,
+ backbone=dict(
+ type='UNet',
+ in_channels=3,
+ base_channels=64,
+ num_stages=5,
+ strides=(1, 1, 1, 1, 1),
+ enc_num_convs=(2, 2, 2, 2, 2),
+ dec_num_convs=(2, 2, 2, 2),
+ downsamples=(True, True, True, True),
+ enc_dilations=(1, 1, 1, 1, 1),
+ dec_dilations=(1, 1, 1, 1),
+ with_cp=False,
+ conv_cfg=None,
+ norm_cfg=norm_cfg,
+ act_cfg=dict(type='ReLU'),
+ upsample_cfg=dict(type='InterpConv'),
+ norm_eval=False),
+ decode_head=dict(
+ type='PSPHead',
+ in_channels=64,
+ in_index=4,
+ channels=16,
+ pool_scales=(1, 2, 3, 6),
+ dropout_ratio=0.1,
+ num_classes=2,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+ auxiliary_head=dict(
+ type='FCNHead',
+ in_channels=128,
+ in_index=3,
+ channels=64,
+ num_convs=1,
+ concat_input=False,
+ dropout_ratio=0.1,
+ num_classes=2,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+ # model training and testing settings
+ train_cfg=dict(),
+ test_cfg=dict(mode='slide', crop_size=256, stride=170))
diff --git a/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/upernet_r50.py b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/upernet_r50.py
new file mode 100644
index 0000000000000000000000000000000000000000..10974962fdd7136031fd06de1700f497d355ceaa
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/upernet_r50.py
@@ -0,0 +1,44 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='EncoderDecoder',
+ pretrained='open-mmlab://resnet50_v1c',
+ backbone=dict(
+ type='ResNetV1c',
+ depth=50,
+ num_stages=4,
+ out_indices=(0, 1, 2, 3),
+ dilations=(1, 1, 1, 1),
+ strides=(1, 2, 2, 2),
+ norm_cfg=norm_cfg,
+ norm_eval=False,
+ style='pytorch',
+ contract_dilation=True),
+ decode_head=dict(
+ type='UPerHead',
+ in_channels=[256, 512, 1024, 2048],
+ in_index=[0, 1, 2, 3],
+ pool_scales=(1, 2, 3, 6),
+ channels=512,
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+ auxiliary_head=dict(
+ type='FCNHead',
+ in_channels=1024,
+ in_index=2,
+ channels=256,
+ num_convs=1,
+ concat_input=False,
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+ # model training and testing settings
+ train_cfg=dict(),
+ test_cfg=dict(mode='whole'))
diff --git a/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/upernet_uniformer.py b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/upernet_uniformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..41aa4db809dc6e2c508e98051f61807d07477903
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/models/upernet_uniformer.py
@@ -0,0 +1,43 @@
+# model settings
+norm_cfg = dict(type='BN', requires_grad=True)
+model = dict(
+ type='EncoderDecoder',
+ pretrained=None,
+ backbone=dict(
+ type='UniFormer',
+ embed_dim=[64, 128, 320, 512],
+ layers=[3, 4, 8, 3],
+ head_dim=64,
+ mlp_ratio=4.,
+ qkv_bias=True,
+ drop_rate=0.,
+ attn_drop_rate=0.,
+ drop_path_rate=0.1),
+ decode_head=dict(
+ type='UPerHead',
+ in_channels=[64, 128, 320, 512],
+ in_index=[0, 1, 2, 3],
+ pool_scales=(1, 2, 3, 6),
+ channels=512,
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+ auxiliary_head=dict(
+ type='FCNHead',
+ in_channels=320,
+ in_index=2,
+ channels=256,
+ num_convs=1,
+ concat_input=False,
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+ # model training and testing settings
+ train_cfg=dict(),
+ test_cfg=dict(mode='whole'))
\ No newline at end of file
diff --git a/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/schedules/schedule_160k.py b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/schedules/schedule_160k.py
new file mode 100644
index 0000000000000000000000000000000000000000..52603890b10f25faf8eec9f9e5a4468fae09b811
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/schedules/schedule_160k.py
@@ -0,0 +1,9 @@
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
+optimizer_config = dict()
+# learning policy
+lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
+# runtime settings
+runner = dict(type='IterBasedRunner', max_iters=160000)
+checkpoint_config = dict(by_epoch=False, interval=16000)
+evaluation = dict(interval=16000, metric='mIoU')
diff --git a/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/schedules/schedule_20k.py b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/schedules/schedule_20k.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf780a1b6f6521833c6a5859675147824efa599d
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/schedules/schedule_20k.py
@@ -0,0 +1,9 @@
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
+optimizer_config = dict()
+# learning policy
+lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
+# runtime settings
+runner = dict(type='IterBasedRunner', max_iters=20000)
+checkpoint_config = dict(by_epoch=False, interval=2000)
+evaluation = dict(interval=2000, metric='mIoU')
diff --git a/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/schedules/schedule_40k.py b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/schedules/schedule_40k.py
new file mode 100644
index 0000000000000000000000000000000000000000..cdbf841abcb26eed87bf76ab816aff4bae0630ee
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/schedules/schedule_40k.py
@@ -0,0 +1,9 @@
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
+optimizer_config = dict()
+# learning policy
+lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
+# runtime settings
+runner = dict(type='IterBasedRunner', max_iters=40000)
+checkpoint_config = dict(by_epoch=False, interval=4000)
+evaluation = dict(interval=4000, metric='mIoU')
diff --git a/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/schedules/schedule_80k.py b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/schedules/schedule_80k.py
new file mode 100644
index 0000000000000000000000000000000000000000..c190cee6bdc7922b688ea75dc8f152fa15c24617
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/uniformer/configs/_base_/schedules/schedule_80k.py
@@ -0,0 +1,9 @@
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
+optimizer_config = dict()
+# learning policy
+lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
+# runtime settings
+runner = dict(type='IterBasedRunner', max_iters=80000)
+checkpoint_config = dict(by_epoch=False, interval=8000)
+evaluation = dict(interval=8000, metric='mIoU')
diff --git a/extensions/sd-webui-controlnet/annotator/uniformer/inference.py b/extensions/sd-webui-controlnet/annotator/uniformer/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..de5955984b2997e2dee5dbacbdc34816888b4f12
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/uniformer/inference.py
@@ -0,0 +1,142 @@
+
+import torch
+
+try:
+ import mmcv as mmcv
+ from mmcv.parallel import collate, scatter
+ from mmcv.runner import load_checkpoint
+ from mmseg.datasets.pipelines import Compose
+ from mmseg.models import build_segmentor
+except ImportError:
+ import annotator.mmpkg.mmcv as mmcv
+ from annotator.mmpkg.mmcv.parallel import collate, scatter
+ from annotator.mmpkg.mmcv.runner import load_checkpoint
+ from annotator.mmpkg.mmseg.datasets.pipelines import Compose
+ from annotator.mmpkg.mmseg.models import build_segmentor
+
+def init_segmentor(config, checkpoint=None, device='cuda:0'):
+ """Initialize a segmentor from config file.
+
+ Args:
+ config (str or :obj:`mmcv.Config`): Config file path or the config
+ object.
+ checkpoint (str, optional): Checkpoint path. If left as None, the model
+ will not load any weights.
+ device (str, optional) CPU/CUDA device option. Default 'cuda:0'.
+ Use 'cpu' for loading model on CPU.
+ Returns:
+ nn.Module: The constructed segmentor.
+ """
+ if isinstance(config, str):
+ config = mmcv.Config.fromfile(config)
+ elif not isinstance(config, mmcv.Config):
+ raise TypeError('config must be a filename or Config object, '
+ 'but got {}'.format(type(config)))
+ config.model.pretrained = None
+ config.model.train_cfg = None
+ model = build_segmentor(config.model, test_cfg=config.get('test_cfg'))
+ if checkpoint is not None:
+ checkpoint = load_checkpoint(model, checkpoint, map_location='cpu')
+ model.CLASSES = checkpoint['meta']['CLASSES']
+ model.PALETTE = checkpoint['meta']['PALETTE']
+ model.cfg = config # save the config in the model for convenience
+ model.to(device)
+ model.eval()
+ return model
+
+
+class LoadImage:
+ """A simple pipeline to load image."""
+
+ def __call__(self, results):
+ """Call function to load images into results.
+
+ Args:
+ results (dict): A result dict contains the file name
+ of the image to be read.
+
+ Returns:
+ dict: ``results`` will be returned containing loaded image.
+ """
+
+ if isinstance(results['img'], str):
+ results['filename'] = results['img']
+ results['ori_filename'] = results['img']
+ else:
+ results['filename'] = None
+ results['ori_filename'] = None
+ img = mmcv.imread(results['img'])
+ results['img'] = img
+ results['img_shape'] = img.shape
+ results['ori_shape'] = img.shape
+ return results
+
+
+def inference_segmentor(model, img):
+ """Inference image(s) with the segmentor.
+
+ Args:
+ model (nn.Module): The loaded segmentor.
+ imgs (str/ndarray or list[str/ndarray]): Either image files or loaded
+ images.
+
+ Returns:
+ (list[Tensor]): The segmentation result.
+ """
+ cfg = model.cfg
+ device = next(model.parameters()).device # model device
+ # build the data pipeline
+ test_pipeline = [LoadImage()] + cfg.data.test.pipeline[1:]
+ test_pipeline = Compose(test_pipeline)
+ # prepare data
+ data = dict(img=img)
+ data = test_pipeline(data)
+ data = collate([data], samples_per_gpu=1)
+ if next(model.parameters()).is_cuda:
+ # scatter to specified GPU
+ data = scatter(data, [device])[0]
+ else:
+ data['img_metas'] = [i.data[0] for i in data['img_metas']]
+
+ # forward the model
+ with torch.no_grad():
+ result = model(return_loss=False, rescale=True, **data)
+ return result
+
+
+def show_result_pyplot(model,
+ img,
+ result,
+ palette=None,
+ fig_size=(15, 10),
+ opacity=0.5,
+ title='',
+ block=True):
+ """Visualize the segmentation results on the image.
+
+ Args:
+ model (nn.Module): The loaded segmentor.
+ img (str or np.ndarray): Image filename or loaded image.
+ result (list): The segmentation result.
+ palette (list[list[int]]] | None): The palette of segmentation
+ map. If None is given, random palette will be generated.
+ Default: None
+ fig_size (tuple): Figure size of the pyplot figure.
+ opacity(float): Opacity of painted segmentation map.
+ Default 0.5.
+ Must be in (0, 1] range.
+ title (str): The title of pyplot figure.
+ Default is ''.
+ block (bool): Whether to block the pyplot figure.
+ Default is True.
+ """
+ if hasattr(model, 'module'):
+ model = model.module
+ img = model.show_result(
+ img, result, palette=palette, show=False, opacity=opacity)
+ # plt.figure(figsize=fig_size)
+ # plt.imshow(mmcv.bgr2rgb(img))
+ # plt.title(title)
+ # plt.tight_layout()
+ # plt.show(block=block)
+ return mmcv.bgr2rgb(img)
diff --git a/extensions/sd-webui-controlnet/annotator/uniformer/mmcv_custom/__init__.py b/extensions/sd-webui-controlnet/annotator/uniformer/mmcv_custom/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b958738b9fd93bfcec239c550df1d9a44b8c536
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/uniformer/mmcv_custom/__init__.py
@@ -0,0 +1,5 @@
+# -*- coding: utf-8 -*-
+
+from .checkpoint import load_checkpoint
+
+__all__ = ['load_checkpoint']
\ No newline at end of file
diff --git a/extensions/sd-webui-controlnet/annotator/uniformer/mmcv_custom/checkpoint.py b/extensions/sd-webui-controlnet/annotator/uniformer/mmcv_custom/checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..48c1b16b53107cb1301edf6cc07ccfe6f7010da6
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/uniformer/mmcv_custom/checkpoint.py
@@ -0,0 +1,508 @@
+# Copyright (c) Open-MMLab. All rights reserved.
+import io
+import os
+import os.path as osp
+import pkgutil
+import time
+import warnings
+from collections import OrderedDict
+from importlib import import_module
+from tempfile import TemporaryDirectory
+
+import torch
+import torchvision
+from torch.optim import Optimizer
+from torch.utils import model_zoo
+from torch.nn import functional as F
+
+try:
+ import mmcv as mmcv
+ from mmcv.fileio import FileClient
+ from mmcv.fileio import load as load_file
+ from mmcv.parallel import is_module_wrapper
+ from mmcv.utils import mkdir_or_exist
+ from mmcv.runner import get_dist_info
+except ImportError:
+ import annotator.mmpkg.mmcv as mmcv
+ from annotator.mmpkg.mmcv.fileio import FileClient
+ from annotator.mmpkg.mmcv.fileio import load as load_file
+ from annotator.mmpkg.mmcv.parallel import is_module_wrapper
+ from annotator.mmpkg.mmcv.utils import mkdir_or_exist
+ from annotator.mmpkg.mmcv.runner import get_dist_info
+
+ENV_MMCV_HOME = 'MMCV_HOME'
+ENV_XDG_CACHE_HOME = 'XDG_CACHE_HOME'
+DEFAULT_CACHE_DIR = '~/.cache'
+
+
+def _get_mmcv_home():
+ mmcv_home = os.path.expanduser(
+ os.getenv(
+ ENV_MMCV_HOME,
+ os.path.join(
+ os.getenv(ENV_XDG_CACHE_HOME, DEFAULT_CACHE_DIR), 'mmcv')))
+
+ mkdir_or_exist(mmcv_home)
+ return mmcv_home
+
+
+def load_state_dict(module, state_dict, strict=False, logger=None):
+ """Load state_dict to a module.
+
+ This method is modified from :meth:`torch.nn.Module.load_state_dict`.
+ Default value for ``strict`` is set to ``False`` and the message for
+ param mismatch will be shown even if strict is False.
+
+ Args:
+ module (Module): Module that receives the state_dict.
+ state_dict (OrderedDict): Weights.
+ strict (bool): whether to strictly enforce that the keys
+ in :attr:`state_dict` match the keys returned by this module's
+ :meth:`~torch.nn.Module.state_dict` function. Default: ``False``.
+ logger (:obj:`logging.Logger`, optional): Logger to log the error
+ message. If not specified, print function will be used.
+ """
+ unexpected_keys = []
+ all_missing_keys = []
+ err_msg = []
+
+ metadata = getattr(state_dict, '_metadata', None)
+ state_dict = state_dict.copy()
+ if metadata is not None:
+ state_dict._metadata = metadata
+
+ # use _load_from_state_dict to enable checkpoint version control
+ def load(module, prefix=''):
+ # recursively check parallel module in case that the model has a
+ # complicated structure, e.g., nn.Module(nn.Module(DDP))
+ if is_module_wrapper(module):
+ module = module.module
+ local_metadata = {} if metadata is None else metadata.get(
+ prefix[:-1], {})
+ module._load_from_state_dict(state_dict, prefix, local_metadata, True,
+ all_missing_keys, unexpected_keys,
+ err_msg)
+ for name, child in module._modules.items():
+ if child is not None:
+ load(child, prefix + name + '.')
+
+ load(module)
+ load = None # break load->load reference cycle
+
+ # ignore "num_batches_tracked" of BN layers
+ missing_keys = [
+ key for key in all_missing_keys if 'num_batches_tracked' not in key
+ ]
+
+ if unexpected_keys:
+ err_msg.append('unexpected key in source '
+ f'state_dict: {", ".join(unexpected_keys)}\n')
+ if missing_keys:
+ err_msg.append(
+ f'missing keys in source state_dict: {", ".join(missing_keys)}\n')
+
+ rank, _ = get_dist_info()
+ if len(err_msg) > 0 and rank == 0:
+ err_msg.insert(
+ 0, 'The model and loaded state dict do not match exactly\n')
+ err_msg = '\n'.join(err_msg)
+ if strict:
+ raise RuntimeError(err_msg)
+ elif logger is not None:
+ logger.warning(err_msg)
+ else:
+ print(err_msg)
+
+
+def load_url_dist(url, model_dir=None):
+ """In distributed setting, this function only download checkpoint at local
+ rank 0."""
+ rank, world_size = get_dist_info()
+ rank = int(os.environ.get('LOCAL_RANK', rank))
+ if rank == 0:
+ checkpoint = model_zoo.load_url(url, model_dir=model_dir)
+ if world_size > 1:
+ torch.distributed.barrier()
+ if rank > 0:
+ checkpoint = model_zoo.load_url(url, model_dir=model_dir)
+ return checkpoint
+
+
+def load_pavimodel_dist(model_path, map_location=None):
+ """In distributed setting, this function only download checkpoint at local
+ rank 0."""
+ try:
+ from pavi import modelcloud
+ except ImportError:
+ raise ImportError(
+ 'Please install pavi to load checkpoint from modelcloud.')
+ rank, world_size = get_dist_info()
+ rank = int(os.environ.get('LOCAL_RANK', rank))
+ if rank == 0:
+ model = modelcloud.get(model_path)
+ with TemporaryDirectory() as tmp_dir:
+ downloaded_file = osp.join(tmp_dir, model.name)
+ model.download(downloaded_file)
+ checkpoint = torch.load(downloaded_file, map_location=map_location)
+ if world_size > 1:
+ torch.distributed.barrier()
+ if rank > 0:
+ model = modelcloud.get(model_path)
+ with TemporaryDirectory() as tmp_dir:
+ downloaded_file = osp.join(tmp_dir, model.name)
+ model.download(downloaded_file)
+ checkpoint = torch.load(
+ downloaded_file, map_location=map_location)
+ return checkpoint
+
+
+def load_fileclient_dist(filename, backend, map_location):
+ """In distributed setting, this function only download checkpoint at local
+ rank 0."""
+ rank, world_size = get_dist_info()
+ rank = int(os.environ.get('LOCAL_RANK', rank))
+ allowed_backends = ['ceph']
+ if backend not in allowed_backends:
+ raise ValueError(f'Load from Backend {backend} is not supported.')
+ if rank == 0:
+ fileclient = FileClient(backend=backend)
+ buffer = io.BytesIO(fileclient.get(filename))
+ checkpoint = torch.load(buffer, map_location=map_location)
+ if world_size > 1:
+ torch.distributed.barrier()
+ if rank > 0:
+ fileclient = FileClient(backend=backend)
+ buffer = io.BytesIO(fileclient.get(filename))
+ checkpoint = torch.load(buffer, map_location=map_location)
+ return checkpoint
+
+
+def get_torchvision_models():
+ model_urls = dict()
+ for _, name, ispkg in pkgutil.walk_packages(torchvision.models.__path__):
+ if ispkg:
+ continue
+ _zoo = import_module(f'torchvision.models.{name}')
+ if hasattr(_zoo, 'model_urls'):
+ _urls = getattr(_zoo, 'model_urls')
+ model_urls.update(_urls)
+ return model_urls
+
+
+def get_external_models():
+ mmcv_home = _get_mmcv_home()
+ default_json_path = osp.join(mmcv.__path__[0], 'model_zoo/open_mmlab.json')
+ default_urls = load_file(default_json_path)
+ assert isinstance(default_urls, dict)
+ external_json_path = osp.join(mmcv_home, 'open_mmlab.json')
+ if osp.exists(external_json_path):
+ external_urls = load_file(external_json_path)
+ assert isinstance(external_urls, dict)
+ default_urls.update(external_urls)
+
+ return default_urls
+
+
+def get_mmcls_models():
+ mmcls_json_path = osp.join(mmcv.__path__[0], 'model_zoo/mmcls.json')
+ mmcls_urls = load_file(mmcls_json_path)
+
+ return mmcls_urls
+
+
+def get_deprecated_model_names():
+ deprecate_json_path = osp.join(mmcv.__path__[0],
+ 'model_zoo/deprecated.json')
+ deprecate_urls = load_file(deprecate_json_path)
+ assert isinstance(deprecate_urls, dict)
+
+ return deprecate_urls
+
+
+def _process_mmcls_checkpoint(checkpoint):
+ state_dict = checkpoint['state_dict']
+ new_state_dict = OrderedDict()
+ for k, v in state_dict.items():
+ if k.startswith('backbone.'):
+ new_state_dict[k[9:]] = v
+ new_checkpoint = dict(state_dict=new_state_dict)
+
+ return new_checkpoint
+
+
+def _load_checkpoint(filename, map_location=None):
+ """Load checkpoint from somewhere (modelzoo, file, url).
+
+ Args:
+ filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+ ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
+ details.
+ map_location (str | None): Same as :func:`torch.load`. Default: None.
+
+ Returns:
+ dict | OrderedDict: The loaded checkpoint. It can be either an
+ OrderedDict storing model weights or a dict containing other
+ information, which depends on the checkpoint.
+ """
+ if filename.startswith('modelzoo://'):
+ warnings.warn('The URL scheme of "modelzoo://" is deprecated, please '
+ 'use "torchvision://" instead')
+ model_urls = get_torchvision_models()
+ model_name = filename[11:]
+ checkpoint = load_url_dist(model_urls[model_name])
+ elif filename.startswith('torchvision://'):
+ model_urls = get_torchvision_models()
+ model_name = filename[14:]
+ checkpoint = load_url_dist(model_urls[model_name])
+ elif filename.startswith('open-mmlab://'):
+ model_urls = get_external_models()
+ model_name = filename[13:]
+ deprecated_urls = get_deprecated_model_names()
+ if model_name in deprecated_urls:
+ warnings.warn(f'open-mmlab://{model_name} is deprecated in favor '
+ f'of open-mmlab://{deprecated_urls[model_name]}')
+ model_name = deprecated_urls[model_name]
+ model_url = model_urls[model_name]
+ # check if is url
+ if model_url.startswith(('http://', 'https://')):
+ checkpoint = load_url_dist(model_url)
+ else:
+ filename = osp.join(_get_mmcv_home(), model_url)
+ if not osp.isfile(filename):
+ raise IOError(f'{filename} is not a checkpoint file')
+ checkpoint = torch.load(filename, map_location=map_location)
+ elif filename.startswith('mmcls://'):
+ model_urls = get_mmcls_models()
+ model_name = filename[8:]
+ checkpoint = load_url_dist(model_urls[model_name])
+ checkpoint = _process_mmcls_checkpoint(checkpoint)
+ elif filename.startswith(('http://', 'https://')):
+ checkpoint = load_url_dist(filename)
+ elif filename.startswith('pavi://'):
+ model_path = filename[7:]
+ checkpoint = load_pavimodel_dist(model_path, map_location=map_location)
+ elif filename.startswith('s3://'):
+ checkpoint = load_fileclient_dist(
+ filename, backend='ceph', map_location=map_location)
+ else:
+ if not osp.isfile(filename):
+ raise IOError(f'{filename} is not a checkpoint file')
+ checkpoint = torch.load(filename, map_location=map_location)
+ return checkpoint
+
+
+def load_checkpoint(model,
+ filename,
+ map_location='cpu',
+ strict=False,
+ logger=None):
+ """Load checkpoint from a file or URI.
+
+ Args:
+ model (Module): Module to load checkpoint.
+ filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+ ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
+ details.
+ map_location (str): Same as :func:`torch.load`.
+ strict (bool): Whether to allow different params for the model and
+ checkpoint.
+ logger (:mod:`logging.Logger` or None): The logger for error message.
+
+ Returns:
+ dict or OrderedDict: The loaded checkpoint.
+ """
+ checkpoint = _load_checkpoint(filename, map_location)
+ # OrderedDict is a subclass of dict
+ if not isinstance(checkpoint, dict):
+ raise RuntimeError(
+ f'No state_dict found in checkpoint file {filename}')
+ # get state_dict from checkpoint
+ if 'state_dict' in checkpoint:
+ state_dict = checkpoint['state_dict']
+ elif 'model' in checkpoint:
+ state_dict = checkpoint['model']
+ else:
+ state_dict = checkpoint
+ # strip prefix of state_dict
+ if list(state_dict.keys())[0].startswith('module.'):
+ state_dict = {k[7:]: v for k, v in state_dict.items()}
+
+ # for MoBY, load model of online branch
+ if sorted(list(state_dict.keys()))[0].startswith('encoder'):
+ state_dict = {k.replace('encoder.', ''): v for k, v in state_dict.items() if k.startswith('encoder.')}
+
+ # reshape absolute position embedding
+ if state_dict.get('absolute_pos_embed') is not None:
+ absolute_pos_embed = state_dict['absolute_pos_embed']
+ N1, L, C1 = absolute_pos_embed.size()
+ N2, C2, H, W = model.absolute_pos_embed.size()
+ if N1 != N2 or C1 != C2 or L != H*W:
+ logger.warning("Error in loading absolute_pos_embed, pass")
+ else:
+ state_dict['absolute_pos_embed'] = absolute_pos_embed.view(N2, H, W, C2).permute(0, 3, 1, 2)
+
+ # interpolate position bias table if needed
+ relative_position_bias_table_keys = [k for k in state_dict.keys() if "relative_position_bias_table" in k]
+ for table_key in relative_position_bias_table_keys:
+ table_pretrained = state_dict[table_key]
+ table_current = model.state_dict()[table_key]
+ L1, nH1 = table_pretrained.size()
+ L2, nH2 = table_current.size()
+ if nH1 != nH2:
+ logger.warning(f"Error in loading {table_key}, pass")
+ else:
+ if L1 != L2:
+ S1 = int(L1 ** 0.5)
+ S2 = int(L2 ** 0.5)
+ table_pretrained_resized = F.interpolate(
+ table_pretrained.permute(1, 0).view(1, nH1, S1, S1),
+ size=(S2, S2), mode='bicubic')
+ state_dict[table_key] = table_pretrained_resized.view(nH2, L2).permute(1, 0)
+
+ # load state_dict
+ load_state_dict(model, state_dict, strict, logger)
+ return checkpoint
+
+
+def weights_to_cpu(state_dict):
+ """Copy a model state_dict to cpu.
+
+ Args:
+ state_dict (OrderedDict): Model weights on GPU.
+
+ Returns:
+ OrderedDict: Model weights on GPU.
+ """
+ state_dict_cpu = OrderedDict()
+ for key, val in state_dict.items():
+ state_dict_cpu[key] = val.cpu()
+ return state_dict_cpu
+
+
+def _save_to_state_dict(module, destination, prefix, keep_vars):
+ """Saves module state to `destination` dictionary.
+
+ This method is modified from :meth:`torch.nn.Module._save_to_state_dict`.
+
+ Args:
+ module (nn.Module): The module to generate state_dict.
+ destination (dict): A dict where state will be stored.
+ prefix (str): The prefix for parameters and buffers used in this
+ module.
+ """
+ for name, param in module._parameters.items():
+ if param is not None:
+ destination[prefix + name] = param if keep_vars else param.detach()
+ for name, buf in module._buffers.items():
+ # remove check of _non_persistent_buffers_set to allow nn.BatchNorm2d
+ if buf is not None:
+ destination[prefix + name] = buf if keep_vars else buf.detach()
+
+
+def get_state_dict(module, destination=None, prefix='', keep_vars=False):
+ """Returns a dictionary containing a whole state of the module.
+
+ Both parameters and persistent buffers (e.g. running averages) are
+ included. Keys are corresponding parameter and buffer names.
+
+ This method is modified from :meth:`torch.nn.Module.state_dict` to
+ recursively check parallel module in case that the model has a complicated
+ structure, e.g., nn.Module(nn.Module(DDP)).
+
+ Args:
+ module (nn.Module): The module to generate state_dict.
+ destination (OrderedDict): Returned dict for the state of the
+ module.
+ prefix (str): Prefix of the key.
+ keep_vars (bool): Whether to keep the variable property of the
+ parameters. Default: False.
+
+ Returns:
+ dict: A dictionary containing a whole state of the module.
+ """
+ # recursively check parallel module in case that the model has a
+ # complicated structure, e.g., nn.Module(nn.Module(DDP))
+ if is_module_wrapper(module):
+ module = module.module
+
+ # below is the same as torch.nn.Module.state_dict()
+ if destination is None:
+ destination = OrderedDict()
+ destination._metadata = OrderedDict()
+ destination._metadata[prefix[:-1]] = local_metadata = dict(
+ version=module._version)
+ _save_to_state_dict(module, destination, prefix, keep_vars)
+ for name, child in module._modules.items():
+ if child is not None:
+ get_state_dict(
+ child, destination, prefix + name + '.', keep_vars=keep_vars)
+ for hook in module._state_dict_hooks.values():
+ hook_result = hook(module, destination, prefix, local_metadata)
+ if hook_result is not None:
+ destination = hook_result
+ return destination
+
+
+def save_checkpoint(model, filename, optimizer=None, meta=None):
+ """Save checkpoint to file.
+
+ The checkpoint will have 3 fields: ``meta``, ``state_dict`` and
+ ``optimizer``. By default ``meta`` will contain version and time info.
+
+ Args:
+ model (Module): Module whose params are to be saved.
+ filename (str): Checkpoint filename.
+ optimizer (:obj:`Optimizer`, optional): Optimizer to be saved.
+ meta (dict, optional): Metadata to be saved in checkpoint.
+ """
+ if meta is None:
+ meta = {}
+ elif not isinstance(meta, dict):
+ raise TypeError(f'meta must be a dict or None, but got {type(meta)}')
+ meta.update(mmcv_version=mmcv.__version__, time=time.asctime())
+
+ if is_module_wrapper(model):
+ model = model.module
+
+ if hasattr(model, 'CLASSES') and model.CLASSES is not None:
+ # save class name to the meta
+ meta.update(CLASSES=model.CLASSES)
+
+ checkpoint = {
+ 'meta': meta,
+ 'state_dict': weights_to_cpu(get_state_dict(model))
+ }
+ # save optimizer state dict in the checkpoint
+ if isinstance(optimizer, Optimizer):
+ checkpoint['optimizer'] = optimizer.state_dict()
+ elif isinstance(optimizer, dict):
+ checkpoint['optimizer'] = {}
+ for name, optim in optimizer.items():
+ checkpoint['optimizer'][name] = optim.state_dict()
+
+ if filename.startswith('pavi://'):
+ try:
+ from pavi import modelcloud
+ from pavi.exception import NodeNotFoundError
+ except ImportError:
+ raise ImportError(
+ 'Please install pavi to load checkpoint from modelcloud.')
+ model_path = filename[7:]
+ root = modelcloud.Folder()
+ model_dir, model_name = osp.split(model_path)
+ try:
+ model = modelcloud.get(model_dir)
+ except NodeNotFoundError:
+ model = root.create_training_model(model_dir)
+ with TemporaryDirectory() as tmp_dir:
+ checkpoint_file = osp.join(tmp_dir, model_name)
+ with open(checkpoint_file, 'wb') as f:
+ torch.save(checkpoint, f)
+ f.flush()
+ model.create_file(checkpoint_file, name=model_name)
+ else:
+ mmcv.mkdir_or_exist(osp.dirname(filename))
+ # immediately flush buffer
+ with open(filename, 'wb') as f:
+ torch.save(checkpoint, f)
+ f.flush()
\ No newline at end of file
diff --git a/extensions/sd-webui-controlnet/annotator/uniformer/uniformer.py b/extensions/sd-webui-controlnet/annotator/uniformer/uniformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5726fbe63888e0d7a85563308ffd2ab526fed32
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/uniformer/uniformer.py
@@ -0,0 +1,426 @@
+# --------------------------------------------------------
+# UniFormer
+# Copyright (c) 2022 SenseTime X-Lab
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Kunchang Li
+# --------------------------------------------------------
+
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+
+from functools import partial
+from collections import OrderedDict
+from timm.models.layers import DropPath, to_2tuple, trunc_normal_
+
+try:
+ from mmseg.utils import get_root_logger
+ from mmseg.models.builder import BACKBONES
+except ImportError:
+ from annotator.mmpkg.mmseg.utils import get_root_logger
+ from annotator.mmpkg.mmseg.models.builder import BACKBONES
+
+from annotator.uniformer.mmcv_custom import load_checkpoint
+
+
+class Mlp(nn.Module):
+ def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+ super().__init__()
+ out_features = out_features or in_features
+ hidden_features = hidden_features or in_features
+ self.fc1 = nn.Linear(in_features, hidden_features)
+ self.act = act_layer()
+ self.fc2 = nn.Linear(hidden_features, out_features)
+ self.drop = nn.Dropout(drop)
+
+ def forward(self, x):
+ x = self.fc1(x)
+ x = self.act(x)
+ x = self.drop(x)
+ x = self.fc2(x)
+ x = self.drop(x)
+ return x
+
+
+class CMlp(nn.Module):
+ def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+ super().__init__()
+ out_features = out_features or in_features
+ hidden_features = hidden_features or in_features
+ self.fc1 = nn.Conv2d(in_features, hidden_features, 1)
+ self.act = act_layer()
+ self.fc2 = nn.Conv2d(hidden_features, out_features, 1)
+ self.drop = nn.Dropout(drop)
+
+ def forward(self, x):
+ x = self.fc1(x)
+ x = self.act(x)
+ x = self.drop(x)
+ x = self.fc2(x)
+ x = self.drop(x)
+ return x
+
+
+class CBlock(nn.Module):
+ def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+ drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+ super().__init__()
+ self.pos_embed = nn.Conv2d(dim, dim, 3, padding=1, groups=dim)
+ self.norm1 = nn.BatchNorm2d(dim)
+ self.conv1 = nn.Conv2d(dim, dim, 1)
+ self.conv2 = nn.Conv2d(dim, dim, 1)
+ self.attn = nn.Conv2d(dim, dim, 5, padding=2, groups=dim)
+ # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+ self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+ self.norm2 = nn.BatchNorm2d(dim)
+ mlp_hidden_dim = int(dim * mlp_ratio)
+ self.mlp = CMlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+
+ def forward(self, x):
+ x = x + self.pos_embed(x)
+ x = x + self.drop_path(self.conv2(self.attn(self.conv1(self.norm1(x)))))
+ x = x + self.drop_path(self.mlp(self.norm2(x)))
+ return x
+
+
+class Attention(nn.Module):
+ def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
+ super().__init__()
+ self.num_heads = num_heads
+ head_dim = dim // num_heads
+ # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
+ self.scale = qk_scale or head_dim ** -0.5
+
+ self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+ self.attn_drop = nn.Dropout(attn_drop)
+ self.proj = nn.Linear(dim, dim)
+ self.proj_drop = nn.Dropout(proj_drop)
+
+ def forward(self, x):
+ B, N, C = x.shape
+ qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+ q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple)
+
+ attn = (q @ k.transpose(-2, -1)) * self.scale
+ attn = attn.softmax(dim=-1)
+ attn = self.attn_drop(attn)
+
+ x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+ x = self.proj(x)
+ x = self.proj_drop(x)
+ return x
+
+
+class SABlock(nn.Module):
+ def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+ drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+ super().__init__()
+ self.pos_embed = nn.Conv2d(dim, dim, 3, padding=1, groups=dim)
+ self.norm1 = norm_layer(dim)
+ self.attn = Attention(
+ dim,
+ num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
+ attn_drop=attn_drop, proj_drop=drop)
+ # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+ self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+ self.norm2 = norm_layer(dim)
+ mlp_hidden_dim = int(dim * mlp_ratio)
+ self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+
+ def forward(self, x):
+ x = x + self.pos_embed(x)
+ B, N, H, W = x.shape
+ x = x.flatten(2).transpose(1, 2)
+ x = x + self.drop_path(self.attn(self.norm1(x)))
+ x = x + self.drop_path(self.mlp(self.norm2(x)))
+ x = x.transpose(1, 2).reshape(B, N, H, W)
+ return x
+
+
+def window_partition(x, window_size):
+ """
+ Args:
+ x: (B, H, W, C)
+ window_size (int): window size
+ Returns:
+ windows: (num_windows*B, window_size, window_size, C)
+ """
+ B, H, W, C = x.shape
+ x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
+ windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+ return windows
+
+
+def window_reverse(windows, window_size, H, W):
+ """
+ Args:
+ windows: (num_windows*B, window_size, window_size, C)
+ window_size (int): Window size
+ H (int): Height of image
+ W (int): Width of image
+ Returns:
+ x: (B, H, W, C)
+ """
+ B = int(windows.shape[0] / (H * W / window_size / window_size))
+ x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
+ x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+ return x
+
+
+class SABlock_Windows(nn.Module):
+ def __init__(self, dim, num_heads, window_size=14, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+ drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+ super().__init__()
+ self.window_size=window_size
+ self.pos_embed = nn.Conv2d(dim, dim, 3, padding=1, groups=dim)
+ self.norm1 = norm_layer(dim)
+ self.attn = Attention(
+ dim,
+ num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
+ attn_drop=attn_drop, proj_drop=drop)
+ # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+ self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+ self.norm2 = norm_layer(dim)
+ mlp_hidden_dim = int(dim * mlp_ratio)
+ self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+
+ def forward(self, x):
+ x = x + self.pos_embed(x)
+ x = x.permute(0, 2, 3, 1)
+ B, H, W, C = x.shape
+ shortcut = x
+ x = self.norm1(x)
+
+ pad_l = pad_t = 0
+ pad_r = (self.window_size - W % self.window_size) % self.window_size
+ pad_b = (self.window_size - H % self.window_size) % self.window_size
+ x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
+ _, Hp, Wp, _ = x.shape
+
+ x_windows = window_partition(x, self.window_size) # nW*B, window_size, window_size, C
+ x_windows = x_windows.view(-1, self.window_size * self.window_size, C) # nW*B, window_size*window_size, C
+
+ # W-MSA/SW-MSA
+ attn_windows = self.attn(x_windows) # nW*B, window_size*window_size, C
+
+ # merge windows
+ attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
+ x = window_reverse(attn_windows, self.window_size, Hp, Wp) # B H' W' C
+
+ # reverse cyclic shift
+ if pad_r > 0 or pad_b > 0:
+ x = x[:, :H, :W, :].contiguous()
+
+ x = shortcut + self.drop_path(x)
+ x = x + self.drop_path(self.mlp(self.norm2(x)))
+ x = x.permute(0, 3, 1, 2).reshape(B, C, H, W)
+ return x
+
+
+class PatchEmbed(nn.Module):
+ """ Image to Patch Embedding
+ """
+ def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
+ super().__init__()
+ img_size = to_2tuple(img_size)
+ patch_size = to_2tuple(patch_size)
+ num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0])
+ self.img_size = img_size
+ self.patch_size = patch_size
+ self.num_patches = num_patches
+ self.norm = nn.LayerNorm(embed_dim)
+ self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+
+ def forward(self, x):
+ B, _, H, W = x.shape
+ x = self.proj(x)
+ B, _, H, W = x.shape
+ x = x.flatten(2).transpose(1, 2)
+ x = self.norm(x)
+ x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
+ return x
+
+
+@BACKBONES.register_module()
+class UniFormer(nn.Module):
+ """ Vision Transformer
+ A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale` -
+ https://arxiv.org/abs/2010.11929
+ """
+ def __init__(self, layers=[3, 4, 8, 3], img_size=224, in_chans=3, num_classes=80, embed_dim=[64, 128, 320, 512],
+ head_dim=64, mlp_ratio=4., qkv_bias=True, qk_scale=None, representation_size=None,
+ drop_rate=0., attn_drop_rate=0., drop_path_rate=0., norm_layer=partial(nn.LayerNorm, eps=1e-6),
+ pretrained_path=None, use_checkpoint=False, checkpoint_num=[0, 0, 0, 0],
+ windows=False, hybrid=False, window_size=14):
+ """
+ Args:
+ layer (list): number of block in each layer
+ img_size (int, tuple): input image size
+ in_chans (int): number of input channels
+ num_classes (int): number of classes for classification head
+ embed_dim (int): embedding dimension
+ head_dim (int): dimension of attention heads
+ mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+ qkv_bias (bool): enable bias for qkv if True
+ qk_scale (float): override default qk scale of head_dim ** -0.5 if set
+ representation_size (Optional[int]): enable and set representation layer (pre-logits) to this value if set
+ drop_rate (float): dropout rate
+ attn_drop_rate (float): attention dropout rate
+ drop_path_rate (float): stochastic depth rate
+ norm_layer (nn.Module): normalization layer
+ pretrained_path (str): path of pretrained model
+ use_checkpoint (bool): whether use checkpoint
+ checkpoint_num (list): index for using checkpoint in every stage
+ windows (bool): whether use window MHRA
+ hybrid (bool): whether use hybrid MHRA
+ window_size (int): size of window (>14)
+ """
+ super().__init__()
+ self.num_classes = num_classes
+ self.use_checkpoint = use_checkpoint
+ self.checkpoint_num = checkpoint_num
+ self.windows = windows
+ print(f'Use Checkpoint: {self.use_checkpoint}')
+ print(f'Checkpoint Number: {self.checkpoint_num}')
+ self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models
+ norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
+
+ self.patch_embed1 = PatchEmbed(
+ img_size=img_size, patch_size=4, in_chans=in_chans, embed_dim=embed_dim[0])
+ self.patch_embed2 = PatchEmbed(
+ img_size=img_size // 4, patch_size=2, in_chans=embed_dim[0], embed_dim=embed_dim[1])
+ self.patch_embed3 = PatchEmbed(
+ img_size=img_size // 8, patch_size=2, in_chans=embed_dim[1], embed_dim=embed_dim[2])
+ self.patch_embed4 = PatchEmbed(
+ img_size=img_size // 16, patch_size=2, in_chans=embed_dim[2], embed_dim=embed_dim[3])
+
+ self.pos_drop = nn.Dropout(p=drop_rate)
+ dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(layers))] # stochastic depth decay rule
+ num_heads = [dim // head_dim for dim in embed_dim]
+ self.blocks1 = nn.ModuleList([
+ CBlock(
+ dim=embed_dim[0], num_heads=num_heads[0], mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+ drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer)
+ for i in range(layers[0])])
+ self.norm1=norm_layer(embed_dim[0])
+ self.blocks2 = nn.ModuleList([
+ CBlock(
+ dim=embed_dim[1], num_heads=num_heads[1], mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+ drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i+layers[0]], norm_layer=norm_layer)
+ for i in range(layers[1])])
+ self.norm2 = norm_layer(embed_dim[1])
+ if self.windows:
+ print('Use local window for all blocks in stage3')
+ self.blocks3 = nn.ModuleList([
+ SABlock_Windows(
+ dim=embed_dim[2], num_heads=num_heads[2], window_size=window_size, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+ drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i+layers[0]+layers[1]], norm_layer=norm_layer)
+ for i in range(layers[2])])
+ elif hybrid:
+ print('Use hybrid window for blocks in stage3')
+ block3 = []
+ for i in range(layers[2]):
+ if (i + 1) % 4 == 0:
+ block3.append(SABlock(
+ dim=embed_dim[2], num_heads=num_heads[2], mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+ drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i+layers[0]+layers[1]], norm_layer=norm_layer))
+ else:
+ block3.append(SABlock_Windows(
+ dim=embed_dim[2], num_heads=num_heads[2], window_size=window_size, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+ drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i+layers[0]+layers[1]], norm_layer=norm_layer))
+ self.blocks3 = nn.ModuleList(block3)
+ else:
+ print('Use global window for all blocks in stage3')
+ self.blocks3 = nn.ModuleList([
+ SABlock(
+ dim=embed_dim[2], num_heads=num_heads[2], mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+ drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i+layers[0]+layers[1]], norm_layer=norm_layer)
+ for i in range(layers[2])])
+ self.norm3 = norm_layer(embed_dim[2])
+ self.blocks4 = nn.ModuleList([
+ SABlock(
+ dim=embed_dim[3], num_heads=num_heads[3], mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+ drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i+layers[0]+layers[1]+layers[2]], norm_layer=norm_layer)
+ for i in range(layers[3])])
+ self.norm4 = norm_layer(embed_dim[3])
+
+ # Representation layer
+ if representation_size:
+ self.num_features = representation_size
+ self.pre_logits = nn.Sequential(OrderedDict([
+ ('fc', nn.Linear(embed_dim, representation_size)),
+ ('act', nn.Tanh())
+ ]))
+ else:
+ self.pre_logits = nn.Identity()
+
+ self.apply(self._init_weights)
+ self.init_weights(pretrained=pretrained_path)
+
+ def init_weights(self, pretrained):
+ if isinstance(pretrained, str):
+ logger = get_root_logger()
+ load_checkpoint(self, pretrained, map_location='cpu', strict=False, logger=logger)
+ print(f'Load pretrained model from {pretrained}')
+ def _init_weights(self, m):
+ if isinstance(m, nn.Linear):
+ trunc_normal_(m.weight, std=.02)
+ if isinstance(m, nn.Linear) and m.bias is not None:
+ nn.init.constant_(m.bias, 0)
+ elif isinstance(m, nn.LayerNorm):
+ nn.init.constant_(m.bias, 0)
+ nn.init.constant_(m.weight, 1.0)
+
+ @torch.jit.ignore
+ def no_weight_decay(self):
+ return {'pos_embed', 'cls_token'}
+
+ def get_classifier(self):
+ return self.head
+
+ def reset_classifier(self, num_classes, global_pool=''):
+ self.num_classes = num_classes
+ self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+ def forward_features(self, x):
+ out = []
+ x = self.patch_embed1(x)
+ x = self.pos_drop(x)
+ for i, blk in enumerate(self.blocks1):
+ if self.use_checkpoint and i < self.checkpoint_num[0]:
+ x = checkpoint.checkpoint(blk, x)
+ else:
+ x = blk(x)
+ x_out = self.norm1(x.permute(0, 2, 3, 1))
+ out.append(x_out.permute(0, 3, 1, 2).contiguous())
+ x = self.patch_embed2(x)
+ for i, blk in enumerate(self.blocks2):
+ if self.use_checkpoint and i < self.checkpoint_num[1]:
+ x = checkpoint.checkpoint(blk, x)
+ else:
+ x = blk(x)
+ x_out = self.norm2(x.permute(0, 2, 3, 1))
+ out.append(x_out.permute(0, 3, 1, 2).contiguous())
+ x = self.patch_embed3(x)
+ for i, blk in enumerate(self.blocks3):
+ if self.use_checkpoint and i < self.checkpoint_num[2]:
+ x = checkpoint.checkpoint(blk, x)
+ else:
+ x = blk(x)
+ x_out = self.norm3(x.permute(0, 2, 3, 1))
+ out.append(x_out.permute(0, 3, 1, 2).contiguous())
+ x = self.patch_embed4(x)
+ for i, blk in enumerate(self.blocks4):
+ if self.use_checkpoint and i < self.checkpoint_num[3]:
+ x = checkpoint.checkpoint(blk, x)
+ else:
+ x = blk(x)
+ x_out = self.norm4(x.permute(0, 2, 3, 1))
+ out.append(x_out.permute(0, 3, 1, 2).contiguous())
+ return tuple(out)
+
+ def forward(self, x):
+ x = self.forward_features(x)
+ return x
diff --git a/extensions/sd-webui-controlnet/annotator/uniformer/upernet_global_small.py b/extensions/sd-webui-controlnet/annotator/uniformer/upernet_global_small.py
new file mode 100644
index 0000000000000000000000000000000000000000..16b14768b80035b52a9a975af67c23c1c7693265
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/uniformer/upernet_global_small.py
@@ -0,0 +1,44 @@
+_base_ = [
+ 'configs/_base_/models/upernet_uniformer.py',
+ 'configs/_base_/datasets/ade20k.py',
+ 'configs/_base_/default_runtime.py',
+ 'configs/_base_/schedules/schedule_160k.py'
+]
+
+custom_imports = dict(
+ imports=['annotator.uniformer.uniformer'],
+ allow_failed_imports=False
+)
+
+model = dict(
+ backbone=dict(
+ type='UniFormer',
+ embed_dim=[64, 128, 320, 512],
+ layers=[3, 4, 8, 3],
+ head_dim=64,
+ drop_path_rate=0.25,
+ windows=False,
+ hybrid=False
+ ),
+ decode_head=dict(
+ in_channels=[64, 128, 320, 512],
+ num_classes=150
+ ),
+ auxiliary_head=dict(
+ in_channels=320,
+ num_classes=150
+ ))
+
+# AdamW optimizer, no weight decay for position embedding & layer norm in backbone
+optimizer = dict(_delete_=True, type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.01,
+ paramwise_cfg=dict(custom_keys={'absolute_pos_embed': dict(decay_mult=0.),
+ 'relative_position_bias_table': dict(decay_mult=0.),
+ 'norm': dict(decay_mult=0.)}))
+
+lr_config = dict(_delete_=True, policy='poly',
+ warmup='linear',
+ warmup_iters=1500,
+ warmup_ratio=1e-6,
+ power=1.0, min_lr=0.0, by_epoch=False)
+
+data=dict(samples_per_gpu=2)
\ No newline at end of file
diff --git a/extensions/sd-webui-controlnet/annotator/util.py b/extensions/sd-webui-controlnet/annotator/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..7cde937016b7a24b4081dc0565b53c16a87939d2
--- /dev/null
+++ b/extensions/sd-webui-controlnet/annotator/util.py
@@ -0,0 +1,34 @@
+import numpy as np
+import cv2
+
+
+def HWC3(x):
+ assert x.dtype == np.uint8
+ if x.ndim == 2:
+ x = x[:, :, None]
+ assert x.ndim == 3
+ H, W, C = x.shape
+ assert C == 1 or C == 3 or C == 4
+ if C == 3:
+ return x
+ if C == 1:
+ return np.concatenate([x, x, x], axis=2)
+ if C == 4:
+ color = x[:, :, 0:3].astype(np.float32)
+ alpha = x[:, :, 3:4].astype(np.float32) / 255.0
+ y = color * alpha + 255.0 * (1.0 - alpha)
+ y = y.clip(0, 255).astype(np.uint8)
+ return y
+
+
+def resize_image(input_image, resolution):
+ H, W, C = input_image.shape
+ H = float(H)
+ W = float(W)
+ k = float(resolution) / min(H, W)
+ H *= k
+ W *= k
+ H = int(np.round(H / 64.0)) * 64
+ W = int(np.round(W / 64.0)) * 64
+ img = cv2.resize(input_image, (W, H), interpolation=cv2.INTER_LANCZOS4 if k > 1 else cv2.INTER_AREA)
+ return img
diff --git a/extensions/sd-webui-controlnet/example/api_img2img.ipynb b/extensions/sd-webui-controlnet/example/api_img2img.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..07f244096c292be46c9d92c19c820633d6c1b41c
--- /dev/null
+++ b/extensions/sd-webui-controlnet/example/api_img2img.ipynb
@@ -0,0 +1,105 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# controlnet + img2img\n",
+ "# enable `Allow other script to control this extension` in settings\n",
+ "\n",
+ "import requests\n",
+ "import cv2\n",
+ "from base64 import b64encode\n",
+ "\n",
+ "def readImage(path):\n",
+ " img = cv2.imread(path)\n",
+ " retval, buffer = cv2.imencode('.jpg', img)\n",
+ " b64img = b64encode(buffer).decode(\"utf-8\")\n",
+ " return b64img\n",
+ "\n",
+ "b64img = readImage(\"/root/workspace/nahida/0e17302b9bfa15402f783c29c0d1d34f.jpg\")\n",
+ "\n",
+ "class controlnetRequest():\n",
+ " def __init__(self, prompt):\n",
+ " self.url = \"http://localhost:7860/controlnet/img2img\"\n",
+ " self.body = {\n",
+ " \"init_images\": [b64img],\n",
+ " \"prompt\": prompt,\n",
+ " \"negative_prompt\": \"\",\n",
+ " \"seed\": -1,\n",
+ " \"subseed\": -1,\n",
+ " \"subseed_strength\": 0,\n",
+ " \"batch_size\": 1,\n",
+ " \"n_iter\": 1,\n",
+ " \"steps\": 20,\n",
+ " \"cfg_scale\": 7,\n",
+ " \"width\": 512,\n",
+ " \"height\": 768,\n",
+ " \"restore_faces\": True,\n",
+ " \"eta\": 0,\n",
+ " \"sampler_index\": \"Euler a\",\n",
+ " \"controlnet_input_image\": [b64img],\n",
+ " \"controlnet_module\": 'canny',\n",
+ " \"controlnet_model\": 'control_canny-fp16 [e3fe7712]',\n",
+ " \"controlnet_guidance\": 1.0,\n",
+ " }\n",
+ "\n",
+ " def sendRequest(self):\n",
+ " r = requests.post(self.url, json=self.body)\n",
+ " return r.json()\n",
+ "\n",
+ "js = controlnetRequest(\"walter white\").sendRequest()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import io, base64\n",
+ "import matplotlib.pyplot as plt\n",
+ "from PIL import Image\n",
+ "\n",
+ "pil_img = Image.open('/root/workspace/nahida/0e17302b9bfa15402f783c29c0d1d34f.jpg')\n",
+ "image = Image.open(io.BytesIO(base64.b64decode(js[\"images\"][0])))\n",
+ "mask_image = Image.open(io.BytesIO(base64.b64decode(js[\"images\"][1])))\n",
+ "\n",
+ "plt.figure()\n",
+ "f, axarr = plt.subplots(1,3) \n",
+ "axarr[0].imshow(pil_img) \n",
+ "axarr[1].imshow(image) \n",
+ "axarr[2].imshow(mask_image) "
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "pynb",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.9"
+ },
+ "orig_nbformat": 4,
+ "vscode": {
+ "interpreter": {
+ "hash": "d73345514d8c18d9a1da7351d222dbd2834c7f4a09e728a0d1f4c4580fbec206"
+ }
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/extensions/sd-webui-controlnet/example/api_txt2img.ipynb b/extensions/sd-webui-controlnet/example/api_txt2img.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..a52c8f4de2870a10d01a92ee0a0aa9cecfb038a1
--- /dev/null
+++ b/extensions/sd-webui-controlnet/example/api_txt2img.ipynb
@@ -0,0 +1,104 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# controlnet + txt2img\n",
+ "# enable `Allow other script to control this extension` in settings\n",
+ "\n",
+ "import requests\n",
+ "import cv2\n",
+ "from base64 import b64encode\n",
+ "\n",
+ "def readImage(path):\n",
+ " img = cv2.imread(path)\n",
+ " retval, buffer = cv2.imencode('.jpg', img)\n",
+ " b64img = b64encode(buffer).decode(\"utf-8\")\n",
+ " return b64img\n",
+ "\n",
+ "b64img = readImage(\"/root/workspace/nahida/0e17302b9bfa15402f783c29c0d1d34f.jpg\")\n",
+ "\n",
+ "class controlnetRequest():\n",
+ " def __init__(self, prompt):\n",
+ " self.url = \"http://localhost:7860/controlnet/txt2img\"\n",
+ " self.body = {\n",
+ " \"prompt\": prompt,\n",
+ " \"negative_prompt\": \"\",\n",
+ " \"seed\": -1,\n",
+ " \"subseed\": -1,\n",
+ " \"subseed_strength\": 0,\n",
+ " \"batch_size\": 1,\n",
+ " \"n_iter\": 1,\n",
+ " \"steps\": 15,\n",
+ " \"cfg_scale\": 7,\n",
+ " \"width\": 512,\n",
+ " \"height\": 768,\n",
+ " \"restore_faces\": True,\n",
+ " \"eta\": 0,\n",
+ " \"sampler_index\": \"Euler a\",\n",
+ " \"controlnet_input_image\": [b64img],\n",
+ " \"controlnet_module\": 'canny',\n",
+ " \"controlnet_model\": 'control_canny-fp16 [e3fe7712]',\n",
+ " \"controlnet_guidance\": 1.0,\n",
+ " }\n",
+ "\n",
+ " def sendRequest(self):\n",
+ " r = requests.post(self.url, json=self.body)\n",
+ " return r.json()\n",
+ "\n",
+ "js = controlnetRequest(\"walter white\").sendRequest()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import io, base64\n",
+ "import matplotlib.pyplot as plt\n",
+ "from PIL import Image\n",
+ "\n",
+ "pil_img = Image.open('/root/workspace/nahida/0e17302b9bfa15402f783c29c0d1d34f.jpg')\n",
+ "image = Image.open(io.BytesIO(base64.b64decode(js[\"images\"][0])))\n",
+ "mask_image = Image.open(io.BytesIO(base64.b64decode(js[\"images\"][1])))\n",
+ "\n",
+ "plt.figure()\n",
+ "f, axarr = plt.subplots(1,3) \n",
+ "axarr[0].imshow(pil_img) \n",
+ "axarr[1].imshow(image) \n",
+ "axarr[2].imshow(mask_image) "
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "pynb",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.9"
+ },
+ "orig_nbformat": 4,
+ "vscode": {
+ "interpreter": {
+ "hash": "d73345514d8c18d9a1da7351d222dbd2834c7f4a09e728a0d1f4c4580fbec206"
+ }
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/extensions/sd-webui-controlnet/extract_controlnet.py b/extensions/sd-webui-controlnet/extract_controlnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8752b534098b168c123ff253083d280f2aba907
--- /dev/null
+++ b/extensions/sd-webui-controlnet/extract_controlnet.py
@@ -0,0 +1,25 @@
+import argparse
+import torch
+from safetensors.torch import load_file, save_file
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--src", default=None, type=str, required=True, help="Path to the model to convert.")
+ parser.add_argument("--dst", default=None, type=str, required=True, help="Path to the output model.")
+ args = parser.parse_args()
+
+ assert args.src is not None, "Must provide a model path!"
+ assert args.dst is not None, "Must provide a checkpoint path!"
+
+ if args.src.endswith(".safetensors"):
+ state_dict = load_file(args.src)
+ else:
+ state_dict = torch.load(args.src)
+
+ if any([k.startswith("control_model.") for k, v in state_dict.items()]):
+ state_dict = {k.replace("control_model.", ""): v for k, v in state_dict.items() if k.startswith("control_model.")}
+
+ if args.dst.endswith(".safetensors"):
+ save_file(state_dict, args.dst)
+ else:
+ torch.save({"state_dict": state_dict}, args.dst)
diff --git a/extensions/sd-webui-controlnet/extract_controlnet_diff.py b/extensions/sd-webui-controlnet/extract_controlnet_diff.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc2ca118d7664a9990834e485dc2539d2ce45f24
--- /dev/null
+++ b/extensions/sd-webui-controlnet/extract_controlnet_diff.py
@@ -0,0 +1,91 @@
+import argparse
+import torch
+from safetensors.torch import load_file, save_file
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--sd15", default=None, type=str, required=True, help="Path to the original sd15.")
+ parser.add_argument("--control", default=None, type=str, required=True, help="Path to the sd15 with control.")
+ parser.add_argument("--dst", default=None, type=str, required=True, help="Path to the output difference model.")
+ parser.add_argument("--fp16", action="store_true", help="Save as fp16.")
+ parser.add_argument("--bf16", action="store_true", help="Save as bf16.")
+ args = parser.parse_args()
+
+ assert args.sd15 is not None, "Must provide a original sd15 model path!"
+ assert args.control is not None, "Must provide a sd15 with control model path!"
+ assert args.dst is not None, "Must provide a output path!"
+
+ # make differences: copy from https://github.com/lllyasviel/ControlNet/blob/main/tool_transfer_control.py
+
+ def get_node_name(name, parent_name):
+ if len(name) <= len(parent_name):
+ return False, ''
+ p = name[:len(parent_name)]
+ if p != parent_name:
+ return False, ''
+ return True, name[len(parent_name):]
+
+ # remove first/cond stage from sd to reduce memory usage
+ def remove_first_and_cond(sd):
+ keys = list(sd.keys())
+ for key in keys:
+ is_first_stage, _ = get_node_name(key, 'first_stage_model')
+ is_cond_stage, _ = get_node_name(key, 'cond_stage_model')
+ if is_first_stage or is_cond_stage:
+ sd.pop(key, None)
+ return sd
+
+ print(f"loading: {args.sd15}")
+ if args.sd15.endswith(".safetensors"):
+ sd15_state_dict = load_file(args.sd15)
+ else:
+ sd15_state_dict = torch.load(args.sd15)
+ sd15_state_dict = sd15_state_dict.pop("state_dict", sd15_state_dict)
+ sd15_state_dict = remove_first_and_cond(sd15_state_dict)
+
+ print(f"loading: {args.control}")
+ if args.control.endswith(".safetensors"):
+ control_state_dict = load_file(args.control)
+ else:
+ control_state_dict = torch.load(args.control)
+ control_state_dict = remove_first_and_cond(control_state_dict)
+
+ # make diff of original and control
+ print(f"create difference")
+ keys = list(control_state_dict.keys())
+ final_state_dict = {"difference": torch.tensor(1.0)} # indicates difference
+ for key in keys:
+ p = control_state_dict.pop(key)
+
+ is_control, node_name = get_node_name(key, 'control_')
+ if not is_control:
+ continue
+
+ sd15_key_name = 'model.diffusion_' + node_name
+ if sd15_key_name in sd15_state_dict: # part of U-Net
+ # print("in sd15", key, sd15_key_name)
+ p_new = p - sd15_state_dict.pop(sd15_key_name)
+ if torch.max(torch.abs(p_new)) < 1e-6: # no difference?
+ print("no diff", key, sd15_key_name)
+ continue
+ else:
+ # print("not in sd15", key, sd15_key_name)
+ p_new = p # hint or zero_conv
+
+ final_state_dict[key] = p_new
+
+ save_dtype = None
+ if args.fp16:
+ save_dtype = torch.float16
+ elif args.bf16:
+ save_dtype = torch.bfloat16
+ if save_dtype is not None:
+ for key in final_state_dict.keys():
+ final_state_dict[key] = final_state_dict[key].to(save_dtype)
+
+ print("saving difference.")
+ if args.dst.endswith(".safetensors"):
+ save_file(final_state_dict, args.dst)
+ else:
+ torch.save({"state_dict": final_state_dict}, args.dst)
+ print("done!")
diff --git a/extensions/sd-webui-controlnet/install.py b/extensions/sd-webui-controlnet/install.py
new file mode 100644
index 0000000000000000000000000000000000000000..f60af764af3478ba51b529d9c89bb7fe4b057655
--- /dev/null
+++ b/extensions/sd-webui-controlnet/install.py
@@ -0,0 +1,10 @@
+import launch
+import os
+
+req_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), "requirements.txt")
+
+with open(req_file) as file:
+ for lib in file:
+ lib = lib.strip()
+ if not launch.is_installed(lib):
+ launch.run_pip(f"install {lib}", f"sd-webui-controlnet requirement: {lib}")
\ No newline at end of file
diff --git a/extensions/sd-webui-controlnet/models/cldm_v15.yaml b/extensions/sd-webui-controlnet/models/cldm_v15.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fde1825577acd46dc90d8d7c6730e22be762fccb
--- /dev/null
+++ b/extensions/sd-webui-controlnet/models/cldm_v15.yaml
@@ -0,0 +1,79 @@
+model:
+ target: cldm.cldm.ControlLDM
+ params:
+ linear_start: 0.00085
+ linear_end: 0.0120
+ num_timesteps_cond: 1
+ log_every_t: 200
+ timesteps: 1000
+ first_stage_key: "jpg"
+ cond_stage_key: "txt"
+ control_key: "hint"
+ image_size: 64
+ channels: 4
+ cond_stage_trainable: false
+ conditioning_key: crossattn
+ monitor: val/loss_simple_ema
+ scale_factor: 0.18215
+ use_ema: False
+ only_mid_control: False
+
+ control_stage_config:
+ target: cldm.cldm.ControlNet
+ params:
+ image_size: 32 # unused
+ in_channels: 4
+ hint_channels: 3
+ model_channels: 320
+ attention_resolutions: [ 4, 2, 1 ]
+ num_res_blocks: 2
+ channel_mult: [ 1, 2, 4, 4 ]
+ num_heads: 8
+ use_spatial_transformer: True
+ transformer_depth: 1
+ context_dim: 768
+ use_checkpoint: True
+ legacy: False
+
+ unet_config:
+ target: cldm.cldm.ControlledUnetModel
+ params:
+ image_size: 32 # unused
+ in_channels: 4
+ out_channels: 4
+ model_channels: 320
+ attention_resolutions: [ 4, 2, 1 ]
+ num_res_blocks: 2
+ channel_mult: [ 1, 2, 4, 4 ]
+ num_heads: 8
+ use_spatial_transformer: True
+ transformer_depth: 1
+ context_dim: 768
+ use_checkpoint: True
+ legacy: False
+
+ first_stage_config:
+ target: ldm.models.autoencoder.AutoencoderKL
+ params:
+ embed_dim: 4
+ monitor: val/rec_loss
+ ddconfig:
+ double_z: true
+ z_channels: 4
+ resolution: 256
+ in_channels: 3
+ out_ch: 3
+ ch: 128
+ ch_mult:
+ - 1
+ - 2
+ - 4
+ - 4
+ num_res_blocks: 2
+ attn_resolutions: []
+ dropout: 0.0
+ lossconfig:
+ target: torch.nn.Identity
+
+ cond_stage_config:
+ target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
diff --git a/extensions/sd-webui-controlnet/models/cldm_v21.yaml b/extensions/sd-webui-controlnet/models/cldm_v21.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fc65193647e476e108fce5977f11250d55919106
--- /dev/null
+++ b/extensions/sd-webui-controlnet/models/cldm_v21.yaml
@@ -0,0 +1,85 @@
+model:
+ target: cldm.cldm.ControlLDM
+ params:
+ linear_start: 0.00085
+ linear_end: 0.0120
+ num_timesteps_cond: 1
+ log_every_t: 200
+ timesteps: 1000
+ first_stage_key: "jpg"
+ cond_stage_key: "txt"
+ control_key: "hint"
+ image_size: 64
+ channels: 4
+ cond_stage_trainable: false
+ conditioning_key: crossattn
+ monitor: val/loss_simple_ema
+ scale_factor: 0.18215
+ use_ema: False
+ only_mid_control: False
+
+ control_stage_config:
+ target: cldm.cldm.ControlNet
+ params:
+ use_checkpoint: True
+ image_size: 32 # unused
+ in_channels: 4
+ hint_channels: 3
+ model_channels: 320
+ attention_resolutions: [ 4, 2, 1 ]
+ num_res_blocks: 2
+ channel_mult: [ 1, 2, 4, 4 ]
+ num_head_channels: 64 # need to fix for flash-attn
+ use_spatial_transformer: True
+ use_linear_in_transformer: True
+ transformer_depth: 1
+ context_dim: 1024
+ legacy: False
+
+ unet_config:
+ target: cldm.cldm.ControlledUnetModel
+ params:
+ use_checkpoint: True
+ image_size: 32 # unused
+ in_channels: 4
+ out_channels: 4
+ model_channels: 320
+ attention_resolutions: [ 4, 2, 1 ]
+ num_res_blocks: 2
+ channel_mult: [ 1, 2, 4, 4 ]
+ num_head_channels: 64 # need to fix for flash-attn
+ use_spatial_transformer: True
+ use_linear_in_transformer: True
+ transformer_depth: 1
+ context_dim: 1024
+ legacy: False
+
+ first_stage_config:
+ target: ldm.models.autoencoder.AutoencoderKL
+ params:
+ embed_dim: 4
+ monitor: val/rec_loss
+ ddconfig:
+ #attn_type: "vanilla-xformers"
+ double_z: true
+ z_channels: 4
+ resolution: 256
+ in_channels: 3
+ out_ch: 3
+ ch: 128
+ ch_mult:
+ - 1
+ - 2
+ - 4
+ - 4
+ num_res_blocks: 2
+ attn_resolutions: []
+ dropout: 0.0
+ lossconfig:
+ target: torch.nn.Identity
+
+ cond_stage_config:
+ target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
+ params:
+ freeze: True
+ layer: "penultimate"
diff --git a/extensions/sd-webui-controlnet/models/image_adapter_v14.yaml b/extensions/sd-webui-controlnet/models/image_adapter_v14.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..439d33cc53a349c9b8c1a0091cbd3643359216d5
--- /dev/null
+++ b/extensions/sd-webui-controlnet/models/image_adapter_v14.yaml
@@ -0,0 +1,9 @@
+model:
+ target: tencentarc.t21_adapter
+ params:
+ channels: [320, 640, 1280, 1280]
+ nums_rb: 2
+ ksize: 1
+ sk: true
+ cin: 192
+ use_conv: false
\ No newline at end of file
diff --git a/extensions/sd-webui-controlnet/models/sketch_adapter_v14.yaml b/extensions/sd-webui-controlnet/models/sketch_adapter_v14.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..686c5f172bf941ffaaee58b912245d6ffb36f4d3
--- /dev/null
+++ b/extensions/sd-webui-controlnet/models/sketch_adapter_v14.yaml
@@ -0,0 +1,9 @@
+model:
+ target: tencentarc.t21_adapter
+ params:
+ channels: [320, 640, 1280, 1280]
+ nums_rb: 2
+ ksize: 1
+ sk: true
+ cin: 64
+ use_conv: false
\ No newline at end of file
diff --git a/extensions/sd-webui-controlnet/models/t2iadapter_color_sd14v1.yaml b/extensions/sd-webui-controlnet/models/t2iadapter_color_sd14v1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6780dd94ca6abfe58b4e3dcce1206b902fc3d540
--- /dev/null
+++ b/extensions/sd-webui-controlnet/models/t2iadapter_color_sd14v1.yaml
@@ -0,0 +1,6 @@
+model:
+ target: scripts.adapter.Adapter_light
+ params:
+ channels: [320, 640, 1280, 1280]
+ nums_rb: 4
+ cin: 192
\ No newline at end of file
diff --git a/extensions/sd-webui-controlnet/models/t2iadapter_keypose_sd14v1.yaml b/extensions/sd-webui-controlnet/models/t2iadapter_keypose_sd14v1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..439d33cc53a349c9b8c1a0091cbd3643359216d5
--- /dev/null
+++ b/extensions/sd-webui-controlnet/models/t2iadapter_keypose_sd14v1.yaml
@@ -0,0 +1,9 @@
+model:
+ target: tencentarc.t21_adapter
+ params:
+ channels: [320, 640, 1280, 1280]
+ nums_rb: 2
+ ksize: 1
+ sk: true
+ cin: 192
+ use_conv: false
\ No newline at end of file
diff --git a/extensions/sd-webui-controlnet/models/t2iadapter_style_sd14v1.yaml b/extensions/sd-webui-controlnet/models/t2iadapter_style_sd14v1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1f634fbe7e46b9e4057298af395e0a28ac1516cf
--- /dev/null
+++ b/extensions/sd-webui-controlnet/models/t2iadapter_style_sd14v1.yaml
@@ -0,0 +1,8 @@
+model:
+ target: scripts.adapter.StyleAdapter
+ params:
+ width: 1024
+ context_dim: 768
+ num_head: 8
+ n_layes: 3
+ num_token: 8
\ No newline at end of file
diff --git a/extensions/sd-webui-controlnet/preload.py b/extensions/sd-webui-controlnet/preload.py
new file mode 100644
index 0000000000000000000000000000000000000000..868801e1c81028eb30114439fab1d421ce9de45d
--- /dev/null
+++ b/extensions/sd-webui-controlnet/preload.py
@@ -0,0 +1,3 @@
+def preload(parser):
+ parser.add_argument("--controlnet-dir", type=str, help="Path to directory with ControlNet models", default=None)
+ parser.add_argument("--no-half-controlnet", action='store_true', help="do not switch the ControlNet models to 16-bit floats (only needed without --no-half)", default=None)
diff --git a/extensions/sd-webui-controlnet/requirements.txt b/extensions/sd-webui-controlnet/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7567a8990a313e3c39564f208c4e4f7ff3a04116
--- /dev/null
+++ b/extensions/sd-webui-controlnet/requirements.txt
@@ -0,0 +1 @@
+svglib
\ No newline at end of file
diff --git a/extensions/sd-webui-controlnet/samples/an-gen.png b/extensions/sd-webui-controlnet/samples/an-gen.png
new file mode 100644
index 0000000000000000000000000000000000000000..128292ec8e536e80e10a79e19b8b8dd234b3dd5f
Binary files /dev/null and b/extensions/sd-webui-controlnet/samples/an-gen.png differ
diff --git a/extensions/sd-webui-controlnet/samples/an-pose.png b/extensions/sd-webui-controlnet/samples/an-pose.png
new file mode 100644
index 0000000000000000000000000000000000000000..83b92e38fa105876be558aaf47ea16dc483ee8c1
Binary files /dev/null and b/extensions/sd-webui-controlnet/samples/an-pose.png differ
diff --git a/extensions/sd-webui-controlnet/samples/an-source.jpg b/extensions/sd-webui-controlnet/samples/an-source.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..01e2bddb52386de3b9e15890da1bc9c9ad2dfdaa
Binary files /dev/null and b/extensions/sd-webui-controlnet/samples/an-source.jpg differ
diff --git a/extensions/sd-webui-controlnet/samples/bal-gen.png b/extensions/sd-webui-controlnet/samples/bal-gen.png
new file mode 100644
index 0000000000000000000000000000000000000000..a3ac24285b27df4e2775fda6ff8405323efa99fd
Binary files /dev/null and b/extensions/sd-webui-controlnet/samples/bal-gen.png differ
diff --git a/extensions/sd-webui-controlnet/samples/bal-source.png b/extensions/sd-webui-controlnet/samples/bal-source.png
new file mode 100644
index 0000000000000000000000000000000000000000..7f77950fb260156378e82a95354fab15176ccd0b
Binary files /dev/null and b/extensions/sd-webui-controlnet/samples/bal-source.png differ
diff --git a/extensions/sd-webui-controlnet/samples/cat_out-2.png b/extensions/sd-webui-controlnet/samples/cat_out-2.png
new file mode 100644
index 0000000000000000000000000000000000000000..b0e5509a68c741e6bd8222178ff1cef3e331696a
Binary files /dev/null and b/extensions/sd-webui-controlnet/samples/cat_out-2.png differ
diff --git a/extensions/sd-webui-controlnet/samples/cat_sk-2.png b/extensions/sd-webui-controlnet/samples/cat_sk-2.png
new file mode 100644
index 0000000000000000000000000000000000000000..1cf583585aa874fb98a7a2167313c3e510a13c65
Binary files /dev/null and b/extensions/sd-webui-controlnet/samples/cat_sk-2.png differ
diff --git a/extensions/sd-webui-controlnet/samples/dog_out-2.png b/extensions/sd-webui-controlnet/samples/dog_out-2.png
new file mode 100644
index 0000000000000000000000000000000000000000..a6781d6b1f44e742c944a1a89348d93c897105a0
Binary files /dev/null and b/extensions/sd-webui-controlnet/samples/dog_out-2.png differ
diff --git a/extensions/sd-webui-controlnet/samples/dog_rel.jpg b/extensions/sd-webui-controlnet/samples/dog_rel.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..78a6d812a6c9b0d10ef325e19fdd40b53d728569
Binary files /dev/null and b/extensions/sd-webui-controlnet/samples/dog_rel.jpg differ
diff --git a/extensions/sd-webui-controlnet/samples/dog_rel.png b/extensions/sd-webui-controlnet/samples/dog_rel.png
new file mode 100644
index 0000000000000000000000000000000000000000..a67da581a1086fffdc03f289104e98d6bd123698
Binary files /dev/null and b/extensions/sd-webui-controlnet/samples/dog_rel.png differ
diff --git a/extensions/sd-webui-controlnet/samples/dog_sk-2.png b/extensions/sd-webui-controlnet/samples/dog_sk-2.png
new file mode 100644
index 0000000000000000000000000000000000000000..b272f79dabf656daf6f43cf8d74d419801aa0526
Binary files /dev/null and b/extensions/sd-webui-controlnet/samples/dog_sk-2.png differ
diff --git a/extensions/sd-webui-controlnet/samples/evt_gen.png b/extensions/sd-webui-controlnet/samples/evt_gen.png
new file mode 100644
index 0000000000000000000000000000000000000000..5d0dbf1c62b5b7cb0b76140ea890389105fc5789
Binary files /dev/null and b/extensions/sd-webui-controlnet/samples/evt_gen.png differ
diff --git a/extensions/sd-webui-controlnet/samples/evt_hed.png b/extensions/sd-webui-controlnet/samples/evt_hed.png
new file mode 100644
index 0000000000000000000000000000000000000000..fa7feb782fd4938f22afa31cdd55d9cb738113b1
Binary files /dev/null and b/extensions/sd-webui-controlnet/samples/evt_hed.png differ
diff --git a/extensions/sd-webui-controlnet/samples/evt_source.jpg b/extensions/sd-webui-controlnet/samples/evt_source.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..0a21210a73535e56649948be7a92dcf9da15b47f
Binary files /dev/null and b/extensions/sd-webui-controlnet/samples/evt_source.jpg differ
diff --git a/extensions/sd-webui-controlnet/samples/fs_input.png b/extensions/sd-webui-controlnet/samples/fs_input.png
new file mode 100644
index 0000000000000000000000000000000000000000..4ec7b353dd0d7f4612884d4ada75c5f08d7fbea0
Binary files /dev/null and b/extensions/sd-webui-controlnet/samples/fs_input.png differ
diff --git a/extensions/sd-webui-controlnet/samples/fs_output.png b/extensions/sd-webui-controlnet/samples/fs_output.png
new file mode 100644
index 0000000000000000000000000000000000000000..44717b870c27379e8d44ed12caee7c896c095b44
Binary files /dev/null and b/extensions/sd-webui-controlnet/samples/fs_output.png differ
diff --git a/extensions/sd-webui-controlnet/samples/kp_a-2.png b/extensions/sd-webui-controlnet/samples/kp_a-2.png
new file mode 100644
index 0000000000000000000000000000000000000000..12a77f8e581201dfb46282da3724e4d99677c712
Binary files /dev/null and b/extensions/sd-webui-controlnet/samples/kp_a-2.png differ
diff --git a/extensions/sd-webui-controlnet/samples/kp_a2-2.png b/extensions/sd-webui-controlnet/samples/kp_a2-2.png
new file mode 100644
index 0000000000000000000000000000000000000000..8db29aed95122542fb01dc786f3226a62d6e8e9e
Binary files /dev/null and b/extensions/sd-webui-controlnet/samples/kp_a2-2.png differ
diff --git a/extensions/sd-webui-controlnet/samples/kp_o-2.png b/extensions/sd-webui-controlnet/samples/kp_o-2.png
new file mode 100644
index 0000000000000000000000000000000000000000..2dbaa80589467b7644578fbcba3bd5df861bcaaa
Binary files /dev/null and b/extensions/sd-webui-controlnet/samples/kp_o-2.png differ
diff --git a/extensions/sd-webui-controlnet/samples/kp_o2-2.png b/extensions/sd-webui-controlnet/samples/kp_o2-2.png
new file mode 100644
index 0000000000000000000000000000000000000000..2c44f3ddc7f1bcca8a7df95bd0f8bea1d8f5a616
Binary files /dev/null and b/extensions/sd-webui-controlnet/samples/kp_o2-2.png differ
diff --git a/extensions/sd-webui-controlnet/samples/mahiro-out.png b/extensions/sd-webui-controlnet/samples/mahiro-out.png
new file mode 100644
index 0000000000000000000000000000000000000000..d1eb02503a54e300b60fe42cb147fbd9f0da2f8b
Binary files /dev/null and b/extensions/sd-webui-controlnet/samples/mahiro-out.png differ
diff --git a/extensions/sd-webui-controlnet/samples/mahiro_canny.png b/extensions/sd-webui-controlnet/samples/mahiro_canny.png
new file mode 100644
index 0000000000000000000000000000000000000000..318f4fae461389d67c9552d372b4aa0cc5997efa
Binary files /dev/null and b/extensions/sd-webui-controlnet/samples/mahiro_canny.png differ
diff --git a/extensions/sd-webui-controlnet/samples/mahiro_input.png b/extensions/sd-webui-controlnet/samples/mahiro_input.png
new file mode 100644
index 0000000000000000000000000000000000000000..0ee95dde70fc94916b1506cb276604e5d12aea02
Binary files /dev/null and b/extensions/sd-webui-controlnet/samples/mahiro_input.png differ
diff --git a/extensions/sd-webui-controlnet/samples/nm-gen.png b/extensions/sd-webui-controlnet/samples/nm-gen.png
new file mode 100644
index 0000000000000000000000000000000000000000..ce7321fe85e44caf95acef442a892ec1731f106e
Binary files /dev/null and b/extensions/sd-webui-controlnet/samples/nm-gen.png differ
diff --git a/extensions/sd-webui-controlnet/samples/nm-out.png b/extensions/sd-webui-controlnet/samples/nm-out.png
new file mode 100644
index 0000000000000000000000000000000000000000..75bb68c07eccba205ec4245be478e199a0f2b442
Binary files /dev/null and b/extensions/sd-webui-controlnet/samples/nm-out.png differ
diff --git a/extensions/sd-webui-controlnet/samples/nm-src.png b/extensions/sd-webui-controlnet/samples/nm-src.png
new file mode 100644
index 0000000000000000000000000000000000000000..125f13aa39f8b70c1aac987efd8ceb3c0147ea8e
Binary files /dev/null and b/extensions/sd-webui-controlnet/samples/nm-src.png differ
diff --git a/extensions/sd-webui-controlnet/samples/sk-b-dep.png b/extensions/sd-webui-controlnet/samples/sk-b-dep.png
new file mode 100644
index 0000000000000000000000000000000000000000..1896956386c1416cdf17b5be6e24995a122527aa
Binary files /dev/null and b/extensions/sd-webui-controlnet/samples/sk-b-dep.png differ
diff --git a/extensions/sd-webui-controlnet/samples/sk-b-out.png b/extensions/sd-webui-controlnet/samples/sk-b-out.png
new file mode 100644
index 0000000000000000000000000000000000000000..6bf4670c01c5e22e27fba2223ce51cf3931aa5de
Binary files /dev/null and b/extensions/sd-webui-controlnet/samples/sk-b-out.png differ
diff --git a/extensions/sd-webui-controlnet/samples/sk-b-src.png b/extensions/sd-webui-controlnet/samples/sk-b-src.png
new file mode 100644
index 0000000000000000000000000000000000000000..1bece79872480e2f5d60777688fbfe42e6215647
Binary files /dev/null and b/extensions/sd-webui-controlnet/samples/sk-b-src.png differ
diff --git a/extensions/sd-webui-controlnet/scripts/adapter.py b/extensions/sd-webui-controlnet/scripts/adapter.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7809e8e28108359af345ecbb85c79c824a63a16
--- /dev/null
+++ b/extensions/sd-webui-controlnet/scripts/adapter.py
@@ -0,0 +1,397 @@
+
+
+import torch
+import torch.nn as nn
+import importlib
+from collections import OrderedDict
+
+from omegaconf import OmegaConf
+from copy import deepcopy
+from modules import devices, lowvram, shared, scripts
+from ldm.modules.diffusionmodules.util import timestep_embedding
+from ldm.modules.diffusionmodules.openaimodel import UNetModel
+
+
+class TorchHijackForUnet:
+ """
+ This is torch, but with cat that resizes tensors to appropriate dimensions if they do not match;
+ this makes it possible to create pictures with dimensions that are multiples of 8 rather than 64
+ """
+
+ def __getattr__(self, item):
+ if item == 'cat':
+ return self.cat
+
+ if hasattr(torch, item):
+ return getattr(torch, item)
+
+ raise AttributeError("'{}' object has no attribute '{}'".format(type(self).__name__, item))
+
+ def cat(self, tensors, *args, **kwargs):
+ if len(tensors) == 2:
+ a, b = tensors
+ if a.shape[-2:] != b.shape[-2:]:
+ a = torch.nn.functional.interpolate(a, b.shape[-2:], mode="nearest")
+
+ tensors = (a, b)
+
+ return torch.cat(tensors, *args, **kwargs)
+
+
+th = TorchHijackForUnet()
+
+
+def align(hint, size):
+ b, c, h1, w1 = hint.shape
+ h, w = size
+ if h != h1 or w != w1:
+ hint = th.nn.functional.interpolate(hint, size=size, mode="nearest")
+ return hint
+
+
+def get_node_name(name, parent_name):
+ if len(name) <= len(parent_name):
+ return False, ''
+ p = name[:len(parent_name)]
+ if p != parent_name:
+ return False, ''
+ return True, name[len(parent_name):]
+
+
+def get_obj_from_str(string, reload=False):
+ module, cls = string.rsplit(".", 1)
+ if reload:
+ module_imp = importlib.import_module(module)
+ importlib.reload(module_imp)
+ return getattr(importlib.import_module(module, package=None), cls)
+
+
+class PlugableAdapter(nn.Module):
+ def __init__(self, state_dict, config_path, lowvram=False, base_model=None) -> None:
+ super().__init__()
+ config = OmegaConf.load(config_path)
+ model = Adapter
+ try:
+ self.target = config.model.target
+ model = get_obj_from_str(config.model.target)
+ except ImportError:
+ pass
+
+ self.control_model = model(**config.model.params)
+ self.control_model.load_state_dict(state_dict)
+ self.lowvram = lowvram
+ self.control = None
+ self.hint_cond = None
+
+ if not self.lowvram:
+ self.control_model.to(devices.get_device_for("controlnet"))
+
+ def reset(self):
+ self.control = None
+ self.hint_cond = None
+
+ def forward(self, hint=None, *args, **kwargs):
+ if self.control is not None:
+ return deepcopy(self.control)
+
+ self.hint_cond = hint
+ hint_in = hint
+ if hasattr(self.control_model, 'conv_in') and self.control_model.conv_in.in_channels == 64:
+ hint_in = hint_in[0].unsqueeze(0).unsqueeze(0)
+ else:
+ hint_in = hint_in.unsqueeze(0)
+
+ self.control = self.control_model(hint_in)
+ return deepcopy(self.control)
+
+
+def conv_nd(dims, *args, **kwargs):
+ """
+ Create a 1D, 2D, or 3D convolution module.
+ """
+ if dims == 1:
+ return nn.Conv1d(*args, **kwargs)
+ elif dims == 2:
+ return nn.Conv2d(*args, **kwargs)
+ elif dims == 3:
+ return nn.Conv3d(*args, **kwargs)
+ raise ValueError(f"unsupported dimensions: {dims}")
+
+def avg_pool_nd(dims, *args, **kwargs):
+ """
+ Create a 1D, 2D, or 3D average pooling module.
+ """
+ if dims == 1:
+ return nn.AvgPool1d(*args, **kwargs)
+ elif dims == 2:
+ return nn.AvgPool2d(*args, **kwargs)
+ elif dims == 3:
+ return nn.AvgPool3d(*args, **kwargs)
+ raise ValueError(f"unsupported dimensions: {dims}")
+
+
+class Downsample(nn.Module):
+ """
+ A downsampling layer with an optional convolution.
+ :param channels: channels in the inputs and outputs.
+ :param use_conv: a bool determining if a convolution is applied.
+ :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
+ downsampling occurs in the inner-two dimensions.
+ """
+
+ def __init__(self, channels, use_conv, dims=2, out_channels=None,padding=1):
+ super().__init__()
+ self.channels = channels
+ self.out_channels = out_channels or channels
+ self.use_conv = use_conv
+ self.dims = dims
+ stride = 2 if dims != 3 else (1, 2, 2)
+ if use_conv:
+ self.op = conv_nd(
+ dims, self.channels, self.out_channels, 3, stride=stride, padding=padding
+ )
+ else:
+ assert self.channels == self.out_channels
+ self.op = avg_pool_nd(dims, kernel_size=stride, stride=stride)
+
+ def forward(self, x):
+ assert x.shape[1] == self.channels
+ return self.op(x)
+
+
+class ResnetBlock(nn.Module):
+ def __init__(self, in_c, out_c, down, ksize=3, sk=False, use_conv=True):
+ super().__init__()
+ ps = ksize//2
+ if in_c != out_c or sk==False:
+ self.in_conv = nn.Conv2d(in_c, out_c, ksize, 1, ps)
+ else:
+ # print('n_in')
+ self.in_conv = None
+ self.block1 = nn.Conv2d(out_c, out_c, 3, 1, 1)
+ self.act = nn.ReLU()
+ self.block2 = nn.Conv2d(out_c, out_c, ksize, 1, ps)
+ if sk==False:
+ self.skep = nn.Conv2d(in_c, out_c, ksize, 1, ps)
+ else:
+ # print('n_sk')
+ self.skep = None
+
+ self.down = down
+ if self.down == True:
+ self.down_opt = Downsample(in_c, use_conv=use_conv)
+
+ def forward(self, x):
+ if self.down == True:
+ x = self.down_opt(x)
+ if self.in_conv is not None: # edit
+ h = self.in_conv(x)
+ # x = self.in_conv(x)
+ # else:
+ # x = x
+
+ h = self.block1(h)
+ h = self.act(h)
+ h = self.block2(h)
+ if self.skep is not None:
+ return h + self.skep(x)
+ else:
+ return h + x
+
+
+class ResnetBlock(nn.Module):
+ def __init__(self, in_c, out_c, down, ksize=3, sk=False, use_conv=True):
+ super().__init__()
+ ps = ksize//2
+ if in_c != out_c or sk==False:
+ self.in_conv = nn.Conv2d(in_c, out_c, ksize, 1, ps)
+ else:
+ # print('n_in')
+ self.in_conv = None
+ self.block1 = nn.Conv2d(out_c, out_c, 3, 1, 1)
+ self.act = nn.ReLU()
+ self.block2 = nn.Conv2d(out_c, out_c, ksize, 1, ps)
+ if sk==False:
+ self.skep = nn.Conv2d(in_c, out_c, ksize, 1, ps)
+ else:
+ self.skep = None
+
+ self.down = down
+ if self.down == True:
+ self.down_opt = Downsample(in_c, use_conv=use_conv)
+
+ def forward(self, x):
+ if self.down == True:
+ x = self.down_opt(x)
+ if self.in_conv is not None: # edit
+ x = self.in_conv(x)
+
+ h = self.block1(x)
+ h = self.act(h)
+ h = self.block2(h)
+ if self.skep is not None:
+ return h + self.skep(x)
+ else:
+ return h + x
+
+
+class Adapter(nn.Module):
+ def __init__(self, channels=[320, 640, 1280, 1280], nums_rb=3, cin=64, ksize=3, sk=False, use_conv=True):
+ super(Adapter, self).__init__()
+ self.unshuffle = nn.PixelUnshuffle(8)
+ self.channels = channels
+ self.nums_rb = nums_rb
+ self.body = []
+ for i in range(len(channels)):
+ for j in range(nums_rb):
+ if (i!=0) and (j==0):
+ self.body.append(ResnetBlock(channels[i-1], channels[i], down=True, ksize=ksize, sk=sk, use_conv=use_conv))
+ else:
+ self.body.append(ResnetBlock(channels[i], channels[i], down=False, ksize=ksize, sk=sk, use_conv=use_conv))
+ self.body = nn.ModuleList(self.body)
+ self.conv_in = nn.Conv2d(cin, channels[0], 3, 1, 1)
+
+ def forward(self, x):
+ # unshuffle
+ x = self.unshuffle(x)
+ # extract features
+ features = []
+ x = self.conv_in(x)
+ for i in range(len(self.channels)):
+ for j in range(self.nums_rb):
+ idx = i*self.nums_rb +j
+ x = self.body[idx](x)
+ features.append(x)
+
+ return features
+
+class LayerNorm(nn.LayerNorm):
+ """Subclass torch's LayerNorm to handle fp16."""
+
+ def forward(self, x: torch.Tensor):
+ orig_type = x.dtype
+ ret = super().forward(x.type(torch.float32))
+ return ret.type(orig_type)
+
+
+class QuickGELU(nn.Module):
+
+ def forward(self, x: torch.Tensor):
+ return x * torch.sigmoid(1.702 * x)
+
+
+class ResidualAttentionBlock(nn.Module):
+
+ def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None):
+ super().__init__()
+
+ self.attn = nn.MultiheadAttention(d_model, n_head)
+ self.ln_1 = LayerNorm(d_model)
+ self.mlp = nn.Sequential(
+ OrderedDict([("c_fc", nn.Linear(d_model, d_model * 4)), ("gelu", QuickGELU()),
+ ("c_proj", nn.Linear(d_model * 4, d_model))]))
+ self.ln_2 = LayerNorm(d_model)
+ self.attn_mask = attn_mask
+
+ def attention(self, x: torch.Tensor):
+ self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None
+ return self.attn(x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]
+
+ def forward(self, x: torch.Tensor):
+ x = x + self.attention(self.ln_1(x))
+ x = x + self.mlp(self.ln_2(x))
+ return x
+
+
+class StyleAdapter(nn.Module):
+
+ def __init__(self, width=1024, context_dim=768, num_head=8, n_layes=3, num_token=4):
+ super().__init__()
+
+ scale = width ** -0.5
+ self.transformer_layes = nn.Sequential(*[ResidualAttentionBlock(width, num_head) for _ in range(n_layes)])
+ self.num_token = num_token
+ self.style_embedding = nn.Parameter(torch.randn(1, num_token, width) * scale)
+ self.ln_post = LayerNorm(width)
+ self.ln_pre = LayerNorm(width)
+ self.proj = nn.Parameter(scale * torch.randn(width, context_dim))
+
+ def forward(self, x):
+ # x shape [N, HW+1, C]
+ style_embedding = self.style_embedding + torch.zeros(
+ (x.shape[0], self.num_token, self.style_embedding.shape[-1]), device=x.device)
+
+ x = torch.cat([x, style_embedding], dim=1)
+ x = self.ln_pre(x)
+ x = x.permute(1, 0, 2) # NLD -> LND
+ x = self.transformer_layes(x)
+ x = x.permute(1, 0, 2) # LND -> NLD
+
+ x = self.ln_post(x[:, -self.num_token:, :])
+ x = x @ self.proj
+
+ return x
+
+
+class ResnetBlock_light(nn.Module):
+ def __init__(self, in_c):
+ super().__init__()
+ self.block1 = nn.Conv2d(in_c, in_c, 3, 1, 1)
+ self.act = nn.ReLU()
+ self.block2 = nn.Conv2d(in_c, in_c, 3, 1, 1)
+
+ def forward(self, x):
+ h = self.block1(x)
+ h = self.act(h)
+ h = self.block2(h)
+
+ return h + x
+
+
+class extractor(nn.Module):
+ def __init__(self, in_c, inter_c, out_c, nums_rb, down=False):
+ super().__init__()
+ self.in_conv = nn.Conv2d(in_c, inter_c, 1, 1, 0)
+ self.body = []
+ for _ in range(nums_rb):
+ self.body.append(ResnetBlock_light(inter_c))
+ self.body = nn.Sequential(*self.body)
+ self.out_conv = nn.Conv2d(inter_c, out_c, 1, 1, 0)
+ self.down = down
+ if self.down == True:
+ self.down_opt = Downsample(in_c, use_conv=False)
+
+ def forward(self, x):
+ if self.down == True:
+ x = self.down_opt(x)
+ x = self.in_conv(x)
+ x = self.body(x)
+ x = self.out_conv(x)
+
+ return x
+
+
+class Adapter_light(nn.Module):
+ def __init__(self, channels=[320, 640, 1280, 1280], nums_rb=3, cin=64):
+ super(Adapter_light, self).__init__()
+ self.unshuffle = nn.PixelUnshuffle(8)
+ self.channels = channels
+ self.nums_rb = nums_rb
+ self.body = []
+ for i in range(len(channels)):
+ if i == 0:
+ self.body.append(extractor(in_c=cin, inter_c=channels[i]//4, out_c=channels[i], nums_rb=nums_rb, down=False))
+ else:
+ self.body.append(extractor(in_c=channels[i-1], inter_c=channels[i]//4, out_c=channels[i], nums_rb=nums_rb, down=True))
+ self.body = nn.ModuleList(self.body)
+
+ def forward(self, x):
+ # unshuffle
+ x = self.unshuffle(x)
+ # extract features
+ features = []
+ for i in range(len(self.channels)):
+ x = self.body[i](x)
+ features.append(x)
+
+ return features
diff --git a/extensions/sd-webui-controlnet/scripts/api.py b/extensions/sd-webui-controlnet/scripts/api.py
new file mode 100644
index 0000000000000000000000000000000000000000..752fd46331940920b0c03636ecca53ab7a06f339
--- /dev/null
+++ b/extensions/sd-webui-controlnet/scripts/api.py
@@ -0,0 +1,330 @@
+from typing import Union
+
+import numpy as np
+from fastapi import FastAPI, Body
+from PIL import Image
+import copy
+import contextlib
+import pydantic
+import sys
+
+import gradio as gr
+
+from modules import ui
+from modules.api.models import *
+from modules.api import api
+from modules.processing import StableDiffusionProcessingTxt2Img, StableDiffusionProcessingImg2Img
+
+import modules.scripts as scripts
+
+from scripts import external_code
+from scripts.processor import *
+
+def to_base64_nparray(encoding: str):
+ return np.array(api.decode_base64_to_image(encoding)).astype('uint8')
+
+def encode_to_base64(image):
+ if type(image) is str:
+ return image
+ elif type(image) is Image.Image:
+ return api.encode_pil_to_base64(image)
+ elif type(image) is np.ndarray:
+ return encode_np_to_base64(image)
+ else:
+ return ""
+
+def encode_np_to_base64(image):
+ pil = Image.fromarray(image)
+ return api.encode_pil_to_base64(pil)
+
+cn_root_field_prefix = 'controlnet_'
+cn_fields = {
+ "input_image": (str, Field(default="", title='ControlNet Input Image')),
+ "mask": (str, Field(default="", title='ControlNet Input Mask')),
+ "module": (str, Field(default="none", title='Controlnet Module')),
+ "model": (str, Field(default="None", title='Controlnet Model')),
+ "weight": (float, Field(default=1.0, title='Controlnet Weight')),
+ "resize_mode": (Union[int, str], Field(default="Scale to Fit (Inner Fit)", title='Controlnet Resize Mode')),
+ "lowvram": (bool, Field(default=False, title='Controlnet Low VRAM')),
+ "processor_res": (int, Field(default=64, title='Controlnet Processor Res')),
+ "threshold_a": (float, Field(default=64, title='Controlnet Threshold a')),
+ "threshold_b": (float, Field(default=64, title='Controlnet Threshold b')),
+ "guidance": (float, Field(default=1.0, title='ControlNet Guidance Strength')),
+ "guidance_start": (float, Field(0.0, title='ControlNet Guidance Start')),
+ "guidance_end": (float, Field(1.0, title='ControlNet Guidance End')),
+ "guessmode": (bool, Field(default=True, title="Guess Mode")),
+}
+
+def get_deprecated_cn_field(field_name: str, field):
+ field_type, field = field
+ field = copy.copy(field)
+ field.default = None
+ field.extra['_deprecated'] = True
+ if field_name in ('input_image', 'mask'):
+ field_type = List[field_type]
+ return f'{cn_root_field_prefix}{field_name}', (field_type, field)
+
+def get_deprecated_field_default(field_name: str):
+ if field_name in ('input_image', 'mask'):
+ return []
+ return cn_fields[field_name][-1].default
+
+ControlNetUnitRequest = pydantic.create_model('ControlNetUnitRequest', **cn_fields)
+
+def create_controlnet_request_model(p_api_class):
+ class RequestModel(p_api_class):
+ class Config(p_api_class.__config__):
+ @staticmethod
+ def schema_extra(schema: dict, _):
+ props = {}
+ for k, v in schema.get('properties', {}).items():
+ if not v.get('_deprecated', False):
+ props[k] = v
+ if v.get('docs_default', None) is not None:
+ v['default'] = v['docs_default']
+ if props:
+ schema['properties'] = props
+
+ additional_fields = {
+ 'controlnet_units': (List[ControlNetUnitRequest], Field(default=[], docs_default=[ControlNetUnitRequest()], description="ControlNet Processing Units")),
+ **dict(get_deprecated_cn_field(k, v) for k, v in cn_fields.items())
+ }
+
+ return pydantic.create_model(
+ f'ControlNet{p_api_class.__name__}',
+ __base__=RequestModel,
+ **additional_fields)
+
+ControlNetTxt2ImgRequest = create_controlnet_request_model(StableDiffusionTxt2ImgProcessingAPI)
+ControlNetImg2ImgRequest = create_controlnet_request_model(StableDiffusionImg2ImgProcessingAPI)
+
+class ApiHijack(api.Api):
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.add_api_route("/controlnet/txt2img", self.controlnet_txt2img, methods=["POST"], response_model=TextToImageResponse)
+ self.add_api_route("/controlnet/img2img", self.controlnet_img2img, methods=["POST"], response_model=ImageToImageResponse)
+
+ def controlnet_txt2img(self, txt2img_request: ControlNetTxt2ImgRequest):
+ return self.controlnet_any2img(
+ any2img_request=txt2img_request,
+ original_callback=ApiHijack.text2imgapi,
+ p_class=StableDiffusionProcessingTxt2Img,
+ script_runner=scripts.scripts_txt2img,
+ is_img2img=False,
+ )
+
+ def controlnet_img2img(self, img2img_request: ControlNetImg2ImgRequest):
+ return self.controlnet_any2img(
+ any2img_request=img2img_request,
+ original_callback=ApiHijack.img2imgapi,
+ p_class=StableDiffusionProcessingImg2Img,
+ script_runner=scripts.scripts_img2img,
+ is_img2img=True,
+ )
+
+ def controlnet_any2img(self, any2img_request, original_callback, p_class, script_runner, is_img2img):
+ any2img_request = nest_deprecated_cn_fields(any2img_request)
+ script_runner = create_cn_script_runner(script_runner, any2img_request.controlnet_units, is_img2img)
+ delattr(any2img_request, 'controlnet_units')
+ with self.queue_lock:
+ self_copy = copy.copy(self)
+ self_copy.queue_lock = contextlib.nullcontext()
+ with OverrideInit(p_class, scripts=script_runner):
+ return original_callback(self_copy, any2img_request)
+
+api.Api = ApiHijack
+
+class OverrideInit:
+ def __init__(self, cls, **kwargs):
+ self.cls = cls
+ self.kwargs = kwargs
+ self.original_init = None
+
+ def __enter__(self):
+ def init_hijack(p, *args, **kwargs):
+ self.original_init(p, *args, **kwargs)
+ for k, v in self.kwargs.items():
+ setattr(p, k, v)
+
+ self.original_init = self.cls.__init__
+ self.cls.__init__ = init_hijack
+
+ def __exit__(self, exc_type, exc_val, exc_tb):
+ self.cls.__init__ = self.original_init
+
+def nest_deprecated_cn_fields(any2img_request):
+ deprecated_cn_fields = {k: v for k, v in vars(any2img_request).items()
+ if k.startswith(cn_root_field_prefix) and k != 'controlnet_units'}
+
+ any2img_request = copy.copy(any2img_request)
+ for k in deprecated_cn_fields.keys():
+ delattr(any2img_request, k)
+
+ if all(v is None for v in deprecated_cn_fields.values()):
+ return any2img_request
+
+ warn_deprecated_cn_params()
+ deprecated_cn_fields = {k[len(cn_root_field_prefix):]: v for k, v in deprecated_cn_fields.items()}
+ for k, v in deprecated_cn_fields.items():
+ if v is None:
+ deprecated_cn_fields[k] = get_deprecated_field_default(k)
+
+ for k in ('input_image', 'mask'):
+ deprecated_cn_fields[k] = deprecated_cn_fields[k][0] if deprecated_cn_fields[k] else ""
+
+ any2img_request.controlnet_units.insert(0, ControlNetUnitRequest(**deprecated_cn_fields))
+ return any2img_request
+
+def create_cn_script_runner(script_runner: scripts.ScriptRunner, control_unit_requests: List[ControlNetUnitRequest], is_img2img: bool):
+ if not script_runner.scripts:
+ script_runner.initialize_scripts(False)
+ ui.create_ui()
+
+ cn_script = external_code.find_cn_script(script_runner)
+ cn_script_runner = copy.copy(script_runner)
+ cn_script_runner.alwayson_scripts = [cn_script]
+ cn_script_args = [None] * cn_script.args_from
+ cn_units = [to_api_cn_unit(control_unit_request) for control_unit_request in control_unit_requests]
+ external_code.update_cn_script_in_place(
+ script_runner=cn_script_runner,
+ script_args=cn_script_args,
+ cn_units=cn_units,
+ is_img2img=is_img2img,
+ )
+
+ def make_script_runner_f_hijack(fixed_original_f):
+ def script_runner_f_hijack(p, *args, **kwargs):
+ original_script_args = p.script_args
+ try:
+ p.script_args = cn_script_args
+ fixed_original_f(p, *args, **kwargs)
+ finally:
+ p.script_args = original_script_args
+
+ return script_runner_f_hijack
+
+ for k in ('process', 'process_batch', 'postprocess', 'postprocess_batch', 'postprocess_image'):
+ original_f = getattr(cn_script_runner, k, None)
+ if original_f is None:
+ continue
+
+ setattr(cn_script_runner, k, make_script_runner_f_hijack(original_f))
+
+ return cn_script_runner
+
+def to_api_cn_unit(unit_request: ControlNetUnitRequest) -> external_code.ControlNetUnit:
+ input_image = to_base64_nparray(unit_request.input_image) if unit_request.input_image else None
+ mask = to_base64_nparray(unit_request.mask) if unit_request.mask else None
+ if input_image is not None and mask is not None:
+ input_image = (input_image, mask)
+
+ if unit_request.guidance < 1.0:
+ unit_request.guidance_end = unit_request.guidance
+
+ return external_code.ControlNetUnit(
+ module=unit_request.module,
+ model=unit_request.model,
+ weight=unit_request.weight,
+ image=input_image,
+ resize_mode=unit_request.resize_mode,
+ low_vram=unit_request.lowvram,
+ processor_res=unit_request.processor_res,
+ threshold_a=unit_request.threshold_a,
+ threshold_b=unit_request.threshold_b,
+ guidance_start=unit_request.guidance_start,
+ guidance_end=unit_request.guidance_end,
+ guess_mode=unit_request.guessmode,
+ )
+
+def warn_deprecated_cn_params():
+ warning_prefix = '[ControlNet] warning: '
+ print(f"{warning_prefix}using deprecated '{cn_root_field_prefix}*' request params", file=sys.stderr)
+ print(f"{warning_prefix}consider using the 'control_units' request param instead", file=sys.stderr)
+
+def controlnet_api(_: gr.Blocks, app: FastAPI):
+ @app.get("/controlnet/model_list")
+ async def model_list():
+ up_to_date_model_list = external_code.get_models(update=True)
+ print(up_to_date_model_list)
+ return {"model_list": up_to_date_model_list}
+
+ @app.post("/controlnet/detect")
+ async def detect(
+ controlnet_module: str = Body("None", title='Controlnet Module'),
+ controlnet_input_images: List[str] = Body([], title='Controlnet Input Images'),
+ controlnet_processor_res: int = Body(512, title='Controlnet Processor Resolution'),
+ controlnet_threshold_a: float = Body(64, title='Controlnet Threshold a'),
+ controlnet_threshold_b: float = Body(64, title='Controlnet Threshold b')
+ ):
+
+ available_modules = [
+ "canny",
+ "depth",
+ "depth_leres",
+ "fake_scribble",
+ "hed",
+ "mlsd",
+ "normal_map",
+ "openpose",
+ "segmentation",
+ "binary",
+ "color"
+ ]
+
+ if controlnet_module not in available_modules:
+ return {"images": [], "info": "Module not available"}
+ if len(controlnet_input_images) == 0:
+ return {"images": [], "info": "No image selected"}
+
+ print(f"Detecting {str(len(controlnet_input_images))} images with the {controlnet_module} module.")
+
+ results = []
+
+ for input_image in controlnet_input_images:
+ img = to_base64_nparray(input_image)
+
+ if controlnet_module == "canny":
+ results.append(canny(img, controlnet_processor_res, controlnet_threshold_a, controlnet_threshold_b)[0])
+ elif controlnet_module == "hed":
+ results.append(hed(img, controlnet_processor_res)[0])
+ elif controlnet_module == "mlsd":
+ results.append(mlsd(img, controlnet_processor_res, controlnet_threshold_a, controlnet_threshold_b)[0])
+ elif controlnet_module == "depth":
+ results.append(midas(img, controlnet_processor_res, np.pi * 2.0)[0])
+ elif controlnet_module == "normal_map":
+ results.append(midas_normal(img, controlnet_processor_res, np.pi * 2.0, controlnet_threshold_a)[0])
+ elif controlnet_module == "depth_leres":
+ results.append(leres(img, controlnet_processor_res, np.pi * 2.0, controlnet_threshold_a, controlnet_threshold_b)[0])
+ elif controlnet_module == "openpose":
+ results.append(openpose(img, controlnet_processor_res, False)[0])
+ elif controlnet_module == "fake_scribble":
+ results.append(fake_scribble(img, controlnet_processor_res)[0])
+ elif controlnet_module == "segmentation":
+ results.append(uniformer(img, controlnet_processor_res)[0])
+ elif controlnet_module == "binary":
+ results.append(binary(img, controlnet_processor_res, controlnet_threshold_a)[0])
+ elif controlnet_module == "color":
+ results.append(color(img, controlnet_processor_res)[0])
+
+ if controlnet_module == "hed":
+ unload_hed()
+ elif controlnet_module == "mlsd":
+ unload_mlsd()
+ elif controlnet_module == "depth" or controlnet_module == "normal_map":
+ unload_midas()
+ elif controlnet_module == "depth_leres":
+ unload_leres()
+ elif controlnet_module == "openpose":
+ unload_openpose()
+ elif controlnet_module == "segmentation":
+ unload_uniformer()
+
+ results64 = list(map(encode_to_base64, results))
+ return {"images": results64, "info": "Success"}
+
+try:
+ import modules.script_callbacks as script_callbacks
+
+ script_callbacks.on_app_started(controlnet_api)
+except:
+ pass
diff --git a/extensions/sd-webui-controlnet/scripts/cldm.py b/extensions/sd-webui-controlnet/scripts/cldm.py
new file mode 100644
index 0000000000000000000000000000000000000000..8619f41d01142f16d93cb222d0df781144e87896
--- /dev/null
+++ b/extensions/sd-webui-controlnet/scripts/cldm.py
@@ -0,0 +1,389 @@
+import torch
+import torch.nn as nn
+from omegaconf import OmegaConf
+from modules import devices, lowvram, shared, scripts
+
+cond_cast_unet = getattr(devices, 'cond_cast_unet', lambda x: x)
+
+from ldm.util import exists
+from ldm.modules.attention import SpatialTransformer
+from ldm.modules.diffusionmodules.util import conv_nd, linear, zero_module, timestep_embedding
+from ldm.modules.diffusionmodules.openaimodel import UNetModel, TimestepEmbedSequential, ResBlock, Downsample, AttentionBlock
+
+
+class TorchHijackForUnet:
+ """
+ This is torch, but with cat that resizes tensors to appropriate dimensions if they do not match;
+ this makes it possible to create pictures with dimensions that are multiples of 8 rather than 64
+ """
+
+ def __getattr__(self, item):
+ if item == 'cat':
+ return self.cat
+
+ if hasattr(torch, item):
+ return getattr(torch, item)
+
+ raise AttributeError("'{}' object has no attribute '{}'".format(type(self).__name__, item))
+
+ def cat(self, tensors, *args, **kwargs):
+ if len(tensors) == 2:
+ a, b = tensors
+ if a.shape[-2:] != b.shape[-2:]:
+ a = torch.nn.functional.interpolate(a, b.shape[-2:], mode="nearest")
+
+ tensors = (a, b)
+
+ return torch.cat(tensors, *args, **kwargs)
+
+
+th = TorchHijackForUnet()
+
+
+def align(hint, size):
+ b, c, h1, w1 = hint.shape
+ h, w = size
+ if h != h1 or w != w1:
+ hint = th.nn.functional.interpolate(hint, size=size, mode="nearest")
+ return hint
+
+
+def get_node_name(name, parent_name):
+ if len(name) <= len(parent_name):
+ return False, ''
+ p = name[:len(parent_name)]
+ if p != parent_name:
+ return False, ''
+ return True, name[len(parent_name):]
+
+
+class PlugableControlModel(nn.Module):
+ def __init__(self, state_dict, config_path, lowvram=False, base_model=None) -> None:
+ super().__init__()
+ config = OmegaConf.load(config_path)
+ self.control_model = ControlNet(**config.model.params.control_stage_config.params)
+
+ if any([k.startswith("control_model.") for k, v in state_dict.items()]):
+
+ is_diff_model = 'difference' in state_dict
+ transfer_ctrl_opt = shared.opts.data.get("control_net_control_transfer", False) and \
+ any([k.startswith("model.diffusion_model.") for k, v in state_dict.items()])
+
+ if (is_diff_model or transfer_ctrl_opt) and base_model is not None:
+ # apply transfer control - https://github.com/lllyasviel/ControlNet/blob/main/tool_transfer_control.py
+
+ unet_state_dict = base_model.state_dict()
+ unet_state_dict_keys = unet_state_dict.keys()
+ final_state_dict = {}
+ counter = 0
+ for key in state_dict.keys():
+ if not key.startswith("control_model."):
+ continue
+
+ p = state_dict[key]
+ is_control, node_name = get_node_name(key, 'control_')
+ key_name = node_name.replace("model.", "") if is_control else key
+
+ if key_name in unet_state_dict_keys:
+ if is_diff_model:
+ # transfer control by make difference in advance
+ p_new = p + unet_state_dict[key_name].clone().cpu()
+ else:
+ # transfer control by calculate offsets from (delta = p + current_unet_encoder - frozen_unet_encoder)
+ p_new = p + unet_state_dict[key_name].clone().cpu() - state_dict["model.diffusion_model."+key_name]
+ counter += 1
+ else:
+ p_new = p
+ final_state_dict[key] = p_new
+
+ print(f'Offset cloned: {counter} values')
+ state_dict = final_state_dict
+
+ state_dict = {k.replace("control_model.", ""): v for k, v in state_dict.items() if k.startswith("control_model.")}
+ else:
+ # assume that model is done by user
+ pass
+
+ self.control_model.load_state_dict(state_dict)
+ if not lowvram:
+ self.control_model.to(devices.get_device_for("controlnet"))
+
+ def reset(self):
+ pass
+
+ def forward(self, *args, **kwargs):
+ return self.control_model(*args, **kwargs)
+
+
+class ControlNet(nn.Module):
+ def __init__(
+ self,
+ image_size,
+ in_channels,
+ model_channels,
+ hint_channels,
+ num_res_blocks,
+ attention_resolutions,
+ dropout=0,
+ channel_mult=(1, 2, 4, 8),
+ conv_resample=True,
+ dims=2,
+ use_checkpoint=False,
+ use_fp16=False,
+ num_heads=-1,
+ num_head_channels=-1,
+ num_heads_upsample=-1,
+ use_scale_shift_norm=False,
+ resblock_updown=False,
+ use_new_attention_order=False,
+ use_spatial_transformer=False, # custom transformer support
+ transformer_depth=1, # custom transformer support
+ context_dim=None, # custom transformer support
+ # custom support for prediction of discrete ids into codebook of first stage vq model
+ n_embed=None,
+ legacy=True,
+ disable_self_attentions=None,
+ num_attention_blocks=None,
+ disable_middle_self_attn=False,
+ use_linear_in_transformer=False,
+ ):
+ use_fp16 = getattr(devices, 'dtype_unet', devices.dtype) == th.float16 and not getattr(shared.cmd_opts, "no_half_controlnet", False)
+
+ super().__init__()
+ if use_spatial_transformer:
+ assert context_dim is not None, 'Fool!! You forgot to include the dimension of your cross-attention conditioning...'
+
+ if context_dim is not None:
+ assert use_spatial_transformer, 'Fool!! You forgot to use the spatial transformer for your cross-attention conditioning...'
+ from omegaconf.listconfig import ListConfig
+ if type(context_dim) == ListConfig:
+ context_dim = list(context_dim)
+
+ if num_heads_upsample == -1:
+ num_heads_upsample = num_heads
+
+ if num_heads == -1:
+ assert num_head_channels != -1, 'Either num_heads or num_head_channels has to be set'
+
+ if num_head_channels == -1:
+ assert num_heads != -1, 'Either num_heads or num_head_channels has to be set'
+
+ self.dims = dims
+ self.image_size = image_size
+ self.in_channels = in_channels
+ self.model_channels = model_channels
+ if isinstance(num_res_blocks, int):
+ self.num_res_blocks = len(channel_mult) * [num_res_blocks]
+ else:
+ if len(num_res_blocks) != len(channel_mult):
+ raise ValueError("provide num_res_blocks either as an int (globally constant) or "
+ "as a list/tuple (per-level) with the same length as channel_mult")
+ self.num_res_blocks = num_res_blocks
+ if disable_self_attentions is not None:
+ # should be a list of booleans, indicating whether to disable self-attention in TransformerBlocks or not
+ assert len(disable_self_attentions) == len(channel_mult)
+ if num_attention_blocks is not None:
+ assert len(num_attention_blocks) == len(self.num_res_blocks)
+ assert all(map(lambda i: self.num_res_blocks[i] >= num_attention_blocks[i], range(
+ len(num_attention_blocks))))
+ print(f"Constructor of UNetModel received num_attention_blocks={num_attention_blocks}. "
+ f"This option has LESS priority than attention_resolutions {attention_resolutions}, "
+ f"i.e., in cases where num_attention_blocks[i] > 0 but 2**i not in attention_resolutions, "
+ f"attention will still not be set.")
+
+ self.attention_resolutions = attention_resolutions
+ self.dropout = dropout
+ self.channel_mult = channel_mult
+ self.conv_resample = conv_resample
+ self.use_checkpoint = use_checkpoint
+ self.dtype = th.float16 if use_fp16 else th.float32
+ self.num_heads = num_heads
+ self.num_head_channels = num_head_channels
+ self.num_heads_upsample = num_heads_upsample
+ self.predict_codebook_ids = n_embed is not None
+
+ time_embed_dim = model_channels * 4
+ self.time_embed = nn.Sequential(
+ linear(model_channels, time_embed_dim),
+ nn.SiLU(),
+ linear(time_embed_dim, time_embed_dim),
+ )
+
+ self.input_blocks = nn.ModuleList(
+ [
+ TimestepEmbedSequential(
+ conv_nd(dims, in_channels, model_channels, 3, padding=1)
+ )
+ ]
+ )
+ self.zero_convs = nn.ModuleList([self.make_zero_conv(model_channels)])
+
+ self.input_hint_block = TimestepEmbedSequential(
+ conv_nd(dims, hint_channels, 16, 3, padding=1),
+ nn.SiLU(),
+ conv_nd(dims, 16, 16, 3, padding=1),
+ nn.SiLU(),
+ conv_nd(dims, 16, 32, 3, padding=1, stride=2),
+ nn.SiLU(),
+ conv_nd(dims, 32, 32, 3, padding=1),
+ nn.SiLU(),
+ conv_nd(dims, 32, 96, 3, padding=1, stride=2),
+ nn.SiLU(),
+ conv_nd(dims, 96, 96, 3, padding=1),
+ nn.SiLU(),
+ conv_nd(dims, 96, 256, 3, padding=1, stride=2),
+ nn.SiLU(),
+ zero_module(conv_nd(dims, 256, model_channels, 3, padding=1))
+ )
+
+ self._feature_size = model_channels
+ input_block_chans = [model_channels]
+ ch = model_channels
+ ds = 1
+ for level, mult in enumerate(channel_mult):
+ for nr in range(self.num_res_blocks[level]):
+ layers = [
+ ResBlock(
+ ch,
+ time_embed_dim,
+ dropout,
+ out_channels=mult * model_channels,
+ dims=dims,
+ use_checkpoint=use_checkpoint,
+ use_scale_shift_norm=use_scale_shift_norm,
+ )
+ ]
+ ch = mult * model_channels
+ if ds in attention_resolutions:
+ if num_head_channels == -1:
+ dim_head = ch // num_heads
+ else:
+ num_heads = ch // num_head_channels
+ dim_head = num_head_channels
+ if legacy:
+ #num_heads = 1
+ dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
+ if exists(disable_self_attentions):
+ disabled_sa = disable_self_attentions[level]
+ else:
+ disabled_sa = False
+
+ if not exists(num_attention_blocks) or nr < num_attention_blocks[level]:
+ layers.append(
+ AttentionBlock(
+ ch,
+ use_checkpoint=use_checkpoint,
+ num_heads=num_heads,
+ num_head_channels=dim_head,
+ use_new_attention_order=use_new_attention_order,
+ ) if not use_spatial_transformer else SpatialTransformer(
+ ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim,
+ disable_self_attn=disabled_sa, use_linear=use_linear_in_transformer,
+ use_checkpoint=use_checkpoint
+ )
+ )
+ self.input_blocks.append(TimestepEmbedSequential(*layers))
+ self.zero_convs.append(self.make_zero_conv(ch))
+ self._feature_size += ch
+ input_block_chans.append(ch)
+ if level != len(channel_mult) - 1:
+ out_ch = ch
+ self.input_blocks.append(
+ TimestepEmbedSequential(
+ ResBlock(
+ ch,
+ time_embed_dim,
+ dropout,
+ out_channels=out_ch,
+ dims=dims,
+ use_checkpoint=use_checkpoint,
+ use_scale_shift_norm=use_scale_shift_norm,
+ down=True,
+ )
+ if resblock_updown
+ else Downsample(
+ ch, conv_resample, dims=dims, out_channels=out_ch
+ )
+ )
+ )
+ ch = out_ch
+ input_block_chans.append(ch)
+ self.zero_convs.append(self.make_zero_conv(ch))
+ ds *= 2
+ self._feature_size += ch
+
+ if num_head_channels == -1:
+ dim_head = ch // num_heads
+ else:
+ num_heads = ch // num_head_channels
+ dim_head = num_head_channels
+ if legacy:
+ #num_heads = 1
+ dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
+ self.middle_block = TimestepEmbedSequential(
+ ResBlock(
+ ch,
+ time_embed_dim,
+ dropout,
+ dims=dims,
+ use_checkpoint=use_checkpoint,
+ use_scale_shift_norm=use_scale_shift_norm,
+ ),
+ AttentionBlock(
+ ch,
+ use_checkpoint=use_checkpoint,
+ num_heads=num_heads,
+ num_head_channels=dim_head,
+ use_new_attention_order=use_new_attention_order,
+ # always uses a self-attn
+ ) if not use_spatial_transformer else SpatialTransformer(
+ ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim,
+ disable_self_attn=disable_middle_self_attn, use_linear=use_linear_in_transformer,
+ use_checkpoint=use_checkpoint
+ ),
+ ResBlock(
+ ch,
+ time_embed_dim,
+ dropout,
+ dims=dims,
+ use_checkpoint=use_checkpoint,
+ use_scale_shift_norm=use_scale_shift_norm,
+ ),
+ )
+ self.middle_block_out = self.make_zero_conv(ch)
+ self._feature_size += ch
+
+ def make_zero_conv(self, channels):
+ return TimestepEmbedSequential(zero_module(conv_nd(self.dims, channels, channels, 1, padding=0)))
+
+ def align(self, hint, h, w):
+ c, h1, w1 = hint.shape
+ if h != h1 or w != w1:
+ hint = align(hint.unsqueeze(0), (h, w))
+ return hint.squeeze(0)
+ return hint
+
+ def forward(self, x, hint, timesteps, context, **kwargs):
+ t_emb = cond_cast_unet(timestep_embedding(timesteps, self.model_channels, repeat_only=False))
+ emb = self.time_embed(t_emb)
+
+ guided_hint = self.input_hint_block(cond_cast_unet(hint), emb, context)
+ outs = []
+
+ h1, w1 = x.shape[-2:]
+ guided_hint = self.align(guided_hint, h1, w1)
+
+ h = x.type(self.dtype)
+ for module, zero_conv in zip(self.input_blocks, self.zero_convs):
+ if guided_hint is not None:
+ h = module(h, emb, context)
+ h += guided_hint
+ guided_hint = None
+ else:
+ h = module(h, emb, context)
+ outs.append(zero_conv(h, emb, context))
+
+ h = self.middle_block(h, emb, context)
+ outs.append(self.middle_block_out(h, emb, context))
+
+ return outs
diff --git a/extensions/sd-webui-controlnet/scripts/controlnet.py b/extensions/sd-webui-controlnet/scripts/controlnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..6eb54a7355b4c2c9d4e0bbb70d59afcd40589787
--- /dev/null
+++ b/extensions/sd-webui-controlnet/scripts/controlnet.py
@@ -0,0 +1,915 @@
+import gc
+import os
+import stat
+from collections import OrderedDict
+from enum import Enum
+from typing import Union
+
+import torch
+
+import modules.scripts as scripts
+from modules import shared, devices, script_callbacks, processing, masking, images
+import gradio as gr
+import numpy as np
+
+from einops import rearrange
+from scripts.cldm import PlugableControlModel
+from scripts.processor import *
+from scripts.adapter import PlugableAdapter
+from scripts.utils import load_state_dict
+from scripts.hook import ControlParams, UnetHook
+from modules import sd_models
+from modules.paths import models_path
+from modules.processing import StableDiffusionProcessingImg2Img
+from modules.images import save_image
+from PIL import Image
+from torchvision.transforms import Resize, InterpolationMode, CenterCrop, Compose
+
+gradio_compat = True
+try:
+ from distutils.version import LooseVersion
+ from importlib_metadata import version
+ if LooseVersion(version("gradio")) < LooseVersion("3.10"):
+ gradio_compat = False
+except ImportError:
+ pass
+
+# svgsupports
+svgsupport = False
+try:
+ import io
+ import base64
+ from svglib.svglib import svg2rlg
+ from reportlab.graphics import renderPM
+ svgsupport = True
+except ImportError:
+ pass
+
+CN_MODEL_EXTS = [".pt", ".pth", ".ckpt", ".safetensors"]
+cn_models = OrderedDict() # "My_Lora(abcd1234)" -> C:/path/to/model.safetensors
+cn_models_names = {} # "my_lora" -> "My_Lora(abcd1234)"
+cn_models_dir = os.path.join(models_path, "ControlNet")
+cn_models_dir_old = os.path.join(scripts.basedir(), "models")
+
+default_conf = os.path.join("models", "cldm_v15.yaml")
+default_conf_adapter = os.path.join("models", "sketch_adapter_v14.yaml")
+cn_detectedmap_dir = os.path.join("detected_maps")
+default_detectedmap_dir = cn_detectedmap_dir
+script_dir = scripts.basedir()
+
+os.makedirs(cn_models_dir, exist_ok=True)
+os.makedirs(cn_detectedmap_dir, exist_ok=True)
+
+refresh_symbol = '\U0001f504' # ð
+switch_values_symbol = '\U000021C5' # â
+camera_symbol = '\U0001F4F7' # ð·
+reverse_symbol = '\U000021C4' # â
+tossup_symbol = '\u2934'
+
+webcam_enabled = False
+webcam_mirrored = False
+
+PARAM_COUNT = 15
+
+
+class ToolButton(gr.Button, gr.components.FormComponent):
+ """Small button with single emoji as text, fits inside gradio forms"""
+
+ def __init__(self, **kwargs):
+ super().__init__(variant="tool", **kwargs)
+
+ def get_block_name(self):
+ return "button"
+
+
+def traverse_all_files(curr_path, model_list):
+ f_list = [(os.path.join(curr_path, entry.name), entry.stat())
+ for entry in os.scandir(curr_path)]
+ for f_info in f_list:
+ fname, fstat = f_info
+ if os.path.splitext(fname)[1] in CN_MODEL_EXTS:
+ model_list.append(f_info)
+ elif stat.S_ISDIR(fstat.st_mode):
+ model_list = traverse_all_files(fname, model_list)
+ return model_list
+
+
+def get_all_models(sort_by, filter_by, path):
+ res = OrderedDict()
+ fileinfos = traverse_all_files(path, [])
+ filter_by = filter_by.strip(" ")
+ if len(filter_by) != 0:
+ fileinfos = [x for x in fileinfos if filter_by.lower()
+ in os.path.basename(x[0]).lower()]
+ if sort_by == "name":
+ fileinfos = sorted(fileinfos, key=lambda x: os.path.basename(x[0]))
+ elif sort_by == "date":
+ fileinfos = sorted(fileinfos, key=lambda x: -x[1].st_mtime)
+ elif sort_by == "path name":
+ fileinfos = sorted(fileinfos)
+
+ for finfo in fileinfos:
+ filename = finfo[0]
+ name = os.path.splitext(os.path.basename(filename))[0]
+ # Prevent a hypothetical "None.pt" from being listed.
+ if name != "None":
+ res[name + f" [{sd_models.model_hash(filename)}]"] = filename
+
+ return res
+
+
+def find_closest_lora_model_name(search: str):
+ if not search:
+ return None
+ if search in cn_models:
+ return search
+ search = search.lower()
+ if search in cn_models_names:
+ return cn_models_names.get(search)
+ applicable = [name for name in cn_models_names.keys()
+ if search in name.lower()]
+ if not applicable:
+ return None
+ applicable = sorted(applicable, key=lambda name: len(name))
+ return cn_models_names[applicable[0]]
+
+
+def swap_img2img_pipeline(p: processing.StableDiffusionProcessingImg2Img):
+ p.__class__ = processing.StableDiffusionProcessingTxt2Img
+ dummy = processing.StableDiffusionProcessingTxt2Img()
+ for k,v in dummy.__dict__.items():
+ if hasattr(p, k):
+ continue
+ setattr(p, k, v)
+
+
+def update_cn_models():
+ cn_models.clear()
+ ext_dirs = (shared.opts.data.get("control_net_models_path", None), getattr(shared.cmd_opts, 'controlnet_dir', None))
+ extra_lora_paths = (extra_lora_path for extra_lora_path in ext_dirs
+ if extra_lora_path is not None and os.path.exists(extra_lora_path))
+ paths = [cn_models_dir, cn_models_dir_old, *extra_lora_paths]
+
+ for path in paths:
+ sort_by = shared.opts.data.get(
+ "control_net_models_sort_models_by", "name")
+ filter_by = shared.opts.data.get("control_net_models_name_filter", "")
+ found = get_all_models(sort_by, filter_by, path)
+ cn_models.update({**found, **cn_models})
+
+ # insert "None" at the beginning of `cn_models` in-place
+ cn_models_copy = OrderedDict(cn_models)
+ cn_models.clear()
+ cn_models.update({**{"None": None}, **cn_models_copy})
+
+ cn_models_names.clear()
+ for name_and_hash, filename in cn_models.items():
+ if filename is None:
+ continue
+ name = os.path.splitext(os.path.basename(filename))[0].lower()
+ cn_models_names[name] = name_and_hash
+
+
+update_cn_models()
+
+
+class ResizeMode(Enum):
+ RESIZE = "Just Resize"
+ INNER_FIT = "Scale to Fit (Inner Fit)"
+ OUTER_FIT = "Envelope (Outer Fit)"
+
+
+def resize_mode_from_value(value: Union[str, int, ResizeMode]) -> ResizeMode:
+ if isinstance(value, str):
+ return ResizeMode(value)
+ elif isinstance(value, int):
+ return [e for e in ResizeMode][value]
+ else:
+ return value
+
+class Script(scripts.Script):
+ model_cache = OrderedDict()
+
+ def __init__(self) -> None:
+ super().__init__()
+ self.latest_network = None
+ self.preprocessor = {
+ "none": lambda x, *args, **kwargs: (x, True),
+ "canny": canny,
+ "depth": midas,
+ "depth_leres": leres,
+ "hed": hed,
+ "mlsd": mlsd,
+ "normal_map": midas_normal,
+ "openpose": openpose,
+ "openpose_hand": openpose_hand,
+ "clip_vision": clip,
+ "color": color,
+ "pidinet": pidinet,
+ "scribble": simple_scribble,
+ "fake_scribble": fake_scribble,
+ "segmentation": uniformer,
+ "binary": binary,
+ }
+ self.unloadable = {
+ "hed": unload_hed,
+ "fake_scribble": unload_hed,
+ "mlsd": unload_mlsd,
+ "clip": unload_clip,
+ "depth": unload_midas,
+ "depth_leres": unload_leres,
+ "normal_map": unload_midas,
+ "pidinet": unload_pidinet,
+ "openpose": unload_openpose,
+ "openpose_hand": unload_openpose,
+ "segmentation": unload_uniformer,
+ }
+ self.input_image = None
+ self.latest_model_hash = ""
+ self.txt2img_w_slider = gr.Slider()
+ self.txt2img_h_slider = gr.Slider()
+ self.img2img_w_slider = gr.Slider()
+ self.img2img_h_slider = gr.Slider()
+
+ def title(self):
+ return "ControlNet"
+
+ def show(self, is_img2img):
+ # if is_img2img:
+ # return False
+ return scripts.AlwaysVisible
+
+ def after_component(self, component, **kwargs):
+ if component.elem_id == "txt2img_width":
+ self.txt2img_w_slider = component
+ return self.txt2img_w_slider
+ if component.elem_id == "txt2img_height":
+ self.txt2img_h_slider = component
+ return self.txt2img_h_slider
+ if component.elem_id == "img2img_width":
+ self.img2img_w_slider = component
+ return self.img2img_w_slider
+ if component.elem_id == "img2img_height":
+ self.img2img_h_slider = component
+ return self.img2img_h_slider
+
+ def get_threshold_block(self, proc):
+ pass
+
+ def uigroup(self, is_img2img):
+ ctrls = ()
+ infotext_fields = []
+ with gr.Row():
+ input_image = gr.Image(source='upload', mirror_webcam=False, type='numpy', tool='sketch')
+ generated_image = gr.Image(label="Annotator result", visible=False)
+
+ with gr.Row():
+ gr.HTML(value='Invert colors if your image has white background.
Change your brush width to make it thinner if you want to draw something.
')
+ webcam_enable = ToolButton(value=camera_symbol)
+ webcam_mirror = ToolButton(value=reverse_symbol)
+ send_dimen_button = ToolButton(value=tossup_symbol)
+
+ with gr.Row():
+ enabled = gr.Checkbox(label='Enable', value=False)
+ scribble_mode = gr.Checkbox(label='Invert Input Color', value=False)
+ rgbbgr_mode = gr.Checkbox(label='RGB to BGR', value=False)
+ lowvram = gr.Checkbox(label='Low VRAM', value=False)
+ guess_mode = gr.Checkbox(label='Guess Mode', value=False)
+
+ ctrls += (enabled,)
+ # infotext_fields.append((enabled, "ControlNet Enabled"))
+
+ def send_dimensions(image):
+ def closesteight(num):
+ rem = num % 8
+ if rem <= 4:
+ return round(num - rem)
+ else:
+ return round(num + (8 - rem))
+ if(image):
+ interm = np.asarray(image.get('image'))
+ return closesteight(interm.shape[1]), closesteight(interm.shape[0])
+ else:
+ return gr.Slider.update(), gr.Slider.update()
+
+ def webcam_toggle():
+ global webcam_enabled
+ webcam_enabled = not webcam_enabled
+ return {"value": None, "source": "webcam" if webcam_enabled else "upload", "__type__": "update"}
+
+ def webcam_mirror_toggle():
+ global webcam_mirrored
+ webcam_mirrored = not webcam_mirrored
+ return {"mirror_webcam": webcam_mirrored, "__type__": "update"}
+
+ webcam_enable.click(fn=webcam_toggle, inputs=None, outputs=input_image)
+ webcam_mirror.click(fn=webcam_mirror_toggle, inputs=None, outputs=input_image)
+
+ def refresh_all_models(*inputs):
+ update_cn_models()
+
+ dd = inputs[0]
+ selected = dd if dd in cn_models else "None"
+ return gr.Dropdown.update(value=selected, choices=list(cn_models.keys()))
+
+ with gr.Row():
+ module = gr.Dropdown(list(self.preprocessor.keys()), label=f"Preprocessor", value="none")
+ model = gr.Dropdown(list(cn_models.keys()), label=f"Model", value="None")
+ refresh_models = ToolButton(value=refresh_symbol)
+ refresh_models.click(refresh_all_models, model, model)
+ # ctrls += (refresh_models, )
+ with gr.Row():
+ weight = gr.Slider(label=f"Weight", value=1.0, minimum=0.0, maximum=2.0, step=.05)
+ guidance_start = gr.Slider(label="Guidance Start (T)", value=0.0, minimum=0.0, maximum=1.0, interactive=True)
+ guidance_end = gr.Slider(label="Guidance End (T)", value=1.0, minimum=0.0, maximum=1.0, interactive=True)
+
+ ctrls += (module, model, weight,)
+ # model_dropdowns.append(model)
+ def build_sliders(module):
+ if module == "canny":
+ return [
+ gr.update(label="Annotator resolution", value=512, minimum=64, maximum=2048, step=1, interactive=True),
+ gr.update(label="Canny low threshold", minimum=1, maximum=255, value=100, step=1, interactive=True),
+ gr.update(label="Canny high threshold", minimum=1, maximum=255, value=200, step=1, interactive=True),
+ gr.update(visible=True)
+ ]
+ elif module == "mlsd": #Hough
+ return [
+ gr.update(label="Hough Resolution", minimum=64, maximum=2048, value=512, step=1, interactive=True),
+ gr.update(label="Hough value threshold (MLSD)", minimum=0.01, maximum=2.0, value=0.1, step=0.01, interactive=True),
+ gr.update(label="Hough distance threshold (MLSD)", minimum=0.01, maximum=20.0, value=0.1, step=0.01, interactive=True),
+ gr.update(visible=True)
+ ]
+ elif module in ["hed", "fake_scribble"]:
+ return [
+ gr.update(label="HED Resolution", minimum=64, maximum=2048, value=512, step=1, interactive=True),
+ gr.update(label="Threshold A", value=64, minimum=64, maximum=1024, interactive=False),
+ gr.update(label="Threshold B", value=64, minimum=64, maximum=1024, interactive=False),
+ gr.update(visible=True)
+ ]
+ elif module in ["openpose", "openpose_hand", "segmentation"]:
+ return [
+ gr.update(label="Annotator Resolution", minimum=64, maximum=2048, value=512, step=1, interactive=True),
+ gr.update(label="Threshold A", value=64, minimum=64, maximum=1024, interactive=False),
+ gr.update(label="Threshold B", value=64, minimum=64, maximum=1024, interactive=False),
+ gr.update(visible=True)
+ ]
+ elif module == "depth":
+ return [
+ gr.update(label="Midas Resolution", minimum=64, maximum=2048, value=384, step=1, interactive=True),
+ gr.update(label="Threshold A", value=64, minimum=64, maximum=1024, interactive=False),
+ gr.update(label="Threshold B", value=64, minimum=64, maximum=1024, interactive=False),
+ gr.update(visible=True)
+ ]
+ elif module in ["depth_leres", "depth_leres_boost"]:
+ return [
+ gr.update(label="LeReS Resolution", minimum=64, maximum=2048, value=512, step=1, interactive=True),
+ gr.update(label="Remove Near %", value=0, minimum=0, maximum=100, step=0.1, interactive=True),
+ gr.update(label="Remove Background %", value=0, minimum=0, maximum=100, step=0.1, interactive=True),
+ gr.update(visible=True)
+ ]
+ elif module == "normal_map":
+ return [
+ gr.update(label="Normal Resolution", minimum=64, maximum=2048, value=512, step=1, interactive=True),
+ gr.update(label="Normal background threshold", minimum=0.0, maximum=1.0, value=0.4, step=0.01, interactive=True),
+ gr.update(label="Threshold B", value=64, minimum=64, maximum=1024, interactive=False),
+ gr.update(visible=True)
+ ]
+ elif module == "binary":
+ return [
+ gr.update(label="Annotator resolution", value=512, minimum=64, maximum=2048, step=1, interactive=True),
+ gr.update(label="Binary threshold", minimum=0, maximum=255, value=0, step=1, interactive=True),
+ gr.update(label="Threshold B", value=64, minimum=64, maximum=1024, interactive=False),
+ gr.update(visible=True)
+ ]
+ elif module == "none":
+ return [
+ gr.update(label="Normal Resolution", value=64, minimum=64, maximum=2048, interactive=False),
+ gr.update(label="Threshold A", value=64, minimum=64, maximum=1024, interactive=False),
+ gr.update(label="Threshold B", value=64, minimum=64, maximum=1024, interactive=False),
+ gr.update(visible=False)
+ ]
+ else:
+ return [
+ gr.update(label="Annotator resolution", value=512, minimum=64, maximum=2048, step=1, interactive=True),
+ gr.update(label="Threshold A", value=64, minimum=64, maximum=1024, interactive=False),
+ gr.update(label="Threshold B", value=64, minimum=64, maximum=1024, interactive=False),
+ gr.update(visible=True)
+ ]
+
+ # advanced options
+ advanced = gr.Column(visible=False)
+ with advanced:
+ processor_res = gr.Slider(label="Annotator resolution", value=64, minimum=64, maximum=2048, interactive=False)
+ threshold_a = gr.Slider(label="Threshold A", value=64, minimum=64, maximum=1024, interactive=False)
+ threshold_b = gr.Slider(label="Threshold B", value=64, minimum=64, maximum=1024, interactive=False)
+
+ if gradio_compat:
+ module.change(build_sliders, inputs=[module], outputs=[processor_res, threshold_a, threshold_b, advanced])
+
+ # infotext_fields.extend((module, model, weight))
+
+ def create_canvas(h, w):
+ return np.zeros(shape=(h, w, 3), dtype=np.uint8) + 255
+
+ def svgPreprocess(inputs):
+ if (inputs):
+ if (inputs['image'].startswith("data:image/svg+xml;base64,") and svgsupport):
+ svg_data = base64.b64decode(inputs['image'].replace('data:image/svg+xml;base64,',''))
+ drawing = svg2rlg(io.BytesIO(svg_data))
+ png_data = renderPM.drawToString(drawing, fmt='PNG')
+ encoded_string = base64.b64encode(png_data)
+ base64_str = str(encoded_string, "utf-8")
+ base64_str = "data:image/png;base64,"+ base64_str
+ inputs['image'] = base64_str
+ return input_image.orgpreprocess(inputs)
+ return None
+
+ resize_mode = gr.Radio(choices=[e.value for e in ResizeMode], value=ResizeMode.INNER_FIT.value, label="Resize Mode")
+ with gr.Row():
+ with gr.Column():
+ canvas_width = gr.Slider(label="Canvas Width", minimum=256, maximum=1024, value=512, step=64)
+ canvas_height = gr.Slider(label="Canvas Height", minimum=256, maximum=1024, value=512, step=64)
+
+ if gradio_compat:
+ canvas_swap_res = ToolButton(value=switch_values_symbol)
+ canvas_swap_res.click(lambda w, h: (h, w), inputs=[canvas_width, canvas_height], outputs=[canvas_width, canvas_height])
+
+ create_button = gr.Button(value="Create blank canvas")
+ create_button.click(fn=create_canvas, inputs=[canvas_height, canvas_width], outputs=[input_image])
+
+ def run_annotator(image, module, pres, pthr_a, pthr_b):
+ img = HWC3(image['image'])
+ if not ((image['mask'][:, :, 0]==0).all() or (image['mask'][:, :, 0]==255).all()):
+ img = HWC3(image['mask'][:, :, 0])
+ preprocessor = self.preprocessor[module]
+ result = None
+ if pres > 64:
+ result, is_image = preprocessor(img, res=pres, thr_a=pthr_a, thr_b=pthr_b)
+ else:
+ result, is_image = preprocessor(img)
+
+ if is_image:
+ return gr.update(value=result, visible=True, interactive=False)
+
+ with gr.Row():
+ annotator_button = gr.Button(value="Preview annotator result")
+ annotator_button_hide = gr.Button(value="Hide annotator result")
+
+ annotator_button.click(fn=run_annotator, inputs=[input_image, module, processor_res, threshold_a, threshold_b], outputs=[generated_image])
+ annotator_button_hide.click(fn=lambda: gr.update(visible=False), inputs=None, outputs=[generated_image])
+
+ if is_img2img:
+ send_dimen_button.click(fn=send_dimensions, inputs=[input_image], outputs=[self.img2img_w_slider, self.img2img_h_slider])
+ else:
+ send_dimen_button.click(fn=send_dimensions, inputs=[input_image], outputs=[self.txt2img_w_slider, self.txt2img_h_slider])
+
+ ctrls += (input_image, scribble_mode, resize_mode, rgbbgr_mode)
+ ctrls += (lowvram,)
+ ctrls += (processor_res, threshold_a, threshold_b, guidance_start, guidance_end, guess_mode)
+
+ input_image.orgpreprocess=input_image.preprocess
+ input_image.preprocess=svgPreprocess
+
+ return ctrls
+
+
+ def ui(self, is_img2img):
+ """this function should create gradio UI elements. See https://gradio.app/docs/#components
+ The return value should be an array of all components that are used in processing.
+ Values of those returned components will be passed to run() and process() functions.
+ """
+ self.infotext_fields = []
+ ctrls_group = (
+ gr.State(is_img2img),
+ gr.State(True), # is_ui
+ )
+ max_models = shared.opts.data.get("control_net_max_models_num", 1)
+ with gr.Group():
+ with gr.Accordion("ControlNet", open = False, elem_id="controlnet"):
+ if max_models > 1:
+ with gr.Tabs():
+ for i in range(max_models):
+ with gr.Tab(f"Control Model - {i}"):
+ ctrls = self.uigroup(is_img2img)
+ self.register_modules(f"ControlNet-{i}", ctrls)
+ ctrls_group += ctrls
+ else:
+ with gr.Column():
+ ctrls = self.uigroup(is_img2img)
+ self.register_modules(f"ControlNet", ctrls)
+ ctrls_group += ctrls
+
+ return ctrls_group
+
+ def register_modules(self, tabname, params):
+ enabled, module, model, weight = params[:4]
+ guidance_start, guidance_end, guess_mode = params[-3:]
+
+ self.infotext_fields.extend([
+ (enabled, f"{tabname} Enabled"),
+ (module, f"{tabname} Preprocessor"),
+ (model, f"{tabname} Model"),
+ (weight, f"{tabname} Weight"),
+ (guidance_start, f"{tabname} Guidance Start"),
+ (guidance_end, f"{tabname} Guidance End"),
+ ])
+
+ def clear_control_model_cache(self):
+ Script.model_cache.clear()
+ gc.collect()
+ devices.torch_gc()
+
+ def load_control_model(self, p, unet, model, lowvram):
+ if model in Script.model_cache:
+ print(f"Loading model from cache: {model}")
+ return Script.model_cache[model]
+
+ # Remove model from cache to clear space before building another model
+ if len(Script.model_cache) > 0 and len(Script.model_cache) >= shared.opts.data.get("control_net_model_cache_size", 2):
+ Script.model_cache.popitem(last=False)
+ gc.collect()
+ devices.torch_gc()
+
+ model_net = self.build_control_model(p, unet, model, lowvram)
+
+ if shared.opts.data.get("control_net_model_cache_size", 2) > 0:
+ Script.model_cache[model] = model_net
+
+ return model_net
+
+ def build_control_model(self, p, unet, model, lowvram):
+
+ model_path = cn_models.get(model, None)
+ if model_path is None:
+ raise RuntimeError(f"model not found: {model}")
+
+ # trim '"' at start/end
+ if model_path.startswith("\"") and model_path.endswith("\""):
+ model_path = model_path[1:-1]
+
+ if not os.path.exists(model_path):
+ raise ValueError(f"file not found: {model_path}")
+
+ print(f"Loading model: {model}")
+ state_dict = load_state_dict(model_path)
+ network_module = PlugableControlModel
+ network_config = shared.opts.data.get("control_net_model_config", default_conf)
+ if not os.path.isabs(network_config):
+ network_config = os.path.join(script_dir, network_config)
+
+ if any([k.startswith("body.") or k == 'style_embedding' for k, v in state_dict.items()]):
+ # adapter model
+ network_module = PlugableAdapter
+ network_config = shared.opts.data.get("control_net_model_adapter_config", default_conf_adapter)
+ if not os.path.isabs(network_config):
+ network_config = os.path.join(script_dir, network_config)
+
+ override_config = os.path.splitext(model_path)[0] + ".yaml"
+ if os.path.exists(override_config):
+ network_config = override_config
+
+ network = network_module(
+ state_dict=state_dict,
+ config_path=network_config,
+ lowvram=lowvram,
+ base_model=unet,
+ )
+ network.to(p.sd_model.device, dtype=p.sd_model.dtype)
+ print(f"ControlNet model {model} loaded.")
+ return network
+
+ @staticmethod
+ def get_remote_call(p, attribute, default=None, idx=0, strict=False, force=False):
+ if not force and not shared.opts.data.get("control_net_allow_script_control", False):
+ return default
+
+ def get_element(obj, idx, strict=False):
+ if not isinstance(obj, list):
+ return obj if not strict or idx == 0 else None
+ elif idx < len(obj):
+ return obj[idx]
+ else:
+ return None
+
+ attribute_value = get_element(getattr(p, attribute, None), idx, strict)
+ default_value = get_element(default, idx)
+ return attribute_value if attribute_value is not None else default_value
+
+ def parse_remote_call(self, p, params, idx):
+ if params is None:
+ params = [None] * PARAM_COUNT
+
+ enabled, module, model, weight, image, scribble_mode, \
+ resize_mode, rgbbgr_mode, lowvram, pres, pthr_a, pthr_b, guidance_start, guidance_end, guess_mode = params
+
+ selector = self.get_remote_call
+
+ enabled = selector(p, "control_net_enabled", enabled, idx, strict=True)
+ module = selector(p, "control_net_module", module, idx)
+ model = selector(p, "control_net_model", model, idx)
+ weight = selector(p, "control_net_weight", weight, idx)
+ image = selector(p, "control_net_image", image, idx)
+ scribble_mode = selector(p, "control_net_scribble_mode", scribble_mode, idx)
+ resize_mode = selector(p, "control_net_resize_mode", resize_mode, idx)
+ rgbbgr_mode = selector(p, "control_net_rgbbgr_mode", rgbbgr_mode, idx)
+ lowvram = selector(p, "control_net_lowvram", lowvram, idx)
+ pres = selector(p, "control_net_pres", pres, idx)
+ pthr_a = selector(p, "control_net_pthr_a", pthr_a, idx)
+ pthr_b = selector(p, "control_net_pthr_b", pthr_b, idx)
+ guidance_strength = selector(p, "control_net_guidance_strength", 1.0, idx)
+ guidance_start = selector(p, "control_net_guidance_start", guidance_start, idx)
+ guidance_end = selector(p, "control_net_guidance_end", guidance_end, idx)
+ guess_mode = selector(p, "control_net_guess_mode", guess_mode, idx)
+ if guidance_strength < 1.0:
+ # for backward compatible
+ guidance_end = guidance_strength
+
+ input_image = selector(p, "control_net_input_image", None, idx)
+
+ return (enabled, module, model, weight, image, scribble_mode, \
+ resize_mode, rgbbgr_mode, lowvram, pres, pthr_a, pthr_b, guidance_start, guidance_end, guess_mode), input_image
+
+ def detectmap_proc(self, detected_map, module, rgbbgr_mode, resize_mode, h, w):
+ detected_map = HWC3(detected_map)
+ if module == "normal_map" or rgbbgr_mode:
+ control = torch.from_numpy(detected_map[:, :, ::-1].copy()).float().to(devices.get_device_for("controlnet")) / 255.0
+ else:
+ control = torch.from_numpy(detected_map.copy()).float().to(devices.get_device_for("controlnet")) / 255.0
+
+ control = rearrange(control, 'h w c -> c h w')
+ detected_map = rearrange(torch.from_numpy(detected_map), 'h w c -> c h w')
+
+ if resize_mode == ResizeMode.INNER_FIT:
+ transform = Compose([
+ Resize(h if hw else w, interpolation=InterpolationMode.BICUBIC),
+ CenterCrop(size=(h, w))
+ ])
+ control = transform(control)
+ detected_map = transform(detected_map)
+ else:
+ control = Resize((h,w), interpolation=InterpolationMode.BICUBIC)(control)
+ detected_map = Resize((h,w), interpolation=InterpolationMode.BICUBIC)(detected_map)
+
+ # for log use
+ detected_map = rearrange(detected_map, 'c h w -> h w c').numpy().astype(np.uint8)
+ return control, detected_map
+
+ def process(self, p, is_img2img=False, is_ui=False, *args):
+ """
+ This function is called before processing begins for AlwaysVisible scripts.
+ You can modify the processing object (p) here, inject hooks, etc.
+ args contains all values returned by components from ui()
+ """
+ unet = p.sd_model.model.diffusion_model
+ if self.latest_network is not None:
+ # always restore (~0.05s)
+ self.latest_network.restore(unet)
+
+ control_groups = []
+ params_group = [args[i:i + PARAM_COUNT] for i in range(0, len(args), PARAM_COUNT)]
+ if len(params_group) == 0:
+ # fill a null group
+ params, _ = self.parse_remote_call(p, None, 0)
+ if params[0]: # enabled
+ params_group.append(params)
+
+ for idx, params in enumerate(params_group):
+ params, _ = self.parse_remote_call(p, params, idx)
+ enabled, module, model, weight, image, scribble_mode, \
+ resize_mode, rgbbgr_mode, lowvram, pres, pthr_a, pthr_b, guidance_start, guidance_end, guess_mode = params
+
+ if not enabled:
+ continue
+ control_groups.append((module, model, params))
+ if len(params_group) != 1:
+ prefix = f"ControlNet-{idx}"
+ else:
+ prefix = "ControlNet"
+ p.extra_generation_params.update({
+ f"{prefix} Enabled": True,
+ f"{prefix} Module": module,
+ f"{prefix} Model": model,
+ f"{prefix} Weight": weight,
+ f"{prefix} Guidance Start": guidance_start,
+ f"{prefix} Guidance End": guidance_end,
+ })
+
+ if len(params_group) == 0:
+ self.latest_network = None
+ return
+
+ detected_maps = []
+ forward_params = []
+ hook_lowvram = False
+
+ # cache stuff
+ if self.latest_model_hash != p.sd_model.sd_model_hash:
+ self.clear_control_model_cache()
+
+ # unload unused preproc
+ module_list = [mod[0] for mod in control_groups]
+ for key in self.unloadable:
+ if key not in module_list:
+ self.unloadable.get(module, lambda:None)()
+
+ self.latest_model_hash = p.sd_model.sd_model_hash
+ for idx, contents in enumerate(control_groups):
+ module, model, params = contents
+ _, input_image = self.parse_remote_call(p, params, idx)
+ enabled, module, model, weight, image, scribble_mode, \
+ resize_mode, rgbbgr_mode, lowvram, pres, pthr_a, pthr_b, guidance_start, guidance_end, guess_mode = params
+
+ resize_mode = resize_mode_from_value(resize_mode)
+
+ if lowvram:
+ hook_lowvram = True
+
+ model_net = self.load_control_model(p, unet, model, lowvram)
+ model_net.reset()
+
+ is_img2img_batch_tab = is_img2img and img2img_tab_tracker.submit_img2img_tab == 'img2img_batch_tab'
+ if is_img2img_batch_tab and hasattr(p, "image_control") and p.image_control is not None:
+ input_image = HWC3(np.asarray(p.image_control))
+ elif input_image is not None:
+ input_image = HWC3(np.asarray(input_image))
+ elif image is not None:
+ input_image = HWC3(image['image'])
+ if not ((image['mask'][:, :, 0]==0).all() or (image['mask'][:, :, 0]==255).all()):
+ print("using mask as input")
+ input_image = HWC3(image['mask'][:, :, 0])
+ scribble_mode = True
+ else:
+ # use img2img init_image as default
+ input_image = getattr(p, "init_images", [None])[0]
+ if input_image is None:
+ raise ValueError('controlnet is enabled but no input image is given')
+ input_image = HWC3(np.asarray(input_image))
+
+ if issubclass(type(p), StableDiffusionProcessingImg2Img) and p.inpaint_full_res == True and p.image_mask is not None:
+ input_image = Image.fromarray(input_image)
+ mask = p.image_mask.convert('L')
+ crop_region = masking.get_crop_region(np.array(mask), p.inpaint_full_res_padding)
+ crop_region = masking.expand_crop_region(crop_region, p.width, p.height, mask.width, mask.height)
+
+ input_image = input_image.crop(crop_region)
+ input_image = images.resize_image(2, input_image, p.width, p.height)
+ input_image = HWC3(np.asarray(input_image))
+
+ if scribble_mode:
+ detected_map = np.zeros_like(input_image, dtype=np.uint8)
+ detected_map[np.min(input_image, axis=2) < 127] = 255
+ input_image = detected_map
+
+ print(f"Loading preprocessor: {module}")
+ preprocessor = self.preprocessor[module]
+ h, w, bsz = p.height, p.width, p.batch_size
+ if pres > 64:
+ detected_map, is_image = preprocessor(input_image, res=pres, thr_a=pthr_a, thr_b=pthr_b)
+ else:
+ detected_map, is_image = preprocessor(input_image)
+
+ if is_image:
+ control, detected_map = self.detectmap_proc(detected_map, module, rgbbgr_mode, resize_mode, h, w)
+ detected_maps.append((detected_map, module))
+ else:
+ control = detected_map
+
+ forward_param = ControlParams(
+ control_model=model_net,
+ hint_cond=control,
+ guess_mode=guess_mode,
+ weight=weight,
+ guidance_stopped=False,
+ start_guidance_percent=guidance_start,
+ stop_guidance_percent=guidance_end,
+ advanced_weighting=None,
+ is_adapter=isinstance(model_net, PlugableAdapter),
+ is_extra_cond=getattr(model_net, "target", "") == "scripts.adapter.StyleAdapter"
+ )
+ forward_params.append(forward_param)
+
+ del model_net
+
+ self.latest_network = UnetHook(lowvram=hook_lowvram)
+ self.latest_network.hook(unet)
+ self.latest_network.notify(forward_params, p.sampler_name in ["DDIM", "PLMS"])
+ self.detected_map = detected_maps
+
+ if len(control_groups) > 0 and shared.opts.data.get("control_net_skip_img2img_processing") and hasattr(p, "init_images"):
+ swap_img2img_pipeline(p)
+
+ def postprocess(self, p, processed, is_img2img=False, is_ui=False, *args):
+ if shared.opts.data.get("control_net_detectmap_autosaving", False) and self.latest_network is not None:
+ for detect_map, module in self.detected_map:
+ detectmap_dir = os.path.join(shared.opts.data.get("control_net_detectedmap_dir", False), module)
+ if not os.path.isabs(detectmap_dir):
+ detectmap_dir = os.path.join(p.outpath_samples, detectmap_dir)
+ if module != "none":
+ os.makedirs(detectmap_dir, exist_ok=True)
+ img = Image.fromarray(detect_map)
+ save_image(img, detectmap_dir, module)
+
+ is_img2img_batch_tab = is_ui and is_img2img and img2img_tab_tracker.submit_img2img_tab == 'img2img_batch_tab'
+ no_detectmap_opt = shared.opts.data.get("control_net_no_detectmap", False)
+ if self.latest_network is None or no_detectmap_opt or is_img2img_batch_tab:
+ return
+
+ if hasattr(self, "detected_map") and self.detected_map is not None:
+ for detect_map, module in self.detected_map:
+ if module in ["canny", "mlsd", "scribble", "fake_scribble", "pidinet", "binary"]:
+ detect_map = 255-detect_map
+ processed.images.extend([Image.fromarray(detect_map)])
+
+ self.input_image = None
+ self.latest_network.restore(p.sd_model.model.diffusion_model)
+ self.latest_network = None
+
+ gc.collect()
+ devices.torch_gc()
+
+def update_script_args(p, value, arg_idx):
+ for s in scripts.scripts_txt2img.alwayson_scripts:
+ if isinstance(s, Script):
+ args = list(p.script_args)
+ # print(f"Changed arg {arg_idx} from {args[s.args_from + arg_idx - 1]} to {value}")
+ args[s.args_from + arg_idx] = value
+ p.script_args = tuple(args)
+ break
+
+
+def on_ui_settings():
+ section = ('control_net', "ControlNet")
+ shared.opts.add_option("control_net_model_config", shared.OptionInfo(
+ default_conf, "Config file for Control Net models", section=section))
+ shared.opts.add_option("control_net_model_adapter_config", shared.OptionInfo(
+ default_conf_adapter, "Config file for Adapter models", section=section))
+ shared.opts.add_option("control_net_detectedmap_dir", shared.OptionInfo(
+ default_detectedmap_dir, "Directory for detected maps auto saving", section=section))
+ shared.opts.add_option("control_net_models_path", shared.OptionInfo(
+ "", "Extra path to scan for ControlNet models (e.g. training output directory)", section=section))
+ shared.opts.add_option("control_net_max_models_num", shared.OptionInfo(
+ 1, "Multi ControlNet: Max models amount (requires restart)", gr.Slider, {"minimum": 1, "maximum": 10, "step": 1}, section=section))
+ shared.opts.add_option("control_net_model_cache_size", shared.OptionInfo(
+ 1, "Model cache size (requires restart)", gr.Slider, {"minimum": 1, "maximum": 5, "step": 1}, section=section))
+ shared.opts.add_option("control_net_control_transfer", shared.OptionInfo(
+ False, "Apply transfer control when loading models", gr.Checkbox, {"interactive": True}, section=section))
+ shared.opts.add_option("control_net_no_detectmap", shared.OptionInfo(
+ False, "Do not append detectmap to output", gr.Checkbox, {"interactive": True}, section=section))
+ shared.opts.add_option("control_net_detectmap_autosaving", shared.OptionInfo(
+ False, "Allow detectmap auto saving", gr.Checkbox, {"interactive": True}, section=section))
+ shared.opts.add_option("control_net_only_midctrl_hires", shared.OptionInfo(
+ True, "Use mid-control on highres pass (second pass)", gr.Checkbox, {"interactive": True}, section=section))
+ shared.opts.add_option("control_net_allow_script_control", shared.OptionInfo(
+ False, "Allow other script to control this extension", gr.Checkbox, {"interactive": True}, section=section))
+ shared.opts.add_option("control_net_skip_img2img_processing", shared.OptionInfo(
+ False, "Skip img2img processing when using img2img initial image", gr.Checkbox, {"interactive": True}, section=section))
+ shared.opts.add_option("control_net_monocular_depth_optim", shared.OptionInfo(
+ False, "Enable optimized monocular depth estimation", gr.Checkbox, {"interactive": True}, section=section))
+ shared.opts.add_option("control_net_only_mid_control", shared.OptionInfo(
+ False, "Only use mid-control when inference", gr.Checkbox, {"interactive": True}, section=section))
+ shared.opts.add_option("control_net_cfg_based_guidance", shared.OptionInfo(
+ False, "Enable CFG-Based guidance", gr.Checkbox, {"interactive": True}, section=section))
+ # shared.opts.add_option("control_net_advanced_weighting", shared.OptionInfo(
+ # False, "Enable advanced weight tuning", gr.Checkbox, {"interactive": False}, section=section))
+
+
+class Img2ImgTabTracker:
+ def __init__(self):
+ self.img2img_tabs = set()
+ self.active_img2img_tab = 'img2img_img2img_tab'
+ self.submit_img2img_tab = None
+
+ def save_submit_img2img_tab(self):
+ self.submit_img2img_tab = self.active_img2img_tab
+
+ def set_active_img2img_tab(self, tab):
+ self.active_img2img_tab = tab.elem_id
+
+ def on_after_component_callback(self, component, **_kwargs):
+ if type(component) is gr.State:
+ return
+
+ if type(component) is gr.Button and component.elem_id == 'img2img_generate':
+ component.click(fn=self.save_submit_img2img_tab, inputs=[], outputs=[])
+ return
+
+ tab = getattr(component, 'parent', None)
+ is_tab = type(tab) is gr.Tab and getattr(tab, 'elem_id', None) is not None
+ is_img2img_tab = is_tab and getattr(tab, 'parent', None) is not None and getattr(tab.parent, 'elem_id', None) == 'mode_img2img'
+ if is_img2img_tab and tab.elem_id not in self.img2img_tabs:
+ tab.select(fn=self.set_active_img2img_tab, inputs=gr.State(tab), outputs=[])
+ self.img2img_tabs.add(tab.elem_id)
+ return
+
+
+img2img_tab_tracker = Img2ImgTabTracker()
+script_callbacks.on_ui_settings(on_ui_settings)
+script_callbacks.on_after_component(img2img_tab_tracker.on_after_component_callback)
diff --git a/extensions/sd-webui-controlnet/scripts/external_code.py b/extensions/sd-webui-controlnet/scripts/external_code.py
new file mode 100644
index 0000000000000000000000000000000000000000..b433da50c46ecfe2001590aa8182e3cd47374ebc
--- /dev/null
+++ b/extensions/sd-webui-controlnet/scripts/external_code.py
@@ -0,0 +1,232 @@
+from typing import List, Any, Optional, Union, Tuple, Dict
+from modules import scripts, processing, shared
+from scripts.controlnet import ResizeMode, update_cn_models, cn_models_names, PARAM_COUNT
+import numpy as np
+
+
+"""
+Resize modes for ControlNet input images.
+"""
+ResizeMode = ResizeMode
+
+
+class ControlNetUnit:
+ """
+ Represents an entire ControlNet processing unit.
+ """
+
+ def __init__(
+ self,
+ enabled: bool=True,
+ module: Optional[str]=None,
+ model: Optional[str]=None,
+ weight: float=1.0,
+ image: Optional[Union[Dict[str, np.ndarray], Tuple[np.ndarray, np.ndarray], np.ndarray]]=None,
+ invert_image: bool=False,
+ resize_mode: Union[ResizeMode, int, str]=ResizeMode.INNER_FIT,
+ rgbbgr_mode: bool=False,
+ low_vram: bool=False,
+ processor_res: int=64,
+ threshold_a: float=64,
+ threshold_b: float=64,
+ guidance_start: float=0.0,
+ guidance_end: float=1.0,
+ guess_mode: bool=True,
+ ):
+ if image is not None:
+ if isinstance(image, tuple):
+ image = {'image': image[0], 'mask': image[1]}
+ elif isinstance(image, np.ndarray):
+ image = {'image': image, 'mask': np.zeros_like(image, dtype=np.uint8)}
+
+ while len(image['mask'].shape) < 3:
+ image['mask'] = image['mask'][..., np.newaxis]
+
+ self.enabled = enabled
+ self.module = module
+ self.model = model
+ self.weight = weight
+ self.image = image
+ self.invert_image = invert_image
+ self.resize_mode = resize_mode
+ self.rgbbgr_mode = rgbbgr_mode
+ self.low_vram = low_vram
+ self.processor_res = processor_res
+ self.threshold_a = threshold_a
+ self.threshold_b = threshold_b
+ self.guidance_start = guidance_start
+ self.guidance_end = guidance_end
+ self.guess_mode = guess_mode
+
+
+def get_all_units_in_processing(p: processing.StableDiffusionProcessing) -> List[ControlNetUnit]:
+ """
+ Fetch ControlNet processing units from a StableDiffusionProcessing.
+ """
+
+ return get_all_units(p.scripts, p.script_args)
+
+
+def get_all_units(script_runner: scripts.ScriptRunner, script_args: List[Any]) -> List[ControlNetUnit]:
+ """
+ Fetch ControlNet processing units from an existing script runner.
+ Use this function to fetch units from the list of all scripts arguments.
+ """
+
+ cn_script = find_cn_script(script_runner)
+ if cn_script:
+ return get_all_units_from(script_args[cn_script.args_from:cn_script.args_to])
+
+ return []
+
+
+def get_all_units_from(script_args: List[Any], strip_positional_args=True) -> List[ControlNetUnit]:
+ """
+ Fetch ControlNet processing units from ControlNet script arguments.
+ Use `external_code.get_all_units` to fetch units from the list of all scripts arguments.
+
+ Keyword arguments:
+ strip_positional_args -- Whether positional arguments are present in `script_args`. (default True)
+ """
+
+ if strip_positional_args:
+ script_args = script_args[2:]
+
+ res = []
+ for i in range(len(script_args) // PARAM_COUNT):
+ res.append(get_single_unit_from(script_args, i))
+
+ return res
+
+
+def get_single_unit_from(script_args: List[Any], index: int=0) -> ControlNetUnit:
+ """
+ Fetch a single ControlNet processing unit from ControlNet script arguments.
+ The list must not contain script positional arguments. It must only consist of flattened processing unit parameters.
+ """
+
+ index_from = index * PARAM_COUNT
+ index_to = index_from + PARAM_COUNT
+ return ControlNetUnit(*script_args[index_from:index_to])
+
+
+def update_cn_script_in_processing(
+ p: processing.StableDiffusionProcessing,
+ cn_units: List[ControlNetUnit],
+ is_img2img: Optional[bool] = None,
+ is_ui: Optional[bool] = None
+):
+ """
+ Update the arguments of the ControlNet script in `p.script_args` in place, reading from `cn_units`.
+ `cn_units` and its elements are not modified. You can call this function repeatedly, as many times as you want.
+
+ Does not update `p.script_args` if any of the folling is true:
+ - ControlNet is not present in `p.scripts`
+ - `p.script_args` is not filled with script arguments for scripts that are processed before ControlNet
+
+ Keyword arguments:
+ is_img2img -- whether to run the script as img2img. In general, this should be set to the appropriate value depending on the `StableDiffusionProcessing` subclass used for generating. If set to None, do not change existing value. (default None)
+ is_ui -- whether to run the script as if from the gradio interface. If set to None, do not change existing value. (default None)
+ """
+
+ cn_units_type = type(cn_units) if type(cn_units) in (list, tuple) else list
+ script_args = list(p.script_args)
+ update_cn_script_in_place(p.scripts, script_args, cn_units, is_img2img, is_ui)
+ p.script_args = cn_units_type(script_args)
+
+
+def update_cn_script_in_place(
+ script_runner: scripts.ScriptRunner,
+ script_args: List[Any],
+ cn_units: List[ControlNetUnit],
+ is_img2img: Optional[bool] = None,
+ is_ui: Optional[bool] = None,
+):
+ """
+ Update the arguments of the ControlNet script in `script_args` in place, reading from `cn_units`.
+ `cn_units` and its elements are not modified. You can call this function repeatedly, as many times as you want.
+
+ Does not update `script_args` if any of the folling is true:
+ - ControlNet is not present in `script_runner`
+ - `script_args` is not filled with script arguments for scripts that are processed before ControlNet
+
+ Keyword arguments:
+ is_img2img -- whether to run the script as img2img. In general, this should be set to the appropriate value depending on the `StableDiffusionProcessing` subclass used for generating. If set to None, do not change existing value. (default None)
+ is_ui -- whether to run the script as if from the gradio interface. If set to None, do not change existing value. (default None)
+ """
+
+ cn_script = find_cn_script(script_runner)
+ if cn_script is None or len(script_args) < cn_script.args_from:
+ return
+
+ cn_script_has_args = len(script_args[cn_script.args_from:cn_script.args_to]) > 0
+ if is_img2img is None:
+ is_img2img = script_args[cn_script.args_from] if cn_script_has_args else False
+ if is_ui is None:
+ is_ui = script_args[cn_script.args_from + 1] if cn_script_has_args else False
+
+ # fill in remaining parameters to satisfy max models, just in case script needs it.
+ max_models = shared.opts.data.get("control_net_max_models_num", 1)
+ cn_units = cn_units + [ControlNetUnit(enabled=False)] * max(max_models - len(cn_units), 0)
+
+ flattened_cn_args: List[Any] = [is_img2img, is_ui]
+ for unit in cn_units:
+ flattened_cn_args.extend((
+ unit.enabled,
+ unit.module if unit.module is not None else "none",
+ unit.model if unit.model is not None else "None",
+ unit.weight,
+ unit.image,
+ unit.invert_image,
+ unit.resize_mode,
+ unit.rgbbgr_mode,
+ unit.low_vram,
+ unit.processor_res,
+ unit.threshold_a,
+ unit.threshold_b,
+ unit.guidance_start,
+ unit.guidance_end,
+ unit.guess_mode))
+
+ cn_script_args_diff = 0
+ for script in script_runner.alwayson_scripts:
+ if script is cn_script:
+ cn_script_args_diff = len(flattened_cn_args) - (cn_script.args_to - cn_script.args_from)
+ script_args[script.args_from:script.args_to] = flattened_cn_args
+ script.args_to = script.args_from + len(flattened_cn_args)
+ else:
+ script.args_from += cn_script_args_diff
+ script.args_to += cn_script_args_diff
+
+
+def get_models(update: bool=False) -> List[str]:
+ """
+ Fetch the list of available models.
+ Each value is a valid candidate of `ControlNetUnit.model`.
+
+ Keyword arguments:
+ update -- Whether to refresh the list from disk. (default False)
+ """
+
+ if update:
+ update_cn_models()
+
+ return list(cn_models_names.values())
+
+
+def find_cn_script(script_runner: scripts.ScriptRunner) -> Optional[scripts.Script]:
+ """
+ Find the ControlNet script in `script_runner`. Returns `None` if `script_runner` does not contain a ControlNet script.
+ """
+
+ for script in script_runner.alwayson_scripts:
+ if is_cn_script(script):
+ return script
+
+
+def is_cn_script(script: scripts.Script) -> bool:
+ """
+ Determine whether `script` is a ControlNet script.
+ """
+
+ return script.title().lower() == 'controlnet'
diff --git a/extensions/sd-webui-controlnet/scripts/hook.py b/extensions/sd-webui-controlnet/scripts/hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..725adb0f7fcbfd296897476ce8b5aa1c78523f27
--- /dev/null
+++ b/extensions/sd-webui-controlnet/scripts/hook.py
@@ -0,0 +1,251 @@
+
+import torch
+import torch.nn as nn
+from modules import devices, lowvram, shared, scripts
+
+cond_cast_unet = getattr(devices, 'cond_cast_unet', lambda x: x)
+
+from ldm.modules.diffusionmodules.util import timestep_embedding
+from ldm.modules.diffusionmodules.openaimodel import UNetModel
+
+
+class TorchHijackForUnet:
+ """
+ This is torch, but with cat that resizes tensors to appropriate dimensions if they do not match;
+ this makes it possible to create pictures with dimensions that are multiples of 8 rather than 64
+ """
+
+ def __getattr__(self, item):
+ if item == 'cat':
+ return self.cat
+
+ if hasattr(torch, item):
+ return getattr(torch, item)
+
+ raise AttributeError("'{}' object has no attribute '{}'".format(type(self).__name__, item))
+
+ def cat(self, tensors, *args, **kwargs):
+ if len(tensors) == 2:
+ a, b = tensors
+ if a.shape[-2:] != b.shape[-2:]:
+ a = torch.nn.functional.interpolate(a, b.shape[-2:], mode="nearest")
+
+ tensors = (a, b)
+
+ return torch.cat(tensors, *args, **kwargs)
+
+
+th = TorchHijackForUnet()
+
+
+class ControlParams:
+ def __init__(
+ self,
+ control_model,
+ hint_cond,
+ guess_mode,
+ weight,
+ guidance_stopped,
+ start_guidance_percent,
+ stop_guidance_percent,
+ advanced_weighting,
+ is_adapter,
+ is_extra_cond
+ ):
+ self.control_model = control_model
+ self.hint_cond = hint_cond
+ self.guess_mode = guess_mode
+ self.weight = weight
+ self.guidance_stopped = guidance_stopped
+ self.start_guidance_percent = start_guidance_percent
+ self.stop_guidance_percent = stop_guidance_percent
+ self.advanced_weighting = advanced_weighting
+ self.is_adapter = is_adapter
+ self.is_extra_cond = is_extra_cond
+
+
+class UnetHook(nn.Module):
+ def __init__(self, lowvram=False) -> None:
+ super().__init__()
+ self.lowvram = lowvram
+ self.batch_cond_available = True
+ self.only_mid_control = shared.opts.data.get("control_net_only_mid_control", False)
+
+ def hook(self, model):
+ outer = self
+
+ def guidance_schedule_handler(x):
+ for param in self.control_params:
+ current_sampling_percent = (x.sampling_step / x.total_sampling_steps)
+ param.guidance_stopped = current_sampling_percent < param.start_guidance_percent or current_sampling_percent > param.stop_guidance_percent
+
+ def cfg_based_adder(base, x, require_autocast, is_adapter=False):
+ if isinstance(x, float):
+ return base + x
+
+ if require_autocast:
+ zeros = torch.zeros_like(base)
+ zeros[:, :x.shape[1], ...] = x
+ x = zeros
+
+ # assume the input format is [cond, uncond] and they have same shape
+ # see https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/0cc0ee1bcb4c24a8c9715f66cede06601bfc00c8/modules/sd_samplers_kdiffusion.py#L114
+ if base.shape[0] % 2 == 0 and (self.guess_mode or shared.opts.data.get("control_net_cfg_based_guidance", False)):
+ if self.is_vanilla_samplers:
+ uncond, cond = base.chunk(2)
+ if x.shape[0] % 2 == 0:
+ _, x_cond = x.chunk(2)
+ return torch.cat([uncond, cond + x_cond], dim=0)
+ if is_adapter:
+ return torch.cat([uncond, cond + x], dim=0)
+ else:
+ cond, uncond = base.chunk(2)
+ if x.shape[0] % 2 == 0:
+ x_cond, _ = x.chunk(2)
+ return torch.cat([cond + x_cond, uncond], dim=0)
+ if is_adapter:
+ return torch.cat([cond + x, uncond], dim=0)
+
+ return base + x
+
+ def forward(self, x, timesteps=None, context=None, **kwargs):
+ total_control = [0.0] * 13
+ total_adapter = [0.0] * 4
+ total_extra_cond = torch.zeros([0, context.shape[-1]]).to(devices.get_device_for("controlnet"))
+ only_mid_control = outer.only_mid_control
+ require_inpaint_hijack = False
+
+ # handle external cond first
+ for param in outer.control_params:
+ if param.guidance_stopped or not param.is_extra_cond:
+ continue
+ if outer.lowvram:
+ param.control_model.to(devices.get_device_for("controlnet"))
+ control = param.control_model(x=x, hint=param.hint_cond, timesteps=timesteps, context=context)
+ total_extra_cond = torch.cat([total_extra_cond, control.clone().squeeze(0) * param.weight])
+
+ # check if it's non-batch-cond mode (lowvram, edit model etc)
+ if context.shape[0] % 2 != 0 and outer.batch_cond_available:
+ outer.batch_cond_available = False
+ if len(total_extra_cond) > 0 or outer.guess_mode or shared.opts.data.get("control_net_cfg_based_guidance", False):
+ print("Warning: StyleAdapter and cfg/guess mode may not works due to non-batch-cond inference")
+
+ # concat styleadapter to cond, pad uncond to same length
+ if len(total_extra_cond) > 0 and outer.batch_cond_available:
+ total_extra_cond = torch.repeat_interleave(total_extra_cond.unsqueeze(0), context.shape[0] // 2, dim=0)
+ if outer.is_vanilla_samplers:
+ uncond, cond = context.chunk(2)
+ cond = torch.cat([cond, total_extra_cond], dim=1)
+ uncond = torch.cat([uncond, uncond[:, -total_extra_cond.shape[1]:, :]], dim=1)
+ context = torch.cat([uncond, cond], dim=0)
+ else:
+ cond, uncond = context.chunk(2)
+ cond = torch.cat([cond, total_extra_cond], dim=1)
+ uncond = torch.cat([uncond, uncond[:, -total_extra_cond.shape[1]:, :]], dim=1)
+ context = torch.cat([cond, uncond], dim=0)
+
+ # handle unet injection stuff
+ for param in outer.control_params:
+ if param.guidance_stopped or param.is_extra_cond:
+ continue
+ if outer.lowvram:
+ param.control_model.to(devices.get_device_for("controlnet"))
+
+ # hires stuffs
+ # note that this method may not works if hr_scale < 1.1
+ if abs(x.shape[-1] - param.hint_cond.shape[-1] // 8) > 8:
+ only_mid_control = shared.opts.data.get("control_net_only_midctrl_hires", True)
+ # If you want to completely disable control net, uncomment this.
+ # return self._original_forward(x, timesteps=timesteps, context=context, **kwargs)
+
+ # inpaint model workaround
+ x_in = x
+ control_model = param.control_model.control_model
+ if not param.is_adapter and x.shape[1] != control_model.input_blocks[0][0].in_channels and x.shape[1] == 9:
+ # inpaint_model: 4 data + 4 downscaled image + 1 mask
+ x_in = x[:, :4, ...]
+ require_inpaint_hijack = True
+
+ assert param.hint_cond is not None, f"Controlnet is enabled but no input image is given"
+ control = param.control_model(x=x_in, hint=param.hint_cond, timesteps=timesteps, context=context)
+ control_scales = ([param.weight] * 13)
+
+ if outer.lowvram:
+ param.control_model.to("cpu")
+ if param.guess_mode:
+ if param.is_adapter:
+ # see https://github.com/Mikubill/sd-webui-controlnet/issues/269
+ control_scales = param.weight * [0.25, 0.62, 0.825, 1.0]
+ else:
+ control_scales = [param.weight * (0.825 ** float(12 - i)) for i in range(13)]
+ if param.advanced_weighting is not None:
+ control_scales = param.advanced_weighting
+
+ control = [c * scale for c, scale in zip(control, control_scales)]
+ for idx, item in enumerate(control):
+ target = total_adapter if param.is_adapter else total_control
+ target[idx] += item
+
+ control = total_control
+ assert timesteps is not None, ValueError(f"insufficient timestep: {timesteps}")
+ hs = []
+ with th.no_grad():
+ t_emb = cond_cast_unet(timestep_embedding(timesteps, self.model_channels, repeat_only=False))
+ emb = self.time_embed(t_emb)
+ h = x.type(self.dtype)
+ for i, module in enumerate(self.input_blocks):
+ h = module(h, emb, context)
+
+ # t2i-adatper, same as openaimodel.py:744
+ if ((i+1)%3 == 0) and len(total_adapter):
+ h = cfg_based_adder(h, total_adapter.pop(0), require_inpaint_hijack, is_adapter=True)
+
+ hs.append(h)
+ h = self.middle_block(h, emb, context)
+
+ control_in = control.pop()
+ h = cfg_based_adder(h, control_in, require_inpaint_hijack)
+
+ for i, module in enumerate(self.output_blocks):
+ if only_mid_control:
+ hs_input = hs.pop()
+ h = th.cat([h, hs_input], dim=1)
+ else:
+ hs_input, control_input = hs.pop(), control.pop()
+ h = th.cat([h, cfg_based_adder(hs_input, control_input, require_inpaint_hijack)], dim=1)
+ h = module(h, emb, context)
+
+ h = h.type(x.dtype)
+ return self.out(h)
+
+ def forward2(*args, **kwargs):
+ # webui will handle other compoments
+ try:
+ if shared.cmd_opts.lowvram:
+ lowvram.send_everything_to_cpu()
+
+ return forward(*args, **kwargs)
+ finally:
+ if self.lowvram:
+ [param.control_model.to("cpu") for param in self.control_params]
+
+ model._original_forward = model.forward
+ model.forward = forward2.__get__(model, UNetModel)
+ scripts.script_callbacks.on_cfg_denoiser(guidance_schedule_handler)
+
+ def notify(self, params, is_vanilla_samplers): # lint: list[ControlParams]
+ self.is_vanilla_samplers = is_vanilla_samplers
+ self.control_params = params
+ self.guess_mode = any([param.guess_mode for param in params])
+
+ def restore(self, model):
+ scripts.script_callbacks.remove_current_script_callbacks()
+ if hasattr(self, "control_params"):
+ del self.control_params
+
+ if not hasattr(model, "_original_forward"):
+ # no such handle, ignore
+ return
+
+ model.forward = model._original_forward
+ del model._original_forward
diff --git a/extensions/sd-webui-controlnet/scripts/movie2movie.py b/extensions/sd-webui-controlnet/scripts/movie2movie.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d132a1d4a3a2daf96684dd4d4c04474194d9505
--- /dev/null
+++ b/extensions/sd-webui-controlnet/scripts/movie2movie.py
@@ -0,0 +1,117 @@
+import copy
+import os
+import shutil
+
+import cv2
+import gradio as gr
+import modules.scripts as scripts
+
+from modules import images
+from modules.processing import process_images
+from modules.shared import opts
+from PIL import Image
+
+
+def get_all_frames(video_path):
+ if video_path is None:
+ return None
+ cap = cv2.VideoCapture(video_path)
+ frame_list = []
+ if not cap.isOpened():
+ return
+ while True:
+ ret, frame = cap.read()
+ if ret:
+ frame_list.append(frame)
+ else:
+ return frame_list
+
+def get_min_frame_num(video_list):
+ min_frame_num = -1
+ for video in video_list:
+ if video is None:
+ continue
+ else:
+ frame_num = len(video)
+ print(frame_num)
+ if min_frame_num < 0:
+ min_frame_num = frame_num
+ elif frame_num < min_frame_num:
+ min_frame_num = frame_num
+ return min_frame_num
+
+def save_gif(path, image_list, name, duration):
+ tmp_dir = path + "/tmp/"
+ if os.path.isdir(tmp_dir):
+ shutil.rmtree(tmp_dir)
+ os.mkdir(tmp_dir)
+ for i, image in enumerate(image_list):
+ images.save_image(image, tmp_dir, f"output_{i}")
+
+ os.makedirs(path + "/controlnet-m2m", exist_ok=True)
+
+ image_list[0].save(path + f"/controlnet-m2m/{name}.gif", save_all=True, append_images=image_list[1:], optimize=False, duration=duration, loop=0)
+
+
+class Script(scripts.Script):
+
+ def title(self):
+ return "controlnet m2m"
+
+ def show(self, is_img2img):
+ return True
+
+ def ui(self, is_img2img):
+ # How the script's is displayed in the UI. See https://gradio.app/docs/#components
+ # for the different UI components you can use and how to create them.
+ # Most UI components can return a value, such as a boolean for a checkbox.
+ # The returned values are passed to the run method as parameters.
+
+ ctrls_group = ()
+ max_models = opts.data.get("control_net_max_models_num", 1)
+
+ with gr.Group():
+ with gr.Accordion("ControlNet-M2M", open = False):
+ with gr.Tabs():
+ for i in range(max_models):
+ with gr.Tab(f"ControlNet-{i}", open=False):
+ ctrls_group += (gr.Video(format='mp4', source='upload', elem_id = f"video_{i}"), )
+
+ duration = gr.Slider(label=f"Duration", value=50.0, minimum=10.0, maximum=200.0, step=10, interactive=True)
+ ctrls_group += (duration,)
+
+ return ctrls_group
+
+ def run(self, p, *args):
+ # This is where the additional processing is implemented. The parameters include
+ # self, the model object "p" (a StableDiffusionProcessing class, see
+ # processing.py), and the parameters returned by the ui method.
+ # Custom functions can be defined here, and additional libraries can be imported
+ # to be used in processing. The return value should be a Processed object, which is
+ # what is returned by the process_images method.
+ video_num = opts.data.get("control_net_max_models_num", 1)
+ video_list = [get_all_frames(video) for video in args[:video_num]]
+ duration, = args[video_num:]
+
+ frame_num = get_min_frame_num(video_list)
+ if frame_num > 0:
+ output_image_list = []
+ for frame in range(frame_num):
+ copy_p = copy.copy(p)
+ copy_p.control_net_input_image = []
+ for video in video_list:
+ if video is None:
+ continue
+ copy_p.control_net_input_image.append(video[frame])
+ proc = process_images(copy_p)
+ img = proc.images[0]
+ output_image_list.append(img)
+ copy_p.close()
+ # TODO: Generate new name for each movie2movie output
+ save_gif(p.outpath_samples, output_image_list, "animation", duration)
+ proc.images = [p.outpath_samples + "/controlnet-m2m/animation.gif"]
+
+ else:
+ proc = process_images(p)
+
+ return proc
\ No newline at end of file
diff --git a/extensions/sd-webui-controlnet/scripts/processor.py b/extensions/sd-webui-controlnet/scripts/processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..52d35da34caa9739714f07cd58a887f2623e7335
--- /dev/null
+++ b/extensions/sd-webui-controlnet/scripts/processor.py
@@ -0,0 +1,228 @@
+
+import numpy as np
+from annotator.util import resize_image, HWC3
+
+
+model_canny = None
+
+
+def canny(img, res=512, thr_a=100, thr_b=200, **kwargs):
+ l, h = thr_a, thr_b
+ img = resize_image(HWC3(img), res)
+ global model_canny
+ if model_canny is None:
+ from annotator.canny import apply_canny
+ model_canny = apply_canny
+ result = model_canny(img, l, h)
+ return result, True
+
+def simple_scribble(img, res=512, **kwargs):
+ img = resize_image(HWC3(img), res)
+ result = np.zeros_like(img, dtype=np.uint8)
+ result[np.min(img, axis=2) < 127] = 255
+ return result, True
+
+
+model_hed = None
+
+
+def hed(img, res=512, **kwargs):
+ img = resize_image(HWC3(img), res)
+ global model_hed
+ if model_hed is None:
+ from annotator.hed import apply_hed
+ model_hed = apply_hed
+ result = model_hed(img)
+ return result, True
+
+def unload_hed():
+ global model_hed
+ if model_hed is not None:
+ from annotator.hed import unload_hed_model
+ unload_hed_model()
+
+def fake_scribble(img, res=512, **kwargs):
+ result, _ = hed(img, res)
+ import cv2
+ from annotator.hed import nms
+ result = nms(result, 127, 3.0)
+ result = cv2.GaussianBlur(result, (0, 0), 3.0)
+ result[result > 10] = 255
+ result[result < 255] = 0
+ return result, True
+
+
+model_mlsd = None
+
+
+def mlsd(img, res=512, thr_a=0.1, thr_b=0.1, **kwargs):
+ thr_v, thr_d = thr_a, thr_b
+ img = resize_image(HWC3(img), res)
+ global model_mlsd
+ if model_mlsd is None:
+ from annotator.mlsd import apply_mlsd
+ model_mlsd = apply_mlsd
+ result = model_mlsd(img, thr_v, thr_d)
+ return result, True
+
+def unload_mlsd():
+ global model_mlsd
+ if model_mlsd is not None:
+ from annotator.mlsd import unload_mlsd_model
+ unload_mlsd_model()
+
+
+model_midas = None
+
+
+def midas(img, res=512, a=np.pi * 2.0, **kwargs):
+ img = resize_image(HWC3(img), res)
+ global model_midas
+ if model_midas is None:
+ from annotator.midas import apply_midas
+ model_midas = apply_midas
+ results, _ = model_midas(img, a)
+ return results, True
+
+def midas_normal(img, res=512, a=np.pi * 2.0, thr_a=0.4, **kwargs): # bg_th -> thr_a
+ bg_th = thr_a
+ img = resize_image(HWC3(img), res)
+ global model_midas
+ if model_midas is None:
+ from annotator.midas import apply_midas
+ model_midas = apply_midas
+ _, results = model_midas(img, a, bg_th)
+ return results, True
+
+def unload_midas():
+ global model_midas
+ if model_midas is not None:
+ from annotator.midas import unload_midas_model
+ unload_midas_model()
+
+model_leres = None
+
+def leres(img, res=512, a=np.pi * 2.0, thr_a=0, thr_b=0, **kwargs):
+ img = resize_image(HWC3(img), res)
+ global model_leres
+ if model_leres is None:
+ from annotator.leres import apply_leres
+ model_leres = apply_leres
+ results = model_leres(img, thr_a, thr_b)
+ return results, True
+
+def unload_leres():
+ global model_leres
+ if model_leres is not None:
+ from annotator.leres import unload_leres_model
+ unload_leres_model()
+
+model_openpose = None
+
+
+def openpose(img, res=512, has_hand=False, **kwargs):
+ img = resize_image(HWC3(img), res)
+ global model_openpose
+ if model_openpose is None:
+ from annotator.openpose import apply_openpose
+ model_openpose = apply_openpose
+ result, _ = model_openpose(img, has_hand)
+ return result, True
+
+def openpose_hand(img, res=512, has_hand=True, **kwargs):
+ img = resize_image(HWC3(img), res)
+ global model_openpose
+ if model_openpose is None:
+ from annotator.openpose import apply_openpose
+ model_openpose = apply_openpose
+ result, _ = model_openpose(img, has_hand)
+ return result, True
+
+def unload_openpose():
+ global model_openpose
+ if model_openpose is not None:
+ from annotator.openpose import unload_openpose_model
+ unload_openpose_model()
+
+
+model_uniformer = None
+
+
+def uniformer(img, res=512, **kwargs):
+ img = resize_image(HWC3(img), res)
+ global model_uniformer
+ if model_uniformer is None:
+ from annotator.uniformer import apply_uniformer
+ model_uniformer = apply_uniformer
+ result = model_uniformer(img)
+ return result, True
+
+def unload_uniformer():
+ global model_uniformer
+ if model_uniformer is not None:
+ from annotator.uniformer import unload_uniformer_model
+ unload_uniformer_model()
+
+
+model_pidinet = None
+
+
+def pidinet(img, res=512, **kwargs):
+ img = resize_image(HWC3(img), res)
+ global model_pidinet
+ if model_pidinet is None:
+ from annotator.pidinet import apply_pidinet
+ model_pidinet = apply_pidinet
+ result = model_pidinet(img)
+ return result, True
+
+def unload_pidinet():
+ global model_pidinet
+ if model_pidinet is not None:
+ from annotator.pidinet import unload_pid_model
+ unload_pid_model()
+
+
+clip_encoder = None
+
+
+def clip(img, res=512, **kwargs):
+ img = resize_image(HWC3(img), res)
+ global clip_encoder
+ if clip_encoder is None:
+ from annotator.clip import apply_clip
+ clip_encoder = apply_clip
+ result = clip_encoder(img).squeeze(0)
+ return result, False
+
+
+def unload_clip():
+ global clip_encoder
+ if clip_encoder is not None:
+ from annotator.clip import unload_clip_model
+ unload_clip_model()
+
+
+model_color = None
+
+
+def color(img, res=512, **kwargs):
+ global model_color
+ if model_color is None:
+ from annotator.color import apply_color
+ model_color = apply_color
+ result = model_color(img, res=res)
+ return result, True
+
+
+model_binary = None
+
+
+def binary(img, res=512, thr_a=0, **kwargs):
+ img = resize_image(HWC3(img), res)
+ global model_binary
+ if model_binary is None:
+ from annotator.binary import apply_binary
+ model_binary = apply_binary
+ result = model_binary(img, thr_a)
+ return result, True
\ No newline at end of file
diff --git a/extensions/sd-webui-controlnet/scripts/utils.py b/extensions/sd-webui-controlnet/scripts/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..915e638caece30dece3428ddb2c871f8c7199563
--- /dev/null
+++ b/extensions/sd-webui-controlnet/scripts/utils.py
@@ -0,0 +1,19 @@
+import torch
+import os
+
+
+def load_state_dict(ckpt_path, location='cpu'):
+ _, extension = os.path.splitext(ckpt_path)
+ if extension.lower() == ".safetensors":
+ import safetensors.torch
+ state_dict = safetensors.torch.load_file(ckpt_path, device=location)
+ else:
+ state_dict = get_state_dict(torch.load(
+ ckpt_path, map_location=torch.device(location)))
+ state_dict = get_state_dict(state_dict)
+ print(f'Loaded state_dict from [{ckpt_path}]')
+ return state_dict
+
+
+def get_state_dict(d):
+ return d.get('state_dict', d)
\ No newline at end of file
diff --git a/extensions/sd-webui-controlnet/scripts/xyz_grid_support.py b/extensions/sd-webui-controlnet/scripts/xyz_grid_support.py
new file mode 100644
index 0000000000000000000000000000000000000000..68693b6b4dc5c6417816e3345409420010a70c8a
--- /dev/null
+++ b/extensions/sd-webui-controlnet/scripts/xyz_grid_support.py
@@ -0,0 +1,467 @@
+import re
+import numpy as np
+
+from modules import scripts, shared
+
+try:
+ from scripts import controlnet
+except ImportError:
+ import_error = True
+else:
+ import_error = False
+
+DEBUG_MODE = False
+
+
+def debug_info(func):
+ def debug_info_(*args, **kwargs):
+ if DEBUG_MODE:
+ print(f"Debug info: {func.__name__}, {args}")
+ return func(*args, **kwargs)
+ return debug_info_
+
+
+def find_dict(dict_list, keyword, search_key="name", stop=False):
+ result = next((d for d in dict_list if d[search_key] == keyword), None)
+ if result or not stop:
+ return result
+ else:
+ raise ValueError(f"Dictionary with value '{keyword}' in key '{search_key}' not found.")
+
+
+def flatten_list(lst):
+ result = []
+ for element in lst:
+ if isinstance(element, list):
+ result.extend(flatten_list(element))
+ else:
+ result.append(element)
+ return result
+
+
+def is_all_included(target_list, check_list, allow_blank=False, stop=False):
+ for element in flatten_list(target_list):
+ if allow_blank and str(element) in ["None", ""]:
+ continue
+ elif element not in check_list:
+ if not stop:
+ return False
+ else:
+ raise ValueError(f"'{element}' is not included in check list.")
+ return True
+
+
+class ListParser():
+ """This class restores a broken list caused by the following process
+ in the xyz_grid module.
+ -> valslist = [x.strip() for x in chain.from_iterable(
+ csv.reader(StringIO(vals)))]
+ It also performs type conversion,
+ adjusts the number of elements in the list, and other operations.
+
+ This class directly modifies the received list.
+ """
+ numeric_pattern = {
+ int: {
+ "range": r"\s*([+-]?\s*\d+)\s*-\s*([+-]?\s*\d+)(?:\s*\(([+-]\d+)\s*\))?\s*",
+ "count": r"\s*([+-]?\s*\d+)\s*-\s*([+-]?\s*\d+)(?:\s*\[(\d+)\s*\])?\s*"
+ },
+ float: {
+ "range": r"\s*([+-]?\s*\d+(?:\.\d*)?)\s*-\s*([+-]?\s*\d+(?:\.\d*)?)(?:\s*\(([+-]\d+(?:\.\d*)?)\s*\))?\s*",
+ "count": r"\s*([+-]?\s*\d+(?:\.\d*)?)\s*-\s*([+-]?\s*\d+(?:\.\d*)?)(?:\s*\[(\d+(?:\.\d*)?)\s*\])?\s*"
+ }
+ }
+
+ ################################################
+ #
+ # Initialization method from here.
+ #
+ ################################################
+
+ def __init__(self, my_list, converter=None, allow_blank=True, exclude_list=None, run=True):
+ self.my_list = my_list
+ self.converter = converter
+ self.allow_blank = allow_blank
+ self.exclude_list = exclude_list
+ self.re_bracket_start = None
+ self.re_bracket_start_precheck = None
+ self.re_bracket_end = None
+ self.re_bracket_end_precheck = None
+ self.re_range = None
+ self.re_count = None
+ self.compile_regex()
+ if run:
+ self.auto_normalize()
+
+ def compile_regex(self):
+ exclude_pattern = "|".join(self.exclude_list) if self.exclude_list else None
+ if exclude_pattern is None:
+ self.re_bracket_start = re.compile(r"^\[")
+ self.re_bracket_end = re.compile(r"\]$")
+ else:
+ self.re_bracket_start = re.compile(fr"^\[(?!(?:{exclude_pattern})\])")
+ self.re_bracket_end = re.compile(fr"(? valslist = [opt.type(x) for x in valslist]
+ # Perform type conversion using the function
+ # set to the confirm attribute instead.
+ #
+ def identity(x):
+ return x
+
+ ################################################
+ # The confirm function defined in this module
+ # enables list notation and performs type conversion.
+ #
+ # Example:
+ # any = [any, any, any, ...]
+ # [any] = [any, None, None, ...]
+ # [None, None, any] = [None, None, any]
+ # [,,any] = [None, None, any]
+ # any, [,any,] = [any, any, any, ...], [None, any, None]
+ #
+ # Enabled Only:
+ # any = [any] = [any, None, None, ...]
+ # (any and [any] are considered equivalent)
+ #
+ def confirm(func_or_str):
+ @debug_info
+ def confirm_(p, xs):
+ if callable(func_or_str): # func_or_str is converter
+ ListParser(xs, func_or_str, allow_blank=True)
+ return
+
+ elif isinstance(func_or_str, str): # func_or_str is keyword
+ valid_data = find_dict(validation_data, func_or_str, stop=True)
+ converter = valid_data["type"]
+ exclude_list = valid_data["exclude"]() if valid_data["exclude"] else None
+ check_list = valid_data["check"]()
+
+ ListParser(xs, converter, allow_blank=True, exclude_list=exclude_list)
+ is_all_included(xs, check_list, allow_blank=True, stop=True)
+ return
+
+ else:
+ raise TypeError(f"Argument must be callable or str, not {type(func_or_str).__name__}.")
+
+ return confirm_
+
+ def bool_(string):
+ string = str(string)
+ if string in ["None", ""]:
+ return None
+ elif string.lower() in ["true", "1"]:
+ return True
+ elif string.lower() in ["false", "0"]:
+ return False
+ else:
+ raise ValueError(f"Could not convert string to boolean: {string}")
+
+ def choices_bool():
+ return ["False", "True"]
+
+ def choices_model():
+ controlnet.update_cn_models()
+ return list(controlnet.cn_models_names.values())
+
+ def choices_resize_mode():
+ return ["Envelope (Outer Fit)", "Scale to Fit (Inner Fit)", "Just Resize"]
+
+ def choices_preprocessor():
+ return list(controlnet.Script().preprocessor)
+
+ def make_excluded_list():
+ pattern = re.compile(r"\[(\w+)\]")
+ return [match.group(1) for s in choices_model()
+ for match in pattern.finditer(s)]
+
+ validation_data = [
+ {"name": "model", "type": str, "check": choices_model, "exclude": make_excluded_list},
+ {"name": "resize_mode", "type": str, "check": choices_resize_mode, "exclude": None},
+ {"name": "preprocessor", "type": str, "check": choices_preprocessor, "exclude": None},
+ ]
+
+ extra_axis_options = [
+ AxisOption("[ControlNet] Enabled", identity, apply_field("control_net_enabled"), confirm=confirm(bool_), choices=choices_bool),
+ AxisOption("[ControlNet] Model", identity, apply_field("control_net_model"), confirm=confirm("model"), choices=choices_model, cost=0.9),
+ AxisOption("[ControlNet] Weight", identity, apply_field("control_net_weight"), confirm=confirm(float)),
+ AxisOption("[ControlNet] Guidance Start", identity, apply_field("control_net_guidance_start"), confirm=confirm(float)),
+ AxisOption("[ControlNet] Guidance End", identity, apply_field("control_net_guidance_end"), confirm=confirm(float)),
+ AxisOption("[ControlNet] Resize Mode", identity, apply_field("control_net_resize_mode"), confirm=confirm("resize_mode"), choices=choices_resize_mode),
+ AxisOption("[ControlNet] Preprocessor", identity, apply_field("control_net_module"), confirm=confirm("preprocessor"), choices=choices_preprocessor),
+ AxisOption("[ControlNet] Pre Resolution", identity, apply_field("control_net_pres"), confirm=confirm(int)),
+ AxisOption("[ControlNet] Pre Threshold A", identity, apply_field("control_net_pthr_a"), confirm=confirm(float)),
+ AxisOption("[ControlNet] Pre Threshold B", identity, apply_field("control_net_pthr_b"), confirm=confirm(float)),
+ ]
+
+ xyz_grid.axis_options.extend(extra_axis_options)
+
+
+def run():
+ xyz_grid = find_module("xyz_grid.py, xy_grid.py")
+ if xyz_grid:
+ add_axis_options(xyz_grid)
+
+
+if not import_error:
+ run()