toto10 commited on Jul 30, 2023

Commit

e7908a1

•

1 Parent(s): f5d05f8

af5f9ea25cd84b6c327d58a09e9ee787fc974290ff2d7d5dfe22b54aad11d08d

Browse files

Files changed (50) hide show

extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/csrc/nms_rotated/nms_rotated_cpu.cpp +75 -0
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/csrc/nms_rotated/nms_rotated_cuda.cu +145 -0
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/csrc/vision.cpp +117 -0
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/deform_conv.py +514 -0
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/losses.py +133 -0
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/mask_ops.py +275 -0
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/nms.py +144 -0
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/roi_align.py +74 -0
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/roi_align_rotated.py +100 -0
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/rotated_boxes.py +21 -0
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/shape_spec.py +18 -0
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/wrappers.py +162 -0
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/model_zoo/__init__.py +10 -0
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/model_zoo/model_zoo.py +213 -0
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/__init__.py +64 -0
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/anchor_generator.py +386 -0
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/backbone/__init__.py +20 -0
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/backbone/backbone.py +74 -0
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/backbone/build.py +33 -0
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/backbone/fpn.py +268 -0
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/backbone/mvit.py +448 -0
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/backbone/regnet.py +452 -0
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/backbone/resnet.py +694 -0
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/backbone/swin.py +695 -0
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/backbone/utils.py +186 -0
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/backbone/vit.py +524 -0
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/box_regression.py +369 -0
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/matcher.py +127 -0
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/meta_arch/__init__.py +16 -0
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/meta_arch/build.py +24 -0
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/meta_arch/dense_detector.py +294 -0
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/meta_arch/fcos.py +328 -0
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/meta_arch/panoptic_fpn.py +269 -0
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/meta_arch/rcnn.py +341 -0
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/meta_arch/retinanet.py +439 -0
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/meta_arch/semantic_seg.py +267 -0
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/mmdet_wrapper.py +273 -0
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/poolers.py +263 -0
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/postprocessing.py +100 -0
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/proposal_generator/__init__.py +5 -0
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/proposal_generator/build.py +24 -0
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/proposal_generator/proposal_utils.py +205 -0
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/proposal_generator/rpn.py +533 -0
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/proposal_generator/rrpn.py +209 -0
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/roi_heads/__init__.py +29 -0
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/roi_heads/box_head.py +118 -0
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/roi_heads/cascade_rcnn.py +299 -0
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/roi_heads/fast_rcnn.py +569 -0
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/roi_heads/keypoint_head.py +272 -0
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/roi_heads/mask_head.py +298 -0

extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/csrc/nms_rotated/nms_rotated_cpu.cpp ADDED Viewed

	@@ -0,0 +1,75 @@

+// Copyright (c) Facebook, Inc. and its affiliates.
+#include "../box_iou_rotated/box_iou_rotated_utils.h"
+#include "nms_rotated.h"
+namespace detectron2 {
+template <typename scalar_t>
+at::Tensor nms_rotated_cpu_kernel(
+    const at::Tensor& dets,
+    const at::Tensor& scores,
+    const double iou_threshold) {
+  // nms_rotated_cpu_kernel is modified from torchvision's nms_cpu_kernel,
+  // however, the code in this function is much shorter because
+  // we delegate the IoU computation for rotated boxes to
+  // the single_box_iou_rotated function in box_iou_rotated_utils.h
+  AT_ASSERTM(dets.device().is_cpu(), "dets must be a CPU tensor");
+  AT_ASSERTM(scores.device().is_cpu(), "scores must be a CPU tensor");
+  AT_ASSERTM(
+      dets.scalar_type() == scores.scalar_type(),
+      "dets should have the same type as scores");
+  if (dets.numel() == 0) {
+    return at::empty({0}, dets.options().dtype(at::kLong));
+  }
+  auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
+  auto ndets = dets.size(0);
+  at::Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte));
+  at::Tensor keep_t = at::zeros({ndets}, dets.options().dtype(at::kLong));
+  auto suppressed = suppressed_t.data_ptr<uint8_t>();
+  auto keep = keep_t.data_ptr<int64_t>();
+  auto order = order_t.data_ptr<int64_t>();
+  int64_t num_to_keep = 0;
+  for (int64_t _i = 0; _i < ndets; _i++) {
+    auto i = order[_i];
+    if (suppressed[i] == 1) {
+      continue;
+    }
+    keep[num_to_keep++] = i;
+    for (int64_t _j = _i + 1; _j < ndets; _j++) {
+      auto j = order[_j];
+      if (suppressed[j] == 1) {
+        continue;
+      }
+      auto ovr = single_box_iou_rotated<scalar_t>(
+          dets[i].data_ptr<scalar_t>(), dets[j].data_ptr<scalar_t>());
+      if (ovr >= iou_threshold) {
+        suppressed[j] = 1;
+      }
+    }
+  }
+  return keep_t.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep);
+}
+at::Tensor nms_rotated_cpu(
+    // input must be contiguous
+    const at::Tensor& dets,
+    const at::Tensor& scores,
+    const double iou_threshold) {
+  auto result = at::empty({0}, dets.options());
+  AT_DISPATCH_FLOATING_TYPES(dets.scalar_type(), "nms_rotated", [&] {
+    result = nms_rotated_cpu_kernel<scalar_t>(dets, scores, iou_threshold);
+  });
+  return result;
+}
+} // namespace detectron2

extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/csrc/nms_rotated/nms_rotated_cuda.cu ADDED Viewed

	@@ -0,0 +1,145 @@

+// Copyright (c) Facebook, Inc. and its affiliates.
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+#ifdef WITH_CUDA
+#include "../box_iou_rotated/box_iou_rotated_utils.h"
+#endif
+// TODO avoid this when pytorch supports "same directory" hipification
+#ifdef WITH_HIP
+#include "box_iou_rotated/box_iou_rotated_utils.h"
+#endif
+using namespace detectron2;
+namespace {
+int const threadsPerBlock = sizeof(unsigned long long) * 8;
+}
+template <typename T>
+__global__ void nms_rotated_cuda_kernel(
+    const int n_boxes,
+    const double iou_threshold,
+    const T* dev_boxes,
+    unsigned long long* dev_mask) {
+  // nms_rotated_cuda_kernel is modified from torchvision's nms_cuda_kernel
+  const int row_start = blockIdx.y;
+  const int col_start = blockIdx.x;
+  // if (row_start > col_start) return;
+  const int row_size =
+      min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
+  const int col_size =
+      min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
+  // Compared to nms_cuda_kernel, where each box is represented with 4 values
+  // (x1, y1, x2, y2), each rotated box is represented with 5 values
+  // (x_center, y_center, width, height, angle_degrees) here.
+  __shared__ T block_boxes[threadsPerBlock * 5];
+  if (threadIdx.x < col_size) {
+    block_boxes[threadIdx.x * 5 + 0] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0];
+    block_boxes[threadIdx.x * 5 + 1] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1];
+    block_boxes[threadIdx.x * 5 + 2] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2];
+    block_boxes[threadIdx.x * 5 + 3] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3];
+    block_boxes[threadIdx.x * 5 + 4] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4];
+  }
+  __syncthreads();
+  if (threadIdx.x < row_size) {
+    const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
+    const T* cur_box = dev_boxes + cur_box_idx * 5;
+    int i = 0;
+    unsigned long long t = 0;
+    int start = 0;
+    if (row_start == col_start) {
+      start = threadIdx.x + 1;
+    }
+    for (i = start; i < col_size; i++) {
+      // Instead of devIoU used by original horizontal nms, here
+      // we use the single_box_iou_rotated function from box_iou_rotated_utils.h
+      if (single_box_iou_rotated<T>(cur_box, block_boxes + i * 5) >
+          iou_threshold) {
+        t |= 1ULL << i;
+      }
+    }
+    const int col_blocks = at::cuda::ATenCeilDiv(n_boxes, threadsPerBlock);
+    dev_mask[cur_box_idx * col_blocks + col_start] = t;
+  }
+}
+namespace detectron2 {
+at::Tensor nms_rotated_cuda(
+    // input must be contiguous
+    const at::Tensor& dets,
+    const at::Tensor& scores,
+    double iou_threshold) {
+  // using scalar_t = float;
+  AT_ASSERTM(dets.is_cuda(), "dets must be a CUDA tensor");
+  AT_ASSERTM(scores.is_cuda(), "scores must be a CUDA tensor");
+  at::cuda::CUDAGuard device_guard(dets.device());
+  auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
+  auto dets_sorted = dets.index_select(0, order_t);
+  auto dets_num = dets.size(0);
+  const int col_blocks =
+      at::cuda::ATenCeilDiv(static_cast<int>(dets_num), threadsPerBlock);
+  at::Tensor mask =
+      at::empty({dets_num * col_blocks}, dets.options().dtype(at::kLong));
+  dim3 blocks(col_blocks, col_blocks);
+  dim3 threads(threadsPerBlock);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES(
+      dets_sorted.scalar_type(), "nms_rotated_kernel_cuda", [&] {
+        nms_rotated_cuda_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
+            dets_num,
+            iou_threshold,
+            dets_sorted.data_ptr<scalar_t>(),
+            (unsigned long long*)mask.data_ptr<int64_t>());
+      });
+  at::Tensor mask_cpu = mask.to(at::kCPU);
+  unsigned long long* mask_host =
+      (unsigned long long*)mask_cpu.data_ptr<int64_t>();
+  std::vector<unsigned long long> remv(col_blocks);
+  memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
+  at::Tensor keep =
+      at::empty({dets_num}, dets.options().dtype(at::kLong).device(at::kCPU));
+  int64_t* keep_out = keep.data_ptr<int64_t>();
+  int num_to_keep = 0;
+  for (int i = 0; i < dets_num; i++) {
+    int nblock = i / threadsPerBlock;
+    int inblock = i % threadsPerBlock;
+    if (!(remv[nblock] & (1ULL << inblock))) {
+      keep_out[num_to_keep++] = i;
+      unsigned long long* p = mask_host + i * col_blocks;
+      for (int j = nblock; j < col_blocks; j++) {
+        remv[j] |= p[j];
+      }
+    }
+  }
+  AT_CUDA_CHECK(cudaGetLastError());
+  return order_t.index(
+      {keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep)
+           .to(order_t.device(), keep.scalar_type())});
+}
+} // namespace detectron2

extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/csrc/vision.cpp ADDED Viewed

	@@ -0,0 +1,117 @@

+// Copyright (c) Facebook, Inc. and its affiliates.
+#include <torch/extension.h>
+#include "ROIAlignRotated/ROIAlignRotated.h"
+#include "box_iou_rotated/box_iou_rotated.h"
+#include "cocoeval/cocoeval.h"
+#include "deformable/deform_conv.h"
+#include "nms_rotated/nms_rotated.h"
+namespace detectron2 {
+#if defined(WITH_CUDA) || defined(WITH_HIP)
+extern int get_cudart_version();
+#endif
+std::string get_cuda_version() {
+#if defined(WITH_CUDA) || defined(WITH_HIP)
+  std::ostringstream oss;
+#if defined(WITH_CUDA)
+  oss << "CUDA ";
+#else
+  oss << "HIP ";
+#endif
+  // copied from
+  // https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/cuda/detail/CUDAHooks.cpp#L231
+  auto printCudaStyleVersion = [&](int v) {
+    oss << (v / 1000) << "." << (v / 10 % 100);
+    if (v % 10 != 0) {
+      oss << "." << (v % 10);
+    }
+  };
+  printCudaStyleVersion(get_cudart_version());
+  return oss.str();
+#else // neither CUDA nor HIP
+  return std::string("not available");
+#endif
+}
+bool has_cuda() {
+#if defined(WITH_CUDA)
+  return true;
+#else
+  return false;
+#endif
+}
+// similar to
+// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Version.cpp
+std::string get_compiler_version() {
+  std::ostringstream ss;
+#if defined(__GNUC__)
+#ifndef __clang__
+#if ((__GNUC__ <= 4) && (__GNUC_MINOR__ <= 8))
+#error "GCC >= 4.9 is required!"
+#endif
+  { ss << "GCC " << __GNUC__ << "." << __GNUC_MINOR__; }
+#endif
+#endif
+#if defined(__clang_major__)
+  {
+    ss << "clang " << __clang_major__ << "." << __clang_minor__ << "."
+       << __clang_patchlevel__;
+  }
+#endif
+#if defined(_MSC_VER)
+  { ss << "MSVC " << _MSC_FULL_VER; }
+#endif
+  return ss.str();
+}
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("get_compiler_version", &get_compiler_version, "get_compiler_version");
+  m.def("get_cuda_version", &get_cuda_version, "get_cuda_version");
+  m.def("has_cuda", &has_cuda, "has_cuda");
+  m.def("deform_conv_forward", &deform_conv_forward, "deform_conv_forward");
+  m.def(
+      "deform_conv_backward_input",
+      &deform_conv_backward_input,
+      "deform_conv_backward_input");
+  m.def(
+      "deform_conv_backward_filter",
+      &deform_conv_backward_filter,
+      "deform_conv_backward_filter");
+  m.def(
+      "modulated_deform_conv_forward",
+      &modulated_deform_conv_forward,
+      "modulated_deform_conv_forward");
+  m.def(
+      "modulated_deform_conv_backward",
+      &modulated_deform_conv_backward,
+      "modulated_deform_conv_backward");
+  m.def("COCOevalAccumulate", &COCOeval::Accumulate, "COCOeval::Accumulate");
+  m.def(
+      "COCOevalEvaluateImages",
+      &COCOeval::EvaluateImages,
+      "COCOeval::EvaluateImages");
+  pybind11::class_<COCOeval::InstanceAnnotation>(m, "InstanceAnnotation")
+      .def(pybind11::init<uint64_t, double, double, bool, bool>());
+  pybind11::class_<COCOeval::ImageEvaluation>(m, "ImageEvaluation")
+      .def(pybind11::init<>());
+}
+TORCH_LIBRARY(detectron2, m) {
+  m.def("nms_rotated", &nms_rotated);
+  m.def("box_iou_rotated", &box_iou_rotated);
+  m.def("roi_align_rotated_forward", &ROIAlignRotated_forward);
+  m.def("roi_align_rotated_backward", &ROIAlignRotated_backward);
+}
+} // namespace detectron2

extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/deform_conv.py ADDED Viewed

	@@ -0,0 +1,514 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+import math
+from functools import lru_cache
+import torch
+from torch import nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair
+from torchvision.ops import deform_conv2d
+from annotator.oneformer.detectron2.utils.develop import create_dummy_class, create_dummy_func
+from .wrappers import _NewEmptyTensorOp
+class _DeformConv(Function):
+    @staticmethod
+    def forward(
+        ctx,
+        input,
+        offset,
+        weight,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        deformable_groups=1,
+        im2col_step=64,
+    ):
+        if input is not None and input.dim() != 4:
+            raise ValueError(
+                "Expected 4D tensor as input, got {}D tensor instead.".format(input.dim())
+            )
+        ctx.stride = _pair(stride)
+        ctx.padding = _pair(padding)
+        ctx.dilation = _pair(dilation)
+        ctx.groups = groups
+        ctx.deformable_groups = deformable_groups
+        ctx.im2col_step = im2col_step
+        ctx.save_for_backward(input, offset, weight)
+        output = input.new_empty(
+            _DeformConv._output_size(input, weight, ctx.padding, ctx.dilation, ctx.stride)
+        )
+        ctx.bufs_ = [input.new_empty(0), input.new_empty(0)]  # columns, ones
+        if not input.is_cuda:
+            # TODO: let torchvision support full features of our deformconv.
+            if deformable_groups != 1:
+                raise NotImplementedError(
+                    "Deformable Conv with deformable_groups != 1 is not supported on CPUs!"
+                )
+            return deform_conv2d(
+                input, offset, weight, stride=stride, padding=padding, dilation=dilation
+            )
+        else:
+            cur_im2col_step = _DeformConv._cal_im2col_step(input.shape[0], ctx.im2col_step)
+            assert (input.shape[0] % cur_im2col_step) == 0, "im2col step must divide batchsize"
+            _C.deform_conv_forward(
+                input,
+                weight,
+                offset,
+                output,
+                ctx.bufs_[0],
+                ctx.bufs_[1],
+                weight.size(3),
+                weight.size(2),
+                ctx.stride[1],
+                ctx.stride[0],
+                ctx.padding[1],
+                ctx.padding[0],
+                ctx.dilation[1],
+                ctx.dilation[0],
+                ctx.groups,
+                ctx.deformable_groups,
+                cur_im2col_step,
+            )
+        return output
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        input, offset, weight = ctx.saved_tensors
+        grad_input = grad_offset = grad_weight = None
+        if not grad_output.is_cuda:
+            raise NotImplementedError("Deformable Conv is not supported on CPUs!")
+        else:
+            cur_im2col_step = _DeformConv._cal_im2col_step(input.shape[0], ctx.im2col_step)
+            assert (input.shape[0] % cur_im2col_step) == 0, "im2col step must divide batchsize"
+            if ctx.needs_input_grad[0] or ctx.needs_input_grad[1]:
+                grad_input = torch.zeros_like(input)
+                grad_offset = torch.zeros_like(offset)
+                _C.deform_conv_backward_input(
+                    input,
+                    offset,
+                    grad_output,
+                    grad_input,
+                    grad_offset,
+                    weight,
+                    ctx.bufs_[0],
+                    weight.size(3),
+                    weight.size(2),
+                    ctx.stride[1],
+                    ctx.stride[0],
+                    ctx.padding[1],
+                    ctx.padding[0],
+                    ctx.dilation[1],
+                    ctx.dilation[0],
+                    ctx.groups,
+                    ctx.deformable_groups,
+                    cur_im2col_step,
+                )
+            if ctx.needs_input_grad[2]:
+                grad_weight = torch.zeros_like(weight)
+                _C.deform_conv_backward_filter(
+                    input,
+                    offset,
+                    grad_output,
+                    grad_weight,
+                    ctx.bufs_[0],
+                    ctx.bufs_[1],
+                    weight.size(3),
+                    weight.size(2),
+                    ctx.stride[1],
+                    ctx.stride[0],
+                    ctx.padding[1],
+                    ctx.padding[0],
+                    ctx.dilation[1],
+                    ctx.dilation[0],
+                    ctx.groups,
+                    ctx.deformable_groups,
+                    1,
+                    cur_im2col_step,
+                )
+        return grad_input, grad_offset, grad_weight, None, None, None, None, None, None
+    @staticmethod
+    def _output_size(input, weight, padding, dilation, stride):
+        channels = weight.size(0)
+        output_size = (input.size(0), channels)
+        for d in range(input.dim() - 2):
+            in_size = input.size(d + 2)
+            pad = padding[d]
+            kernel = dilation[d] * (weight.size(d + 2) - 1) + 1
+            stride_ = stride[d]
+            output_size += ((in_size + (2 * pad) - kernel) // stride_ + 1,)
+        if not all(map(lambda s: s > 0, output_size)):
+            raise ValueError(
+                "convolution input is too small (output would be {})".format(
+                    "x".join(map(str, output_size))
+                )
+            )
+        return output_size
+    @staticmethod
+    @lru_cache(maxsize=128)
+    def _cal_im2col_step(input_size, default_size):
+        """
+        Calculate proper im2col step size, which should be divisible by input_size and not larger
+        than prefer_size. Meanwhile the step size should be as large as possible to be more
+        efficient. So we choose the largest one among all divisors of input_size which are smaller
+        than prefer_size.
+        :param input_size: input batch size .
+        :param default_size: default preferred im2col step size.
+        :return: the largest proper step size.
+        """
+        if input_size <= default_size:
+            return input_size
+        best_step = 1
+        for step in range(2, min(int(math.sqrt(input_size)) + 1, default_size)):
+            if input_size % step == 0:
+                if input_size // step <= default_size:
+                    return input_size // step
+                best_step = step
+        return best_step
+class _ModulatedDeformConv(Function):
+    @staticmethod
+    def forward(
+        ctx,
+        input,
+        offset,
+        mask,
+        weight,
+        bias=None,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        deformable_groups=1,
+    ):
+        ctx.stride = stride
+        ctx.padding = padding
+        ctx.dilation = dilation
+        ctx.groups = groups
+        ctx.deformable_groups = deformable_groups
+        ctx.with_bias = bias is not None
+        if not ctx.with_bias:
+            bias = input.new_empty(1)  # fake tensor
+        if not input.is_cuda:
+            raise NotImplementedError("Deformable Conv is not supported on CPUs!")
+        if (
+            weight.requires_grad
+            or mask.requires_grad
+            or offset.requires_grad
+            or input.requires_grad
+        ):
+            ctx.save_for_backward(input, offset, mask, weight, bias)
+        output = input.new_empty(_ModulatedDeformConv._infer_shape(ctx, input, weight))
+        ctx._bufs = [input.new_empty(0), input.new_empty(0)]
+        _C.modulated_deform_conv_forward(
+            input,
+            weight,
+            bias,
+            ctx._bufs[0],
+            offset,
+            mask,
+            output,
+            ctx._bufs[1],
+            weight.shape[2],
+            weight.shape[3],
+            ctx.stride,
+            ctx.stride,
+            ctx.padding,
+            ctx.padding,
+            ctx.dilation,
+            ctx.dilation,
+            ctx.groups,
+            ctx.deformable_groups,
+            ctx.with_bias,
+        )
+        return output
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        if not grad_output.is_cuda:
+            raise NotImplementedError("Deformable Conv is not supported on CPUs!")
+        input, offset, mask, weight, bias = ctx.saved_tensors
+        grad_input = torch.zeros_like(input)
+        grad_offset = torch.zeros_like(offset)
+        grad_mask = torch.zeros_like(mask)
+        grad_weight = torch.zeros_like(weight)
+        grad_bias = torch.zeros_like(bias)
+        _C.modulated_deform_conv_backward(
+            input,
+            weight,
+            bias,
+            ctx._bufs[0],
+            offset,
+            mask,
+            ctx._bufs[1],
+            grad_input,
+            grad_weight,
+            grad_bias,
+            grad_offset,
+            grad_mask,
+            grad_output,
+            weight.shape[2],
+            weight.shape[3],
+            ctx.stride,
+            ctx.stride,
+            ctx.padding,
+            ctx.padding,
+            ctx.dilation,
+            ctx.dilation,
+            ctx.groups,
+            ctx.deformable_groups,
+            ctx.with_bias,
+        )
+        if not ctx.with_bias:
+            grad_bias = None
+        return (
+            grad_input,
+            grad_offset,
+            grad_mask,
+            grad_weight,
+            grad_bias,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
+    @staticmethod
+    def _infer_shape(ctx, input, weight):
+        n = input.size(0)
+        channels_out = weight.size(0)
+        height, width = input.shape[2:4]
+        kernel_h, kernel_w = weight.shape[2:4]
+        height_out = (
+            height + 2 * ctx.padding - (ctx.dilation * (kernel_h - 1) + 1)
+        ) // ctx.stride + 1
+        width_out = (
+            width + 2 * ctx.padding - (ctx.dilation * (kernel_w - 1) + 1)
+        ) // ctx.stride + 1
+        return n, channels_out, height_out, width_out
+deform_conv = _DeformConv.apply
+modulated_deform_conv = _ModulatedDeformConv.apply
+class DeformConv(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        deformable_groups=1,
+        bias=False,
+        norm=None,
+        activation=None,
+    ):
+        """
+        Deformable convolution from :paper:`deformconv`.
+        Arguments are similar to :class:`Conv2D`. Extra arguments:
+        Args:
+            deformable_groups (int): number of groups used in deformable convolution.
+            norm (nn.Module, optional): a normalization layer
+            activation (callable(Tensor) -> Tensor): a callable activation function
+        """
+        super(DeformConv, self).__init__()
+        assert not bias
+        assert in_channels % groups == 0, "in_channels {} cannot be divisible by groups {}".format(
+            in_channels, groups
+        )
+        assert (
+            out_channels % groups == 0
+        ), "out_channels {} cannot be divisible by groups {}".format(out_channels, groups)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = _pair(kernel_size)
+        self.stride = _pair(stride)
+        self.padding = _pair(padding)
+        self.dilation = _pair(dilation)
+        self.groups = groups
+        self.deformable_groups = deformable_groups
+        self.norm = norm
+        self.activation = activation
+        self.weight = nn.Parameter(
+            torch.Tensor(out_channels, in_channels // self.groups, *self.kernel_size)
+        )
+        self.bias = None
+        nn.init.kaiming_uniform_(self.weight, nonlinearity="relu")
+    def forward(self, x, offset):
+        if x.numel() == 0:
+            # When input is empty, we want to return a empty tensor with "correct" shape,
+            # So that the following operations will not panic
+            # if they check for the shape of the tensor.
+            # This computes the height and width of the output tensor
+            output_shape = [
+                (i + 2 * p - (di * (k - 1) + 1)) // s + 1
+                for i, p, di, k, s in zip(
+                    x.shape[-2:], self.padding, self.dilation, self.kernel_size, self.stride
+                )
+            ]
+            output_shape = [x.shape[0], self.weight.shape[0]] + output_shape
+            return _NewEmptyTensorOp.apply(x, output_shape)
+        x = deform_conv(
+            x,
+            offset,
+            self.weight,
+            self.stride,
+            self.padding,
+            self.dilation,
+            self.groups,
+            self.deformable_groups,
+        )
+        if self.norm is not None:
+            x = self.norm(x)
+        if self.activation is not None:
+            x = self.activation(x)
+        return x
+    def extra_repr(self):
+        tmpstr = "in_channels=" + str(self.in_channels)
+        tmpstr += ", out_channels=" + str(self.out_channels)
+        tmpstr += ", kernel_size=" + str(self.kernel_size)
+        tmpstr += ", stride=" + str(self.stride)
+        tmpstr += ", padding=" + str(self.padding)
+        tmpstr += ", dilation=" + str(self.dilation)
+        tmpstr += ", groups=" + str(self.groups)
+        tmpstr += ", deformable_groups=" + str(self.deformable_groups)
+        tmpstr += ", bias=False"
+        return tmpstr
+class ModulatedDeformConv(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        deformable_groups=1,
+        bias=True,
+        norm=None,
+        activation=None,
+    ):
+        """
+        Modulated deformable convolution from :paper:`deformconv2`.
+        Arguments are similar to :class:`Conv2D`. Extra arguments:
+        Args:
+            deformable_groups (int): number of groups used in deformable convolution.
+            norm (nn.Module, optional): a normalization layer
+            activation (callable(Tensor) -> Tensor): a callable activation function
+        """
+        super(ModulatedDeformConv, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = _pair(kernel_size)
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.groups = groups
+        self.deformable_groups = deformable_groups
+        self.with_bias = bias
+        self.norm = norm
+        self.activation = activation
+        self.weight = nn.Parameter(
+            torch.Tensor(out_channels, in_channels // groups, *self.kernel_size)
+        )
+        if bias:
+            self.bias = nn.Parameter(torch.Tensor(out_channels))
+        else:
+            self.bias = None
+        nn.init.kaiming_uniform_(self.weight, nonlinearity="relu")
+        if self.bias is not None:
+            nn.init.constant_(self.bias, 0)
+    def forward(self, x, offset, mask):
+        if x.numel() == 0:
+            output_shape = [
+                (i + 2 * p - (di * (k - 1) + 1)) // s + 1
+                for i, p, di, k, s in zip(
+                    x.shape[-2:], self.padding, self.dilation, self.kernel_size, self.stride
+                )
+            ]
+            output_shape = [x.shape[0], self.weight.shape[0]] + output_shape
+            return _NewEmptyTensorOp.apply(x, output_shape)
+        x = modulated_deform_conv(
+            x,
+            offset,
+            mask,
+            self.weight,
+            self.bias,
+            self.stride,
+            self.padding,
+            self.dilation,
+            self.groups,
+            self.deformable_groups,
+        )
+        if self.norm is not None:
+            x = self.norm(x)
+        if self.activation is not None:
+            x = self.activation(x)
+        return x
+    def extra_repr(self):
+        tmpstr = "in_channels=" + str(self.in_channels)
+        tmpstr += ", out_channels=" + str(self.out_channels)
+        tmpstr += ", kernel_size=" + str(self.kernel_size)
+        tmpstr += ", stride=" + str(self.stride)
+        tmpstr += ", padding=" + str(self.padding)
+        tmpstr += ", dilation=" + str(self.dilation)
+        tmpstr += ", groups=" + str(self.groups)
+        tmpstr += ", deformable_groups=" + str(self.deformable_groups)
+        tmpstr += ", bias=" + str(self.with_bias)
+        return tmpstr
+try:
+    from annotator.oneformer.detectron2 import _C
+except ImportError:
+    # TODO: register ops natively so there is no need to import _C.
+    _msg = "detectron2 is not compiled successfully, please build following the instructions!"
+    _args = ("detectron2._C", _msg)
+    DeformConv = create_dummy_class("DeformConv", *_args)
+    ModulatedDeformConv = create_dummy_class("ModulatedDeformConv", *_args)
+    deform_conv = create_dummy_func("deform_conv", *_args)
+    modulated_deform_conv = create_dummy_func("modulated_deform_conv", *_args)

extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/losses.py ADDED Viewed

	@@ -0,0 +1,133 @@

+import math
+import torch
+def diou_loss(
+    boxes1: torch.Tensor,
+    boxes2: torch.Tensor,
+    reduction: str = "none",
+    eps: float = 1e-7,
+) -> torch.Tensor:
+    """
+    Distance Intersection over Union Loss (Zhaohui Zheng et. al)
+    https://arxiv.org/abs/1911.08287
+    Args:
+        boxes1, boxes2 (Tensor): box locations in XYXY format, shape (N, 4) or (4,).
+        reduction: 'none' | 'mean' | 'sum'
+                 'none': No reduction will be applied to the output.
+                 'mean': The output will be averaged.
+                 'sum': The output will be summed.
+        eps (float): small number to prevent division by zero
+    """
+    x1, y1, x2, y2 = boxes1.unbind(dim=-1)
+    x1g, y1g, x2g, y2g = boxes2.unbind(dim=-1)
+    # TODO: use torch._assert_async() when pytorch 1.8 support is dropped
+    assert (x2 >= x1).all(), "bad box: x1 larger than x2"
+    assert (y2 >= y1).all(), "bad box: y1 larger than y2"
+    # Intersection keypoints
+    xkis1 = torch.max(x1, x1g)
+    ykis1 = torch.max(y1, y1g)
+    xkis2 = torch.min(x2, x2g)
+    ykis2 = torch.min(y2, y2g)
+    intsct = torch.zeros_like(x1)
+    mask = (ykis2 > ykis1) & (xkis2 > xkis1)
+    intsct[mask] = (xkis2[mask] - xkis1[mask]) * (ykis2[mask] - ykis1[mask])
+    union = (x2 - x1) * (y2 - y1) + (x2g - x1g) * (y2g - y1g) - intsct + eps
+    iou = intsct / union
+    # smallest enclosing box
+    xc1 = torch.min(x1, x1g)
+    yc1 = torch.min(y1, y1g)
+    xc2 = torch.max(x2, x2g)
+    yc2 = torch.max(y2, y2g)
+    diag_len = ((xc2 - xc1) ** 2) + ((yc2 - yc1) ** 2) + eps
+    # centers of boxes
+    x_p = (x2 + x1) / 2
+    y_p = (y2 + y1) / 2
+    x_g = (x1g + x2g) / 2
+    y_g = (y1g + y2g) / 2
+    distance = ((x_p - x_g) ** 2) + ((y_p - y_g) ** 2)
+    # Eqn. (7)
+    loss = 1 - iou + (distance / diag_len)
+    if reduction == "mean":
+        loss = loss.mean() if loss.numel() > 0 else 0.0 * loss.sum()
+    elif reduction == "sum":
+        loss = loss.sum()
+    return loss
+def ciou_loss(
+    boxes1: torch.Tensor,
+    boxes2: torch.Tensor,
+    reduction: str = "none",
+    eps: float = 1e-7,
+) -> torch.Tensor:
+    """
+    Complete Intersection over Union Loss (Zhaohui Zheng et. al)
+    https://arxiv.org/abs/1911.08287
+    Args:
+        boxes1, boxes2 (Tensor): box locations in XYXY format, shape (N, 4) or (4,).
+        reduction: 'none' | 'mean' | 'sum'
+                 'none': No reduction will be applied to the output.
+                 'mean': The output will be averaged.
+                 'sum': The output will be summed.
+        eps (float): small number to prevent division by zero
+    """
+    x1, y1, x2, y2 = boxes1.unbind(dim=-1)
+    x1g, y1g, x2g, y2g = boxes2.unbind(dim=-1)
+    # TODO: use torch._assert_async() when pytorch 1.8 support is dropped
+    assert (x2 >= x1).all(), "bad box: x1 larger than x2"
+    assert (y2 >= y1).all(), "bad box: y1 larger than y2"
+    # Intersection keypoints
+    xkis1 = torch.max(x1, x1g)
+    ykis1 = torch.max(y1, y1g)
+    xkis2 = torch.min(x2, x2g)
+    ykis2 = torch.min(y2, y2g)
+    intsct = torch.zeros_like(x1)
+    mask = (ykis2 > ykis1) & (xkis2 > xkis1)
+    intsct[mask] = (xkis2[mask] - xkis1[mask]) * (ykis2[mask] - ykis1[mask])
+    union = (x2 - x1) * (y2 - y1) + (x2g - x1g) * (y2g - y1g) - intsct + eps
+    iou = intsct / union
+    # smallest enclosing box
+    xc1 = torch.min(x1, x1g)
+    yc1 = torch.min(y1, y1g)
+    xc2 = torch.max(x2, x2g)
+    yc2 = torch.max(y2, y2g)
+    diag_len = ((xc2 - xc1) ** 2) + ((yc2 - yc1) ** 2) + eps
+    # centers of boxes
+    x_p = (x2 + x1) / 2
+    y_p = (y2 + y1) / 2
+    x_g = (x1g + x2g) / 2
+    y_g = (y1g + y2g) / 2
+    distance = ((x_p - x_g) ** 2) + ((y_p - y_g) ** 2)
+    # width and height of boxes
+    w_pred = x2 - x1
+    h_pred = y2 - y1
+    w_gt = x2g - x1g
+    h_gt = y2g - y1g
+    v = (4 / (math.pi**2)) * torch.pow((torch.atan(w_gt / h_gt) - torch.atan(w_pred / h_pred)), 2)
+    with torch.no_grad():
+        alpha = v / (1 - iou + v + eps)
+    # Eqn. (10)
+    loss = 1 - iou + (distance / diag_len) + alpha * v
+    if reduction == "mean":
+        loss = loss.mean() if loss.numel() > 0 else 0.0 * loss.sum()
+    elif reduction == "sum":
+        loss = loss.sum()
+    return loss

extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/mask_ops.py ADDED Viewed

	@@ -0,0 +1,275 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+import numpy as np
+from typing import Tuple
+import torch
+from PIL import Image
+from torch.nn import functional as F
+__all__ = ["paste_masks_in_image"]
+BYTES_PER_FLOAT = 4
+# TODO: This memory limit may be too much or too little. It would be better to
+# determine it based on available resources.
+GPU_MEM_LIMIT = 1024**3  # 1 GB memory limit
+def _do_paste_mask(masks, boxes, img_h: int, img_w: int, skip_empty: bool = True):
+    """
+    Args:
+        masks: N, 1, H, W
+        boxes: N, 4
+        img_h, img_w (int):
+        skip_empty (bool): only paste masks within the region that
+            tightly bound all boxes, and returns the results this region only.
+            An important optimization for CPU.
+    Returns:
+        if skip_empty == False, a mask of shape (N, img_h, img_w)
+        if skip_empty == True, a mask of shape (N, h', w'), and the slice
+            object for the corresponding region.
+    """
+    # On GPU, paste all masks together (up to chunk size)
+    # by using the entire image to sample the masks
+    # Compared to pasting them one by one,
+    # this has more operations but is faster on COCO-scale dataset.
+    device = masks.device
+    if skip_empty and not torch.jit.is_scripting():
+        x0_int, y0_int = torch.clamp(boxes.min(dim=0).values.floor()[:2] - 1, min=0).to(
+            dtype=torch.int32
+        )
+        x1_int = torch.clamp(boxes[:, 2].max().ceil() + 1, max=img_w).to(dtype=torch.int32)
+        y1_int = torch.clamp(boxes[:, 3].max().ceil() + 1, max=img_h).to(dtype=torch.int32)
+    else:
+        x0_int, y0_int = 0, 0
+        x1_int, y1_int = img_w, img_h
+    x0, y0, x1, y1 = torch.split(boxes, 1, dim=1)  # each is Nx1
+    N = masks.shape[0]
+    img_y = torch.arange(y0_int, y1_int, device=device, dtype=torch.float32) + 0.5
+    img_x = torch.arange(x0_int, x1_int, device=device, dtype=torch.float32) + 0.5
+    img_y = (img_y - y0) / (y1 - y0) * 2 - 1
+    img_x = (img_x - x0) / (x1 - x0) * 2 - 1
+    # img_x, img_y have shapes (N, w), (N, h)
+    gx = img_x[:, None, :].expand(N, img_y.size(1), img_x.size(1))
+    gy = img_y[:, :, None].expand(N, img_y.size(1), img_x.size(1))
+    grid = torch.stack([gx, gy], dim=3)
+    if not torch.jit.is_scripting():
+        if not masks.dtype.is_floating_point:
+            masks = masks.float()
+    img_masks = F.grid_sample(masks, grid.to(masks.dtype), align_corners=False)
+    if skip_empty and not torch.jit.is_scripting():
+        return img_masks[:, 0], (slice(y0_int, y1_int), slice(x0_int, x1_int))
+    else:
+        return img_masks[:, 0], ()
+# Annotate boxes as Tensor (but not Boxes) in order to use scripting
+@torch.jit.script_if_tracing
+def paste_masks_in_image(
+    masks: torch.Tensor, boxes: torch.Tensor, image_shape: Tuple[int, int], threshold: float = 0.5
+):
+    """
+    Paste a set of masks that are of a fixed resolution (e.g., 28 x 28) into an image.
+    The location, height, and width for pasting each mask is determined by their
+    corresponding bounding boxes in boxes.
+    Note:
+        This is a complicated but more accurate implementation. In actual deployment, it is
+        often enough to use a faster but less accurate implementation.
+        See :func:`paste_mask_in_image_old` in this file for an alternative implementation.
+    Args:
+        masks (tensor): Tensor of shape (Bimg, Hmask, Wmask), where Bimg is the number of
+            detected object instances in the image and Hmask, Wmask are the mask width and mask
+            height of the predicted mask (e.g., Hmask = Wmask = 28). Values are in [0, 1].
+        boxes (Boxes or Tensor): A Boxes of length Bimg or Tensor of shape (Bimg, 4).
+            boxes[i] and masks[i] correspond to the same object instance.
+        image_shape (tuple): height, width
+        threshold (float): A threshold in [0, 1] for converting the (soft) masks to
+            binary masks.
+    Returns:
+        img_masks (Tensor): A tensor of shape (Bimg, Himage, Wimage), where Bimg is the
+        number of detected object instances and Himage, Wimage are the image width
+        and height. img_masks[i] is a binary mask for object instance i.
+    """
+    assert masks.shape[-1] == masks.shape[-2], "Only square mask predictions are supported"
+    N = len(masks)
+    if N == 0:
+        return masks.new_empty((0,) + image_shape, dtype=torch.uint8)
+    if not isinstance(boxes, torch.Tensor):
+        boxes = boxes.tensor
+    device = boxes.device
+    assert len(boxes) == N, boxes.shape
+    img_h, img_w = image_shape
+    # The actual implementation split the input into chunks,
+    # and paste them chunk by chunk.
+    if device.type == "cpu" or torch.jit.is_scripting():
+        # CPU is most efficient when they are pasted one by one with skip_empty=True
+        # so that it performs minimal number of operations.
+        num_chunks = N
+    else:
+        # GPU benefits from parallelism for larger chunks, but may have memory issue
+        # int(img_h) because shape may be tensors in tracing
+        num_chunks = int(np.ceil(N * int(img_h) * int(img_w) * BYTES_PER_FLOAT / GPU_MEM_LIMIT))
+        assert (
+            num_chunks <= N
+        ), "Default GPU_MEM_LIMIT in mask_ops.py is too small; try increasing it"
+    chunks = torch.chunk(torch.arange(N, device=device), num_chunks)
+    img_masks = torch.zeros(
+        N, img_h, img_w, device=device, dtype=torch.bool if threshold >= 0 else torch.uint8
+    )
+    for inds in chunks:
+        masks_chunk, spatial_inds = _do_paste_mask(
+            masks[inds, None, :, :], boxes[inds], img_h, img_w, skip_empty=device.type == "cpu"
+        )
+        if threshold >= 0:
+            masks_chunk = (masks_chunk >= threshold).to(dtype=torch.bool)
+        else:
+            # for visualization and debugging
+            masks_chunk = (masks_chunk * 255).to(dtype=torch.uint8)
+        if torch.jit.is_scripting():  # Scripting does not use the optimized codepath
+            img_masks[inds] = masks_chunk
+        else:
+            img_masks[(inds,) + spatial_inds] = masks_chunk
+    return img_masks
+# The below are the original paste function (from Detectron1) which has
+# larger quantization error.
+# It is faster on CPU, while the aligned one is faster on GPU thanks to grid_sample.
+def paste_mask_in_image_old(mask, box, img_h, img_w, threshold):
+    """
+    Paste a single mask in an image.
+    This is a per-box implementation of :func:`paste_masks_in_image`.
+    This function has larger quantization error due to incorrect pixel
+    modeling and is not used any more.
+    Args:
+        mask (Tensor): A tensor of shape (Hmask, Wmask) storing the mask of a single
+            object instance. Values are in [0, 1].
+        box (Tensor): A tensor of shape (4, ) storing the x0, y0, x1, y1 box corners
+            of the object instance.
+        img_h, img_w (int): Image height and width.
+        threshold (float): Mask binarization threshold in [0, 1].
+    Returns:
+        im_mask (Tensor):
+            The resized and binarized object mask pasted into the original
+            image plane (a tensor of shape (img_h, img_w)).
+    """
+    # Conversion from continuous box coordinates to discrete pixel coordinates
+    # via truncation (cast to int32). This determines which pixels to paste the
+    # mask onto.
+    box = box.to(dtype=torch.int32)  # Continuous to discrete coordinate conversion
+    # An example (1D) box with continuous coordinates (x0=0.7, x1=4.3) will map to
+    # a discrete coordinates (x0=0, x1=4). Note that box is mapped to 5 = x1 - x0 + 1
+    # pixels (not x1 - x0 pixels).
+    samples_w = box[2] - box[0] + 1  # Number of pixel samples, *not* geometric width
+    samples_h = box[3] - box[1] + 1  # Number of pixel samples, *not* geometric height
+    # Resample the mask from it's original grid to the new samples_w x samples_h grid
+    mask = Image.fromarray(mask.cpu().numpy())
+    mask = mask.resize((samples_w, samples_h), resample=Image.BILINEAR)
+    mask = np.array(mask, copy=False)
+    if threshold >= 0:
+        mask = np.array(mask > threshold, dtype=np.uint8)
+        mask = torch.from_numpy(mask)
+    else:
+        # for visualization and debugging, we also
+        # allow it to return an unmodified mask
+        mask = torch.from_numpy(mask * 255).to(torch.uint8)
+    im_mask = torch.zeros((img_h, img_w), dtype=torch.uint8)
+    x_0 = max(box[0], 0)
+    x_1 = min(box[2] + 1, img_w)
+    y_0 = max(box[1], 0)
+    y_1 = min(box[3] + 1, img_h)
+    im_mask[y_0:y_1, x_0:x_1] = mask[
+        (y_0 - box[1]) : (y_1 - box[1]), (x_0 - box[0]) : (x_1 - box[0])
+    ]
+    return im_mask
+# Our pixel modeling requires extrapolation for any continuous
+# coordinate < 0.5 or > length - 0.5. When sampling pixels on the masks,
+# we would like this extrapolation to be an interpolation between boundary values and zero,
+# instead of using absolute zero or boundary values.
+# Therefore `paste_mask_in_image_old` is often used with zero padding around the masks like this:
+# masks, scale = pad_masks(masks[:, 0, :, :], 1)
+# boxes = scale_boxes(boxes.tensor, scale)
+def pad_masks(masks, padding):
+    """
+    Args:
+        masks (tensor): A tensor of shape (B, M, M) representing B masks.
+        padding (int): Number of cells to pad on all sides.
+    Returns:
+        The padded masks and the scale factor of the padding size / original size.
+    """
+    B = masks.shape[0]
+    M = masks.shape[-1]
+    pad2 = 2 * padding
+    scale = float(M + pad2) / M
+    padded_masks = masks.new_zeros((B, M + pad2, M + pad2))
+    padded_masks[:, padding:-padding, padding:-padding] = masks
+    return padded_masks, scale
+def scale_boxes(boxes, scale):
+    """
+    Args:
+        boxes (tensor): A tensor of shape (B, 4) representing B boxes with 4
+            coords representing the corners x0, y0, x1, y1,
+        scale (float): The box scaling factor.
+    Returns:
+        Scaled boxes.
+    """
+    w_half = (boxes[:, 2] - boxes[:, 0]) * 0.5
+    h_half = (boxes[:, 3] - boxes[:, 1]) * 0.5
+    x_c = (boxes[:, 2] + boxes[:, 0]) * 0.5
+    y_c = (boxes[:, 3] + boxes[:, 1]) * 0.5
+    w_half *= scale
+    h_half *= scale
+    scaled_boxes = torch.zeros_like(boxes)
+    scaled_boxes[:, 0] = x_c - w_half
+    scaled_boxes[:, 2] = x_c + w_half
+    scaled_boxes[:, 1] = y_c - h_half
+    scaled_boxes[:, 3] = y_c + h_half
+    return scaled_boxes
+@torch.jit.script_if_tracing
+def _paste_masks_tensor_shape(
+    masks: torch.Tensor,
+    boxes: torch.Tensor,
+    image_shape: Tuple[torch.Tensor, torch.Tensor],
+    threshold: float = 0.5,
+):
+    """
+    A wrapper of paste_masks_in_image where image_shape is Tensor.
+    During tracing, shapes might be tensors instead of ints. The Tensor->int
+    conversion should be scripted rather than traced.
+    """
+    return paste_masks_in_image(masks, boxes, (int(image_shape[0]), int(image_shape[1])), threshold)

extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/nms.py ADDED Viewed

	@@ -0,0 +1,144 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+import torch
+from torchvision.ops import boxes as box_ops
+from torchvision.ops import nms  # noqa . for compatibility
+def batched_nms(
+    boxes: torch.Tensor, scores: torch.Tensor, idxs: torch.Tensor, iou_threshold: float
+):
+    """
+    Same as torchvision.ops.boxes.batched_nms, but with float().
+    """
+    assert boxes.shape[-1] == 4
+    # Note: Torchvision already has a strategy (https://github.com/pytorch/vision/issues/1311)
+    # to decide whether to use coordinate trick or for loop to implement batched_nms. So we
+    # just call it directly.
+    # Fp16 does not have enough range for batched NMS, so adding float().
+    return box_ops.batched_nms(boxes.float(), scores, idxs, iou_threshold)
+# Note: this function (nms_rotated) might be moved into
+# torchvision/ops/boxes.py in the future
+def nms_rotated(boxes: torch.Tensor, scores: torch.Tensor, iou_threshold: float):
+    """
+    Performs non-maximum suppression (NMS) on the rotated boxes according
+    to their intersection-over-union (IoU).
+    Rotated NMS iteratively removes lower scoring rotated boxes which have an
+    IoU greater than iou_threshold with another (higher scoring) rotated box.
+    Note that RotatedBox (5, 3, 4, 2, -90) covers exactly the same region as
+    RotatedBox (5, 3, 4, 2, 90) does, and their IoU will be 1. However, they
+    can be representing completely different objects in certain tasks, e.g., OCR.
+    As for the question of whether rotated-NMS should treat them as faraway boxes
+    even though their IOU is 1, it depends on the application and/or ground truth annotation.
+    As an extreme example, consider a single character v and the square box around it.
+    If the angle is 0 degree, the object (text) would be read as 'v';
+    If the angle is 90 degrees, the object (text) would become '>';
+    If the angle is 180 degrees, the object (text) would become '^';
+    If the angle is 270/-90 degrees, the object (text) would become '<'
+    All of these cases have IoU of 1 to each other, and rotated NMS that only
+    uses IoU as criterion would only keep one of them with the highest score -
+    which, practically, still makes sense in most cases because typically
+    only one of theses orientations is the correct one. Also, it does not matter
+    as much if the box is only used to classify the object (instead of transcribing
+    them with a sequential OCR recognition model) later.
+    On the other hand, when we use IoU to filter proposals that are close to the
+    ground truth during training, we should definitely take the angle into account if
+    we know the ground truth is labeled with the strictly correct orientation (as in,
+    upside-down words are annotated with -180 degrees even though they can be covered
+    with a 0/90/-90 degree box, etc.)
+    The way the original dataset is annotated also matters. For example, if the dataset
+    is a 4-point polygon dataset that does not enforce ordering of vertices/orientation,
+    we can estimate a minimum rotated bounding box to this polygon, but there's no way
+    we can tell the correct angle with 100% confidence (as shown above, there could be 4 different
+    rotated boxes, with angles differed by 90 degrees to each other, covering the exactly
+    same region). In that case we have to just use IoU to determine the box
+    proximity (as many detection benchmarks (even for text) do) unless there're other
+    assumptions we can make (like width is always larger than height, or the object is not
+    rotated by more than 90 degrees CCW/CW, etc.)
+    In summary, not considering angles in rotated NMS seems to be a good option for now,
+    but we should be aware of its implications.
+    Args:
+        boxes (Tensor[N, 5]): Rotated boxes to perform NMS on. They are expected to be in
+           (x_center, y_center, width, height, angle_degrees) format.
+        scores (Tensor[N]): Scores for each one of the rotated boxes
+        iou_threshold (float): Discards all overlapping rotated boxes with IoU < iou_threshold
+    Returns:
+        keep (Tensor): int64 tensor with the indices of the elements that have been kept
+        by Rotated NMS, sorted in decreasing order of scores
+    """
+    return torch.ops.detectron2.nms_rotated(boxes, scores, iou_threshold)
+# Note: this function (batched_nms_rotated) might be moved into
+# torchvision/ops/boxes.py in the future
+@torch.jit.script_if_tracing
+def batched_nms_rotated(
+    boxes: torch.Tensor, scores: torch.Tensor, idxs: torch.Tensor, iou_threshold: float
+):
+    """
+    Performs non-maximum suppression in a batched fashion.
+    Each index value correspond to a category, and NMS
+    will not be applied between elements of different categories.
+    Args:
+        boxes (Tensor[N, 5]):
+           boxes where NMS will be performed. They
+           are expected to be in (x_ctr, y_ctr, width, height, angle_degrees) format
+        scores (Tensor[N]):
+           scores for each one of the boxes
+        idxs (Tensor[N]):
+           indices of the categories for each one of the boxes.
+        iou_threshold (float):
+           discards all overlapping boxes
+           with IoU < iou_threshold
+    Returns:
+        Tensor:
+            int64 tensor with the indices of the elements that have been kept
+            by NMS, sorted in decreasing order of scores
+    """
+    assert boxes.shape[-1] == 5
+    if boxes.numel() == 0:
+        return torch.empty((0,), dtype=torch.int64, device=boxes.device)
+    boxes = boxes.float()  # fp16 does not have enough range for batched NMS
+    # Strategy: in order to perform NMS independently per class,
+    # we add an offset to all the boxes. The offset is dependent
+    # only on the class idx, and is large enough so that boxes
+    # from different classes do not overlap
+    # Note that batched_nms in torchvision/ops/boxes.py only uses max_coordinate,
+    # which won't handle negative coordinates correctly.
+    # Here by using min_coordinate we can make sure the negative coordinates are
+    # correctly handled.
+    max_coordinate = (
+        torch.max(boxes[:, 0], boxes[:, 1]) + torch.max(boxes[:, 2], boxes[:, 3]) / 2
+    ).max()
+    min_coordinate = (
+        torch.min(boxes[:, 0], boxes[:, 1]) - torch.max(boxes[:, 2], boxes[:, 3]) / 2
+    ).min()
+    offsets = idxs.to(boxes) * (max_coordinate - min_coordinate + 1)
+    boxes_for_nms = boxes.clone()  # avoid modifying the original values in boxes
+    boxes_for_nms[:, :2] += offsets[:, None]
+    keep = nms_rotated(boxes_for_nms, scores, iou_threshold)
+    return keep

extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/roi_align.py ADDED Viewed

	@@ -0,0 +1,74 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+from torch import nn
+from torchvision.ops import roi_align
+# NOTE: torchvision's RoIAlign has a different default aligned=False
+class ROIAlign(nn.Module):
+    def __init__(self, output_size, spatial_scale, sampling_ratio, aligned=True):
+        """
+        Args:
+            output_size (tuple): h, w
+            spatial_scale (float): scale the input boxes by this number
+            sampling_ratio (int): number of inputs samples to take for each output
+                sample. 0 to take samples densely.
+            aligned (bool): if False, use the legacy implementation in
+                Detectron. If True, align the results more perfectly.
+        Note:
+            The meaning of aligned=True:
+            Given a continuous coordinate c, its two neighboring pixel indices (in our
+            pixel model) are computed by floor(c - 0.5) and ceil(c - 0.5). For example,
+            c=1.3 has pixel neighbors with discrete indices [0] and [1] (which are sampled
+            from the underlying signal at continuous coordinates 0.5 and 1.5). But the original
+            roi_align (aligned=False) does not subtract the 0.5 when computing neighboring
+            pixel indices and therefore it uses pixels with a slightly incorrect alignment
+            (relative to our pixel model) when performing bilinear interpolation.
+            With `aligned=True`,
+            we first appropriately scale the ROI and then shift it by -0.5
+            prior to calling roi_align. This produces the correct neighbors; see
+            detectron2/tests/test_roi_align.py for verification.
+            The difference does not make a difference to the model's performance if
+            ROIAlign is used together with conv layers.
+        """
+        super().__init__()
+        self.output_size = output_size
+        self.spatial_scale = spatial_scale
+        self.sampling_ratio = sampling_ratio
+        self.aligned = aligned
+        from torchvision import __version__
+        version = tuple(int(x) for x in __version__.split(".")[:2])
+        # https://github.com/pytorch/vision/pull/2438
+        assert version >= (0, 7), "Require torchvision >= 0.7"
+    def forward(self, input, rois):
+        """
+        Args:
+            input: NCHW images
+            rois: Bx5 boxes. First column is the index into N. The other 4 columns are xyxy.
+        """
+        assert rois.dim() == 2 and rois.size(1) == 5
+        if input.is_quantized:
+            input = input.dequantize()
+        return roi_align(
+            input,
+            rois.to(dtype=input.dtype),
+            self.output_size,
+            self.spatial_scale,
+            self.sampling_ratio,
+            self.aligned,
+        )
+    def __repr__(self):
+        tmpstr = self.__class__.__name__ + "("
+        tmpstr += "output_size=" + str(self.output_size)
+        tmpstr += ", spatial_scale=" + str(self.spatial_scale)
+        tmpstr += ", sampling_ratio=" + str(self.sampling_ratio)
+        tmpstr += ", aligned=" + str(self.aligned)
+        tmpstr += ")"
+        return tmpstr

extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/roi_align_rotated.py ADDED Viewed

	@@ -0,0 +1,100 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+import torch
+from torch import nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair
+class _ROIAlignRotated(Function):
+    @staticmethod
+    def forward(ctx, input, roi, output_size, spatial_scale, sampling_ratio):
+        ctx.save_for_backward(roi)
+        ctx.output_size = _pair(output_size)
+        ctx.spatial_scale = spatial_scale
+        ctx.sampling_ratio = sampling_ratio
+        ctx.input_shape = input.size()
+        output = torch.ops.detectron2.roi_align_rotated_forward(
+            input, roi, spatial_scale, output_size[0], output_size[1], sampling_ratio
+        )
+        return output
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        (rois,) = ctx.saved_tensors
+        output_size = ctx.output_size
+        spatial_scale = ctx.spatial_scale
+        sampling_ratio = ctx.sampling_ratio
+        bs, ch, h, w = ctx.input_shape
+        grad_input = torch.ops.detectron2.roi_align_rotated_backward(
+            grad_output,
+            rois,
+            spatial_scale,
+            output_size[0],
+            output_size[1],
+            bs,
+            ch,
+            h,
+            w,
+            sampling_ratio,
+        )
+        return grad_input, None, None, None, None, None
+roi_align_rotated = _ROIAlignRotated.apply
+class ROIAlignRotated(nn.Module):
+    def __init__(self, output_size, spatial_scale, sampling_ratio):
+        """
+        Args:
+            output_size (tuple): h, w
+            spatial_scale (float): scale the input boxes by this number
+            sampling_ratio (int): number of inputs samples to take for each output
+                sample. 0 to take samples densely.
+        Note:
+            ROIAlignRotated supports continuous coordinate by default:
+            Given a continuous coordinate c, its two neighboring pixel indices (in our
+            pixel model) are computed by floor(c - 0.5) and ceil(c - 0.5). For example,
+            c=1.3 has pixel neighbors with discrete indices [0] and [1] (which are sampled
+            from the underlying signal at continuous coordinates 0.5 and 1.5).
+        """
+        super(ROIAlignRotated, self).__init__()
+        self.output_size = output_size
+        self.spatial_scale = spatial_scale
+        self.sampling_ratio = sampling_ratio
+    def forward(self, input, rois):
+        """
+        Args:
+            input: NCHW images
+            rois: Bx6 boxes. First column is the index into N.
+                The other 5 columns are (x_ctr, y_ctr, width, height, angle_degrees).
+        """
+        assert rois.dim() == 2 and rois.size(1) == 6
+        orig_dtype = input.dtype
+        if orig_dtype == torch.float16:
+            input = input.float()
+            rois = rois.float()
+        output_size = _pair(self.output_size)
+        # Scripting for Autograd is currently unsupported.
+        # This is a quick fix without having to rewrite code on the C++ side
+        if torch.jit.is_scripting() or torch.jit.is_tracing():
+            return torch.ops.detectron2.roi_align_rotated_forward(
+                input, rois, self.spatial_scale, output_size[0], output_size[1], self.sampling_ratio
+            ).to(dtype=orig_dtype)
+        return roi_align_rotated(
+            input, rois, self.output_size, self.spatial_scale, self.sampling_ratio
+        ).to(dtype=orig_dtype)
+    def __repr__(self):
+        tmpstr = self.__class__.__name__ + "("
+        tmpstr += "output_size=" + str(self.output_size)
+        tmpstr += ", spatial_scale=" + str(self.spatial_scale)
+        tmpstr += ", sampling_ratio=" + str(self.sampling_ratio)
+        tmpstr += ")"
+        return tmpstr

extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/rotated_boxes.py ADDED Viewed

	@@ -0,0 +1,21 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+from __future__ import absolute_import, division, print_function, unicode_literals
+import torch
+def pairwise_iou_rotated(boxes1, boxes2):
+    """
+    Return intersection-over-union (Jaccard index) of boxes.
+    Both sets of boxes are expected to be in
+    (x_center, y_center, width, height, angle) format.
+    Arguments:
+        boxes1 (Tensor[N, 5])
+        boxes2 (Tensor[M, 5])
+    Returns:
+        iou (Tensor[N, M]): the NxM matrix containing the pairwise
+            IoU values for every element in boxes1 and boxes2
+    """
+    return torch.ops.detectron2.box_iou_rotated(boxes1, boxes2)

extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/shape_spec.py ADDED Viewed

	@@ -0,0 +1,18 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+from dataclasses import dataclass
+from typing import Optional
+@dataclass
+class ShapeSpec:
+    """
+    A simple structure that contains basic shape specification about a tensor.
+    It is often used as the auxiliary inputs/outputs of models,
+    to complement the lack of shape inference ability among pytorch modules.
+    """
+    channels: Optional[int] = None
+    height: Optional[int] = None
+    width: Optional[int] = None
+    stride: Optional[int] = None

extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/wrappers.py ADDED Viewed

	@@ -0,0 +1,162 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+"""
+Wrappers around on some nn functions, mainly to support empty tensors.
+Ideally, add support directly in PyTorch to empty tensors in those functions.
+These can be removed once https://github.com/pytorch/pytorch/issues/12013
+is implemented
+"""
+import warnings
+from typing import List, Optional
+import torch
+from torch.nn import functional as F
+from annotator.oneformer.detectron2.utils.env import TORCH_VERSION
+def shapes_to_tensor(x: List[int], device: Optional[torch.device] = None) -> torch.Tensor:
+    """
+    Turn a list of integer scalars or integer Tensor scalars into a vector,
+    in a way that's both traceable and scriptable.
+    In tracing, `x` should be a list of scalar Tensor, so the output can trace to the inputs.
+    In scripting or eager, `x` should be a list of int.
+    """
+    if torch.jit.is_scripting():
+        return torch.as_tensor(x, device=device)
+    if torch.jit.is_tracing():
+        assert all(
+            [isinstance(t, torch.Tensor) for t in x]
+        ), "Shape should be tensor during tracing!"
+        # as_tensor should not be used in tracing because it records a constant
+        ret = torch.stack(x)
+        if ret.device != device:  # avoid recording a hard-coded device if not necessary
+            ret = ret.to(device=device)
+        return ret
+    return torch.as_tensor(x, device=device)
+def check_if_dynamo_compiling():
+    if TORCH_VERSION >= (1, 14):
+        from torch._dynamo import is_compiling
+        return is_compiling()
+    else:
+        return False
+def cat(tensors: List[torch.Tensor], dim: int = 0):
+    """
+    Efficient version of torch.cat that avoids a copy if there is only a single element in a list
+    """
+    assert isinstance(tensors, (list, tuple))
+    if len(tensors) == 1:
+        return tensors[0]
+    return torch.cat(tensors, dim)
+def empty_input_loss_func_wrapper(loss_func):
+    def wrapped_loss_func(input, target, *, reduction="mean", **kwargs):
+        """
+        Same as `loss_func`, but returns 0 (instead of nan) for empty inputs.
+        """
+        if target.numel() == 0 and reduction == "mean":
+            return input.sum() * 0.0  # connect the gradient
+        return loss_func(input, target, reduction=reduction, **kwargs)
+    return wrapped_loss_func
+cross_entropy = empty_input_loss_func_wrapper(F.cross_entropy)
+class _NewEmptyTensorOp(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x, new_shape):
+        ctx.shape = x.shape
+        return x.new_empty(new_shape)
+    @staticmethod
+    def backward(ctx, grad):
+        shape = ctx.shape
+        return _NewEmptyTensorOp.apply(grad, shape), None
+class Conv2d(torch.nn.Conv2d):
+    """
+    A wrapper around :class:`torch.nn.Conv2d` to support empty inputs and more features.
+    """
+    def __init__(self, *args, **kwargs):
+        """
+        Extra keyword arguments supported in addition to those in `torch.nn.Conv2d`:
+        Args:
+            norm (nn.Module, optional): a normalization layer
+            activation (callable(Tensor) -> Tensor): a callable activation function
+        It assumes that norm layer is used before activation.
+        """
+        norm = kwargs.pop("norm", None)
+        activation = kwargs.pop("activation", None)
+        super().__init__(*args, **kwargs)
+        self.norm = norm
+        self.activation = activation
+    def forward(self, x):
+        # torchscript does not support SyncBatchNorm yet
+        # https://github.com/pytorch/pytorch/issues/40507
+        # and we skip these codes in torchscript since:
+        # 1. currently we only support torchscript in evaluation mode
+        # 2. features needed by exporting module to torchscript are added in PyTorch 1.6 or
+        # later version, `Conv2d` in these PyTorch versions has already supported empty inputs.
+        if not torch.jit.is_scripting():
+            # Dynamo doesn't support context managers yet
+            is_dynamo_compiling = check_if_dynamo_compiling()
+            if not is_dynamo_compiling:
+                with warnings.catch_warnings(record=True):
+                    if x.numel() == 0 and self.training:
+                        # https://github.com/pytorch/pytorch/issues/12013
+                        assert not isinstance(
+                            self.norm, torch.nn.SyncBatchNorm
+                        ), "SyncBatchNorm does not support empty inputs!"
+        x = F.conv2d(
+            x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups
+        )
+        if self.norm is not None:
+            x = self.norm(x)
+        if self.activation is not None:
+            x = self.activation(x)
+        return x
+ConvTranspose2d = torch.nn.ConvTranspose2d
+BatchNorm2d = torch.nn.BatchNorm2d
+interpolate = F.interpolate
+Linear = torch.nn.Linear
+def nonzero_tuple(x):
+    """
+    A 'as_tuple=True' version of torch.nonzero to support torchscript.
+    because of https://github.com/pytorch/pytorch/issues/38718
+    """
+    if torch.jit.is_scripting():
+        if x.dim() == 0:
+            return x.unsqueeze(0).nonzero().unbind(1)
+        return x.nonzero().unbind(1)
+    else:
+        return x.nonzero(as_tuple=True)
+@torch.jit.script_if_tracing
+def move_device_like(src: torch.Tensor, dst: torch.Tensor) -> torch.Tensor:
+    """
+    Tracing friendly way to cast tensor to another tensor's device. Device will be treated
+    as constant during tracing, scripting the casting process as whole can workaround this issue.
+    """
+    return src.to(dst.device)

extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/model_zoo/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+"""
+Model Zoo API for Detectron2: a collection of functions to create common model architectures
+listed in `MODEL_ZOO.md <https://github.com/facebookresearch/detectron2/blob/main/MODEL_ZOO.md>`_,
+and optionally load their pre-trained weights.
+"""
+from .model_zoo import get, get_config_file, get_checkpoint_url, get_config
+__all__ = ["get_checkpoint_url", "get", "get_config_file", "get_config"]

extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/model_zoo/model_zoo.py ADDED Viewed

	@@ -0,0 +1,213 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+import os
+from typing import Optional
+import pkg_resources
+import torch
+from annotator.oneformer.detectron2.checkpoint import DetectionCheckpointer
+from annotator.oneformer.detectron2.config import CfgNode, LazyConfig, get_cfg, instantiate
+from annotator.oneformer.detectron2.modeling import build_model
+class _ModelZooUrls(object):
+    """
+    Mapping from names to officially released Detectron2 pre-trained models.
+    """
+    S3_PREFIX = "https://dl.fbaipublicfiles.com/detectron2/"
+    # format: {config_path.yaml} -> model_id/model_final_{commit}.pkl
+    CONFIG_PATH_TO_URL_SUFFIX = {
+        # COCO Detection with Faster R-CNN
+        "COCO-Detection/faster_rcnn_R_50_C4_1x": "137257644/model_final_721ade.pkl",
+        "COCO-Detection/faster_rcnn_R_50_DC5_1x": "137847829/model_final_51d356.pkl",
+        "COCO-Detection/faster_rcnn_R_50_FPN_1x": "137257794/model_final_b275ba.pkl",
+        "COCO-Detection/faster_rcnn_R_50_C4_3x": "137849393/model_final_f97cb7.pkl",
+        "COCO-Detection/faster_rcnn_R_50_DC5_3x": "137849425/model_final_68d202.pkl",
+        "COCO-Detection/faster_rcnn_R_50_FPN_3x": "137849458/model_final_280758.pkl",
+        "COCO-Detection/faster_rcnn_R_101_C4_3x": "138204752/model_final_298dad.pkl",
+        "COCO-Detection/faster_rcnn_R_101_DC5_3x": "138204841/model_final_3e0943.pkl",
+        "COCO-Detection/faster_rcnn_R_101_FPN_3x": "137851257/model_final_f6e8b1.pkl",
+        "COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x": "139173657/model_final_68b088.pkl",
+        # COCO Detection with RetinaNet
+        "COCO-Detection/retinanet_R_50_FPN_1x": "190397773/model_final_bfca0b.pkl",
+        "COCO-Detection/retinanet_R_50_FPN_3x": "190397829/model_final_5bd44e.pkl",
+        "COCO-Detection/retinanet_R_101_FPN_3x": "190397697/model_final_971ab9.pkl",
+        # COCO Detection with RPN and Fast R-CNN
+        "COCO-Detection/rpn_R_50_C4_1x": "137258005/model_final_450694.pkl",
+        "COCO-Detection/rpn_R_50_FPN_1x": "137258492/model_final_02ce48.pkl",
+        "COCO-Detection/fast_rcnn_R_50_FPN_1x": "137635226/model_final_e5f7ce.pkl",
+        # COCO Instance Segmentation Baselines with Mask R-CNN
+        "COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x": "137259246/model_final_9243eb.pkl",
+        "COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_1x": "137260150/model_final_4f86c3.pkl",
+        "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x": "137260431/model_final_a54504.pkl",
+        "COCO-InstanceSegmentation/mask_rcnn_R_50_C4_3x": "137849525/model_final_4ce675.pkl",
+        "COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_3x": "137849551/model_final_84107b.pkl",
+        "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x": "137849600/model_final_f10217.pkl",
+        "COCO-InstanceSegmentation/mask_rcnn_R_101_C4_3x": "138363239/model_final_a2914c.pkl",
+        "COCO-InstanceSegmentation/mask_rcnn_R_101_DC5_3x": "138363294/model_final_0464b7.pkl",
+        "COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x": "138205316/model_final_a3ec72.pkl",
+        "COCO-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_3x": "139653917/model_final_2d9806.pkl",  # noqa
+        # New baselines using Large-Scale Jitter and Longer Training Schedule
+        "new_baselines/mask_rcnn_R_50_FPN_100ep_LSJ": "42047764/model_final_bb69de.pkl",
+        "new_baselines/mask_rcnn_R_50_FPN_200ep_LSJ": "42047638/model_final_89a8d3.pkl",
+        "new_baselines/mask_rcnn_R_50_FPN_400ep_LSJ": "42019571/model_final_14d201.pkl",
+        "new_baselines/mask_rcnn_R_101_FPN_100ep_LSJ": "42025812/model_final_4f7b58.pkl",
+        "new_baselines/mask_rcnn_R_101_FPN_200ep_LSJ": "42131867/model_final_0bb7ae.pkl",
+        "new_baselines/mask_rcnn_R_101_FPN_400ep_LSJ": "42073830/model_final_f96b26.pkl",
+        "new_baselines/mask_rcnn_regnetx_4gf_dds_FPN_100ep_LSJ": "42047771/model_final_b7fbab.pkl",  # noqa
+        "new_baselines/mask_rcnn_regnetx_4gf_dds_FPN_200ep_LSJ": "42132721/model_final_5d87c1.pkl",  # noqa
+        "new_baselines/mask_rcnn_regnetx_4gf_dds_FPN_400ep_LSJ": "42025447/model_final_f1362d.pkl",  # noqa
+        "new_baselines/mask_rcnn_regnety_4gf_dds_FPN_100ep_LSJ": "42047784/model_final_6ba57e.pkl",  # noqa
+        "new_baselines/mask_rcnn_regnety_4gf_dds_FPN_200ep_LSJ": "42047642/model_final_27b9c1.pkl",  # noqa
+        "new_baselines/mask_rcnn_regnety_4gf_dds_FPN_400ep_LSJ": "42045954/model_final_ef3a80.pkl",  # noqa
+        # COCO Person Keypoint Detection Baselines with Keypoint R-CNN
+        "COCO-Keypoints/keypoint_rcnn_R_50_FPN_1x": "137261548/model_final_04e291.pkl",
+        "COCO-Keypoints/keypoint_rcnn_R_50_FPN_3x": "137849621/model_final_a6e10b.pkl",
+        "COCO-Keypoints/keypoint_rcnn_R_101_FPN_3x": "138363331/model_final_997cc7.pkl",
+        "COCO-Keypoints/keypoint_rcnn_X_101_32x8d_FPN_3x": "139686956/model_final_5ad38f.pkl",
+        # COCO Panoptic Segmentation Baselines with Panoptic FPN
+        "COCO-PanopticSegmentation/panoptic_fpn_R_50_1x": "139514544/model_final_dbfeb4.pkl",
+        "COCO-PanopticSegmentation/panoptic_fpn_R_50_3x": "139514569/model_final_c10459.pkl",
+        "COCO-PanopticSegmentation/panoptic_fpn_R_101_3x": "139514519/model_final_cafdb1.pkl",
+        # LVIS Instance Segmentation Baselines with Mask R-CNN
+        "LVISv0.5-InstanceSegmentation/mask_rcnn_R_50_FPN_1x": "144219072/model_final_571f7c.pkl",  # noqa
+        "LVISv0.5-InstanceSegmentation/mask_rcnn_R_101_FPN_1x": "144219035/model_final_824ab5.pkl",  # noqa
+        "LVISv0.5-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_1x": "144219108/model_final_5e3439.pkl",  # noqa
+        # Cityscapes & Pascal VOC Baselines
+        "Cityscapes/mask_rcnn_R_50_FPN": "142423278/model_final_af9cf5.pkl",
+        "PascalVOC-Detection/faster_rcnn_R_50_C4": "142202221/model_final_b1acc2.pkl",
+        # Other Settings
+        "Misc/mask_rcnn_R_50_FPN_1x_dconv_c3-c5": "138602867/model_final_65c703.pkl",
+        "Misc/mask_rcnn_R_50_FPN_3x_dconv_c3-c5": "144998336/model_final_821d0b.pkl",
+        "Misc/cascade_mask_rcnn_R_50_FPN_1x": "138602847/model_final_e9d89b.pkl",
+        "Misc/cascade_mask_rcnn_R_50_FPN_3x": "144998488/model_final_480dd8.pkl",
+        "Misc/mask_rcnn_R_50_FPN_3x_syncbn": "169527823/model_final_3b3c51.pkl",
+        "Misc/mask_rcnn_R_50_FPN_3x_gn": "138602888/model_final_dc5d9e.pkl",
+        "Misc/scratch_mask_rcnn_R_50_FPN_3x_gn": "138602908/model_final_01ca85.pkl",
+        "Misc/scratch_mask_rcnn_R_50_FPN_9x_gn": "183808979/model_final_da7b4c.pkl",
+        "Misc/scratch_mask_rcnn_R_50_FPN_9x_syncbn": "184226666/model_final_5ce33e.pkl",
+        "Misc/panoptic_fpn_R_101_dconv_cascade_gn_3x": "139797668/model_final_be35db.pkl",
+        "Misc/cascade_mask_rcnn_X_152_32x8d_FPN_IN5k_gn_dconv": "18131413/model_0039999_e76410.pkl",  # noqa
+        # D1 Comparisons
+        "Detectron1-Comparisons/faster_rcnn_R_50_FPN_noaug_1x": "137781054/model_final_7ab50c.pkl",  # noqa
+        "Detectron1-Comparisons/mask_rcnn_R_50_FPN_noaug_1x": "137781281/model_final_62ca52.pkl",  # noqa
+        "Detectron1-Comparisons/keypoint_rcnn_R_50_FPN_1x": "137781195/model_final_cce136.pkl",
+    }
+    @staticmethod
+    def query(config_path: str) -> Optional[str]:
+        """
+        Args:
+            config_path: relative config filename
+        """
+        name = config_path.replace(".yaml", "").replace(".py", "")
+        if name in _ModelZooUrls.CONFIG_PATH_TO_URL_SUFFIX:
+            suffix = _ModelZooUrls.CONFIG_PATH_TO_URL_SUFFIX[name]
+            return _ModelZooUrls.S3_PREFIX + name + "/" + suffix
+        return None
+def get_checkpoint_url(config_path):
+    """
+    Returns the URL to the model trained using the given config
+    Args:
+        config_path (str): config file name relative to detectron2's "configs/"
+            directory, e.g., "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml"
+    Returns:
+        str: a URL to the model
+    """
+    url = _ModelZooUrls.query(config_path)
+    if url is None:
+        raise RuntimeError("Pretrained model for {} is not available!".format(config_path))
+    return url
+def get_config_file(config_path):
+    """
+    Returns path to a builtin config file.
+    Args:
+        config_path (str): config file name relative to detectron2's "configs/"
+            directory, e.g., "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml"
+    Returns:
+        str: the real path to the config file.
+    """
+    cfg_file = pkg_resources.resource_filename(
+        "detectron2.model_zoo", os.path.join("configs", config_path)
+    )
+    if not os.path.exists(cfg_file):
+        raise RuntimeError("{} not available in Model Zoo!".format(config_path))
+    return cfg_file
+def get_config(config_path, trained: bool = False):
+    """
+    Returns a config object for a model in model zoo.
+    Args:
+        config_path (str): config file name relative to detectron2's "configs/"
+            directory, e.g., "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml"
+        trained (bool): If True, will set ``MODEL.WEIGHTS`` to trained model zoo weights.
+            If False, the checkpoint specified in the config file's ``MODEL.WEIGHTS`` is used
+            instead; this will typically (though not always) initialize a subset of weights using
+            an ImageNet pre-trained model, while randomly initializing the other weights.
+    Returns:
+        CfgNode or omegaconf.DictConfig: a config object
+    """
+    cfg_file = get_config_file(config_path)
+    if cfg_file.endswith(".yaml"):
+        cfg = get_cfg()
+        cfg.merge_from_file(cfg_file)
+        if trained:
+            cfg.MODEL.WEIGHTS = get_checkpoint_url(config_path)
+        return cfg
+    elif cfg_file.endswith(".py"):
+        cfg = LazyConfig.load(cfg_file)
+        if trained:
+            url = get_checkpoint_url(config_path)
+            if "train" in cfg and "init_checkpoint" in cfg.train:
+                cfg.train.init_checkpoint = url
+            else:
+                raise NotImplementedError
+        return cfg
+def get(config_path, trained: bool = False, device: Optional[str] = None):
+    """
+    Get a model specified by relative path under Detectron2's official ``configs/`` directory.
+    Args:
+        config_path (str): config file name relative to detectron2's "configs/"
+            directory, e.g., "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml"
+        trained (bool): see :func:`get_config`.
+        device (str or None): overwrite the device in config, if given.
+    Returns:
+        nn.Module: a detectron2 model. Will be in training mode.
+    Example:
+    ::
+        from annotator.oneformer.detectron2 import model_zoo
+        model = model_zoo.get("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml", trained=True)
+    """
+    cfg = get_config(config_path, trained)
+    if device is None and not torch.cuda.is_available():
+        device = "cpu"
+    if device is not None and isinstance(cfg, CfgNode):
+        cfg.MODEL.DEVICE = device
+    if isinstance(cfg, CfgNode):
+        model = build_model(cfg)
+        DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS)
+    else:
+        model = instantiate(cfg.model)
+        if device is not None:
+            model = model.to(device)
+        if "train" in cfg and "init_checkpoint" in cfg.train:
+            DetectionCheckpointer(model).load(cfg.train.init_checkpoint)
+    return model

extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/__init__.py ADDED Viewed

	@@ -0,0 +1,64 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+from annotator.oneformer.detectron2.layers import ShapeSpec
+from .anchor_generator import build_anchor_generator, ANCHOR_GENERATOR_REGISTRY
+from .backbone import (
+    BACKBONE_REGISTRY,
+    FPN,
+    Backbone,
+    ResNet,
+    ResNetBlockBase,
+    build_backbone,
+    build_resnet_backbone,
+    make_stage,
+    ViT,
+    SimpleFeaturePyramid,
+    get_vit_lr_decay_rate,
+    MViT,
+    SwinTransformer,
+)
+from .meta_arch import (
+    META_ARCH_REGISTRY,
+    SEM_SEG_HEADS_REGISTRY,
+    GeneralizedRCNN,
+    PanopticFPN,
+    ProposalNetwork,
+    RetinaNet,
+    SemanticSegmentor,
+    build_model,
+    build_sem_seg_head,
+    FCOS,
+)
+from .postprocessing import detector_postprocess
+from .proposal_generator import (
+    PROPOSAL_GENERATOR_REGISTRY,
+    build_proposal_generator,
+    RPN_HEAD_REGISTRY,
+    build_rpn_head,
+)
+from .roi_heads import (
+    ROI_BOX_HEAD_REGISTRY,
+    ROI_HEADS_REGISTRY,
+    ROI_KEYPOINT_HEAD_REGISTRY,
+    ROI_MASK_HEAD_REGISTRY,
+    ROIHeads,
+    StandardROIHeads,
+    BaseMaskRCNNHead,
+    BaseKeypointRCNNHead,
+    FastRCNNOutputLayers,
+    build_box_head,
+    build_keypoint_head,
+    build_mask_head,
+    build_roi_heads,
+)
+from .test_time_augmentation import DatasetMapperTTA, GeneralizedRCNNWithTTA
+from .mmdet_wrapper import MMDetBackbone, MMDetDetector
+_EXCLUDE = {"ShapeSpec"}
+__all__ = [k for k in globals().keys() if k not in _EXCLUDE and not k.startswith("_")]
+from annotator.oneformer.detectron2.utils.env import fixup_module_metadata
+fixup_module_metadata(__name__, globals(), __all__)
+del fixup_module_metadata

extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/anchor_generator.py ADDED Viewed

	@@ -0,0 +1,386 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+import collections
+import math
+from typing import List
+import torch
+from torch import nn
+from annotator.oneformer.detectron2.config import configurable
+from annotator.oneformer.detectron2.layers import ShapeSpec, move_device_like
+from annotator.oneformer.detectron2.structures import Boxes, RotatedBoxes
+from annotator.oneformer.detectron2.utils.registry import Registry
+ANCHOR_GENERATOR_REGISTRY = Registry("ANCHOR_GENERATOR")
+ANCHOR_GENERATOR_REGISTRY.__doc__ = """
+Registry for modules that creates object detection anchors for feature maps.
+The registered object will be called with `obj(cfg, input_shape)`.
+"""
+class BufferList(nn.Module):
+    """
+    Similar to nn.ParameterList, but for buffers
+    """
+    def __init__(self, buffers):
+        super().__init__()
+        for i, buffer in enumerate(buffers):
+            # Use non-persistent buffer so the values are not saved in checkpoint
+            self.register_buffer(str(i), buffer, persistent=False)
+    def __len__(self):
+        return len(self._buffers)
+    def __iter__(self):
+        return iter(self._buffers.values())
+def _create_grid_offsets(
+    size: List[int], stride: int, offset: float, target_device_tensor: torch.Tensor
+):
+    grid_height, grid_width = size
+    shifts_x = move_device_like(
+        torch.arange(offset * stride, grid_width * stride, step=stride, dtype=torch.float32),
+        target_device_tensor,
+    )
+    shifts_y = move_device_like(
+        torch.arange(offset * stride, grid_height * stride, step=stride, dtype=torch.float32),
+        target_device_tensor,
+    )
+    shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x)
+    shift_x = shift_x.reshape(-1)
+    shift_y = shift_y.reshape(-1)
+    return shift_x, shift_y
+def _broadcast_params(params, num_features, name):
+    """
+    If one size (or aspect ratio) is specified and there are multiple feature
+    maps, we "broadcast" anchors of that single size (or aspect ratio)
+    over all feature maps.
+    If params is list[float], or list[list[float]] with len(params) == 1, repeat
+    it num_features time.
+    Returns:
+        list[list[float]]: param for each feature
+    """
+    assert isinstance(
+        params, collections.abc.Sequence
+    ), f"{name} in anchor generator has to be a list! Got {params}."
+    assert len(params), f"{name} in anchor generator cannot be empty!"
+    if not isinstance(params[0], collections.abc.Sequence):  # params is list[float]
+        return [params] * num_features
+    if len(params) == 1:
+        return list(params) * num_features
+    assert len(params) == num_features, (
+        f"Got {name} of length {len(params)} in anchor generator, "
+        f"but the number of input features is {num_features}!"
+    )
+    return params
+@ANCHOR_GENERATOR_REGISTRY.register()
+class DefaultAnchorGenerator(nn.Module):
+    """
+    Compute anchors in the standard ways described in
+    "Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks".
+    """
+    box_dim: torch.jit.Final[int] = 4
+    """
+    the dimension of each anchor box.
+    """
+    @configurable
+    def __init__(self, *, sizes, aspect_ratios, strides, offset=0.5):
+        """
+        This interface is experimental.
+        Args:
+            sizes (list[list[float]] or list[float]):
+                If ``sizes`` is list[list[float]], ``sizes[i]`` is the list of anchor sizes
+                (i.e. sqrt of anchor area) to use for the i-th feature map.
+                If ``sizes`` is list[float], ``sizes`` is used for all feature maps.
+                Anchor sizes are given in absolute lengths in units of
+                the input image; they do not dynamically scale if the input image size changes.
+            aspect_ratios (list[list[float]] or list[float]): list of aspect ratios
+                (i.e. height / width) to use for anchors. Same "broadcast" rule for `sizes` applies.
+            strides (list[int]): stride of each input feature.
+            offset (float): Relative offset between the center of the first anchor and the top-left
+                corner of the image. Value has to be in [0, 1).
+                Recommend to use 0.5, which means half stride.
+        """
+        super().__init__()
+        self.strides = strides
+        self.num_features = len(self.strides)
+        sizes = _broadcast_params(sizes, self.num_features, "sizes")
+        aspect_ratios = _broadcast_params(aspect_ratios, self.num_features, "aspect_ratios")
+        self.cell_anchors = self._calculate_anchors(sizes, aspect_ratios)
+        self.offset = offset
+        assert 0.0 <= self.offset < 1.0, self.offset
+    @classmethod
+    def from_config(cls, cfg, input_shape: List[ShapeSpec]):
+        return {
+            "sizes": cfg.MODEL.ANCHOR_GENERATOR.SIZES,
+            "aspect_ratios": cfg.MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS,
+            "strides": [x.stride for x in input_shape],
+            "offset": cfg.MODEL.ANCHOR_GENERATOR.OFFSET,
+        }
+    def _calculate_anchors(self, sizes, aspect_ratios):
+        cell_anchors = [
+            self.generate_cell_anchors(s, a).float() for s, a in zip(sizes, aspect_ratios)
+        ]
+        return BufferList(cell_anchors)
+    @property
+    @torch.jit.unused
+    def num_cell_anchors(self):
+        """
+        Alias of `num_anchors`.
+        """
+        return self.num_anchors
+    @property
+    @torch.jit.unused
+    def num_anchors(self):
+        """
+        Returns:
+            list[int]: Each int is the number of anchors at every pixel
+                location, on that feature map.
+                For example, if at every pixel we use anchors of 3 aspect
+                ratios and 5 sizes, the number of anchors is 15.
+                (See also ANCHOR_GENERATOR.SIZES and ANCHOR_GENERATOR.ASPECT_RATIOS in config)
+                In standard RPN models, `num_anchors` on every feature map is the same.
+        """
+        return [len(cell_anchors) for cell_anchors in self.cell_anchors]
+    def _grid_anchors(self, grid_sizes: List[List[int]]):
+        """
+        Returns:
+            list[Tensor]: #featuremap tensors, each is (#locations x #cell_anchors) x 4
+        """
+        anchors = []
+        # buffers() not supported by torchscript. use named_buffers() instead
+        buffers: List[torch.Tensor] = [x[1] for x in self.cell_anchors.named_buffers()]
+        for size, stride, base_anchors in zip(grid_sizes, self.strides, buffers):
+            shift_x, shift_y = _create_grid_offsets(size, stride, self.offset, base_anchors)
+            shifts = torch.stack((shift_x, shift_y, shift_x, shift_y), dim=1)
+            anchors.append((shifts.view(-1, 1, 4) + base_anchors.view(1, -1, 4)).reshape(-1, 4))
+        return anchors
+    def generate_cell_anchors(self, sizes=(32, 64, 128, 256, 512), aspect_ratios=(0.5, 1, 2)):
+        """
+        Generate a tensor storing canonical anchor boxes, which are all anchor
+        boxes of different sizes and aspect_ratios centered at (0, 0).
+        We can later build the set of anchors for a full feature map by
+        shifting and tiling these tensors (see `meth:_grid_anchors`).
+        Args:
+            sizes (tuple[float]):
+            aspect_ratios (tuple[float]]):
+        Returns:
+            Tensor of shape (len(sizes) * len(aspect_ratios), 4) storing anchor boxes
+                in XYXY format.
+        """
+        # This is different from the anchor generator defined in the original Faster R-CNN
+        # code or Detectron. They yield the same AP, however the old version defines cell
+        # anchors in a less natural way with a shift relative to the feature grid and
+        # quantization that results in slightly different sizes for different aspect ratios.
+        # See also https://github.com/facebookresearch/Detectron/issues/227
+        anchors = []
+        for size in sizes:
+            area = size**2.0
+            for aspect_ratio in aspect_ratios:
+                # s * s = w * h
+                # a = h / w
+                # ... some algebra ...
+                # w = sqrt(s * s / a)
+                # h = a * w
+                w = math.sqrt(area / aspect_ratio)
+                h = aspect_ratio * w
+                x0, y0, x1, y1 = -w / 2.0, -h / 2.0, w / 2.0, h / 2.0
+                anchors.append([x0, y0, x1, y1])
+        return torch.tensor(anchors)
+    def forward(self, features: List[torch.Tensor]):
+        """
+        Args:
+            features (list[Tensor]): list of backbone feature maps on which to generate anchors.
+        Returns:
+            list[Boxes]: a list of Boxes containing all the anchors for each feature map
+                (i.e. the cell anchors repeated over all locations in the feature map).
+                The number of anchors of each feature map is Hi x Wi x num_cell_anchors,
+                where Hi, Wi are resolution of the feature map divided by anchor stride.
+        """
+        grid_sizes = [feature_map.shape[-2:] for feature_map in features]
+        anchors_over_all_feature_maps = self._grid_anchors(grid_sizes)
+        return [Boxes(x) for x in anchors_over_all_feature_maps]
+@ANCHOR_GENERATOR_REGISTRY.register()
+class RotatedAnchorGenerator(nn.Module):
+    """
+    Compute rotated anchors used by Rotated RPN (RRPN), described in
+    "Arbitrary-Oriented Scene Text Detection via Rotation Proposals".
+    """
+    box_dim: int = 5
+    """
+    the dimension of each anchor box.
+    """
+    @configurable
+    def __init__(self, *, sizes, aspect_ratios, strides, angles, offset=0.5):
+        """
+        This interface is experimental.
+        Args:
+            sizes (list[list[float]] or list[float]):
+                If sizes is list[list[float]], sizes[i] is the list of anchor sizes
+                (i.e. sqrt of anchor area) to use for the i-th feature map.
+                If sizes is list[float], the sizes are used for all feature maps.
+                Anchor sizes are given in absolute lengths in units of
+                the input image; they do not dynamically scale if the input image size changes.
+            aspect_ratios (list[list[float]] or list[float]): list of aspect ratios
+                (i.e. height / width) to use for anchors. Same "broadcast" rule for `sizes` applies.
+            strides (list[int]): stride of each input feature.
+            angles (list[list[float]] or list[float]): list of angles (in degrees CCW)
+                to use for anchors. Same "broadcast" rule for `sizes` applies.
+            offset (float): Relative offset between the center of the first anchor and the top-left
+                corner of the image. Value has to be in [0, 1).
+                Recommend to use 0.5, which means half stride.
+        """
+        super().__init__()
+        self.strides = strides
+        self.num_features = len(self.strides)
+        sizes = _broadcast_params(sizes, self.num_features, "sizes")
+        aspect_ratios = _broadcast_params(aspect_ratios, self.num_features, "aspect_ratios")
+        angles = _broadcast_params(angles, self.num_features, "angles")
+        self.cell_anchors = self._calculate_anchors(sizes, aspect_ratios, angles)
+        self.offset = offset
+        assert 0.0 <= self.offset < 1.0, self.offset
+    @classmethod
+    def from_config(cls, cfg, input_shape: List[ShapeSpec]):
+        return {
+            "sizes": cfg.MODEL.ANCHOR_GENERATOR.SIZES,
+            "aspect_ratios": cfg.MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS,
+            "strides": [x.stride for x in input_shape],
+            "offset": cfg.MODEL.ANCHOR_GENERATOR.OFFSET,
+            "angles": cfg.MODEL.ANCHOR_GENERATOR.ANGLES,
+        }
+    def _calculate_anchors(self, sizes, aspect_ratios, angles):
+        cell_anchors = [
+            self.generate_cell_anchors(size, aspect_ratio, angle).float()
+            for size, aspect_ratio, angle in zip(sizes, aspect_ratios, angles)
+        ]
+        return BufferList(cell_anchors)
+    @property
+    def num_cell_anchors(self):
+        """
+        Alias of `num_anchors`.
+        """
+        return self.num_anchors
+    @property
+    def num_anchors(self):
+        """
+        Returns:
+            list[int]: Each int is the number of anchors at every pixel
+                location, on that feature map.
+                For example, if at every pixel we use anchors of 3 aspect
+                ratios, 2 sizes and 5 angles, the number of anchors is 30.
+                (See also ANCHOR_GENERATOR.SIZES, ANCHOR_GENERATOR.ASPECT_RATIOS
+                and ANCHOR_GENERATOR.ANGLES in config)
+                In standard RRPN models, `num_anchors` on every feature map is the same.
+        """
+        return [len(cell_anchors) for cell_anchors in self.cell_anchors]
+    def _grid_anchors(self, grid_sizes):
+        anchors = []
+        for size, stride, base_anchors in zip(grid_sizes, self.strides, self.cell_anchors):
+            shift_x, shift_y = _create_grid_offsets(size, stride, self.offset, base_anchors)
+            zeros = torch.zeros_like(shift_x)
+            shifts = torch.stack((shift_x, shift_y, zeros, zeros, zeros), dim=1)
+            anchors.append((shifts.view(-1, 1, 5) + base_anchors.view(1, -1, 5)).reshape(-1, 5))
+        return anchors
+    def generate_cell_anchors(
+        self,
+        sizes=(32, 64, 128, 256, 512),
+        aspect_ratios=(0.5, 1, 2),
+        angles=(-90, -60, -30, 0, 30, 60, 90),
+    ):
+        """
+        Generate a tensor storing canonical anchor boxes, which are all anchor
+        boxes of different sizes, aspect_ratios, angles centered at (0, 0).
+        We can later build the set of anchors for a full feature map by
+        shifting and tiling these tensors (see `meth:_grid_anchors`).
+        Args:
+            sizes (tuple[float]):
+            aspect_ratios (tuple[float]]):
+            angles (tuple[float]]):
+        Returns:
+            Tensor of shape (len(sizes) * len(aspect_ratios) * len(angles), 5)
+                storing anchor boxes in (x_ctr, y_ctr, w, h, angle) format.
+        """
+        anchors = []
+        for size in sizes:
+            area = size**2.0
+            for aspect_ratio in aspect_ratios:
+                # s * s = w * h
+                # a = h / w
+                # ... some algebra ...
+                # w = sqrt(s * s / a)
+                # h = a * w
+                w = math.sqrt(area / aspect_ratio)
+                h = aspect_ratio * w
+                anchors.extend([0, 0, w, h, a] for a in angles)
+        return torch.tensor(anchors)
+    def forward(self, features):
+        """
+        Args:
+            features (list[Tensor]): list of backbone feature maps on which to generate anchors.
+        Returns:
+            list[RotatedBoxes]: a list of Boxes containing all the anchors for each feature map
+                (i.e. the cell anchors repeated over all locations in the feature map).
+                The number of anchors of each feature map is Hi x Wi x num_cell_anchors,
+                where Hi, Wi are resolution of the feature map divided by anchor stride.
+        """
+        grid_sizes = [feature_map.shape[-2:] for feature_map in features]
+        anchors_over_all_feature_maps = self._grid_anchors(grid_sizes)
+        return [RotatedBoxes(x) for x in anchors_over_all_feature_maps]
+def build_anchor_generator(cfg, input_shape):
+    """
+    Built an anchor generator from `cfg.MODEL.ANCHOR_GENERATOR.NAME`.
+    """
+    anchor_generator = cfg.MODEL.ANCHOR_GENERATOR.NAME
+    return ANCHOR_GENERATOR_REGISTRY.get(anchor_generator)(cfg, input_shape)

extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/backbone/__init__.py ADDED Viewed

	@@ -0,0 +1,20 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+from .build import build_backbone, BACKBONE_REGISTRY  # noqa F401 isort:skip
+from .backbone import Backbone
+from .fpn import FPN
+from .regnet import RegNet
+from .resnet import (
+    BasicStem,
+    ResNet,
+    ResNetBlockBase,
+    build_resnet_backbone,
+    make_stage,
+    BottleneckBlock,
+)
+from .vit import ViT, SimpleFeaturePyramid, get_vit_lr_decay_rate
+from .mvit import MViT
+from .swin import SwinTransformer
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
+# TODO can expose more resnet blocks after careful consideration

extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/backbone/backbone.py ADDED Viewed

	@@ -0,0 +1,74 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+from abc import ABCMeta, abstractmethod
+from typing import Dict
+import torch.nn as nn
+from annotator.oneformer.detectron2.layers import ShapeSpec
+__all__ = ["Backbone"]
+class Backbone(nn.Module, metaclass=ABCMeta):
+    """
+    Abstract base class for network backbones.
+    """
+    def __init__(self):
+        """
+        The `__init__` method of any subclass can specify its own set of arguments.
+        """
+        super().__init__()
+    @abstractmethod
+    def forward(self):
+        """
+        Subclasses must override this method, but adhere to the same return type.
+        Returns:
+            dict[str->Tensor]: mapping from feature name (e.g., "res2") to tensor
+        """
+        pass
+    @property
+    def size_divisibility(self) -> int:
+        """
+        Some backbones require the input height and width to be divisible by a
+        specific integer. This is typically true for encoder / decoder type networks
+        with lateral connection (e.g., FPN) for which feature maps need to match
+        dimension in the "bottom up" and "top down" paths. Set to 0 if no specific
+        input size divisibility is required.
+        """
+        return 0
+    @property
+    def padding_constraints(self) -> Dict[str, int]:
+        """
+        This property is a generalization of size_divisibility. Some backbones and training
+        recipes require specific padding constraints, such as enforcing divisibility by a specific
+        integer (e.g., FPN) or padding to a square (e.g., ViTDet with large-scale jitter
+        in :paper:vitdet). `padding_constraints` contains these optional items like:
+        {
+            "size_divisibility": int,
+            "square_size": int,
+            # Future options are possible
+        }
+        `size_divisibility` will read from here if presented and `square_size` indicates the
+        square padding size if `square_size` > 0.
+        TODO: use type of Dict[str, int] to avoid torchscipt issues. The type of padding_constraints
+        could be generalized as TypedDict (Python 3.8+) to support more types in the future.
+        """
+        return {}
+    def output_shape(self):
+        """
+        Returns:
+            dict[str->ShapeSpec]
+        """
+        # this is a backward-compatible default
+        return {
+            name: ShapeSpec(
+                channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
+            )
+            for name in self._out_features
+        }

extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/backbone/build.py ADDED Viewed

	@@ -0,0 +1,33 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+from annotator.oneformer.detectron2.layers import ShapeSpec
+from annotator.oneformer.detectron2.utils.registry import Registry
+from .backbone import Backbone
+BACKBONE_REGISTRY = Registry("BACKBONE")
+BACKBONE_REGISTRY.__doc__ = """
+Registry for backbones, which extract feature maps from images
+The registered object must be a callable that accepts two arguments:
+1. A :class:`detectron2.config.CfgNode`
+2. A :class:`detectron2.layers.ShapeSpec`, which contains the input shape specification.
+Registered object must return instance of :class:`Backbone`.
+"""
+def build_backbone(cfg, input_shape=None):
+    """
+    Build a backbone from `cfg.MODEL.BACKBONE.NAME`.
+    Returns:
+        an instance of :class:`Backbone`
+    """
+    if input_shape is None:
+        input_shape = ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN))
+    backbone_name = cfg.MODEL.BACKBONE.NAME
+    backbone = BACKBONE_REGISTRY.get(backbone_name)(cfg, input_shape)
+    assert isinstance(backbone, Backbone)
+    return backbone

extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/backbone/fpn.py ADDED Viewed

	@@ -0,0 +1,268 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+import math
+import fvcore.nn.weight_init as weight_init
+import torch
+import torch.nn.functional as F
+from torch import nn
+from annotator.oneformer.detectron2.layers import Conv2d, ShapeSpec, get_norm
+from .backbone import Backbone
+from .build import BACKBONE_REGISTRY
+from .resnet import build_resnet_backbone
+__all__ = ["build_resnet_fpn_backbone", "build_retinanet_resnet_fpn_backbone", "FPN"]
+class FPN(Backbone):
+    """
+    This module implements :paper:`FPN`.
+    It creates pyramid features built on top of some input feature maps.
+    """
+    _fuse_type: torch.jit.Final[str]
+    def __init__(
+        self,
+        bottom_up,
+        in_features,
+        out_channels,
+        norm="",
+        top_block=None,
+        fuse_type="sum",
+        square_pad=0,
+    ):
+        """
+        Args:
+            bottom_up (Backbone): module representing the bottom up subnetwork.
+                Must be a subclass of :class:`Backbone`. The multi-scale feature
+                maps generated by the bottom up network, and listed in `in_features`,
+                are used to generate FPN levels.
+            in_features (list[str]): names of the input feature maps coming
+                from the backbone to which FPN is attached. For example, if the
+                backbone produces ["res2", "res3", "res4"], any *contiguous* sublist
+                of these may be used; order must be from high to low resolution.
+            out_channels (int): number of channels in the output feature maps.
+            norm (str): the normalization to use.
+            top_block (nn.Module or None): if provided, an extra operation will
+                be performed on the output of the last (smallest resolution)
+                FPN output, and the result will extend the result list. The top_block
+                further downsamples the feature map. It must have an attribute
+                "num_levels", meaning the number of extra FPN levels added by
+                this block, and "in_feature", which is a string representing
+                its input feature (e.g., p5).
+            fuse_type (str): types for fusing the top down features and the lateral
+                ones. It can be "sum" (default), which sums up element-wise; or "avg",
+                which takes the element-wise mean of the two.
+            square_pad (int): If > 0, require input images to be padded to specific square size.
+        """
+        super(FPN, self).__init__()
+        assert isinstance(bottom_up, Backbone)
+        assert in_features, in_features
+        # Feature map strides and channels from the bottom up network (e.g. ResNet)
+        input_shapes = bottom_up.output_shape()
+        strides = [input_shapes[f].stride for f in in_features]
+        in_channels_per_feature = [input_shapes[f].channels for f in in_features]
+        _assert_strides_are_log2_contiguous(strides)
+        lateral_convs = []
+        output_convs = []
+        use_bias = norm == ""
+        for idx, in_channels in enumerate(in_channels_per_feature):
+            lateral_norm = get_norm(norm, out_channels)
+            output_norm = get_norm(norm, out_channels)
+            lateral_conv = Conv2d(
+                in_channels, out_channels, kernel_size=1, bias=use_bias, norm=lateral_norm
+            )
+            output_conv = Conv2d(
+                out_channels,
+                out_channels,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias=use_bias,
+                norm=output_norm,
+            )
+            weight_init.c2_xavier_fill(lateral_conv)
+            weight_init.c2_xavier_fill(output_conv)
+            stage = int(math.log2(strides[idx]))
+            self.add_module("fpn_lateral{}".format(stage), lateral_conv)
+            self.add_module("fpn_output{}".format(stage), output_conv)
+            lateral_convs.append(lateral_conv)
+            output_convs.append(output_conv)
+        # Place convs into top-down order (from low to high resolution)
+        # to make the top-down computation in forward clearer.
+        self.lateral_convs = lateral_convs[::-1]
+        self.output_convs = output_convs[::-1]
+        self.top_block = top_block
+        self.in_features = tuple(in_features)
+        self.bottom_up = bottom_up
+        # Return feature names are "p<stage>", like ["p2", "p3", ..., "p6"]
+        self._out_feature_strides = {"p{}".format(int(math.log2(s))): s for s in strides}
+        # top block output feature maps.
+        if self.top_block is not None:
+            for s in range(stage, stage + self.top_block.num_levels):
+                self._out_feature_strides["p{}".format(s + 1)] = 2 ** (s + 1)
+        self._out_features = list(self._out_feature_strides.keys())
+        self._out_feature_channels = {k: out_channels for k in self._out_features}
+        self._size_divisibility = strides[-1]
+        self._square_pad = square_pad
+        assert fuse_type in {"avg", "sum"}
+        self._fuse_type = fuse_type
+    @property
+    def size_divisibility(self):
+        return self._size_divisibility
+    @property
+    def padding_constraints(self):
+        return {"square_size": self._square_pad}
+    def forward(self, x):
+        """
+        Args:
+            input (dict[str->Tensor]): mapping feature map name (e.g., "res5") to
+                feature map tensor for each feature level in high to low resolution order.
+        Returns:
+            dict[str->Tensor]:
+                mapping from feature map name to FPN feature map tensor
+                in high to low resolution order. Returned feature names follow the FPN
+                paper convention: "p<stage>", where stage has stride = 2 ** stage e.g.,
+                ["p2", "p3", ..., "p6"].
+        """
+        bottom_up_features = self.bottom_up(x)
+        results = []
+        prev_features = self.lateral_convs[0](bottom_up_features[self.in_features[-1]])
+        results.append(self.output_convs[0](prev_features))
+        # Reverse feature maps into top-down order (from low to high resolution)
+        for idx, (lateral_conv, output_conv) in enumerate(
+            zip(self.lateral_convs, self.output_convs)
+        ):
+            # Slicing of ModuleList is not supported https://github.com/pytorch/pytorch/issues/47336
+            # Therefore we loop over all modules but skip the first one
+            if idx > 0:
+                features = self.in_features[-idx - 1]
+                features = bottom_up_features[features]
+                top_down_features = F.interpolate(prev_features, scale_factor=2.0, mode="nearest")
+                lateral_features = lateral_conv(features)
+                prev_features = lateral_features + top_down_features
+                if self._fuse_type == "avg":
+                    prev_features /= 2
+                results.insert(0, output_conv(prev_features))
+        if self.top_block is not None:
+            if self.top_block.in_feature in bottom_up_features:
+                top_block_in_feature = bottom_up_features[self.top_block.in_feature]
+            else:
+                top_block_in_feature = results[self._out_features.index(self.top_block.in_feature)]
+            results.extend(self.top_block(top_block_in_feature))
+        assert len(self._out_features) == len(results)
+        return {f: res for f, res in zip(self._out_features, results)}
+    def output_shape(self):
+        return {
+            name: ShapeSpec(
+                channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
+            )
+            for name in self._out_features
+        }
+def _assert_strides_are_log2_contiguous(strides):
+    """
+    Assert that each stride is 2x times its preceding stride, i.e. "contiguous in log2".
+    """
+    for i, stride in enumerate(strides[1:], 1):
+        assert stride == 2 * strides[i - 1], "Strides {} {} are not log2 contiguous".format(
+            stride, strides[i - 1]
+        )
+class LastLevelMaxPool(nn.Module):
+    """
+    This module is used in the original FPN to generate a downsampled
+    P6 feature from P5.
+    """
+    def __init__(self):
+        super().__init__()
+        self.num_levels = 1
+        self.in_feature = "p5"
+    def forward(self, x):
+        return [F.max_pool2d(x, kernel_size=1, stride=2, padding=0)]
+class LastLevelP6P7(nn.Module):
+    """
+    This module is used in RetinaNet to generate extra layers, P6 and P7 from
+    C5 feature.
+    """
+    def __init__(self, in_channels, out_channels, in_feature="res5"):
+        super().__init__()
+        self.num_levels = 2
+        self.in_feature = in_feature
+        self.p6 = nn.Conv2d(in_channels, out_channels, 3, 2, 1)
+        self.p7 = nn.Conv2d(out_channels, out_channels, 3, 2, 1)
+        for module in [self.p6, self.p7]:
+            weight_init.c2_xavier_fill(module)
+    def forward(self, c5):
+        p6 = self.p6(c5)
+        p7 = self.p7(F.relu(p6))
+        return [p6, p7]
+@BACKBONE_REGISTRY.register()
+def build_resnet_fpn_backbone(cfg, input_shape: ShapeSpec):
+    """
+    Args:
+        cfg: a detectron2 CfgNode
+    Returns:
+        backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
+    """
+    bottom_up = build_resnet_backbone(cfg, input_shape)
+    in_features = cfg.MODEL.FPN.IN_FEATURES
+    out_channels = cfg.MODEL.FPN.OUT_CHANNELS
+    backbone = FPN(
+        bottom_up=bottom_up,
+        in_features=in_features,
+        out_channels=out_channels,
+        norm=cfg.MODEL.FPN.NORM,
+        top_block=LastLevelMaxPool(),
+        fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
+    )
+    return backbone
+@BACKBONE_REGISTRY.register()
+def build_retinanet_resnet_fpn_backbone(cfg, input_shape: ShapeSpec):
+    """
+    Args:
+        cfg: a detectron2 CfgNode
+    Returns:
+        backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
+    """
+    bottom_up = build_resnet_backbone(cfg, input_shape)
+    in_features = cfg.MODEL.FPN.IN_FEATURES
+    out_channels = cfg.MODEL.FPN.OUT_CHANNELS
+    in_channels_p6p7 = bottom_up.output_shape()["res5"].channels
+    backbone = FPN(
+        bottom_up=bottom_up,
+        in_features=in_features,
+        out_channels=out_channels,
+        norm=cfg.MODEL.FPN.NORM,
+        top_block=LastLevelP6P7(in_channels_p6p7, out_channels),
+        fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
+    )
+    return backbone

extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/backbone/mvit.py ADDED Viewed

	@@ -0,0 +1,448 @@

+import logging
+import numpy as np
+import torch
+import torch.nn as nn
+from .backbone import Backbone
+from .utils import (
+    PatchEmbed,
+    add_decomposed_rel_pos,
+    get_abs_pos,
+    window_partition,
+    window_unpartition,
+)
+logger = logging.getLogger(__name__)
+__all__ = ["MViT"]
+def attention_pool(x, pool, norm=None):
+    # (B, H, W, C) -> (B, C, H, W)
+    x = x.permute(0, 3, 1, 2)
+    x = pool(x)
+    # (B, C, H1, W1) -> (B, H1, W1, C)
+    x = x.permute(0, 2, 3, 1)
+    if norm:
+        x = norm(x)
+    return x
+class MultiScaleAttention(nn.Module):
+    """Multiscale Multi-head Attention block."""
+    def __init__(
+        self,
+        dim,
+        dim_out,
+        num_heads,
+        qkv_bias=True,
+        norm_layer=nn.LayerNorm,
+        pool_kernel=(3, 3),
+        stride_q=1,
+        stride_kv=1,
+        residual_pooling=True,
+        window_size=0,
+        use_rel_pos=False,
+        rel_pos_zero_init=True,
+        input_size=None,
+    ):
+        """
+        Args:
+            dim (int): Number of input channels.
+            dim_out (int): Number of output channels.
+            num_heads (int): Number of attention heads.
+            qkv_bias (bool:  If True, add a learnable bias to query, key, value.
+            norm_layer (nn.Module): Normalization layer.
+            pool_kernel (tuple): kernel size for qkv pooling layers.
+            stride_q (int): stride size for q pooling layer.
+            stride_kv (int): stride size for kv pooling layer.
+            residual_pooling (bool): If true, enable residual pooling.
+            use_rel_pos (bool): If True, add relative postional embeddings to the attention map.
+            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
+            input_size (int or None): Input resolution.
+        """
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim_out // num_heads
+        self.scale = head_dim**-0.5
+        self.qkv = nn.Linear(dim, dim_out * 3, bias=qkv_bias)
+        self.proj = nn.Linear(dim_out, dim_out)
+        # qkv pooling
+        pool_padding = [k // 2 for k in pool_kernel]
+        dim_conv = dim_out // num_heads
+        self.pool_q = nn.Conv2d(
+            dim_conv,
+            dim_conv,
+            pool_kernel,
+            stride=stride_q,
+            padding=pool_padding,
+            groups=dim_conv,
+            bias=False,
+        )
+        self.norm_q = norm_layer(dim_conv)
+        self.pool_k = nn.Conv2d(
+            dim_conv,
+            dim_conv,
+            pool_kernel,
+            stride=stride_kv,
+            padding=pool_padding,
+            groups=dim_conv,
+            bias=False,
+        )
+        self.norm_k = norm_layer(dim_conv)
+        self.pool_v = nn.Conv2d(
+            dim_conv,
+            dim_conv,
+            pool_kernel,
+            stride=stride_kv,
+            padding=pool_padding,
+            groups=dim_conv,
+            bias=False,
+        )
+        self.norm_v = norm_layer(dim_conv)
+        self.window_size = window_size
+        if window_size:
+            self.q_win_size = window_size // stride_q
+            self.kv_win_size = window_size // stride_kv
+        self.residual_pooling = residual_pooling
+        self.use_rel_pos = use_rel_pos
+        if self.use_rel_pos:
+            # initialize relative positional embeddings
+            assert input_size[0] == input_size[1]
+            size = input_size[0]
+            rel_dim = 2 * max(size // stride_q, size // stride_kv) - 1
+            self.rel_pos_h = nn.Parameter(torch.zeros(rel_dim, head_dim))
+            self.rel_pos_w = nn.Parameter(torch.zeros(rel_dim, head_dim))
+            if not rel_pos_zero_init:
+                nn.init.trunc_normal_(self.rel_pos_h, std=0.02)
+                nn.init.trunc_normal_(self.rel_pos_w, std=0.02)
+    def forward(self, x):
+        B, H, W, _ = x.shape
+        # qkv with shape (3, B, nHead, H, W, C)
+        qkv = self.qkv(x).reshape(B, H, W, 3, self.num_heads, -1).permute(3, 0, 4, 1, 2, 5)
+        # q, k, v with shape (B * nHead, H, W, C)
+        q, k, v = qkv.reshape(3, B * self.num_heads, H, W, -1).unbind(0)
+        q = attention_pool(q, self.pool_q, self.norm_q)
+        k = attention_pool(k, self.pool_k, self.norm_k)
+        v = attention_pool(v, self.pool_v, self.norm_v)
+        ori_q = q
+        if self.window_size:
+            q, q_hw_pad = window_partition(q, self.q_win_size)
+            k, kv_hw_pad = window_partition(k, self.kv_win_size)
+            v, _ = window_partition(v, self.kv_win_size)
+            q_hw = (self.q_win_size, self.q_win_size)
+            kv_hw = (self.kv_win_size, self.kv_win_size)
+        else:
+            q_hw = q.shape[1:3]
+            kv_hw = k.shape[1:3]
+        q = q.view(q.shape[0], np.prod(q_hw), -1)
+        k = k.view(k.shape[0], np.prod(kv_hw), -1)
+        v = v.view(v.shape[0], np.prod(kv_hw), -1)
+        attn = (q * self.scale) @ k.transpose(-2, -1)
+        if self.use_rel_pos:
+            attn = add_decomposed_rel_pos(attn, q, self.rel_pos_h, self.rel_pos_w, q_hw, kv_hw)
+        attn = attn.softmax(dim=-1)
+        x = attn @ v
+        x = x.view(x.shape[0], q_hw[0], q_hw[1], -1)
+        if self.window_size:
+            x = window_unpartition(x, self.q_win_size, q_hw_pad, ori_q.shape[1:3])
+        if self.residual_pooling:
+            x += ori_q
+        H, W = x.shape[1], x.shape[2]
+        x = x.view(B, self.num_heads, H, W, -1).permute(0, 2, 3, 1, 4).reshape(B, H, W, -1)
+        x = self.proj(x)
+        return x
+class MultiScaleBlock(nn.Module):
+    """Multiscale Transformer blocks"""
+    def __init__(
+        self,
+        dim,
+        dim_out,
+        num_heads,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        drop_path=0.0,
+        norm_layer=nn.LayerNorm,
+        act_layer=nn.GELU,
+        qkv_pool_kernel=(3, 3),
+        stride_q=1,
+        stride_kv=1,
+        residual_pooling=True,
+        window_size=0,
+        use_rel_pos=False,
+        rel_pos_zero_init=True,
+        input_size=None,
+    ):
+        """
+        Args:
+            dim (int): Number of input channels.
+            dim_out (int): Number of output channels.
+            num_heads (int): Number of attention heads in the MViT block.
+            mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+            qkv_bias (bool): If True, add a learnable bias to query, key, value.
+            drop_path (float): Stochastic depth rate.
+            norm_layer (nn.Module): Normalization layer.
+            act_layer (nn.Module): Activation layer.
+            qkv_pool_kernel (tuple): kernel size for qkv pooling layers.
+            stride_q (int): stride size for q pooling layer.
+            stride_kv (int): stride size for kv pooling layer.
+            residual_pooling (bool): If true, enable residual pooling.
+            window_size (int): Window size for window attention blocks. If it equals 0, then not
+                use window attention.
+            use_rel_pos (bool): If True, add relative postional embeddings to the attention map.
+            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
+            input_size (int or None): Input resolution.
+        """
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = MultiScaleAttention(
+            dim,
+            dim_out,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            norm_layer=norm_layer,
+            pool_kernel=qkv_pool_kernel,
+            stride_q=stride_q,
+            stride_kv=stride_kv,
+            residual_pooling=residual_pooling,
+            window_size=window_size,
+            use_rel_pos=use_rel_pos,
+            rel_pos_zero_init=rel_pos_zero_init,
+            input_size=input_size,
+        )
+        from timm.models.layers import DropPath, Mlp
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim_out)
+        self.mlp = Mlp(
+            in_features=dim_out,
+            hidden_features=int(dim_out * mlp_ratio),
+            out_features=dim_out,
+            act_layer=act_layer,
+        )
+        if dim != dim_out:
+            self.proj = nn.Linear(dim, dim_out)
+        if stride_q > 1:
+            kernel_skip = stride_q + 1
+            padding_skip = int(kernel_skip // 2)
+            self.pool_skip = nn.MaxPool2d(kernel_skip, stride_q, padding_skip, ceil_mode=False)
+    def forward(self, x):
+        x_norm = self.norm1(x)
+        x_block = self.attn(x_norm)
+        if hasattr(self, "proj"):
+            x = self.proj(x_norm)
+        if hasattr(self, "pool_skip"):
+            x = attention_pool(x, self.pool_skip)
+        x = x + self.drop_path(x_block)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+class MViT(Backbone):
+    """
+    This module implements Multiscale Vision Transformer (MViT) backbone in :paper:'mvitv2'.
+    """
+    def __init__(
+        self,
+        img_size=224,
+        patch_kernel=(7, 7),
+        patch_stride=(4, 4),
+        patch_padding=(3, 3),
+        in_chans=3,
+        embed_dim=96,
+        depth=16,
+        num_heads=1,
+        last_block_indexes=(0, 2, 11, 15),
+        qkv_pool_kernel=(3, 3),
+        adaptive_kv_stride=4,
+        adaptive_window_size=56,
+        residual_pooling=True,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        drop_path_rate=0.0,
+        norm_layer=nn.LayerNorm,
+        act_layer=nn.GELU,
+        use_abs_pos=False,
+        use_rel_pos=True,
+        rel_pos_zero_init=True,
+        use_act_checkpoint=False,
+        pretrain_img_size=224,
+        pretrain_use_cls_token=True,
+        out_features=("scale2", "scale3", "scale4", "scale5"),
+    ):
+        """
+        Args:
+            img_size (int): Input image size.
+            patch_kernel (tuple): kernel size for patch embedding.
+            patch_stride (tuple): stride size for patch embedding.
+            patch_padding (tuple): padding size for patch embedding.
+            in_chans (int): Number of input image channels.
+            embed_dim (int): Patch embedding dimension.
+            depth (int): Depth of MViT.
+            num_heads (int): Number of base attention heads in each MViT block.
+            last_block_indexes (tuple): Block indexes for last blocks in each stage.
+            qkv_pool_kernel (tuple): kernel size for qkv pooling layers.
+            adaptive_kv_stride (int): adaptive stride size for kv pooling.
+            adaptive_window_size (int): adaptive window size for window attention blocks.
+            residual_pooling (bool): If true, enable residual pooling.
+            mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+            qkv_bias (bool): If True, add a learnable bias to query, key, value.
+            drop_path_rate (float): Stochastic depth rate.
+            norm_layer (nn.Module): Normalization layer.
+            act_layer (nn.Module): Activation layer.
+            use_abs_pos (bool): If True, use absolute positional embeddings.
+            use_rel_pos (bool): If True, add relative postional embeddings to the attention map.
+            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
+            window_size (int): Window size for window attention blocks.
+            use_act_checkpoint (bool): If True, use activation checkpointing.
+            pretrain_img_size (int): input image size for pretraining models.
+            pretrain_use_cls_token (bool): If True, pretrainig models use class token.
+            out_features (tuple): name of the feature maps from each stage.
+        """
+        super().__init__()
+        self.pretrain_use_cls_token = pretrain_use_cls_token
+        self.patch_embed = PatchEmbed(
+            kernel_size=patch_kernel,
+            stride=patch_stride,
+            padding=patch_padding,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+        )
+        if use_abs_pos:
+            # Initialize absoluate positional embedding with pretrain image size.
+            num_patches = (pretrain_img_size // patch_stride[0]) * (
+                pretrain_img_size // patch_stride[1]
+            )
+            num_positions = (num_patches + 1) if pretrain_use_cls_token else num_patches
+            self.pos_embed = nn.Parameter(torch.zeros(1, num_positions, embed_dim))
+        else:
+            self.pos_embed = None
+        # stochastic depth decay rule
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]
+        dim_out = embed_dim
+        stride_kv = adaptive_kv_stride
+        window_size = adaptive_window_size
+        input_size = (img_size // patch_stride[0], img_size // patch_stride[1])
+        stage = 2
+        stride = patch_stride[0]
+        self._out_feature_strides = {}
+        self._out_feature_channels = {}
+        self.blocks = nn.ModuleList()
+        for i in range(depth):
+            # Multiply stride_kv by 2 if it's the last block of stage2 and stage3.
+            if i == last_block_indexes[1] or i == last_block_indexes[2]:
+                stride_kv_ = stride_kv * 2
+            else:
+                stride_kv_ = stride_kv
+            # hybrid window attention: global attention in last three stages.
+            window_size_ = 0 if i in last_block_indexes[1:] else window_size
+            block = MultiScaleBlock(
+                dim=embed_dim,
+                dim_out=dim_out,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                qkv_pool_kernel=qkv_pool_kernel,
+                stride_q=2 if i - 1 in last_block_indexes else 1,
+                stride_kv=stride_kv_,
+                residual_pooling=residual_pooling,
+                window_size=window_size_,
+                use_rel_pos=use_rel_pos,
+                rel_pos_zero_init=rel_pos_zero_init,
+                input_size=input_size,
+            )
+            if use_act_checkpoint:
+                # TODO: use torch.utils.checkpoint
+                from fairscale.nn.checkpoint import checkpoint_wrapper
+                block = checkpoint_wrapper(block)
+            self.blocks.append(block)
+            embed_dim = dim_out
+            if i in last_block_indexes:
+                name = f"scale{stage}"
+                if name in out_features:
+                    self._out_feature_channels[name] = dim_out
+                    self._out_feature_strides[name] = stride
+                    self.add_module(f"{name}_norm", norm_layer(dim_out))
+                dim_out *= 2
+                num_heads *= 2
+                stride_kv = max(stride_kv // 2, 1)
+                stride *= 2
+                stage += 1
+            if i - 1 in last_block_indexes:
+                window_size = window_size // 2
+                input_size = [s // 2 for s in input_size]
+        self._out_features = out_features
+        self._last_block_indexes = last_block_indexes
+        if self.pos_embed is not None:
+            nn.init.trunc_normal_(self.pos_embed, std=0.02)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            nn.init.trunc_normal_(m.weight, std=0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    def forward(self, x):
+        x = self.patch_embed(x)
+        if self.pos_embed is not None:
+            x = x + get_abs_pos(self.pos_embed, self.pretrain_use_cls_token, x.shape[1:3])
+        outputs = {}
+        stage = 2
+        for i, blk in enumerate(self.blocks):
+            x = blk(x)
+            if i in self._last_block_indexes:
+                name = f"scale{stage}"
+                if name in self._out_features:
+                    x_out = getattr(self, f"{name}_norm")(x)
+                    outputs[name] = x_out.permute(0, 3, 1, 2)
+                stage += 1
+        return outputs

extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/backbone/regnet.py ADDED Viewed

	@@ -0,0 +1,452 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+Implementation of RegNet models from :paper:`dds` and :paper:`scaling`.
+This code is adapted from https://github.com/facebookresearch/pycls with minimal modifications.
+Some code duplication exists between RegNet and ResNets (e.g., ResStem) in order to simplify
+model loading.
+"""
+import numpy as np
+from torch import nn
+from annotator.oneformer.detectron2.layers import CNNBlockBase, ShapeSpec, get_norm
+from .backbone import Backbone
+__all__ = [
+    "AnyNet",
+    "RegNet",
+    "ResStem",
+    "SimpleStem",
+    "VanillaBlock",
+    "ResBasicBlock",
+    "ResBottleneckBlock",
+]
+def conv2d(w_in, w_out, k, *, stride=1, groups=1, bias=False):
+    """Helper for building a conv2d layer."""
+    assert k % 2 == 1, "Only odd size kernels supported to avoid padding issues."
+    s, p, g, b = stride, (k - 1) // 2, groups, bias
+    return nn.Conv2d(w_in, w_out, k, stride=s, padding=p, groups=g, bias=b)
+def gap2d():
+    """Helper for building a global average pooling layer."""
+    return nn.AdaptiveAvgPool2d((1, 1))
+def pool2d(k, *, stride=1):
+    """Helper for building a pool2d layer."""
+    assert k % 2 == 1, "Only odd size kernels supported to avoid padding issues."
+    return nn.MaxPool2d(k, stride=stride, padding=(k - 1) // 2)
+def init_weights(m):
+    """Performs ResNet-style weight initialization."""
+    if isinstance(m, nn.Conv2d):
+        # Note that there is no bias due to BN
+        fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+        m.weight.data.normal_(mean=0.0, std=np.sqrt(2.0 / fan_out))
+    elif isinstance(m, nn.BatchNorm2d):
+        m.weight.data.fill_(1.0)
+        m.bias.data.zero_()
+    elif isinstance(m, nn.Linear):
+        m.weight.data.normal_(mean=0.0, std=0.01)
+        m.bias.data.zero_()
+class ResStem(CNNBlockBase):
+    """ResNet stem for ImageNet: 7x7, BN, AF, MaxPool."""
+    def __init__(self, w_in, w_out, norm, activation_class):
+        super().__init__(w_in, w_out, 4)
+        self.conv = conv2d(w_in, w_out, 7, stride=2)
+        self.bn = get_norm(norm, w_out)
+        self.af = activation_class()
+        self.pool = pool2d(3, stride=2)
+    def forward(self, x):
+        for layer in self.children():
+            x = layer(x)
+        return x
+class SimpleStem(CNNBlockBase):
+    """Simple stem for ImageNet: 3x3, BN, AF."""
+    def __init__(self, w_in, w_out, norm, activation_class):
+        super().__init__(w_in, w_out, 2)
+        self.conv = conv2d(w_in, w_out, 3, stride=2)
+        self.bn = get_norm(norm, w_out)
+        self.af = activation_class()
+    def forward(self, x):
+        for layer in self.children():
+            x = layer(x)
+        return x
+class SE(nn.Module):
+    """Squeeze-and-Excitation (SE) block: AvgPool, FC, Act, FC, Sigmoid."""
+    def __init__(self, w_in, w_se, activation_class):
+        super().__init__()
+        self.avg_pool = gap2d()
+        self.f_ex = nn.Sequential(
+            conv2d(w_in, w_se, 1, bias=True),
+            activation_class(),
+            conv2d(w_se, w_in, 1, bias=True),
+            nn.Sigmoid(),
+        )
+    def forward(self, x):
+        return x * self.f_ex(self.avg_pool(x))
+class VanillaBlock(CNNBlockBase):
+    """Vanilla block: [3x3 conv, BN, Relu] x2."""
+    def __init__(self, w_in, w_out, stride, norm, activation_class, _params):
+        super().__init__(w_in, w_out, stride)
+        self.a = conv2d(w_in, w_out, 3, stride=stride)
+        self.a_bn = get_norm(norm, w_out)
+        self.a_af = activation_class()
+        self.b = conv2d(w_out, w_out, 3)
+        self.b_bn = get_norm(norm, w_out)
+        self.b_af = activation_class()
+    def forward(self, x):
+        for layer in self.children():
+            x = layer(x)
+        return x
+class BasicTransform(nn.Module):
+    """Basic transformation: [3x3 conv, BN, Relu] x2."""
+    def __init__(self, w_in, w_out, stride, norm, activation_class, _params):
+        super().__init__()
+        self.a = conv2d(w_in, w_out, 3, stride=stride)
+        self.a_bn = get_norm(norm, w_out)
+        self.a_af = activation_class()
+        self.b = conv2d(w_out, w_out, 3)
+        self.b_bn = get_norm(norm, w_out)
+        self.b_bn.final_bn = True
+    def forward(self, x):
+        for layer in self.children():
+            x = layer(x)
+        return x
+class ResBasicBlock(CNNBlockBase):
+    """Residual basic block: x + f(x), f = basic transform."""
+    def __init__(self, w_in, w_out, stride, norm, activation_class, params):
+        super().__init__(w_in, w_out, stride)
+        self.proj, self.bn = None, None
+        if (w_in != w_out) or (stride != 1):
+            self.proj = conv2d(w_in, w_out, 1, stride=stride)
+            self.bn = get_norm(norm, w_out)
+        self.f = BasicTransform(w_in, w_out, stride, norm, activation_class, params)
+        self.af = activation_class()
+    def forward(self, x):
+        x_p = self.bn(self.proj(x)) if self.proj else x
+        return self.af(x_p + self.f(x))
+class BottleneckTransform(nn.Module):
+    """Bottleneck transformation: 1x1, 3x3 [+SE], 1x1."""
+    def __init__(self, w_in, w_out, stride, norm, activation_class, params):
+        super().__init__()
+        w_b = int(round(w_out * params["bot_mul"]))
+        w_se = int(round(w_in * params["se_r"]))
+        groups = w_b // params["group_w"]
+        self.a = conv2d(w_in, w_b, 1)
+        self.a_bn = get_norm(norm, w_b)
+        self.a_af = activation_class()
+        self.b = conv2d(w_b, w_b, 3, stride=stride, groups=groups)
+        self.b_bn = get_norm(norm, w_b)
+        self.b_af = activation_class()
+        self.se = SE(w_b, w_se, activation_class) if w_se else None
+        self.c = conv2d(w_b, w_out, 1)
+        self.c_bn = get_norm(norm, w_out)
+        self.c_bn.final_bn = True
+    def forward(self, x):
+        for layer in self.children():
+            x = layer(x)
+        return x
+class ResBottleneckBlock(CNNBlockBase):
+    """Residual bottleneck block: x + f(x), f = bottleneck transform."""
+    def __init__(self, w_in, w_out, stride, norm, activation_class, params):
+        super().__init__(w_in, w_out, stride)
+        self.proj, self.bn = None, None
+        if (w_in != w_out) or (stride != 1):
+            self.proj = conv2d(w_in, w_out, 1, stride=stride)
+            self.bn = get_norm(norm, w_out)
+        self.f = BottleneckTransform(w_in, w_out, stride, norm, activation_class, params)
+        self.af = activation_class()
+    def forward(self, x):
+        x_p = self.bn(self.proj(x)) if self.proj else x
+        return self.af(x_p + self.f(x))
+class AnyStage(nn.Module):
+    """AnyNet stage (sequence of blocks w/ the same output shape)."""
+    def __init__(self, w_in, w_out, stride, d, block_class, norm, activation_class, params):
+        super().__init__()
+        for i in range(d):
+            block = block_class(w_in, w_out, stride, norm, activation_class, params)
+            self.add_module("b{}".format(i + 1), block)
+            stride, w_in = 1, w_out
+    def forward(self, x):
+        for block in self.children():
+            x = block(x)
+        return x
+class AnyNet(Backbone):
+    """AnyNet model. See :paper:`dds`."""
+    def __init__(
+        self,
+        *,
+        stem_class,
+        stem_width,
+        block_class,
+        depths,
+        widths,
+        group_widths,
+        strides,
+        bottleneck_ratios,
+        se_ratio,
+        activation_class,
+        freeze_at=0,
+        norm="BN",
+        out_features=None,
+    ):
+        """
+        Args:
+            stem_class (callable): A callable taking 4 arguments (channels in, channels out,
+                normalization, callable returning an activation function) that returns another
+                callable implementing the stem module.
+            stem_width (int): The number of output channels that the stem produces.
+            block_class (callable): A callable taking 6 arguments (channels in, channels out,
+                stride, normalization, callable returning an activation function, a dict of
+                block-specific parameters) that returns another callable implementing the repeated
+                block module.
+            depths (list[int]): Number of blocks in each stage.
+            widths (list[int]): For each stage, the number of output channels of each block.
+            group_widths (list[int]): For each stage, the number of channels per group in group
+                convolution, if the block uses group convolution.
+            strides (list[int]): The stride that each network stage applies to its input.
+            bottleneck_ratios (list[float]): For each stage, the ratio of the number of bottleneck
+                channels to the number of block input channels (or, equivalently, output channels),
+                if the block uses a bottleneck.
+            se_ratio (float): The ratio of the number of channels used inside the squeeze-excitation
+                (SE) module to it number of input channels, if SE the block uses SE.
+            activation_class (callable): A callable taking no arguments that returns another
+                callable implementing an activation function.
+            freeze_at (int): The number of stages at the beginning to freeze.
+                see :meth:`freeze` for detailed explanation.
+            norm (str or callable): normalization for all conv layers.
+                See :func:`layers.get_norm` for supported format.
+            out_features (list[str]): name of the layers whose outputs should
+                be returned in forward. RegNet's use "stem" and "s1", "s2", etc for the stages after
+                the stem. If None, will return the output of the last layer.
+        """
+        super().__init__()
+        self.stem = stem_class(3, stem_width, norm, activation_class)
+        current_stride = self.stem.stride
+        self._out_feature_strides = {"stem": current_stride}
+        self._out_feature_channels = {"stem": self.stem.out_channels}
+        self.stages_and_names = []
+        prev_w = stem_width
+        for i, (d, w, s, b, g) in enumerate(
+            zip(depths, widths, strides, bottleneck_ratios, group_widths)
+        ):
+            params = {"bot_mul": b, "group_w": g, "se_r": se_ratio}
+            stage = AnyStage(prev_w, w, s, d, block_class, norm, activation_class, params)
+            name = "s{}".format(i + 1)
+            self.add_module(name, stage)
+            self.stages_and_names.append((stage, name))
+            self._out_feature_strides[name] = current_stride = int(
+                current_stride * np.prod([k.stride for k in stage.children()])
+            )
+            self._out_feature_channels[name] = list(stage.children())[-1].out_channels
+            prev_w = w
+        self.apply(init_weights)
+        if out_features is None:
+            out_features = [name]
+        self._out_features = out_features
+        assert len(self._out_features)
+        children = [x[0] for x in self.named_children()]
+        for out_feature in self._out_features:
+            assert out_feature in children, "Available children: {} does not include {}".format(
+                ", ".join(children), out_feature
+            )
+        self.freeze(freeze_at)
+    def forward(self, x):
+        """
+        Args:
+            x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``.
+        Returns:
+            dict[str->Tensor]: names and the corresponding features
+        """
+        assert x.dim() == 4, f"Model takes an input of shape (N, C, H, W). Got {x.shape} instead!"
+        outputs = {}
+        x = self.stem(x)
+        if "stem" in self._out_features:
+            outputs["stem"] = x
+        for stage, name in self.stages_and_names:
+            x = stage(x)
+            if name in self._out_features:
+                outputs[name] = x
+        return outputs
+    def output_shape(self):
+        return {
+            name: ShapeSpec(
+                channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
+            )
+            for name in self._out_features
+        }
+    def freeze(self, freeze_at=0):
+        """
+        Freeze the first several stages of the model. Commonly used in fine-tuning.
+        Layers that produce the same feature map spatial size are defined as one
+        "stage" by :paper:`FPN`.
+        Args:
+            freeze_at (int): number of stages to freeze.
+                `1` means freezing the stem. `2` means freezing the stem and
+                one residual stage, etc.
+        Returns:
+            nn.Module: this model itself
+        """
+        if freeze_at >= 1:
+            self.stem.freeze()
+        for idx, (stage, _) in enumerate(self.stages_and_names, start=2):
+            if freeze_at >= idx:
+                for block in stage.children():
+                    block.freeze()
+        return self
+def adjust_block_compatibility(ws, bs, gs):
+    """Adjusts the compatibility of widths, bottlenecks, and groups."""
+    assert len(ws) == len(bs) == len(gs)
+    assert all(w > 0 and b > 0 and g > 0 for w, b, g in zip(ws, bs, gs))
+    vs = [int(max(1, w * b)) for w, b in zip(ws, bs)]
+    gs = [int(min(g, v)) for g, v in zip(gs, vs)]
+    ms = [np.lcm(g, b) if b > 1 else g for g, b in zip(gs, bs)]
+    vs = [max(m, int(round(v / m) * m)) for v, m in zip(vs, ms)]
+    ws = [int(v / b) for v, b in zip(vs, bs)]
+    assert all(w * b % g == 0 for w, b, g in zip(ws, bs, gs))
+    return ws, bs, gs
+def generate_regnet_parameters(w_a, w_0, w_m, d, q=8):
+    """Generates per stage widths and depths from RegNet parameters."""
+    assert w_a >= 0 and w_0 > 0 and w_m > 1 and w_0 % q == 0
+    # Generate continuous per-block ws
+    ws_cont = np.arange(d) * w_a + w_0
+    # Generate quantized per-block ws
+    ks = np.round(np.log(ws_cont / w_0) / np.log(w_m))
+    ws_all = w_0 * np.power(w_m, ks)
+    ws_all = np.round(np.divide(ws_all, q)).astype(int) * q
+    # Generate per stage ws and ds (assumes ws_all are sorted)
+    ws, ds = np.unique(ws_all, return_counts=True)
+    # Compute number of actual stages and total possible stages
+    num_stages, total_stages = len(ws), ks.max() + 1
+    # Convert numpy arrays to lists and return
+    ws, ds, ws_all, ws_cont = (x.tolist() for x in (ws, ds, ws_all, ws_cont))
+    return ws, ds, num_stages, total_stages, ws_all, ws_cont
+class RegNet(AnyNet):
+    """RegNet model. See :paper:`dds`."""
+    def __init__(
+        self,
+        *,
+        stem_class,
+        stem_width,
+        block_class,
+        depth,
+        w_a,
+        w_0,
+        w_m,
+        group_width,
+        stride=2,
+        bottleneck_ratio=1.0,
+        se_ratio=0.0,
+        activation_class=None,
+        freeze_at=0,
+        norm="BN",
+        out_features=None,
+    ):
+        """
+        Build a RegNet from the parameterization described in :paper:`dds` Section 3.3.
+        Args:
+            See :class:`AnyNet` for arguments that are not listed here.
+            depth (int): Total number of blocks in the RegNet.
+            w_a (float): Factor by which block width would increase prior to quantizing block widths
+                by stage. See :paper:`dds` Section 3.3.
+            w_0 (int): Initial block width. See :paper:`dds` Section 3.3.
+            w_m (float): Parameter controlling block width quantization.
+                See :paper:`dds` Section 3.3.
+            group_width (int): Number of channels per group in group convolution, if the block uses
+                group convolution.
+            bottleneck_ratio (float): The ratio of the number of bottleneck channels to the number
+                of block input channels (or, equivalently, output channels), if the block uses a
+                bottleneck.
+            stride (int): The stride that each network stage applies to its input.
+        """
+        ws, ds = generate_regnet_parameters(w_a, w_0, w_m, depth)[0:2]
+        ss = [stride for _ in ws]
+        bs = [bottleneck_ratio for _ in ws]
+        gs = [group_width for _ in ws]
+        ws, bs, gs = adjust_block_compatibility(ws, bs, gs)
+        def default_activation_class():
+            return nn.ReLU(inplace=True)
+        super().__init__(
+            stem_class=stem_class,
+            stem_width=stem_width,
+            block_class=block_class,
+            depths=ds,
+            widths=ws,
+            strides=ss,
+            group_widths=gs,
+            bottleneck_ratios=bs,
+            se_ratio=se_ratio,
+            activation_class=default_activation_class
+            if activation_class is None
+            else activation_class,
+            freeze_at=freeze_at,
+            norm=norm,
+            out_features=out_features,
+        )

extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/backbone/resnet.py ADDED Viewed

	@@ -0,0 +1,694 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+import numpy as np
+import fvcore.nn.weight_init as weight_init
+import torch
+import torch.nn.functional as F
+from torch import nn
+from annotator.oneformer.detectron2.layers import (
+    CNNBlockBase,
+    Conv2d,
+    DeformConv,
+    ModulatedDeformConv,
+    ShapeSpec,
+    get_norm,
+)
+from .backbone import Backbone
+from .build import BACKBONE_REGISTRY
+__all__ = [
+    "ResNetBlockBase",
+    "BasicBlock",
+    "BottleneckBlock",
+    "DeformBottleneckBlock",
+    "BasicStem",
+    "ResNet",
+    "make_stage",
+    "build_resnet_backbone",
+]
+class BasicBlock(CNNBlockBase):
+    """
+    The basic residual block for ResNet-18 and ResNet-34 defined in :paper:`ResNet`,
+    with two 3x3 conv layers and a projection shortcut if needed.
+    """
+    def __init__(self, in_channels, out_channels, *, stride=1, norm="BN"):
+        """
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            stride (int): Stride for the first conv.
+            norm (str or callable): normalization for all conv layers.
+                See :func:`layers.get_norm` for supported format.
+        """
+        super().__init__(in_channels, out_channels, stride)
+        if in_channels != out_channels:
+            self.shortcut = Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=1,
+                stride=stride,
+                bias=False,
+                norm=get_norm(norm, out_channels),
+            )
+        else:
+            self.shortcut = None
+        self.conv1 = Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            bias=False,
+            norm=get_norm(norm, out_channels),
+        )
+        self.conv2 = Conv2d(
+            out_channels,
+            out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=False,
+            norm=get_norm(norm, out_channels),
+        )
+        for layer in [self.conv1, self.conv2, self.shortcut]:
+            if layer is not None:  # shortcut can be None
+                weight_init.c2_msra_fill(layer)
+    def forward(self, x):
+        out = self.conv1(x)
+        out = F.relu_(out)
+        out = self.conv2(out)
+        if self.shortcut is not None:
+            shortcut = self.shortcut(x)
+        else:
+            shortcut = x
+        out += shortcut
+        out = F.relu_(out)
+        return out
+class BottleneckBlock(CNNBlockBase):
+    """
+    The standard bottleneck residual block used by ResNet-50, 101 and 152
+    defined in :paper:`ResNet`.  It contains 3 conv layers with kernels
+    1x1, 3x3, 1x1, and a projection shortcut if needed.
+    """
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        *,
+        bottleneck_channels,
+        stride=1,
+        num_groups=1,
+        norm="BN",
+        stride_in_1x1=False,
+        dilation=1,
+    ):
+        """
+        Args:
+            bottleneck_channels (int): number of output channels for the 3x3
+                "bottleneck" conv layers.
+            num_groups (int): number of groups for the 3x3 conv layer.
+            norm (str or callable): normalization for all conv layers.
+                See :func:`layers.get_norm` for supported format.
+            stride_in_1x1 (bool): when stride>1, whether to put stride in the
+                first 1x1 convolution or the bottleneck 3x3 convolution.
+            dilation (int): the dilation rate of the 3x3 conv layer.
+        """
+        super().__init__(in_channels, out_channels, stride)
+        if in_channels != out_channels:
+            self.shortcut = Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=1,
+                stride=stride,
+                bias=False,
+                norm=get_norm(norm, out_channels),
+            )
+        else:
+            self.shortcut = None
+        # The original MSRA ResNet models have stride in the first 1x1 conv
+        # The subsequent fb.torch.resnet and Caffe2 ResNe[X]t implementations have
+        # stride in the 3x3 conv
+        stride_1x1, stride_3x3 = (stride, 1) if stride_in_1x1 else (1, stride)
+        self.conv1 = Conv2d(
+            in_channels,
+            bottleneck_channels,
+            kernel_size=1,
+            stride=stride_1x1,
+            bias=False,
+            norm=get_norm(norm, bottleneck_channels),
+        )
+        self.conv2 = Conv2d(
+            bottleneck_channels,
+            bottleneck_channels,
+            kernel_size=3,
+            stride=stride_3x3,
+            padding=1 * dilation,
+            bias=False,
+            groups=num_groups,
+            dilation=dilation,
+            norm=get_norm(norm, bottleneck_channels),
+        )
+        self.conv3 = Conv2d(
+            bottleneck_channels,
+            out_channels,
+            kernel_size=1,
+            bias=False,
+            norm=get_norm(norm, out_channels),
+        )
+        for layer in [self.conv1, self.conv2, self.conv3, self.shortcut]:
+            if layer is not None:  # shortcut can be None
+                weight_init.c2_msra_fill(layer)
+        # Zero-initialize the last normalization in each residual branch,
+        # so that at the beginning, the residual branch starts with zeros,
+        # and each residual block behaves like an identity.
+        # See Sec 5.1 in "Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour":
+        # "For BN layers, the learnable scaling coefficient γ is initialized
+        # to be 1, except for each residual block's last BN
+        # where γ is initialized to be 0."
+        # nn.init.constant_(self.conv3.norm.weight, 0)
+        # TODO this somehow hurts performance when training GN models from scratch.
+        # Add it as an option when we need to use this code to train a backbone.
+    def forward(self, x):
+        out = self.conv1(x)
+        out = F.relu_(out)
+        out = self.conv2(out)
+        out = F.relu_(out)
+        out = self.conv3(out)
+        if self.shortcut is not None:
+            shortcut = self.shortcut(x)
+        else:
+            shortcut = x
+        out += shortcut
+        out = F.relu_(out)
+        return out
+class DeformBottleneckBlock(CNNBlockBase):
+    """
+    Similar to :class:`BottleneckBlock`, but with :paper:`deformable conv <deformconv>`
+    in the 3x3 convolution.
+    """
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        *,
+        bottleneck_channels,
+        stride=1,
+        num_groups=1,
+        norm="BN",
+        stride_in_1x1=False,
+        dilation=1,
+        deform_modulated=False,
+        deform_num_groups=1,
+    ):
+        super().__init__(in_channels, out_channels, stride)
+        self.deform_modulated = deform_modulated
+        if in_channels != out_channels:
+            self.shortcut = Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=1,
+                stride=stride,
+                bias=False,
+                norm=get_norm(norm, out_channels),
+            )
+        else:
+            self.shortcut = None
+        stride_1x1, stride_3x3 = (stride, 1) if stride_in_1x1 else (1, stride)
+        self.conv1 = Conv2d(
+            in_channels,
+            bottleneck_channels,
+            kernel_size=1,
+            stride=stride_1x1,
+            bias=False,
+            norm=get_norm(norm, bottleneck_channels),
+        )
+        if deform_modulated:
+            deform_conv_op = ModulatedDeformConv
+            # offset channels are 2 or 3 (if with modulated) * kernel_size * kernel_size
+            offset_channels = 27
+        else:
+            deform_conv_op = DeformConv
+            offset_channels = 18
+        self.conv2_offset = Conv2d(
+            bottleneck_channels,
+            offset_channels * deform_num_groups,
+            kernel_size=3,
+            stride=stride_3x3,
+            padding=1 * dilation,
+            dilation=dilation,
+        )
+        self.conv2 = deform_conv_op(
+            bottleneck_channels,
+            bottleneck_channels,
+            kernel_size=3,
+            stride=stride_3x3,
+            padding=1 * dilation,
+            bias=False,
+            groups=num_groups,
+            dilation=dilation,
+            deformable_groups=deform_num_groups,
+            norm=get_norm(norm, bottleneck_channels),
+        )
+        self.conv3 = Conv2d(
+            bottleneck_channels,
+            out_channels,
+            kernel_size=1,
+            bias=False,
+            norm=get_norm(norm, out_channels),
+        )
+        for layer in [self.conv1, self.conv2, self.conv3, self.shortcut]:
+            if layer is not None:  # shortcut can be None
+                weight_init.c2_msra_fill(layer)
+        nn.init.constant_(self.conv2_offset.weight, 0)
+        nn.init.constant_(self.conv2_offset.bias, 0)
+    def forward(self, x):
+        out = self.conv1(x)
+        out = F.relu_(out)
+        if self.deform_modulated:
+            offset_mask = self.conv2_offset(out)
+            offset_x, offset_y, mask = torch.chunk(offset_mask, 3, dim=1)
+            offset = torch.cat((offset_x, offset_y), dim=1)
+            mask = mask.sigmoid()
+            out = self.conv2(out, offset, mask)
+        else:
+            offset = self.conv2_offset(out)
+            out = self.conv2(out, offset)
+        out = F.relu_(out)
+        out = self.conv3(out)
+        if self.shortcut is not None:
+            shortcut = self.shortcut(x)
+        else:
+            shortcut = x
+        out += shortcut
+        out = F.relu_(out)
+        return out
+class BasicStem(CNNBlockBase):
+    """
+    The standard ResNet stem (layers before the first residual block),
+    with a conv, relu and max_pool.
+    """
+    def __init__(self, in_channels=3, out_channels=64, norm="BN"):
+        """
+        Args:
+            norm (str or callable): norm after the first conv layer.
+                See :func:`layers.get_norm` for supported format.
+        """
+        super().__init__(in_channels, out_channels, 4)
+        self.in_channels = in_channels
+        self.conv1 = Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=7,
+            stride=2,
+            padding=3,
+            bias=False,
+            norm=get_norm(norm, out_channels),
+        )
+        weight_init.c2_msra_fill(self.conv1)
+    def forward(self, x):
+        x = self.conv1(x)
+        x = F.relu_(x)
+        x = F.max_pool2d(x, kernel_size=3, stride=2, padding=1)
+        return x
+class ResNet(Backbone):
+    """
+    Implement :paper:`ResNet`.
+    """
+    def __init__(self, stem, stages, num_classes=None, out_features=None, freeze_at=0):
+        """
+        Args:
+            stem (nn.Module): a stem module
+            stages (list[list[CNNBlockBase]]): several (typically 4) stages,
+                each contains multiple :class:`CNNBlockBase`.
+            num_classes (None or int): if None, will not perform classification.
+                Otherwise, will create a linear layer.
+            out_features (list[str]): name of the layers whose outputs should
+                be returned in forward. Can be anything in "stem", "linear", or "res2" ...
+                If None, will return the output of the last layer.
+            freeze_at (int): The number of stages at the beginning to freeze.
+                see :meth:`freeze` for detailed explanation.
+        """
+        super().__init__()
+        self.stem = stem
+        self.num_classes = num_classes
+        current_stride = self.stem.stride
+        self._out_feature_strides = {"stem": current_stride}
+        self._out_feature_channels = {"stem": self.stem.out_channels}
+        self.stage_names, self.stages = [], []
+        if out_features is not None:
+            # Avoid keeping unused layers in this module. They consume extra memory
+            # and may cause allreduce to fail
+            num_stages = max(
+                [{"res2": 1, "res3": 2, "res4": 3, "res5": 4}.get(f, 0) for f in out_features]
+            )
+            stages = stages[:num_stages]
+        for i, blocks in enumerate(stages):
+            assert len(blocks) > 0, len(blocks)
+            for block in blocks:
+                assert isinstance(block, CNNBlockBase), block
+            name = "res" + str(i + 2)
+            stage = nn.Sequential(*blocks)
+            self.add_module(name, stage)
+            self.stage_names.append(name)
+            self.stages.append(stage)
+            self._out_feature_strides[name] = current_stride = int(
+                current_stride * np.prod([k.stride for k in blocks])
+            )
+            self._out_feature_channels[name] = curr_channels = blocks[-1].out_channels
+        self.stage_names = tuple(self.stage_names)  # Make it static for scripting
+        if num_classes is not None:
+            self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+            self.linear = nn.Linear(curr_channels, num_classes)
+            # Sec 5.1 in "Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour":
+            # "The 1000-way fully-connected layer is initialized by
+            # drawing weights from a zero-mean Gaussian with standard deviation of 0.01."
+            nn.init.normal_(self.linear.weight, std=0.01)
+            name = "linear"
+        if out_features is None:
+            out_features = [name]
+        self._out_features = out_features
+        assert len(self._out_features)
+        children = [x[0] for x in self.named_children()]
+        for out_feature in self._out_features:
+            assert out_feature in children, "Available children: {}".format(", ".join(children))
+        self.freeze(freeze_at)
+    def forward(self, x):
+        """
+        Args:
+            x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``.
+        Returns:
+            dict[str->Tensor]: names and the corresponding features
+        """
+        assert x.dim() == 4, f"ResNet takes an input of shape (N, C, H, W). Got {x.shape} instead!"
+        outputs = {}
+        x = self.stem(x)
+        if "stem" in self._out_features:
+            outputs["stem"] = x
+        for name, stage in zip(self.stage_names, self.stages):
+            x = stage(x)
+            if name in self._out_features:
+                outputs[name] = x
+        if self.num_classes is not None:
+            x = self.avgpool(x)
+            x = torch.flatten(x, 1)
+            x = self.linear(x)
+            if "linear" in self._out_features:
+                outputs["linear"] = x
+        return outputs
+    def output_shape(self):
+        return {
+            name: ShapeSpec(
+                channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
+            )
+            for name in self._out_features
+        }
+    def freeze(self, freeze_at=0):
+        """
+        Freeze the first several stages of the ResNet. Commonly used in
+        fine-tuning.
+        Layers that produce the same feature map spatial size are defined as one
+        "stage" by :paper:`FPN`.
+        Args:
+            freeze_at (int): number of stages to freeze.
+                `1` means freezing the stem. `2` means freezing the stem and
+                one residual stage, etc.
+        Returns:
+            nn.Module: this ResNet itself
+        """
+        if freeze_at >= 1:
+            self.stem.freeze()
+        for idx, stage in enumerate(self.stages, start=2):
+            if freeze_at >= idx:
+                for block in stage.children():
+                    block.freeze()
+        return self
+    @staticmethod
+    def make_stage(block_class, num_blocks, *, in_channels, out_channels, **kwargs):
+        """
+        Create a list of blocks of the same type that forms one ResNet stage.
+        Args:
+            block_class (type): a subclass of CNNBlockBase that's used to create all blocks in this
+                stage. A module of this type must not change spatial resolution of inputs unless its
+                stride != 1.
+            num_blocks (int): number of blocks in this stage
+            in_channels (int): input channels of the entire stage.
+            out_channels (int): output channels of **every block** in the stage.
+            kwargs: other arguments passed to the constructor of
+                `block_class`. If the argument name is "xx_per_block", the
+                argument is a list of values to be passed to each block in the
+                stage. Otherwise, the same argument is passed to every block
+                in the stage.
+        Returns:
+            list[CNNBlockBase]: a list of block module.
+        Examples:
+        ::
+            stage = ResNet.make_stage(
+                BottleneckBlock, 3, in_channels=16, out_channels=64,
+                bottleneck_channels=16, num_groups=1,
+                stride_per_block=[2, 1, 1],
+                dilations_per_block=[1, 1, 2]
+            )
+        Usually, layers that produce the same feature map spatial size are defined as one
+        "stage" (in :paper:`FPN`). Under such definition, ``stride_per_block[1:]`` should
+        all be 1.
+        """
+        blocks = []
+        for i in range(num_blocks):
+            curr_kwargs = {}
+            for k, v in kwargs.items():
+                if k.endswith("_per_block"):
+                    assert len(v) == num_blocks, (
+                        f"Argument '{k}' of make_stage should have the "
+                        f"same length as num_blocks={num_blocks}."
+                    )
+                    newk = k[: -len("_per_block")]
+                    assert newk not in kwargs, f"Cannot call make_stage with both {k} and {newk}!"
+                    curr_kwargs[newk] = v[i]
+                else:
+                    curr_kwargs[k] = v
+            blocks.append(
+                block_class(in_channels=in_channels, out_channels=out_channels, **curr_kwargs)
+            )
+            in_channels = out_channels
+        return blocks
+    @staticmethod
+    def make_default_stages(depth, block_class=None, **kwargs):
+        """
+        Created list of ResNet stages from pre-defined depth (one of 18, 34, 50, 101, 152).
+        If it doesn't create the ResNet variant you need, please use :meth:`make_stage`
+        instead for fine-grained customization.
+        Args:
+            depth (int): depth of ResNet
+            block_class (type): the CNN block class. Has to accept
+                `bottleneck_channels` argument for depth > 50.
+                By default it is BasicBlock or BottleneckBlock, based on the
+                depth.
+            kwargs:
+                other arguments to pass to `make_stage`. Should not contain
+                stride and channels, as they are predefined for each depth.
+        Returns:
+            list[list[CNNBlockBase]]: modules in all stages; see arguments of
+                :class:`ResNet.__init__`.
+        """
+        num_blocks_per_stage = {
+            18: [2, 2, 2, 2],
+            34: [3, 4, 6, 3],
+            50: [3, 4, 6, 3],
+            101: [3, 4, 23, 3],
+            152: [3, 8, 36, 3],
+        }[depth]
+        if block_class is None:
+            block_class = BasicBlock if depth < 50 else BottleneckBlock
+        if depth < 50:
+            in_channels = [64, 64, 128, 256]
+            out_channels = [64, 128, 256, 512]
+        else:
+            in_channels = [64, 256, 512, 1024]
+            out_channels = [256, 512, 1024, 2048]
+        ret = []
+        for (n, s, i, o) in zip(num_blocks_per_stage, [1, 2, 2, 2], in_channels, out_channels):
+            if depth >= 50:
+                kwargs["bottleneck_channels"] = o // 4
+            ret.append(
+                ResNet.make_stage(
+                    block_class=block_class,
+                    num_blocks=n,
+                    stride_per_block=[s] + [1] * (n - 1),
+                    in_channels=i,
+                    out_channels=o,
+                    **kwargs,
+                )
+            )
+        return ret
+ResNetBlockBase = CNNBlockBase
+"""
+Alias for backward compatibiltiy.
+"""
+def make_stage(*args, **kwargs):
+    """
+    Deprecated alias for backward compatibiltiy.
+    """
+    return ResNet.make_stage(*args, **kwargs)
+@BACKBONE_REGISTRY.register()
+def build_resnet_backbone(cfg, input_shape):
+    """
+    Create a ResNet instance from config.
+    Returns:
+        ResNet: a :class:`ResNet` instance.
+    """
+    # need registration of new blocks/stems?
+    norm = cfg.MODEL.RESNETS.NORM
+    stem = BasicStem(
+        in_channels=input_shape.channels,
+        out_channels=cfg.MODEL.RESNETS.STEM_OUT_CHANNELS,
+        norm=norm,
+    )
+    # fmt: off
+    freeze_at           = cfg.MODEL.BACKBONE.FREEZE_AT
+    out_features        = cfg.MODEL.RESNETS.OUT_FEATURES
+    depth               = cfg.MODEL.RESNETS.DEPTH
+    num_groups          = cfg.MODEL.RESNETS.NUM_GROUPS
+    width_per_group     = cfg.MODEL.RESNETS.WIDTH_PER_GROUP
+    bottleneck_channels = num_groups * width_per_group
+    in_channels         = cfg.MODEL.RESNETS.STEM_OUT_CHANNELS
+    out_channels        = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS
+    stride_in_1x1       = cfg.MODEL.RESNETS.STRIDE_IN_1X1
+    res5_dilation       = cfg.MODEL.RESNETS.RES5_DILATION
+    deform_on_per_stage = cfg.MODEL.RESNETS.DEFORM_ON_PER_STAGE
+    deform_modulated    = cfg.MODEL.RESNETS.DEFORM_MODULATED
+    deform_num_groups   = cfg.MODEL.RESNETS.DEFORM_NUM_GROUPS
+    # fmt: on
+    assert res5_dilation in {1, 2}, "res5_dilation cannot be {}.".format(res5_dilation)
+    num_blocks_per_stage = {
+        18: [2, 2, 2, 2],
+        34: [3, 4, 6, 3],
+        50: [3, 4, 6, 3],
+        101: [3, 4, 23, 3],
+        152: [3, 8, 36, 3],
+    }[depth]
+    if depth in [18, 34]:
+        assert out_channels == 64, "Must set MODEL.RESNETS.RES2_OUT_CHANNELS = 64 for R18/R34"
+        assert not any(
+            deform_on_per_stage
+        ), "MODEL.RESNETS.DEFORM_ON_PER_STAGE unsupported for R18/R34"
+        assert res5_dilation == 1, "Must set MODEL.RESNETS.RES5_DILATION = 1 for R18/R34"
+        assert num_groups == 1, "Must set MODEL.RESNETS.NUM_GROUPS = 1 for R18/R34"
+    stages = []
+    for idx, stage_idx in enumerate(range(2, 6)):
+        # res5_dilation is used this way as a convention in R-FCN & Deformable Conv paper
+        dilation = res5_dilation if stage_idx == 5 else 1
+        first_stride = 1 if idx == 0 or (stage_idx == 5 and dilation == 2) else 2
+        stage_kargs = {
+            "num_blocks": num_blocks_per_stage[idx],
+            "stride_per_block": [first_stride] + [1] * (num_blocks_per_stage[idx] - 1),
+            "in_channels": in_channels,
+            "out_channels": out_channels,
+            "norm": norm,
+        }
+        # Use BasicBlock for R18 and R34.
+        if depth in [18, 34]:
+            stage_kargs["block_class"] = BasicBlock
+        else:
+            stage_kargs["bottleneck_channels"] = bottleneck_channels
+            stage_kargs["stride_in_1x1"] = stride_in_1x1
+            stage_kargs["dilation"] = dilation
+            stage_kargs["num_groups"] = num_groups
+            if deform_on_per_stage[idx]:
+                stage_kargs["block_class"] = DeformBottleneckBlock
+                stage_kargs["deform_modulated"] = deform_modulated
+                stage_kargs["deform_num_groups"] = deform_num_groups
+            else:
+                stage_kargs["block_class"] = BottleneckBlock
+        blocks = ResNet.make_stage(**stage_kargs)
+        in_channels = out_channels
+        out_channels *= 2
+        bottleneck_channels *= 2
+        stages.append(blocks)
+    return ResNet(stem, stages, out_features=out_features, freeze_at=freeze_at)

extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/backbone/swin.py ADDED Viewed

	@@ -0,0 +1,695 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+Implementation of Swin models from :paper:`swin`.
+This code is adapted from https://github.com/SwinTransformer/Swin-Transformer-Object-Detection/blob/master/mmdet/models/backbones/swin_transformer.py with minimal modifications.  # noqa
+--------------------------------------------------------
+Swin Transformer
+Copyright (c) 2021 Microsoft
+Licensed under The MIT License [see LICENSE for details]
+Written by Ze Liu, Yutong Lin, Yixuan Wei
+--------------------------------------------------------
+LICENSE: https://github.com/SwinTransformer/Swin-Transformer-Object-Detection/blob/461e003166a8083d0b620beacd4662a2df306bd6/LICENSE
+"""
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from annotator.oneformer.detectron2.modeling.backbone.backbone import Backbone
+_to_2tuple = nn.modules.utils._ntuple(2)
+class Mlp(nn.Module):
+    """Multilayer perceptron."""
+    def __init__(
+        self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.0
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+    Returns:
+        windows: (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
+    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    return windows
+def window_reverse(windows, window_size, H, W):
+    """
+    Args:
+        windows: (num_windows*B, window_size, window_size, C)
+        window_size (int): Window size
+        H (int): Height of image
+        W (int): Width of image
+    Returns:
+        x: (B, H, W, C)
+    """
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+    return x
+class WindowAttention(nn.Module):
+    """Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value.
+            Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+    """
+    def __init__(
+        self,
+        dim,
+        window_size,
+        num_heads,
+        qkv_bias=True,
+        qk_scale=None,
+        attn_drop=0.0,
+        proj_drop=0.0,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads)
+        )  # 2*Wh-1 * 2*Ww-1, nH
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(self.window_size[0])
+        coords_w = torch.arange(self.window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        self.register_buffer("relative_position_index", relative_position_index)
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        nn.init.trunc_normal_(self.relative_position_bias_table, std=0.02)
+        self.softmax = nn.Softmax(dim=-1)
+    def forward(self, x, mask=None):
+        """Forward function.
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
+        """
+        B_, N, C = x.shape
+        qkv = (
+            self.qkv(x)
+            .reshape(B_, N, 3, self.num_heads, C // self.num_heads)
+            .permute(2, 0, 3, 1, 4)
+        )
+        q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)
+        q = q * self.scale
+        attn = q @ k.transpose(-2, -1)
+        relative_position_bias = self.relative_position_bias_table[
+            self.relative_position_index.view(-1)
+        ].view(
+            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1
+        )  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.permute(
+            2, 0, 1
+        ).contiguous()  # nH, Wh*Ww, Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class SwinTransformerBlock(nn.Module):
+    """Swin Transformer Block.
+    Args:
+        dim (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        shift_size (int): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        window_size=7,
+        shift_size=0,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        qk_scale=None,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        act_layer=nn.GELU,
+        norm_layer=nn.LayerNorm,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention(
+            dim,
+            window_size=_to_2tuple(self.window_size),
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+        )
+        if drop_path > 0.0:
+            from timm.models.layers import DropPath
+            self.drop_path = DropPath(drop_path)
+        else:
+            self.drop_path = nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop
+        )
+        self.H = None
+        self.W = None
+    def forward(self, x, mask_matrix):
+        """Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+            mask_matrix: Attention mask for cyclic shift.
+        """
+        B, L, C = x.shape
+        H, W = self.H, self.W
+        assert L == H * W, "input feature has wrong size"
+        shortcut = x
+        x = self.norm1(x)
+        x = x.view(B, H, W, C)
+        # pad feature maps to multiples of window size
+        pad_l = pad_t = 0
+        pad_r = (self.window_size - W % self.window_size) % self.window_size
+        pad_b = (self.window_size - H % self.window_size) % self.window_size
+        x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
+        _, Hp, Wp, _ = x.shape
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
+            attn_mask = mask_matrix
+        else:
+            shifted_x = x
+            attn_mask = None
+        # partition windows
+        x_windows = window_partition(
+            shifted_x, self.window_size
+        )  # nW*B, window_size, window_size, C
+        x_windows = x_windows.view(
+            -1, self.window_size * self.window_size, C
+        )  # nW*B, window_size*window_size, C
+        # W-MSA/SW-MSA
+        attn_windows = self.attn(x_windows, mask=attn_mask)  # nW*B, window_size*window_size, C
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
+        shifted_x = window_reverse(attn_windows, self.window_size, Hp, Wp)  # B H' W' C
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
+        else:
+            x = shifted_x
+        if pad_r > 0 or pad_b > 0:
+            x = x[:, :H, :W, :].contiguous()
+        x = x.view(B, H * W, C)
+        # FFN
+        x = shortcut + self.drop_path(x)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+class PatchMerging(nn.Module):
+    """Patch Merging Layer
+    Args:
+        dim (int): Number of input channels.
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+    def __init__(self, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
+        self.norm = norm_layer(4 * dim)
+    def forward(self, x, H, W):
+        """Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+        """
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+        x = x.view(B, H, W, C)
+        # padding
+        pad_input = (H % 2 == 1) or (W % 2 == 1)
+        if pad_input:
+            x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2))
+        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
+        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
+        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
+        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
+        x = torch.cat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
+        x = x.view(B, -1, 4 * C)  # B H/2*W/2 4*C
+        x = self.norm(x)
+        x = self.reduction(x)
+        return x
+class BasicLayer(nn.Module):
+    """A basic Swin Transformer layer for one stage.
+    Args:
+        dim (int): Number of feature channels
+        depth (int): Depths of this stage.
+        num_heads (int): Number of attention head.
+        window_size (int): Local window size. Default: 7.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Module | None, optional): Downsample layer at the end of the layer.
+            Default: None
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+    def __init__(
+        self,
+        dim,
+        depth,
+        num_heads,
+        window_size=7,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        qk_scale=None,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        norm_layer=nn.LayerNorm,
+        downsample=None,
+        use_checkpoint=False,
+    ):
+        super().__init__()
+        self.window_size = window_size
+        self.shift_size = window_size // 2
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+        # build blocks
+        self.blocks = nn.ModuleList(
+            [
+                SwinTransformerBlock(
+                    dim=dim,
+                    num_heads=num_heads,
+                    window_size=window_size,
+                    shift_size=0 if (i % 2 == 0) else window_size // 2,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    qk_scale=qk_scale,
+                    drop=drop,
+                    attn_drop=attn_drop,
+                    drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
+                    norm_layer=norm_layer,
+                )
+                for i in range(depth)
+            ]
+        )
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(dim=dim, norm_layer=norm_layer)
+        else:
+            self.downsample = None
+    def forward(self, x, H, W):
+        """Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+        """
+        # calculate attention mask for SW-MSA
+        Hp = int(np.ceil(H / self.window_size)) * self.window_size
+        Wp = int(np.ceil(W / self.window_size)) * self.window_size
+        img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device)  # 1 Hp Wp 1
+        h_slices = (
+            slice(0, -self.window_size),
+            slice(-self.window_size, -self.shift_size),
+            slice(-self.shift_size, None),
+        )
+        w_slices = (
+            slice(0, -self.window_size),
+            slice(-self.window_size, -self.shift_size),
+            slice(-self.shift_size, None),
+        )
+        cnt = 0
+        for h in h_slices:
+            for w in w_slices:
+                img_mask[:, h, w, :] = cnt
+                cnt += 1
+        mask_windows = window_partition(
+            img_mask, self.window_size
+        )  # nW, window_size, window_size, 1
+        mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
+        attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+        attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(
+            attn_mask == 0, float(0.0)
+        )
+        for blk in self.blocks:
+            blk.H, blk.W = H, W
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x, attn_mask)
+            else:
+                x = blk(x, attn_mask)
+        if self.downsample is not None:
+            x_down = self.downsample(x, H, W)
+            Wh, Ww = (H + 1) // 2, (W + 1) // 2
+            return x, H, W, x_down, Wh, Ww
+        else:
+            return x, H, W, x, H, W
+class PatchEmbed(nn.Module):
+    """Image to Patch Embedding
+    Args:
+        patch_size (int): Patch token size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Module, optional): Normalization layer. Default: None
+    """
+    def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
+        super().__init__()
+        patch_size = _to_2tuple(patch_size)
+        self.patch_size = patch_size
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+    def forward(self, x):
+        """Forward function."""
+        # padding
+        _, _, H, W = x.size()
+        if W % self.patch_size[1] != 0:
+            x = F.pad(x, (0, self.patch_size[1] - W % self.patch_size[1]))
+        if H % self.patch_size[0] != 0:
+            x = F.pad(x, (0, 0, 0, self.patch_size[0] - H % self.patch_size[0]))
+        x = self.proj(x)  # B C Wh Ww
+        if self.norm is not None:
+            Wh, Ww = x.size(2), x.size(3)
+            x = x.flatten(2).transpose(1, 2)
+            x = self.norm(x)
+            x = x.transpose(1, 2).view(-1, self.embed_dim, Wh, Ww)
+        return x
+class SwinTransformer(Backbone):
+    """Swin Transformer backbone.
+        A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted
+            Windows`  - https://arxiv.org/pdf/2103.14030
+    Args:
+        pretrain_img_size (int): Input image size for training the pretrained model,
+            used in absolute postion embedding. Default 224.
+        patch_size (int | tuple(int)): Patch size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        depths (tuple[int]): Depths of each Swin Transformer stage.
+        num_heads (tuple[int]): Number of attention head of each stage.
+        window_size (int): Window size. Default: 7.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
+        drop_rate (float): Dropout rate.
+        attn_drop_rate (float): Attention dropout rate. Default: 0.
+        drop_path_rate (float): Stochastic depth rate. Default: 0.2.
+        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
+        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False.
+        patch_norm (bool): If True, add normalization after patch embedding. Default: True.
+        out_indices (Sequence[int]): Output from which stages.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters.
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+    def __init__(
+        self,
+        pretrain_img_size=224,
+        patch_size=4,
+        in_chans=3,
+        embed_dim=96,
+        depths=(2, 2, 6, 2),
+        num_heads=(3, 6, 12, 24),
+        window_size=7,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.0,
+        attn_drop_rate=0.0,
+        drop_path_rate=0.2,
+        norm_layer=nn.LayerNorm,
+        ape=False,
+        patch_norm=True,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=-1,
+        use_checkpoint=False,
+    ):
+        super().__init__()
+        self.pretrain_img_size = pretrain_img_size
+        self.num_layers = len(depths)
+        self.embed_dim = embed_dim
+        self.ape = ape
+        self.patch_norm = patch_norm
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        # split image into non-overlapping patches
+        self.patch_embed = PatchEmbed(
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None,
+        )
+        # absolute position embedding
+        if self.ape:
+            pretrain_img_size = _to_2tuple(pretrain_img_size)
+            patch_size = _to_2tuple(patch_size)
+            patches_resolution = [
+                pretrain_img_size[0] // patch_size[0],
+                pretrain_img_size[1] // patch_size[1],
+            ]
+            self.absolute_pos_embed = nn.Parameter(
+                torch.zeros(1, embed_dim, patches_resolution[0], patches_resolution[1])
+            )
+            nn.init.trunc_normal_(self.absolute_pos_embed, std=0.02)
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        # stochastic depth
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
+        ]  # stochastic depth decay rule
+        # build layers
+        self.layers = nn.ModuleList()
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(
+                dim=int(embed_dim * 2**i_layer),
+                depth=depths[i_layer],
+                num_heads=num_heads[i_layer],
+                window_size=window_size,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[sum(depths[:i_layer]) : sum(depths[: i_layer + 1])],
+                norm_layer=norm_layer,
+                downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
+                use_checkpoint=use_checkpoint,
+            )
+            self.layers.append(layer)
+        num_features = [int(embed_dim * 2**i) for i in range(self.num_layers)]
+        self.num_features = num_features
+        # add a norm layer for each output
+        for i_layer in out_indices:
+            layer = norm_layer(num_features[i_layer])
+            layer_name = f"norm{i_layer}"
+            self.add_module(layer_name, layer)
+        self._freeze_stages()
+        self._out_features = ["p{}".format(i) for i in self.out_indices]
+        self._out_feature_channels = {
+            "p{}".format(i): self.embed_dim * 2**i for i in self.out_indices
+        }
+        self._out_feature_strides = {"p{}".format(i): 2 ** (i + 2) for i in self.out_indices}
+        self._size_devisibility = 32
+        self.apply(self._init_weights)
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.requires_grad = False
+        if self.frozen_stages >= 1 and self.ape:
+            self.absolute_pos_embed.requires_grad = False
+        if self.frozen_stages >= 2:
+            self.pos_drop.eval()
+            for i in range(0, self.frozen_stages - 1):
+                m = self.layers[i]
+                m.eval()
+                for param in m.parameters():
+                    param.requires_grad = False
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            nn.init.trunc_normal_(m.weight, std=0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    @property
+    def size_divisibility(self):
+        return self._size_divisibility
+    def forward(self, x):
+        """Forward function."""
+        x = self.patch_embed(x)
+        Wh, Ww = x.size(2), x.size(3)
+        if self.ape:
+            # interpolate the position embedding to the corresponding size
+            absolute_pos_embed = F.interpolate(
+                self.absolute_pos_embed, size=(Wh, Ww), mode="bicubic"
+            )
+            x = (x + absolute_pos_embed).flatten(2).transpose(1, 2)  # B Wh*Ww C
+        else:
+            x = x.flatten(2).transpose(1, 2)
+        x = self.pos_drop(x)
+        outs = {}
+        for i in range(self.num_layers):
+            layer = self.layers[i]
+            x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
+            if i in self.out_indices:
+                norm_layer = getattr(self, f"norm{i}")
+                x_out = norm_layer(x_out)
+                out = x_out.view(-1, H, W, self.num_features[i]).permute(0, 3, 1, 2).contiguous()
+                outs["p{}".format(i)] = out
+        return outs

extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/backbone/utils.py ADDED Viewed

	@@ -0,0 +1,186 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+__all__ = [
+    "window_partition",
+    "window_unpartition",
+    "add_decomposed_rel_pos",
+    "get_abs_pos",
+    "PatchEmbed",
+]
+def window_partition(x, window_size):
+    """
+    Partition into non-overlapping windows with padding if needed.
+    Args:
+        x (tensor): input tokens with [B, H, W, C].
+        window_size (int): window size.
+    Returns:
+        windows: windows after partition with [B * num_windows, window_size, window_size, C].
+        (Hp, Wp): padded height and width before partition
+    """
+    B, H, W, C = x.shape
+    pad_h = (window_size - H % window_size) % window_size
+    pad_w = (window_size - W % window_size) % window_size
+    if pad_h > 0 or pad_w > 0:
+        x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h))
+    Hp, Wp = H + pad_h, W + pad_w
+    x = x.view(B, Hp // window_size, window_size, Wp // window_size, window_size, C)
+    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    return windows, (Hp, Wp)
+def window_unpartition(windows, window_size, pad_hw, hw):
+    """
+    Window unpartition into original sequences and removing padding.
+    Args:
+        x (tensor): input tokens with [B * num_windows, window_size, window_size, C].
+        window_size (int): window size.
+        pad_hw (Tuple): padded height and width (Hp, Wp).
+        hw (Tuple): original height and width (H, W) before padding.
+    Returns:
+        x: unpartitioned sequences with [B, H, W, C].
+    """
+    Hp, Wp = pad_hw
+    H, W = hw
+    B = windows.shape[0] // (Hp * Wp // window_size // window_size)
+    x = windows.view(B, Hp // window_size, Wp // window_size, window_size, window_size, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, Hp, Wp, -1)
+    if Hp > H or Wp > W:
+        x = x[:, :H, :W, :].contiguous()
+    return x
+def get_rel_pos(q_size, k_size, rel_pos):
+    """
+    Get relative positional embeddings according to the relative positions of
+        query and key sizes.
+    Args:
+        q_size (int): size of query q.
+        k_size (int): size of key k.
+        rel_pos (Tensor): relative position embeddings (L, C).
+    Returns:
+        Extracted positional embeddings according to relative positions.
+    """
+    max_rel_dist = int(2 * max(q_size, k_size) - 1)
+    # Interpolate rel pos if needed.
+    if rel_pos.shape[0] != max_rel_dist:
+        # Interpolate rel pos.
+        rel_pos_resized = F.interpolate(
+            rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1),
+            size=max_rel_dist,
+            mode="linear",
+        )
+        rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0)
+    else:
+        rel_pos_resized = rel_pos
+    # Scale the coords with short length if shapes for q and k are different.
+    q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0)
+    k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0)
+    relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0)
+    return rel_pos_resized[relative_coords.long()]
+def add_decomposed_rel_pos(attn, q, rel_pos_h, rel_pos_w, q_size, k_size):
+    """
+    Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
+    https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py   # noqa B950
+    Args:
+        attn (Tensor): attention map.
+        q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C).
+        rel_pos_h (Tensor): relative position embeddings (Lh, C) for height axis.
+        rel_pos_w (Tensor): relative position embeddings (Lw, C) for width axis.
+        q_size (Tuple): spatial sequence size of query q with (q_h, q_w).
+        k_size (Tuple): spatial sequence size of key k with (k_h, k_w).
+    Returns:
+        attn (Tensor): attention map with added relative positional embeddings.
+    """
+    q_h, q_w = q_size
+    k_h, k_w = k_size
+    Rh = get_rel_pos(q_h, k_h, rel_pos_h)
+    Rw = get_rel_pos(q_w, k_w, rel_pos_w)
+    B, _, dim = q.shape
+    r_q = q.reshape(B, q_h, q_w, dim)
+    rel_h = torch.einsum("bhwc,hkc->bhwk", r_q, Rh)
+    rel_w = torch.einsum("bhwc,wkc->bhwk", r_q, Rw)
+    attn = (
+        attn.view(B, q_h, q_w, k_h, k_w) + rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :]
+    ).view(B, q_h * q_w, k_h * k_w)
+    return attn
+def get_abs_pos(abs_pos, has_cls_token, hw):
+    """
+    Calculate absolute positional embeddings. If needed, resize embeddings and remove cls_token
+        dimension for the original embeddings.
+    Args:
+        abs_pos (Tensor): absolute positional embeddings with (1, num_position, C).
+        has_cls_token (bool): If true, has 1 embedding in abs_pos for cls token.
+        hw (Tuple): size of input image tokens.
+    Returns:
+        Absolute positional embeddings after processing with shape (1, H, W, C)
+    """
+    h, w = hw
+    if has_cls_token:
+        abs_pos = abs_pos[:, 1:]
+    xy_num = abs_pos.shape[1]
+    size = int(math.sqrt(xy_num))
+    assert size * size == xy_num
+    if size != h or size != w:
+        new_abs_pos = F.interpolate(
+            abs_pos.reshape(1, size, size, -1).permute(0, 3, 1, 2),
+            size=(h, w),
+            mode="bicubic",
+            align_corners=False,
+        )
+        return new_abs_pos.permute(0, 2, 3, 1)
+    else:
+        return abs_pos.reshape(1, h, w, -1)
+class PatchEmbed(nn.Module):
+    """
+    Image to Patch Embedding.
+    """
+    def __init__(
+        self, kernel_size=(16, 16), stride=(16, 16), padding=(0, 0), in_chans=3, embed_dim=768
+    ):
+        """
+        Args:
+            kernel_size (Tuple): kernel size of the projection layer.
+            stride (Tuple): stride of the projection layer.
+            padding (Tuple): padding size of the projection layer.
+            in_chans (int): Number of input image channels.
+            embed_dim (int):  embed_dim (int): Patch embedding dimension.
+        """
+        super().__init__()
+        self.proj = nn.Conv2d(
+            in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding
+        )
+    def forward(self, x):
+        x = self.proj(x)
+        # B C H W -> B H W C
+        x = x.permute(0, 2, 3, 1)
+        return x

extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/backbone/vit.py ADDED Viewed

	@@ -0,0 +1,524 @@

+import logging
+import math
+import fvcore.nn.weight_init as weight_init
+import torch
+import torch.nn as nn
+from annotator.oneformer.detectron2.layers import CNNBlockBase, Conv2d, get_norm
+from annotator.oneformer.detectron2.modeling.backbone.fpn import _assert_strides_are_log2_contiguous
+from .backbone import Backbone
+from .utils import (
+    PatchEmbed,
+    add_decomposed_rel_pos,
+    get_abs_pos,
+    window_partition,
+    window_unpartition,
+)
+logger = logging.getLogger(__name__)
+__all__ = ["ViT", "SimpleFeaturePyramid", "get_vit_lr_decay_rate"]
+class Attention(nn.Module):
+    """Multi-head Attention block with relative position embeddings."""
+    def __init__(
+        self,
+        dim,
+        num_heads=8,
+        qkv_bias=True,
+        use_rel_pos=False,
+        rel_pos_zero_init=True,
+        input_size=None,
+    ):
+        """
+        Args:
+            dim (int): Number of input channels.
+            num_heads (int): Number of attention heads.
+            qkv_bias (bool:  If True, add a learnable bias to query, key, value.
+            rel_pos (bool): If True, add relative positional embeddings to the attention map.
+            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
+            input_size (int or None): Input resolution for calculating the relative positional
+                parameter size.
+        """
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.proj = nn.Linear(dim, dim)
+        self.use_rel_pos = use_rel_pos
+        if self.use_rel_pos:
+            # initialize relative positional embeddings
+            self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, head_dim))
+            self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, head_dim))
+            if not rel_pos_zero_init:
+                nn.init.trunc_normal_(self.rel_pos_h, std=0.02)
+                nn.init.trunc_normal_(self.rel_pos_w, std=0.02)
+    def forward(self, x):
+        B, H, W, _ = x.shape
+        # qkv with shape (3, B, nHead, H * W, C)
+        qkv = self.qkv(x).reshape(B, H * W, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        # q, k, v with shape (B * nHead, H * W, C)
+        q, k, v = qkv.reshape(3, B * self.num_heads, H * W, -1).unbind(0)
+        attn = (q * self.scale) @ k.transpose(-2, -1)
+        if self.use_rel_pos:
+            attn = add_decomposed_rel_pos(attn, q, self.rel_pos_h, self.rel_pos_w, (H, W), (H, W))
+        attn = attn.softmax(dim=-1)
+        x = (attn @ v).view(B, self.num_heads, H, W, -1).permute(0, 2, 3, 1, 4).reshape(B, H, W, -1)
+        x = self.proj(x)
+        return x
+class ResBottleneckBlock(CNNBlockBase):
+    """
+    The standard bottleneck residual block without the last activation layer.
+    It contains 3 conv layers with kernels 1x1, 3x3, 1x1.
+    """
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        bottleneck_channels,
+        norm="LN",
+        act_layer=nn.GELU,
+    ):
+        """
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            bottleneck_channels (int): number of output channels for the 3x3
+                "bottleneck" conv layers.
+            norm (str or callable): normalization for all conv layers.
+                See :func:`layers.get_norm` for supported format.
+            act_layer (callable): activation for all conv layers.
+        """
+        super().__init__(in_channels, out_channels, 1)
+        self.conv1 = Conv2d(in_channels, bottleneck_channels, 1, bias=False)
+        self.norm1 = get_norm(norm, bottleneck_channels)
+        self.act1 = act_layer()
+        self.conv2 = Conv2d(
+            bottleneck_channels,
+            bottleneck_channels,
+            3,
+            padding=1,
+            bias=False,
+        )
+        self.norm2 = get_norm(norm, bottleneck_channels)
+        self.act2 = act_layer()
+        self.conv3 = Conv2d(bottleneck_channels, out_channels, 1, bias=False)
+        self.norm3 = get_norm(norm, out_channels)
+        for layer in [self.conv1, self.conv2, self.conv3]:
+            weight_init.c2_msra_fill(layer)
+        for layer in [self.norm1, self.norm2]:
+            layer.weight.data.fill_(1.0)
+            layer.bias.data.zero_()
+        # zero init last norm layer.
+        self.norm3.weight.data.zero_()
+        self.norm3.bias.data.zero_()
+    def forward(self, x):
+        out = x
+        for layer in self.children():
+            out = layer(out)
+        out = x + out
+        return out
+class Block(nn.Module):
+    """Transformer blocks with support of window attention and residual propagation blocks"""
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        drop_path=0.0,
+        norm_layer=nn.LayerNorm,
+        act_layer=nn.GELU,
+        use_rel_pos=False,
+        rel_pos_zero_init=True,
+        window_size=0,
+        use_residual_block=False,
+        input_size=None,
+    ):
+        """
+        Args:
+            dim (int): Number of input channels.
+            num_heads (int): Number of attention heads in each ViT block.
+            mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+            qkv_bias (bool): If True, add a learnable bias to query, key, value.
+            drop_path (float): Stochastic depth rate.
+            norm_layer (nn.Module): Normalization layer.
+            act_layer (nn.Module): Activation layer.
+            use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
+            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
+            window_size (int): Window size for window attention blocks. If it equals 0, then not
+                use window attention.
+            use_residual_block (bool): If True, use a residual block after the MLP block.
+            input_size (int or None): Input resolution for calculating the relative positional
+                parameter size.
+        """
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            use_rel_pos=use_rel_pos,
+            rel_pos_zero_init=rel_pos_zero_init,
+            input_size=input_size if window_size == 0 else (window_size, window_size),
+        )
+        from timm.models.layers import DropPath, Mlp
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        self.mlp = Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer)
+        self.window_size = window_size
+        self.use_residual_block = use_residual_block
+        if use_residual_block:
+            # Use a residual block with bottleneck channel as dim // 2
+            self.residual = ResBottleneckBlock(
+                in_channels=dim,
+                out_channels=dim,
+                bottleneck_channels=dim // 2,
+                norm="LN",
+                act_layer=act_layer,
+            )
+    def forward(self, x):
+        shortcut = x
+        x = self.norm1(x)
+        # Window partition
+        if self.window_size > 0:
+            H, W = x.shape[1], x.shape[2]
+            x, pad_hw = window_partition(x, self.window_size)
+        x = self.attn(x)
+        # Reverse window partition
+        if self.window_size > 0:
+            x = window_unpartition(x, self.window_size, pad_hw, (H, W))
+        x = shortcut + self.drop_path(x)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        if self.use_residual_block:
+            x = self.residual(x.permute(0, 3, 1, 2)).permute(0, 2, 3, 1)
+        return x
+class ViT(Backbone):
+    """
+    This module implements Vision Transformer (ViT) backbone in :paper:`vitdet`.
+    "Exploring Plain Vision Transformer Backbones for Object Detection",
+    https://arxiv.org/abs/2203.16527
+    """
+    def __init__(
+        self,
+        img_size=1024,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        drop_path_rate=0.0,
+        norm_layer=nn.LayerNorm,
+        act_layer=nn.GELU,
+        use_abs_pos=True,
+        use_rel_pos=False,
+        rel_pos_zero_init=True,
+        window_size=0,
+        window_block_indexes=(),
+        residual_block_indexes=(),
+        use_act_checkpoint=False,
+        pretrain_img_size=224,
+        pretrain_use_cls_token=True,
+        out_feature="last_feat",
+    ):
+        """
+        Args:
+            img_size (int): Input image size.
+            patch_size (int): Patch size.
+            in_chans (int): Number of input image channels.
+            embed_dim (int): Patch embedding dimension.
+            depth (int): Depth of ViT.
+            num_heads (int): Number of attention heads in each ViT block.
+            mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+            qkv_bias (bool): If True, add a learnable bias to query, key, value.
+            drop_path_rate (float): Stochastic depth rate.
+            norm_layer (nn.Module): Normalization layer.
+            act_layer (nn.Module): Activation layer.
+            use_abs_pos (bool): If True, use absolute positional embeddings.
+            use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
+            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
+            window_size (int): Window size for window attention blocks.
+            window_block_indexes (list): Indexes for blocks using window attention.
+            residual_block_indexes (list): Indexes for blocks using conv propagation.
+            use_act_checkpoint (bool): If True, use activation checkpointing.
+            pretrain_img_size (int): input image size for pretraining models.
+            pretrain_use_cls_token (bool): If True, pretrainig models use class token.
+            out_feature (str): name of the feature from the last block.
+        """
+        super().__init__()
+        self.pretrain_use_cls_token = pretrain_use_cls_token
+        self.patch_embed = PatchEmbed(
+            kernel_size=(patch_size, patch_size),
+            stride=(patch_size, patch_size),
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+        )
+        if use_abs_pos:
+            # Initialize absolute positional embedding with pretrain image size.
+            num_patches = (pretrain_img_size // patch_size) * (pretrain_img_size // patch_size)
+            num_positions = (num_patches + 1) if pretrain_use_cls_token else num_patches
+            self.pos_embed = nn.Parameter(torch.zeros(1, num_positions, embed_dim))
+        else:
+            self.pos_embed = None
+        # stochastic depth decay rule
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]
+        self.blocks = nn.ModuleList()
+        for i in range(depth):
+            block = Block(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                act_layer=act_layer,
+                use_rel_pos=use_rel_pos,
+                rel_pos_zero_init=rel_pos_zero_init,
+                window_size=window_size if i in window_block_indexes else 0,
+                use_residual_block=i in residual_block_indexes,
+                input_size=(img_size // patch_size, img_size // patch_size),
+            )
+            if use_act_checkpoint:
+                # TODO: use torch.utils.checkpoint
+                from fairscale.nn.checkpoint import checkpoint_wrapper
+                block = checkpoint_wrapper(block)
+            self.blocks.append(block)
+        self._out_feature_channels = {out_feature: embed_dim}
+        self._out_feature_strides = {out_feature: patch_size}
+        self._out_features = [out_feature]
+        if self.pos_embed is not None:
+            nn.init.trunc_normal_(self.pos_embed, std=0.02)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            nn.init.trunc_normal_(m.weight, std=0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    def forward(self, x):
+        x = self.patch_embed(x)
+        if self.pos_embed is not None:
+            x = x + get_abs_pos(
+                self.pos_embed, self.pretrain_use_cls_token, (x.shape[1], x.shape[2])
+            )
+        for blk in self.blocks:
+            x = blk(x)
+        outputs = {self._out_features[0]: x.permute(0, 3, 1, 2)}
+        return outputs
+class SimpleFeaturePyramid(Backbone):
+    """
+    This module implements SimpleFeaturePyramid in :paper:`vitdet`.
+    It creates pyramid features built on top of the input feature map.
+    """
+    def __init__(
+        self,
+        net,
+        in_feature,
+        out_channels,
+        scale_factors,
+        top_block=None,
+        norm="LN",
+        square_pad=0,
+    ):
+        """
+        Args:
+            net (Backbone): module representing the subnetwork backbone.
+                Must be a subclass of :class:`Backbone`.
+            in_feature (str): names of the input feature maps coming
+                from the net.
+            out_channels (int): number of channels in the output feature maps.
+            scale_factors (list[float]): list of scaling factors to upsample or downsample
+                the input features for creating pyramid features.
+            top_block (nn.Module or None): if provided, an extra operation will
+                be performed on the output of the last (smallest resolution)
+                pyramid output, and the result will extend the result list. The top_block
+                further downsamples the feature map. It must have an attribute
+                "num_levels", meaning the number of extra pyramid levels added by
+                this block, and "in_feature", which is a string representing
+                its input feature (e.g., p5).
+            norm (str): the normalization to use.
+            square_pad (int): If > 0, require input images to be padded to specific square size.
+        """
+        super(SimpleFeaturePyramid, self).__init__()
+        assert isinstance(net, Backbone)
+        self.scale_factors = scale_factors
+        input_shapes = net.output_shape()
+        strides = [int(input_shapes[in_feature].stride / scale) for scale in scale_factors]
+        _assert_strides_are_log2_contiguous(strides)
+        dim = input_shapes[in_feature].channels
+        self.stages = []
+        use_bias = norm == ""
+        for idx, scale in enumerate(scale_factors):
+            out_dim = dim
+            if scale == 4.0:
+                layers = [
+                    nn.ConvTranspose2d(dim, dim // 2, kernel_size=2, stride=2),
+                    get_norm(norm, dim // 2),
+                    nn.GELU(),
+                    nn.ConvTranspose2d(dim // 2, dim // 4, kernel_size=2, stride=2),
+                ]
+                out_dim = dim // 4
+            elif scale == 2.0:
+                layers = [nn.ConvTranspose2d(dim, dim // 2, kernel_size=2, stride=2)]
+                out_dim = dim // 2
+            elif scale == 1.0:
+                layers = []
+            elif scale == 0.5:
+                layers = [nn.MaxPool2d(kernel_size=2, stride=2)]
+            else:
+                raise NotImplementedError(f"scale_factor={scale} is not supported yet.")
+            layers.extend(
+                [
+                    Conv2d(
+                        out_dim,
+                        out_channels,
+                        kernel_size=1,
+                        bias=use_bias,
+                        norm=get_norm(norm, out_channels),
+                    ),
+                    Conv2d(
+                        out_channels,
+                        out_channels,
+                        kernel_size=3,
+                        padding=1,
+                        bias=use_bias,
+                        norm=get_norm(norm, out_channels),
+                    ),
+                ]
+            )
+            layers = nn.Sequential(*layers)
+            stage = int(math.log2(strides[idx]))
+            self.add_module(f"simfp_{stage}", layers)
+            self.stages.append(layers)
+        self.net = net
+        self.in_feature = in_feature
+        self.top_block = top_block
+        # Return feature names are "p<stage>", like ["p2", "p3", ..., "p6"]
+        self._out_feature_strides = {"p{}".format(int(math.log2(s))): s for s in strides}
+        # top block output feature maps.
+        if self.top_block is not None:
+            for s in range(stage, stage + self.top_block.num_levels):
+                self._out_feature_strides["p{}".format(s + 1)] = 2 ** (s + 1)
+        self._out_features = list(self._out_feature_strides.keys())
+        self._out_feature_channels = {k: out_channels for k in self._out_features}
+        self._size_divisibility = strides[-1]
+        self._square_pad = square_pad
+    @property
+    def padding_constraints(self):
+        return {
+            "size_divisiblity": self._size_divisibility,
+            "square_size": self._square_pad,
+        }
+    def forward(self, x):
+        """
+        Args:
+            x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``.
+        Returns:
+            dict[str->Tensor]:
+                mapping from feature map name to pyramid feature map tensor
+                in high to low resolution order. Returned feature names follow the FPN
+                convention: "p<stage>", where stage has stride = 2 ** stage e.g.,
+                ["p2", "p3", ..., "p6"].
+        """
+        bottom_up_features = self.net(x)
+        features = bottom_up_features[self.in_feature]
+        results = []
+        for stage in self.stages:
+            results.append(stage(features))
+        if self.top_block is not None:
+            if self.top_block.in_feature in bottom_up_features:
+                top_block_in_feature = bottom_up_features[self.top_block.in_feature]
+            else:
+                top_block_in_feature = results[self._out_features.index(self.top_block.in_feature)]
+            results.extend(self.top_block(top_block_in_feature))
+        assert len(self._out_features) == len(results)
+        return {f: res for f, res in zip(self._out_features, results)}
+def get_vit_lr_decay_rate(name, lr_decay_rate=1.0, num_layers=12):
+    """
+    Calculate lr decay rate for different ViT blocks.
+    Args:
+        name (string): parameter name.
+        lr_decay_rate (float): base lr decay rate.
+        num_layers (int): number of ViT blocks.
+    Returns:
+        lr decay rate for the given parameter.
+    """
+    layer_id = num_layers + 1
+    if name.startswith("backbone"):
+        if ".pos_embed" in name or ".patch_embed" in name:
+            layer_id = 0
+        elif ".blocks." in name and ".residual." not in name:
+            layer_id = int(name[name.find(".blocks.") :].split(".")[2]) + 1
+    return lr_decay_rate ** (num_layers + 1 - layer_id)

extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/box_regression.py ADDED Viewed

	@@ -0,0 +1,369 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+import math
+from typing import List, Tuple, Union
+import torch
+from fvcore.nn import giou_loss, smooth_l1_loss
+from torch.nn import functional as F
+from annotator.oneformer.detectron2.layers import cat, ciou_loss, diou_loss
+from annotator.oneformer.detectron2.structures import Boxes
+# Value for clamping large dw and dh predictions. The heuristic is that we clamp
+# such that dw and dh are no larger than what would transform a 16px box into a
+# 1000px box (based on a small anchor, 16px, and a typical image size, 1000px).
+_DEFAULT_SCALE_CLAMP = math.log(1000.0 / 16)
+__all__ = ["Box2BoxTransform", "Box2BoxTransformRotated", "Box2BoxTransformLinear"]
+@torch.jit.script
+class Box2BoxTransform(object):
+    """
+    The box-to-box transform defined in R-CNN. The transformation is parameterized
+    by 4 deltas: (dx, dy, dw, dh). The transformation scales the box's width and height
+    by exp(dw), exp(dh) and shifts a box's center by the offset (dx * width, dy * height).
+    """
+    def __init__(
+        self, weights: Tuple[float, float, float, float], scale_clamp: float = _DEFAULT_SCALE_CLAMP
+    ):
+        """
+        Args:
+            weights (4-element tuple): Scaling factors that are applied to the
+                (dx, dy, dw, dh) deltas. In Fast R-CNN, these were originally set
+                such that the deltas have unit variance; now they are treated as
+                hyperparameters of the system.
+            scale_clamp (float): When predicting deltas, the predicted box scaling
+                factors (dw and dh) are clamped such that they are <= scale_clamp.
+        """
+        self.weights = weights
+        self.scale_clamp = scale_clamp
+    def get_deltas(self, src_boxes, target_boxes):
+        """
+        Get box regression transformation deltas (dx, dy, dw, dh) that can be used
+        to transform the `src_boxes` into the `target_boxes`. That is, the relation
+        ``target_boxes == self.apply_deltas(deltas, src_boxes)`` is true (unless
+        any delta is too large and is clamped).
+        Args:
+            src_boxes (Tensor): source boxes, e.g., object proposals
+            target_boxes (Tensor): target of the transformation, e.g., ground-truth
+                boxes.
+        """
+        assert isinstance(src_boxes, torch.Tensor), type(src_boxes)
+        assert isinstance(target_boxes, torch.Tensor), type(target_boxes)
+        src_widths = src_boxes[:, 2] - src_boxes[:, 0]
+        src_heights = src_boxes[:, 3] - src_boxes[:, 1]
+        src_ctr_x = src_boxes[:, 0] + 0.5 * src_widths
+        src_ctr_y = src_boxes[:, 1] + 0.5 * src_heights
+        target_widths = target_boxes[:, 2] - target_boxes[:, 0]
+        target_heights = target_boxes[:, 3] - target_boxes[:, 1]
+        target_ctr_x = target_boxes[:, 0] + 0.5 * target_widths
+        target_ctr_y = target_boxes[:, 1] + 0.5 * target_heights
+        wx, wy, ww, wh = self.weights
+        dx = wx * (target_ctr_x - src_ctr_x) / src_widths
+        dy = wy * (target_ctr_y - src_ctr_y) / src_heights
+        dw = ww * torch.log(target_widths / src_widths)
+        dh = wh * torch.log(target_heights / src_heights)
+        deltas = torch.stack((dx, dy, dw, dh), dim=1)
+        assert (src_widths > 0).all().item(), "Input boxes to Box2BoxTransform are not valid!"
+        return deltas
+    def apply_deltas(self, deltas, boxes):
+        """
+        Apply transformation `deltas` (dx, dy, dw, dh) to `boxes`.
+        Args:
+            deltas (Tensor): transformation deltas of shape (N, k*4), where k >= 1.
+                deltas[i] represents k potentially different class-specific
+                box transformations for the single box boxes[i].
+            boxes (Tensor): boxes to transform, of shape (N, 4)
+        """
+        deltas = deltas.float()  # ensure fp32 for decoding precision
+        boxes = boxes.to(deltas.dtype)
+        widths = boxes[:, 2] - boxes[:, 0]
+        heights = boxes[:, 3] - boxes[:, 1]
+        ctr_x = boxes[:, 0] + 0.5 * widths
+        ctr_y = boxes[:, 1] + 0.5 * heights
+        wx, wy, ww, wh = self.weights
+        dx = deltas[:, 0::4] / wx
+        dy = deltas[:, 1::4] / wy
+        dw = deltas[:, 2::4] / ww
+        dh = deltas[:, 3::4] / wh
+        # Prevent sending too large values into torch.exp()
+        dw = torch.clamp(dw, max=self.scale_clamp)
+        dh = torch.clamp(dh, max=self.scale_clamp)
+        pred_ctr_x = dx * widths[:, None] + ctr_x[:, None]
+        pred_ctr_y = dy * heights[:, None] + ctr_y[:, None]
+        pred_w = torch.exp(dw) * widths[:, None]
+        pred_h = torch.exp(dh) * heights[:, None]
+        x1 = pred_ctr_x - 0.5 * pred_w
+        y1 = pred_ctr_y - 0.5 * pred_h
+        x2 = pred_ctr_x + 0.5 * pred_w
+        y2 = pred_ctr_y + 0.5 * pred_h
+        pred_boxes = torch.stack((x1, y1, x2, y2), dim=-1)
+        return pred_boxes.reshape(deltas.shape)
+@torch.jit.script
+class Box2BoxTransformRotated(object):
+    """
+    The box-to-box transform defined in Rotated R-CNN. The transformation is parameterized
+    by 5 deltas: (dx, dy, dw, dh, da). The transformation scales the box's width and height
+    by exp(dw), exp(dh), shifts a box's center by the offset (dx * width, dy * height),
+    and rotate a box's angle by da (radians).
+    Note: angles of deltas are in radians while angles of boxes are in degrees.
+    """
+    def __init__(
+        self,
+        weights: Tuple[float, float, float, float, float],
+        scale_clamp: float = _DEFAULT_SCALE_CLAMP,
+    ):
+        """
+        Args:
+            weights (5-element tuple): Scaling factors that are applied to the
+                (dx, dy, dw, dh, da) deltas. These are treated as
+                hyperparameters of the system.
+            scale_clamp (float): When predicting deltas, the predicted box scaling
+                factors (dw and dh) are clamped such that they are <= scale_clamp.
+        """
+        self.weights = weights
+        self.scale_clamp = scale_clamp
+    def get_deltas(self, src_boxes, target_boxes):
+        """
+        Get box regression transformation deltas (dx, dy, dw, dh, da) that can be used
+        to transform the `src_boxes` into the `target_boxes`. That is, the relation
+        ``target_boxes == self.apply_deltas(deltas, src_boxes)`` is true (unless
+        any delta is too large and is clamped).
+        Args:
+            src_boxes (Tensor): Nx5 source boxes, e.g., object proposals
+            target_boxes (Tensor): Nx5 target of the transformation, e.g., ground-truth
+                boxes.
+        """
+        assert isinstance(src_boxes, torch.Tensor), type(src_boxes)
+        assert isinstance(target_boxes, torch.Tensor), type(target_boxes)
+        src_ctr_x, src_ctr_y, src_widths, src_heights, src_angles = torch.unbind(src_boxes, dim=1)
+        target_ctr_x, target_ctr_y, target_widths, target_heights, target_angles = torch.unbind(
+            target_boxes, dim=1
+        )
+        wx, wy, ww, wh, wa = self.weights
+        dx = wx * (target_ctr_x - src_ctr_x) / src_widths
+        dy = wy * (target_ctr_y - src_ctr_y) / src_heights
+        dw = ww * torch.log(target_widths / src_widths)
+        dh = wh * torch.log(target_heights / src_heights)
+        # Angles of deltas are in radians while angles of boxes are in degrees.
+        # the conversion to radians serve as a way to normalize the values
+        da = target_angles - src_angles
+        da = (da + 180.0) % 360.0 - 180.0  # make it in [-180, 180)
+        da *= wa * math.pi / 180.0
+        deltas = torch.stack((dx, dy, dw, dh, da), dim=1)
+        assert (
+            (src_widths > 0).all().item()
+        ), "Input boxes to Box2BoxTransformRotated are not valid!"
+        return deltas
+    def apply_deltas(self, deltas, boxes):
+        """
+        Apply transformation `deltas` (dx, dy, dw, dh, da) to `boxes`.
+        Args:
+            deltas (Tensor): transformation deltas of shape (N, k*5).
+                deltas[i] represents box transformation for the single box boxes[i].
+            boxes (Tensor): boxes to transform, of shape (N, 5)
+        """
+        assert deltas.shape[1] % 5 == 0 and boxes.shape[1] == 5
+        boxes = boxes.to(deltas.dtype).unsqueeze(2)
+        ctr_x = boxes[:, 0]
+        ctr_y = boxes[:, 1]
+        widths = boxes[:, 2]
+        heights = boxes[:, 3]
+        angles = boxes[:, 4]
+        wx, wy, ww, wh, wa = self.weights
+        dx = deltas[:, 0::5] / wx
+        dy = deltas[:, 1::5] / wy
+        dw = deltas[:, 2::5] / ww
+        dh = deltas[:, 3::5] / wh
+        da = deltas[:, 4::5] / wa
+        # Prevent sending too large values into torch.exp()
+        dw = torch.clamp(dw, max=self.scale_clamp)
+        dh = torch.clamp(dh, max=self.scale_clamp)
+        pred_boxes = torch.zeros_like(deltas)
+        pred_boxes[:, 0::5] = dx * widths + ctr_x  # x_ctr
+        pred_boxes[:, 1::5] = dy * heights + ctr_y  # y_ctr
+        pred_boxes[:, 2::5] = torch.exp(dw) * widths  # width
+        pred_boxes[:, 3::5] = torch.exp(dh) * heights  # height
+        # Following original RRPN implementation,
+        # angles of deltas are in radians while angles of boxes are in degrees.
+        pred_angle = da * 180.0 / math.pi + angles
+        pred_angle = (pred_angle + 180.0) % 360.0 - 180.0  # make it in [-180, 180)
+        pred_boxes[:, 4::5] = pred_angle
+        return pred_boxes
+class Box2BoxTransformLinear(object):
+    """
+    The linear box-to-box transform defined in FCOS. The transformation is parameterized
+    by the distance from the center of (square) src box to 4 edges of the target box.
+    """
+    def __init__(self, normalize_by_size=True):
+        """
+        Args:
+            normalize_by_size: normalize deltas by the size of src (anchor) boxes.
+        """
+        self.normalize_by_size = normalize_by_size
+    def get_deltas(self, src_boxes, target_boxes):
+        """
+        Get box regression transformation deltas (dx1, dy1, dx2, dy2) that can be used
+        to transform the `src_boxes` into the `target_boxes`. That is, the relation
+        ``target_boxes == self.apply_deltas(deltas, src_boxes)`` is true.
+        The center of src must be inside target boxes.
+        Args:
+            src_boxes (Tensor): square source boxes, e.g., anchors
+            target_boxes (Tensor): target of the transformation, e.g., ground-truth
+                boxes.
+        """
+        assert isinstance(src_boxes, torch.Tensor), type(src_boxes)
+        assert isinstance(target_boxes, torch.Tensor), type(target_boxes)
+        src_ctr_x = 0.5 * (src_boxes[:, 0] + src_boxes[:, 2])
+        src_ctr_y = 0.5 * (src_boxes[:, 1] + src_boxes[:, 3])
+        target_l = src_ctr_x - target_boxes[:, 0]
+        target_t = src_ctr_y - target_boxes[:, 1]
+        target_r = target_boxes[:, 2] - src_ctr_x
+        target_b = target_boxes[:, 3] - src_ctr_y
+        deltas = torch.stack((target_l, target_t, target_r, target_b), dim=1)
+        if self.normalize_by_size:
+            stride_w = src_boxes[:, 2] - src_boxes[:, 0]
+            stride_h = src_boxes[:, 3] - src_boxes[:, 1]
+            strides = torch.stack([stride_w, stride_h, stride_w, stride_h], axis=1)
+            deltas = deltas / strides
+        return deltas
+    def apply_deltas(self, deltas, boxes):
+        """
+        Apply transformation `deltas` (dx1, dy1, dx2, dy2) to `boxes`.
+        Args:
+            deltas (Tensor): transformation deltas of shape (N, k*4), where k >= 1.
+                deltas[i] represents k potentially different class-specific
+                box transformations for the single box boxes[i].
+            boxes (Tensor): boxes to transform, of shape (N, 4)
+        """
+        # Ensure the output is a valid box. See Sec 2.1 of https://arxiv.org/abs/2006.09214
+        deltas = F.relu(deltas)
+        boxes = boxes.to(deltas.dtype)
+        ctr_x = 0.5 * (boxes[:, 0] + boxes[:, 2])
+        ctr_y = 0.5 * (boxes[:, 1] + boxes[:, 3])
+        if self.normalize_by_size:
+            stride_w = boxes[:, 2] - boxes[:, 0]
+            stride_h = boxes[:, 3] - boxes[:, 1]
+            strides = torch.stack([stride_w, stride_h, stride_w, stride_h], axis=1)
+            deltas = deltas * strides
+        l = deltas[:, 0::4]
+        t = deltas[:, 1::4]
+        r = deltas[:, 2::4]
+        b = deltas[:, 3::4]
+        pred_boxes = torch.zeros_like(deltas)
+        pred_boxes[:, 0::4] = ctr_x[:, None] - l  # x1
+        pred_boxes[:, 1::4] = ctr_y[:, None] - t  # y1
+        pred_boxes[:, 2::4] = ctr_x[:, None] + r  # x2
+        pred_boxes[:, 3::4] = ctr_y[:, None] + b  # y2
+        return pred_boxes
+def _dense_box_regression_loss(
+    anchors: List[Union[Boxes, torch.Tensor]],
+    box2box_transform: Box2BoxTransform,
+    pred_anchor_deltas: List[torch.Tensor],
+    gt_boxes: List[torch.Tensor],
+    fg_mask: torch.Tensor,
+    box_reg_loss_type="smooth_l1",
+    smooth_l1_beta=0.0,
+):
+    """
+    Compute loss for dense multi-level box regression.
+    Loss is accumulated over ``fg_mask``.
+    Args:
+        anchors: #lvl anchor boxes, each is (HixWixA, 4)
+        pred_anchor_deltas: #lvl predictions, each is (N, HixWixA, 4)
+        gt_boxes: N ground truth boxes, each has shape (R, 4) (R = sum(Hi * Wi * A))
+        fg_mask: the foreground boolean mask of shape (N, R) to compute loss on
+        box_reg_loss_type (str): Loss type to use. Supported losses: "smooth_l1", "giou",
+            "diou", "ciou".
+        smooth_l1_beta (float): beta parameter for the smooth L1 regression loss. Default to
+            use L1 loss. Only used when `box_reg_loss_type` is "smooth_l1"
+    """
+    if isinstance(anchors[0], Boxes):
+        anchors = type(anchors[0]).cat(anchors).tensor  # (R, 4)
+    else:
+        anchors = cat(anchors)
+    if box_reg_loss_type == "smooth_l1":
+        gt_anchor_deltas = [box2box_transform.get_deltas(anchors, k) for k in gt_boxes]
+        gt_anchor_deltas = torch.stack(gt_anchor_deltas)  # (N, R, 4)
+        loss_box_reg = smooth_l1_loss(
+            cat(pred_anchor_deltas, dim=1)[fg_mask],
+            gt_anchor_deltas[fg_mask],
+            beta=smooth_l1_beta,
+            reduction="sum",
+        )
+    elif box_reg_loss_type == "giou":
+        pred_boxes = [
+            box2box_transform.apply_deltas(k, anchors) for k in cat(pred_anchor_deltas, dim=1)
+        ]
+        loss_box_reg = giou_loss(
+            torch.stack(pred_boxes)[fg_mask], torch.stack(gt_boxes)[fg_mask], reduction="sum"
+        )
+    elif box_reg_loss_type == "diou":
+        pred_boxes = [
+            box2box_transform.apply_deltas(k, anchors) for k in cat(pred_anchor_deltas, dim=1)
+        ]
+        loss_box_reg = diou_loss(
+            torch.stack(pred_boxes)[fg_mask], torch.stack(gt_boxes)[fg_mask], reduction="sum"
+        )
+    elif box_reg_loss_type == "ciou":
+        pred_boxes = [
+            box2box_transform.apply_deltas(k, anchors) for k in cat(pred_anchor_deltas, dim=1)
+        ]
+        loss_box_reg = ciou_loss(
+            torch.stack(pred_boxes)[fg_mask], torch.stack(gt_boxes)[fg_mask], reduction="sum"
+        )
+    else:
+        raise ValueError(f"Invalid dense box regression loss type '{box_reg_loss_type}'")
+    return loss_box_reg

extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/matcher.py ADDED Viewed

	@@ -0,0 +1,127 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+from typing import List
+import torch
+from annotator.oneformer.detectron2.layers import nonzero_tuple
+# TODO: the name is too general
+class Matcher(object):
+    """
+    This class assigns to each predicted "element" (e.g., a box) a ground-truth
+    element. Each predicted element will have exactly zero or one matches; each
+    ground-truth element may be matched to zero or more predicted elements.
+    The matching is determined by the MxN match_quality_matrix, that characterizes
+    how well each (ground-truth, prediction)-pair match each other. For example,
+    if the elements are boxes, this matrix may contain box intersection-over-union
+    overlap values.
+    The matcher returns (a) a vector of length N containing the index of the
+    ground-truth element m in [0, M) that matches to prediction n in [0, N).
+    (b) a vector of length N containing the labels for each prediction.
+    """
+    def __init__(
+        self, thresholds: List[float], labels: List[int], allow_low_quality_matches: bool = False
+    ):
+        """
+        Args:
+            thresholds (list): a list of thresholds used to stratify predictions
+                into levels.
+            labels (list): a list of values to label predictions belonging at
+                each level. A label can be one of {-1, 0, 1} signifying
+                {ignore, negative class, positive class}, respectively.
+            allow_low_quality_matches (bool): if True, produce additional matches
+                for predictions with maximum match quality lower than high_threshold.
+                See set_low_quality_matches_ for more details.
+            For example,
+                thresholds = [0.3, 0.5]
+                labels = [0, -1, 1]
+                All predictions with iou < 0.3 will be marked with 0 and
+                thus will be considered as false positives while training.
+                All predictions with 0.3 <= iou < 0.5 will be marked with -1 and
+                thus will be ignored.
+                All predictions with 0.5 <= iou will be marked with 1 and
+                thus will be considered as true positives.
+        """
+        # Add -inf and +inf to first and last position in thresholds
+        thresholds = thresholds[:]
+        assert thresholds[0] > 0
+        thresholds.insert(0, -float("inf"))
+        thresholds.append(float("inf"))
+        # Currently torchscript does not support all + generator
+        assert all([low <= high for (low, high) in zip(thresholds[:-1], thresholds[1:])])
+        assert all([l in [-1, 0, 1] for l in labels])
+        assert len(labels) == len(thresholds) - 1
+        self.thresholds = thresholds
+        self.labels = labels
+        self.allow_low_quality_matches = allow_low_quality_matches
+    def __call__(self, match_quality_matrix):
+        """
+        Args:
+            match_quality_matrix (Tensor[float]): an MxN tensor, containing the
+                pairwise quality between M ground-truth elements and N predicted
+                elements. All elements must be >= 0 (due to the us of `torch.nonzero`
+                for selecting indices in :meth:`set_low_quality_matches_`).
+        Returns:
+            matches (Tensor[int64]): a vector of length N, where matches[i] is a matched
+                ground-truth index in [0, M)
+            match_labels (Tensor[int8]): a vector of length N, where pred_labels[i] indicates
+                whether a prediction is a true or false positive or ignored
+        """
+        assert match_quality_matrix.dim() == 2
+        if match_quality_matrix.numel() == 0:
+            default_matches = match_quality_matrix.new_full(
+                (match_quality_matrix.size(1),), 0, dtype=torch.int64
+            )
+            # When no gt boxes exist, we define IOU = 0 and therefore set labels
+            # to `self.labels[0]`, which usually defaults to background class 0
+            # To choose to ignore instead, can make labels=[-1,0,-1,1] + set appropriate thresholds
+            default_match_labels = match_quality_matrix.new_full(
+                (match_quality_matrix.size(1),), self.labels[0], dtype=torch.int8
+            )
+            return default_matches, default_match_labels
+        assert torch.all(match_quality_matrix >= 0)
+        # match_quality_matrix is M (gt) x N (predicted)
+        # Max over gt elements (dim 0) to find best gt candidate for each prediction
+        matched_vals, matches = match_quality_matrix.max(dim=0)
+        match_labels = matches.new_full(matches.size(), 1, dtype=torch.int8)
+        for (l, low, high) in zip(self.labels, self.thresholds[:-1], self.thresholds[1:]):
+            low_high = (matched_vals >= low) & (matched_vals < high)
+            match_labels[low_high] = l
+        if self.allow_low_quality_matches:
+            self.set_low_quality_matches_(match_labels, match_quality_matrix)
+        return matches, match_labels
+    def set_low_quality_matches_(self, match_labels, match_quality_matrix):
+        """
+        Produce additional matches for predictions that have only low-quality matches.
+        Specifically, for each ground-truth G find the set of predictions that have
+        maximum overlap with it (including ties); for each prediction in that set, if
+        it is unmatched, then match it to the ground-truth G.
+        This function implements the RPN assignment case (i) in Sec. 3.1.2 of
+        :paper:`Faster R-CNN`.
+        """
+        # For each gt, find the prediction with which it has highest quality
+        highest_quality_foreach_gt, _ = match_quality_matrix.max(dim=1)
+        # Find the highest quality match available, even if it is low, including ties.
+        # Note that the matches qualities must be positive due to the use of
+        # `torch.nonzero`.
+        _, pred_inds_with_highest_quality = nonzero_tuple(
+            match_quality_matrix == highest_quality_foreach_gt[:, None]
+        )
+        # If an anchor was labeled positive only due to a low-quality match
+        # with gt_A, but it has larger overlap with gt_B, it's matched index will still be gt_B.
+        # This follows the implementation in Detectron, and is found to have no significant impact.
+        match_labels[pred_inds_with_highest_quality] = 1

extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/meta_arch/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+from .build import META_ARCH_REGISTRY, build_model  # isort:skip
+from .panoptic_fpn import PanopticFPN
+# import all the meta_arch, so they will be registered
+from .rcnn import GeneralizedRCNN, ProposalNetwork
+from .dense_detector import DenseDetector
+from .retinanet import RetinaNet
+from .fcos import FCOS
+from .semantic_seg import SEM_SEG_HEADS_REGISTRY, SemanticSegmentor, build_sem_seg_head
+__all__ = list(globals().keys())

extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/meta_arch/build.py ADDED Viewed

	@@ -0,0 +1,24 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+import torch
+from annotator.oneformer.detectron2.utils.logger import _log_api_usage
+from annotator.oneformer.detectron2.utils.registry import Registry
+META_ARCH_REGISTRY = Registry("META_ARCH")  # noqa F401 isort:skip
+META_ARCH_REGISTRY.__doc__ = """
+Registry for meta-architectures, i.e. the whole model.
+The registered object will be called with `obj(cfg)`
+and expected to return a `nn.Module` object.
+"""
+def build_model(cfg):
+    """
+    Build the whole model architecture, defined by ``cfg.MODEL.META_ARCHITECTURE``.
+    Note that it does not load any weights from ``cfg``.
+    """
+    meta_arch = cfg.MODEL.META_ARCHITECTURE
+    model = META_ARCH_REGISTRY.get(meta_arch)(cfg)
+    _log_api_usage("modeling.meta_arch." + meta_arch)
+    return model

extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/meta_arch/dense_detector.py ADDED Viewed

	@@ -0,0 +1,294 @@

+import numpy as np
+from typing import Dict, List, Optional, Tuple
+import torch
+from torch import Tensor, nn
+from annotator.oneformer.detectron2.data.detection_utils import convert_image_to_rgb
+from annotator.oneformer.detectron2.layers import move_device_like
+from annotator.oneformer.detectron2.modeling import Backbone
+from annotator.oneformer.detectron2.structures import Boxes, ImageList, Instances
+from annotator.oneformer.detectron2.utils.events import get_event_storage
+from ..postprocessing import detector_postprocess
+def permute_to_N_HWA_K(tensor, K: int):
+    """
+    Transpose/reshape a tensor from (N, (Ai x K), H, W) to (N, (HxWxAi), K)
+    """
+    assert tensor.dim() == 4, tensor.shape
+    N, _, H, W = tensor.shape
+    tensor = tensor.view(N, -1, K, H, W)
+    tensor = tensor.permute(0, 3, 4, 1, 2)
+    tensor = tensor.reshape(N, -1, K)  # Size=(N,HWA,K)
+    return tensor
+class DenseDetector(nn.Module):
+    """
+    Base class for dense detector. We define a dense detector as a fully-convolutional model that
+    makes per-pixel (i.e. dense) predictions.
+    """
+    def __init__(
+        self,
+        backbone: Backbone,
+        head: nn.Module,
+        head_in_features: Optional[List[str]] = None,
+        *,
+        pixel_mean,
+        pixel_std,
+    ):
+        """
+        Args:
+            backbone: backbone module
+            head: head module
+            head_in_features: backbone features to use in head. Default to all backbone features.
+            pixel_mean (Tuple[float]):
+                Values to be used for image normalization (BGR order).
+                To train on images of different number of channels, set different mean & std.
+                Default values are the mean pixel value from ImageNet: [103.53, 116.28, 123.675]
+            pixel_std (Tuple[float]):
+                When using pre-trained models in Detectron1 or any MSRA models,
+                std has been absorbed into its conv1 weights, so the std needs to be set 1.
+                Otherwise, you can use [57.375, 57.120, 58.395] (ImageNet std)
+        """
+        super().__init__()
+        self.backbone = backbone
+        self.head = head
+        if head_in_features is None:
+            shapes = self.backbone.output_shape()
+            self.head_in_features = sorted(shapes.keys(), key=lambda x: shapes[x].stride)
+        else:
+            self.head_in_features = head_in_features
+        self.register_buffer("pixel_mean", torch.tensor(pixel_mean).view(-1, 1, 1), False)
+        self.register_buffer("pixel_std", torch.tensor(pixel_std).view(-1, 1, 1), False)
+    @property
+    def device(self):
+        return self.pixel_mean.device
+    def _move_to_current_device(self, x):
+        return move_device_like(x, self.pixel_mean)
+    def forward(self, batched_inputs: List[Dict[str, Tensor]]):
+        """
+        Args:
+            batched_inputs: a list, batched outputs of :class:`DatasetMapper` .
+                Each item in the list contains the inputs for one image.
+                For now, each item in the list is a dict that contains:
+                * image: Tensor, image in (C, H, W) format.
+                * instances: Instances
+                Other information that's included in the original dicts, such as:
+                * "height", "width" (int): the output resolution of the model, used in inference.
+                  See :meth:`postprocess` for details.
+        Returns:
+            In training, dict[str, Tensor]: mapping from a named loss to a tensor storing the
+            loss. Used during training only. In inference, the standard output format, described
+            in :doc:`/tutorials/models`.
+        """
+        images = self.preprocess_image(batched_inputs)
+        features = self.backbone(images.tensor)
+        features = [features[f] for f in self.head_in_features]
+        predictions = self.head(features)
+        if self.training:
+            assert not torch.jit.is_scripting(), "Not supported"
+            assert "instances" in batched_inputs[0], "Instance annotations are missing in training!"
+            gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
+            return self.forward_training(images, features, predictions, gt_instances)
+        else:
+            results = self.forward_inference(images, features, predictions)
+            if torch.jit.is_scripting():
+                return results
+            processed_results = []
+            for results_per_image, input_per_image, image_size in zip(
+                results, batched_inputs, images.image_sizes
+            ):
+                height = input_per_image.get("height", image_size[0])
+                width = input_per_image.get("width", image_size[1])
+                r = detector_postprocess(results_per_image, height, width)
+                processed_results.append({"instances": r})
+            return processed_results
+    def forward_training(self, images, features, predictions, gt_instances):
+        raise NotImplementedError()
+    def preprocess_image(self, batched_inputs: List[Dict[str, Tensor]]):
+        """
+        Normalize, pad and batch the input images.
+        """
+        images = [self._move_to_current_device(x["image"]) for x in batched_inputs]
+        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
+        images = ImageList.from_tensors(
+            images,
+            self.backbone.size_divisibility,
+            padding_constraints=self.backbone.padding_constraints,
+        )
+        return images
+    def _transpose_dense_predictions(
+        self, predictions: List[List[Tensor]], dims_per_anchor: List[int]
+    ) -> List[List[Tensor]]:
+        """
+        Transpose the dense per-level predictions.
+        Args:
+            predictions: a list of outputs, each is a list of per-level
+                predictions with shape (N, Ai x K, Hi, Wi), where N is the
+                number of images, Ai is the number of anchors per location on
+                level i, K is the dimension of predictions per anchor.
+            dims_per_anchor: the value of K for each predictions. e.g. 4 for
+                box prediction, #classes for classification prediction.
+        Returns:
+            List[List[Tensor]]: each prediction is transposed to (N, Hi x Wi x Ai, K).
+        """
+        assert len(predictions) == len(dims_per_anchor)
+        res: List[List[Tensor]] = []
+        for pred, dim_per_anchor in zip(predictions, dims_per_anchor):
+            pred = [permute_to_N_HWA_K(x, dim_per_anchor) for x in pred]
+            res.append(pred)
+        return res
+    def _ema_update(self, name: str, value: float, initial_value: float, momentum: float = 0.9):
+        """
+        Apply EMA update to `self.name` using `value`.
+        This is mainly used for loss normalizer. In Detectron1, loss is normalized by number
+        of foreground samples in the batch. When batch size is 1 per GPU, #foreground has a
+        large variance and using it lead to lower performance. Therefore we maintain an EMA of
+        #foreground to stabilize the normalizer.
+        Args:
+            name: name of the normalizer
+            value: the new value to update
+            initial_value: the initial value to start with
+            momentum: momentum of EMA
+        Returns:
+            float: the updated EMA value
+        """
+        if hasattr(self, name):
+            old = getattr(self, name)
+        else:
+            old = initial_value
+        new = old * momentum + value * (1 - momentum)
+        setattr(self, name, new)
+        return new
+    def _decode_per_level_predictions(
+        self,
+        anchors: Boxes,
+        pred_scores: Tensor,
+        pred_deltas: Tensor,
+        score_thresh: float,
+        topk_candidates: int,
+        image_size: Tuple[int, int],
+    ) -> Instances:
+        """
+        Decode boxes and classification predictions of one featuer level, by
+        the following steps:
+        1. filter the predictions based on score threshold and top K scores.
+        2. transform the box regression outputs
+        3. return the predicted scores, classes and boxes
+        Args:
+            anchors: Boxes, anchor for this feature level
+            pred_scores: HxWxA,K
+            pred_deltas: HxWxA,4
+        Returns:
+            Instances: with field "scores", "pred_boxes", "pred_classes".
+        """
+        # Apply two filtering to make NMS faster.
+        # 1. Keep boxes with confidence score higher than threshold
+        keep_idxs = pred_scores > score_thresh
+        pred_scores = pred_scores[keep_idxs]
+        topk_idxs = torch.nonzero(keep_idxs)  # Kx2
+        # 2. Keep top k top scoring boxes only
+        topk_idxs_size = topk_idxs.shape[0]
+        if isinstance(topk_idxs_size, Tensor):
+            # It's a tensor in tracing
+            num_topk = torch.clamp(topk_idxs_size, max=topk_candidates)
+        else:
+            num_topk = min(topk_idxs_size, topk_candidates)
+        pred_scores, idxs = pred_scores.topk(num_topk)
+        topk_idxs = topk_idxs[idxs]
+        anchor_idxs, classes_idxs = topk_idxs.unbind(dim=1)
+        pred_boxes = self.box2box_transform.apply_deltas(
+            pred_deltas[anchor_idxs], anchors.tensor[anchor_idxs]
+        )
+        return Instances(
+            image_size, pred_boxes=Boxes(pred_boxes), scores=pred_scores, pred_classes=classes_idxs
+        )
+    def _decode_multi_level_predictions(
+        self,
+        anchors: List[Boxes],
+        pred_scores: List[Tensor],
+        pred_deltas: List[Tensor],
+        score_thresh: float,
+        topk_candidates: int,
+        image_size: Tuple[int, int],
+    ) -> Instances:
+        """
+        Run `_decode_per_level_predictions` for all feature levels and concat the results.
+        """
+        predictions = [
+            self._decode_per_level_predictions(
+                anchors_i,
+                box_cls_i,
+                box_reg_i,
+                self.test_score_thresh,
+                self.test_topk_candidates,
+                image_size,
+            )
+            # Iterate over every feature level
+            for box_cls_i, box_reg_i, anchors_i in zip(pred_scores, pred_deltas, anchors)
+        ]
+        return predictions[0].cat(predictions)  # 'Instances.cat' is not scriptale but this is
+    def visualize_training(self, batched_inputs, results):
+        """
+        A function used to visualize ground truth images and final network predictions.
+        It shows ground truth bounding boxes on the original image and up to 20
+        predicted object bounding boxes on the original image.
+        Args:
+            batched_inputs (list): a list that contains input to the model.
+            results (List[Instances]): a list of #images elements returned by forward_inference().
+        """
+        from annotator.oneformer.detectron2.utils.visualizer import Visualizer
+        assert len(batched_inputs) == len(
+            results
+        ), "Cannot visualize inputs and results of different sizes"
+        storage = get_event_storage()
+        max_boxes = 20
+        image_index = 0  # only visualize a single image
+        img = batched_inputs[image_index]["image"]
+        img = convert_image_to_rgb(img.permute(1, 2, 0), self.input_format)
+        v_gt = Visualizer(img, None)
+        v_gt = v_gt.overlay_instances(boxes=batched_inputs[image_index]["instances"].gt_boxes)
+        anno_img = v_gt.get_image()
+        processed_results = detector_postprocess(results[image_index], img.shape[0], img.shape[1])
+        predicted_boxes = processed_results.pred_boxes.tensor.detach().cpu().numpy()
+        v_pred = Visualizer(img, None)
+        v_pred = v_pred.overlay_instances(boxes=predicted_boxes[0:max_boxes])
+        prop_img = v_pred.get_image()
+        vis_img = np.vstack((anno_img, prop_img))
+        vis_img = vis_img.transpose(2, 0, 1)
+        vis_name = f"Top: GT bounding boxes; Bottom: {max_boxes} Highest Scoring Results"
+        storage.put_image(vis_name, vis_img)

extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/meta_arch/fcos.py ADDED Viewed

	@@ -0,0 +1,328 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+import logging
+from typing import List, Optional, Tuple
+import torch
+from fvcore.nn import sigmoid_focal_loss_jit
+from torch import nn
+from torch.nn import functional as F
+from annotator.oneformer.detectron2.layers import ShapeSpec, batched_nms
+from annotator.oneformer.detectron2.structures import Boxes, ImageList, Instances, pairwise_point_box_distance
+from annotator.oneformer.detectron2.utils.events import get_event_storage
+from ..anchor_generator import DefaultAnchorGenerator
+from ..backbone import Backbone
+from ..box_regression import Box2BoxTransformLinear, _dense_box_regression_loss
+from .dense_detector import DenseDetector
+from .retinanet import RetinaNetHead
+__all__ = ["FCOS"]
+logger = logging.getLogger(__name__)
+class FCOS(DenseDetector):
+    """
+    Implement FCOS in :paper:`fcos`.
+    """
+    def __init__(
+        self,
+        *,
+        backbone: Backbone,
+        head: nn.Module,
+        head_in_features: Optional[List[str]] = None,
+        box2box_transform=None,
+        num_classes,
+        center_sampling_radius: float = 1.5,
+        focal_loss_alpha=0.25,
+        focal_loss_gamma=2.0,
+        test_score_thresh=0.2,
+        test_topk_candidates=1000,
+        test_nms_thresh=0.6,
+        max_detections_per_image=100,
+        pixel_mean,
+        pixel_std,
+    ):
+        """
+        Args:
+            center_sampling_radius: radius of the "center" of a groundtruth box,
+                within which all anchor points are labeled positive.
+            Other arguments mean the same as in :class:`RetinaNet`.
+        """
+        super().__init__(
+            backbone, head, head_in_features, pixel_mean=pixel_mean, pixel_std=pixel_std
+        )
+        self.num_classes = num_classes
+        # FCOS uses one anchor point per location.
+        # We represent the anchor point by a box whose size equals the anchor stride.
+        feature_shapes = backbone.output_shape()
+        fpn_strides = [feature_shapes[k].stride for k in self.head_in_features]
+        self.anchor_generator = DefaultAnchorGenerator(
+            sizes=[[k] for k in fpn_strides], aspect_ratios=[1.0], strides=fpn_strides
+        )
+        # FCOS parameterizes box regression by a linear transform,
+        # where predictions are normalized by anchor stride (equal to anchor size).
+        if box2box_transform is None:
+            box2box_transform = Box2BoxTransformLinear(normalize_by_size=True)
+        self.box2box_transform = box2box_transform
+        self.center_sampling_radius = float(center_sampling_radius)
+        # Loss parameters:
+        self.focal_loss_alpha = focal_loss_alpha
+        self.focal_loss_gamma = focal_loss_gamma
+        # Inference parameters:
+        self.test_score_thresh = test_score_thresh
+        self.test_topk_candidates = test_topk_candidates
+        self.test_nms_thresh = test_nms_thresh
+        self.max_detections_per_image = max_detections_per_image
+    def forward_training(self, images, features, predictions, gt_instances):
+        # Transpose the Hi*Wi*A dimension to the middle:
+        pred_logits, pred_anchor_deltas, pred_centerness = self._transpose_dense_predictions(
+            predictions, [self.num_classes, 4, 1]
+        )
+        anchors = self.anchor_generator(features)
+        gt_labels, gt_boxes = self.label_anchors(anchors, gt_instances)
+        return self.losses(
+            anchors, pred_logits, gt_labels, pred_anchor_deltas, gt_boxes, pred_centerness
+        )
+    @torch.no_grad()
+    def _match_anchors(self, gt_boxes: Boxes, anchors: List[Boxes]):
+        """
+        Match ground-truth boxes to a set of multi-level anchors.
+        Args:
+            gt_boxes: Ground-truth boxes from instances of an image.
+            anchors: List of anchors for each feature map (of different scales).
+        Returns:
+            torch.Tensor
+                A tensor of shape `(M, R)`, given `M` ground-truth boxes and total
+                `R` anchor points from all feature levels, indicating the quality
+                of match between m-th box and r-th anchor. Higher value indicates
+                better match.
+        """
+        # Naming convention: (M = ground-truth boxes, R = anchor points)
+        # Anchor points are represented as square boxes of size = stride.
+        num_anchors_per_level = [len(x) for x in anchors]
+        anchors = Boxes.cat(anchors)  # (R, 4)
+        anchor_centers = anchors.get_centers()  # (R, 2)
+        anchor_sizes = anchors.tensor[:, 2] - anchors.tensor[:, 0]  # (R, )
+        lower_bound = anchor_sizes * 4
+        lower_bound[: num_anchors_per_level[0]] = 0
+        upper_bound = anchor_sizes * 8
+        upper_bound[-num_anchors_per_level[-1] :] = float("inf")
+        gt_centers = gt_boxes.get_centers()
+        # FCOS with center sampling: anchor point must be close enough to
+        # ground-truth box center.
+        center_dists = (anchor_centers[None, :, :] - gt_centers[:, None, :]).abs_()
+        sampling_regions = self.center_sampling_radius * anchor_sizes[None, :]
+        match_quality_matrix = center_dists.max(dim=2).values < sampling_regions
+        pairwise_dist = pairwise_point_box_distance(anchor_centers, gt_boxes)
+        pairwise_dist = pairwise_dist.permute(1, 0, 2)  # (M, R, 4)
+        # The original FCOS anchor matching rule: anchor point must be inside GT.
+        match_quality_matrix &= pairwise_dist.min(dim=2).values > 0
+        # Multilevel anchor matching in FCOS: each anchor is only responsible
+        # for certain scale range.
+        pairwise_dist = pairwise_dist.max(dim=2).values
+        match_quality_matrix &= (pairwise_dist > lower_bound[None, :]) & (
+            pairwise_dist < upper_bound[None, :]
+        )
+        # Match the GT box with minimum area, if there are multiple GT matches.
+        gt_areas = gt_boxes.area()  # (M, )
+        match_quality_matrix = match_quality_matrix.to(torch.float32)
+        match_quality_matrix *= 1e8 - gt_areas[:, None]
+        return match_quality_matrix  # (M, R)
+    @torch.no_grad()
+    def label_anchors(self, anchors: List[Boxes], gt_instances: List[Instances]):
+        """
+        Same interface as :meth:`RetinaNet.label_anchors`, but implemented with FCOS
+        anchor matching rule.
+        Unlike RetinaNet, there are no ignored anchors.
+        """
+        gt_labels, matched_gt_boxes = [], []
+        for inst in gt_instances:
+            if len(inst) > 0:
+                match_quality_matrix = self._match_anchors(inst.gt_boxes, anchors)
+                # Find matched ground-truth box per anchor. Un-matched anchors are
+                # assigned -1. This is equivalent to using an anchor matcher as used
+                # in R-CNN/RetinaNet: `Matcher(thresholds=[1e-5], labels=[0, 1])`
+                match_quality, matched_idxs = match_quality_matrix.max(dim=0)
+                matched_idxs[match_quality < 1e-5] = -1
+                matched_gt_boxes_i = inst.gt_boxes.tensor[matched_idxs.clip(min=0)]
+                gt_labels_i = inst.gt_classes[matched_idxs.clip(min=0)]
+                # Anchors with matched_idxs = -1 are labeled background.
+                gt_labels_i[matched_idxs < 0] = self.num_classes
+            else:
+                matched_gt_boxes_i = torch.zeros_like(Boxes.cat(anchors).tensor)
+                gt_labels_i = torch.full(
+                    (len(matched_gt_boxes_i),),
+                    fill_value=self.num_classes,
+                    dtype=torch.long,
+                    device=matched_gt_boxes_i.device,
+                )
+            gt_labels.append(gt_labels_i)
+            matched_gt_boxes.append(matched_gt_boxes_i)
+        return gt_labels, matched_gt_boxes
+    def losses(
+        self, anchors, pred_logits, gt_labels, pred_anchor_deltas, gt_boxes, pred_centerness
+    ):
+        """
+        This method is almost identical to :meth:`RetinaNet.losses`, with an extra
+        "loss_centerness" in the returned dict.
+        """
+        num_images = len(gt_labels)
+        gt_labels = torch.stack(gt_labels)  # (M, R)
+        pos_mask = (gt_labels >= 0) & (gt_labels != self.num_classes)
+        num_pos_anchors = pos_mask.sum().item()
+        get_event_storage().put_scalar("num_pos_anchors", num_pos_anchors / num_images)
+        normalizer = self._ema_update("loss_normalizer", max(num_pos_anchors, 1), 300)
+        # classification and regression loss
+        gt_labels_target = F.one_hot(gt_labels, num_classes=self.num_classes + 1)[
+            :, :, :-1
+        ]  # no loss for the last (background) class
+        loss_cls = sigmoid_focal_loss_jit(
+            torch.cat(pred_logits, dim=1),
+            gt_labels_target.to(pred_logits[0].dtype),
+            alpha=self.focal_loss_alpha,
+            gamma=self.focal_loss_gamma,
+            reduction="sum",
+        )
+        loss_box_reg = _dense_box_regression_loss(
+            anchors,
+            self.box2box_transform,
+            pred_anchor_deltas,
+            gt_boxes,
+            pos_mask,
+            box_reg_loss_type="giou",
+        )
+        ctrness_targets = self.compute_ctrness_targets(anchors, gt_boxes)  # (M, R)
+        pred_centerness = torch.cat(pred_centerness, dim=1).squeeze(dim=2)  # (M, R)
+        ctrness_loss = F.binary_cross_entropy_with_logits(
+            pred_centerness[pos_mask], ctrness_targets[pos_mask], reduction="sum"
+        )
+        return {
+            "loss_fcos_cls": loss_cls / normalizer,
+            "loss_fcos_loc": loss_box_reg / normalizer,
+            "loss_fcos_ctr": ctrness_loss / normalizer,
+        }
+    def compute_ctrness_targets(self, anchors: List[Boxes], gt_boxes: List[torch.Tensor]):
+        anchors = Boxes.cat(anchors).tensor  # Rx4
+        reg_targets = [self.box2box_transform.get_deltas(anchors, m) for m in gt_boxes]
+        reg_targets = torch.stack(reg_targets, dim=0)  # NxRx4
+        if len(reg_targets) == 0:
+            return reg_targets.new_zeros(len(reg_targets))
+        left_right = reg_targets[:, :, [0, 2]]
+        top_bottom = reg_targets[:, :, [1, 3]]
+        ctrness = (left_right.min(dim=-1)[0] / left_right.max(dim=-1)[0]) * (
+            top_bottom.min(dim=-1)[0] / top_bottom.max(dim=-1)[0]
+        )
+        return torch.sqrt(ctrness)
+    def forward_inference(
+        self,
+        images: ImageList,
+        features: List[torch.Tensor],
+        predictions: List[List[torch.Tensor]],
+    ):
+        pred_logits, pred_anchor_deltas, pred_centerness = self._transpose_dense_predictions(
+            predictions, [self.num_classes, 4, 1]
+        )
+        anchors = self.anchor_generator(features)
+        results: List[Instances] = []
+        for img_idx, image_size in enumerate(images.image_sizes):
+            scores_per_image = [
+                # Multiply and sqrt centerness & classification scores
+                # (See eqn. 4 in https://arxiv.org/abs/2006.09214)
+                torch.sqrt(x[img_idx].sigmoid_() * y[img_idx].sigmoid_())
+                for x, y in zip(pred_logits, pred_centerness)
+            ]
+            deltas_per_image = [x[img_idx] for x in pred_anchor_deltas]
+            results_per_image = self.inference_single_image(
+                anchors, scores_per_image, deltas_per_image, image_size
+            )
+            results.append(results_per_image)
+        return results
+    def inference_single_image(
+        self,
+        anchors: List[Boxes],
+        box_cls: List[torch.Tensor],
+        box_delta: List[torch.Tensor],
+        image_size: Tuple[int, int],
+    ):
+        """
+        Identical to :meth:`RetinaNet.inference_single_image.
+        """
+        pred = self._decode_multi_level_predictions(
+            anchors,
+            box_cls,
+            box_delta,
+            self.test_score_thresh,
+            self.test_topk_candidates,
+            image_size,
+        )
+        keep = batched_nms(
+            pred.pred_boxes.tensor, pred.scores, pred.pred_classes, self.test_nms_thresh
+        )
+        return pred[keep[: self.max_detections_per_image]]
+class FCOSHead(RetinaNetHead):
+    """
+    The head used in :paper:`fcos`. It adds an additional centerness
+    prediction branch on top of :class:`RetinaNetHead`.
+    """
+    def __init__(self, *, input_shape: List[ShapeSpec], conv_dims: List[int], **kwargs):
+        super().__init__(input_shape=input_shape, conv_dims=conv_dims, num_anchors=1, **kwargs)
+        # Unlike original FCOS, we do not add an additional learnable scale layer
+        # because it's found to have no benefits after normalizing regression targets by stride.
+        self._num_features = len(input_shape)
+        self.ctrness = nn.Conv2d(conv_dims[-1], 1, kernel_size=3, stride=1, padding=1)
+        torch.nn.init.normal_(self.ctrness.weight, std=0.01)
+        torch.nn.init.constant_(self.ctrness.bias, 0)
+    def forward(self, features):
+        assert len(features) == self._num_features
+        logits = []
+        bbox_reg = []
+        ctrness = []
+        for feature in features:
+            logits.append(self.cls_score(self.cls_subnet(feature)))
+            bbox_feature = self.bbox_subnet(feature)
+            bbox_reg.append(self.bbox_pred(bbox_feature))
+            ctrness.append(self.ctrness(bbox_feature))
+        return logits, bbox_reg, ctrness

extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/meta_arch/panoptic_fpn.py ADDED Viewed

	@@ -0,0 +1,269 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+import logging
+from typing import Dict, List
+import torch
+from torch import nn
+from annotator.oneformer.detectron2.config import configurable
+from annotator.oneformer.detectron2.structures import ImageList
+from ..postprocessing import detector_postprocess, sem_seg_postprocess
+from .build import META_ARCH_REGISTRY
+from .rcnn import GeneralizedRCNN
+from .semantic_seg import build_sem_seg_head
+__all__ = ["PanopticFPN"]
+@META_ARCH_REGISTRY.register()
+class PanopticFPN(GeneralizedRCNN):
+    """
+    Implement the paper :paper:`PanopticFPN`.
+    """
+    @configurable
+    def __init__(
+        self,
+        *,
+        sem_seg_head: nn.Module,
+        combine_overlap_thresh: float = 0.5,
+        combine_stuff_area_thresh: float = 4096,
+        combine_instances_score_thresh: float = 0.5,
+        **kwargs,
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            sem_seg_head: a module for the semantic segmentation head.
+            combine_overlap_thresh: combine masks into one instances if
+                they have enough overlap
+            combine_stuff_area_thresh: ignore stuff areas smaller than this threshold
+            combine_instances_score_thresh: ignore instances whose score is
+                smaller than this threshold
+        Other arguments are the same as :class:`GeneralizedRCNN`.
+        """
+        super().__init__(**kwargs)
+        self.sem_seg_head = sem_seg_head
+        # options when combining instance & semantic outputs
+        self.combine_overlap_thresh = combine_overlap_thresh
+        self.combine_stuff_area_thresh = combine_stuff_area_thresh
+        self.combine_instances_score_thresh = combine_instances_score_thresh
+    @classmethod
+    def from_config(cls, cfg):
+        ret = super().from_config(cfg)
+        ret.update(
+            {
+                "combine_overlap_thresh": cfg.MODEL.PANOPTIC_FPN.COMBINE.OVERLAP_THRESH,
+                "combine_stuff_area_thresh": cfg.MODEL.PANOPTIC_FPN.COMBINE.STUFF_AREA_LIMIT,
+                "combine_instances_score_thresh": cfg.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH,  # noqa
+            }
+        )
+        ret["sem_seg_head"] = build_sem_seg_head(cfg, ret["backbone"].output_shape())
+        logger = logging.getLogger(__name__)
+        if not cfg.MODEL.PANOPTIC_FPN.COMBINE.ENABLED:
+            logger.warning(
+                "PANOPTIC_FPN.COMBINED.ENABLED is no longer used. "
+                " model.inference(do_postprocess=) should be used to toggle postprocessing."
+            )
+        if cfg.MODEL.PANOPTIC_FPN.INSTANCE_LOSS_WEIGHT != 1.0:
+            w = cfg.MODEL.PANOPTIC_FPN.INSTANCE_LOSS_WEIGHT
+            logger.warning(
+                "PANOPTIC_FPN.INSTANCE_LOSS_WEIGHT should be replaced by weights on each ROI head."
+            )
+            def update_weight(x):
+                if isinstance(x, dict):
+                    return {k: v * w for k, v in x.items()}
+                else:
+                    return x * w
+            roi_heads = ret["roi_heads"]
+            roi_heads.box_predictor.loss_weight = update_weight(roi_heads.box_predictor.loss_weight)
+            roi_heads.mask_head.loss_weight = update_weight(roi_heads.mask_head.loss_weight)
+        return ret
+    def forward(self, batched_inputs):
+        """
+        Args:
+            batched_inputs: a list, batched outputs of :class:`DatasetMapper`.
+                Each item in the list contains the inputs for one image.
+                For now, each item in the list is a dict that contains:
+                * "image": Tensor, image in (C, H, W) format.
+                * "instances": Instances
+                * "sem_seg": semantic segmentation ground truth.
+                * Other information that's included in the original dicts, such as:
+                  "height", "width" (int): the output resolution of the model, used in inference.
+                  See :meth:`postprocess` for details.
+        Returns:
+            list[dict]:
+                each dict has the results for one image. The dict contains the following keys:
+                * "instances": see :meth:`GeneralizedRCNN.forward` for its format.
+                * "sem_seg": see :meth:`SemanticSegmentor.forward` for its format.
+                * "panoptic_seg": See the return value of
+                  :func:`combine_semantic_and_instance_outputs` for its format.
+        """
+        if not self.training:
+            return self.inference(batched_inputs)
+        images = self.preprocess_image(batched_inputs)
+        features = self.backbone(images.tensor)
+        assert "sem_seg" in batched_inputs[0]
+        gt_sem_seg = [x["sem_seg"].to(self.device) for x in batched_inputs]
+        gt_sem_seg = ImageList.from_tensors(
+            gt_sem_seg,
+            self.backbone.size_divisibility,
+            self.sem_seg_head.ignore_value,
+            self.backbone.padding_constraints,
+        ).tensor
+        sem_seg_results, sem_seg_losses = self.sem_seg_head(features, gt_sem_seg)
+        gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
+        proposals, proposal_losses = self.proposal_generator(images, features, gt_instances)
+        detector_results, detector_losses = self.roi_heads(
+            images, features, proposals, gt_instances
+        )
+        losses = sem_seg_losses
+        losses.update(proposal_losses)
+        losses.update(detector_losses)
+        return losses
+    def inference(self, batched_inputs: List[Dict[str, torch.Tensor]], do_postprocess: bool = True):
+        """
+        Run inference on the given inputs.
+        Args:
+            batched_inputs (list[dict]): same as in :meth:`forward`
+            do_postprocess (bool): whether to apply post-processing on the outputs.
+        Returns:
+            When do_postprocess=True, see docs in :meth:`forward`.
+            Otherwise, returns a (list[Instances], list[Tensor]) that contains
+            the raw detector outputs, and raw semantic segmentation outputs.
+        """
+        images = self.preprocess_image(batched_inputs)
+        features = self.backbone(images.tensor)
+        sem_seg_results, sem_seg_losses = self.sem_seg_head(features, None)
+        proposals, _ = self.proposal_generator(images, features, None)
+        detector_results, _ = self.roi_heads(images, features, proposals, None)
+        if do_postprocess:
+            processed_results = []
+            for sem_seg_result, detector_result, input_per_image, image_size in zip(
+                sem_seg_results, detector_results, batched_inputs, images.image_sizes
+            ):
+                height = input_per_image.get("height", image_size[0])
+                width = input_per_image.get("width", image_size[1])
+                sem_seg_r = sem_seg_postprocess(sem_seg_result, image_size, height, width)
+                detector_r = detector_postprocess(detector_result, height, width)
+                processed_results.append({"sem_seg": sem_seg_r, "instances": detector_r})
+                panoptic_r = combine_semantic_and_instance_outputs(
+                    detector_r,
+                    sem_seg_r.argmax(dim=0),
+                    self.combine_overlap_thresh,
+                    self.combine_stuff_area_thresh,
+                    self.combine_instances_score_thresh,
+                )
+                processed_results[-1]["panoptic_seg"] = panoptic_r
+            return processed_results
+        else:
+            return detector_results, sem_seg_results
+def combine_semantic_and_instance_outputs(
+    instance_results,
+    semantic_results,
+    overlap_threshold,
+    stuff_area_thresh,
+    instances_score_thresh,
+):
+    """
+    Implement a simple combining logic following
+    "combine_semantic_and_instance_predictions.py" in panopticapi
+    to produce panoptic segmentation outputs.
+    Args:
+        instance_results: output of :func:`detector_postprocess`.
+        semantic_results: an (H, W) tensor, each element is the contiguous semantic
+            category id
+    Returns:
+        panoptic_seg (Tensor): of shape (height, width) where the values are ids for each segment.
+        segments_info (list[dict]): Describe each segment in `panoptic_seg`.
+            Each dict contains keys "id", "category_id", "isthing".
+    """
+    panoptic_seg = torch.zeros_like(semantic_results, dtype=torch.int32)
+    # sort instance outputs by scores
+    sorted_inds = torch.argsort(-instance_results.scores)
+    current_segment_id = 0
+    segments_info = []
+    instance_masks = instance_results.pred_masks.to(dtype=torch.bool, device=panoptic_seg.device)
+    # Add instances one-by-one, check for overlaps with existing ones
+    for inst_id in sorted_inds:
+        score = instance_results.scores[inst_id].item()
+        if score < instances_score_thresh:
+            break
+        mask = instance_masks[inst_id]  # H,W
+        mask_area = mask.sum().item()
+        if mask_area == 0:
+            continue
+        intersect = (mask > 0) & (panoptic_seg > 0)
+        intersect_area = intersect.sum().item()
+        if intersect_area * 1.0 / mask_area > overlap_threshold:
+            continue
+        if intersect_area > 0:
+            mask = mask & (panoptic_seg == 0)
+        current_segment_id += 1
+        panoptic_seg[mask] = current_segment_id
+        segments_info.append(
+            {
+                "id": current_segment_id,
+                "isthing": True,
+                "score": score,
+                "category_id": instance_results.pred_classes[inst_id].item(),
+                "instance_id": inst_id.item(),
+            }
+        )
+    # Add semantic results to remaining empty areas
+    semantic_labels = torch.unique(semantic_results).cpu().tolist()
+    for semantic_label in semantic_labels:
+        if semantic_label == 0:  # 0 is a special "thing" class
+            continue
+        mask = (semantic_results == semantic_label) & (panoptic_seg == 0)
+        mask_area = mask.sum().item()
+        if mask_area < stuff_area_thresh:
+            continue
+        current_segment_id += 1
+        panoptic_seg[mask] = current_segment_id
+        segments_info.append(
+            {
+                "id": current_segment_id,
+                "isthing": False,
+                "category_id": semantic_label,
+                "area": mask_area,
+            }
+        )
+    return panoptic_seg, segments_info

extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/meta_arch/rcnn.py ADDED Viewed

	@@ -0,0 +1,341 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+import logging
+import numpy as np
+from typing import Dict, List, Optional, Tuple
+import torch
+from torch import nn
+from annotator.oneformer.detectron2.config import configurable
+from annotator.oneformer.detectron2.data.detection_utils import convert_image_to_rgb
+from annotator.oneformer.detectron2.layers import move_device_like
+from annotator.oneformer.detectron2.structures import ImageList, Instances
+from annotator.oneformer.detectron2.utils.events import get_event_storage
+from annotator.oneformer.detectron2.utils.logger import log_first_n
+from ..backbone import Backbone, build_backbone
+from ..postprocessing import detector_postprocess
+from ..proposal_generator import build_proposal_generator
+from ..roi_heads import build_roi_heads
+from .build import META_ARCH_REGISTRY
+__all__ = ["GeneralizedRCNN", "ProposalNetwork"]
+@META_ARCH_REGISTRY.register()
+class GeneralizedRCNN(nn.Module):
+    """
+    Generalized R-CNN. Any models that contains the following three components:
+    1. Per-image feature extraction (aka backbone)
+    2. Region proposal generation
+    3. Per-region feature extraction and prediction
+    """
+    @configurable
+    def __init__(
+        self,
+        *,
+        backbone: Backbone,
+        proposal_generator: nn.Module,
+        roi_heads: nn.Module,
+        pixel_mean: Tuple[float],
+        pixel_std: Tuple[float],
+        input_format: Optional[str] = None,
+        vis_period: int = 0,
+    ):
+        """
+        Args:
+            backbone: a backbone module, must follow detectron2's backbone interface
+            proposal_generator: a module that generates proposals using backbone features
+            roi_heads: a ROI head that performs per-region computation
+            pixel_mean, pixel_std: list or tuple with #channels element, representing
+                the per-channel mean and std to be used to normalize the input image
+            input_format: describe the meaning of channels of input. Needed by visualization
+            vis_period: the period to run visualization. Set to 0 to disable.
+        """
+        super().__init__()
+        self.backbone = backbone
+        self.proposal_generator = proposal_generator
+        self.roi_heads = roi_heads
+        self.input_format = input_format
+        self.vis_period = vis_period
+        if vis_period > 0:
+            assert input_format is not None, "input_format is required for visualization!"
+        self.register_buffer("pixel_mean", torch.tensor(pixel_mean).view(-1, 1, 1), False)
+        self.register_buffer("pixel_std", torch.tensor(pixel_std).view(-1, 1, 1), False)
+        assert (
+            self.pixel_mean.shape == self.pixel_std.shape
+        ), f"{self.pixel_mean} and {self.pixel_std} have different shapes!"
+    @classmethod
+    def from_config(cls, cfg):
+        backbone = build_backbone(cfg)
+        return {
+            "backbone": backbone,
+            "proposal_generator": build_proposal_generator(cfg, backbone.output_shape()),
+            "roi_heads": build_roi_heads(cfg, backbone.output_shape()),
+            "input_format": cfg.INPUT.FORMAT,
+            "vis_period": cfg.VIS_PERIOD,
+            "pixel_mean": cfg.MODEL.PIXEL_MEAN,
+            "pixel_std": cfg.MODEL.PIXEL_STD,
+        }
+    @property
+    def device(self):
+        return self.pixel_mean.device
+    def _move_to_current_device(self, x):
+        return move_device_like(x, self.pixel_mean)
+    def visualize_training(self, batched_inputs, proposals):
+        """
+        A function used to visualize images and proposals. It shows ground truth
+        bounding boxes on the original image and up to 20 top-scoring predicted
+        object proposals on the original image. Users can implement different
+        visualization functions for different models.
+        Args:
+            batched_inputs (list): a list that contains input to the model.
+            proposals (list): a list that contains predicted proposals. Both
+                batched_inputs and proposals should have the same length.
+        """
+        from annotator.oneformer.detectron2.utils.visualizer import Visualizer
+        storage = get_event_storage()
+        max_vis_prop = 20
+        for input, prop in zip(batched_inputs, proposals):
+            img = input["image"]
+            img = convert_image_to_rgb(img.permute(1, 2, 0), self.input_format)
+            v_gt = Visualizer(img, None)
+            v_gt = v_gt.overlay_instances(boxes=input["instances"].gt_boxes)
+            anno_img = v_gt.get_image()
+            box_size = min(len(prop.proposal_boxes), max_vis_prop)
+            v_pred = Visualizer(img, None)
+            v_pred = v_pred.overlay_instances(
+                boxes=prop.proposal_boxes[0:box_size].tensor.cpu().numpy()
+            )
+            prop_img = v_pred.get_image()
+            vis_img = np.concatenate((anno_img, prop_img), axis=1)
+            vis_img = vis_img.transpose(2, 0, 1)
+            vis_name = "Left: GT bounding boxes;  Right: Predicted proposals"
+            storage.put_image(vis_name, vis_img)
+            break  # only visualize one image in a batch
+    def forward(self, batched_inputs: List[Dict[str, torch.Tensor]]):
+        """
+        Args:
+            batched_inputs: a list, batched outputs of :class:`DatasetMapper` .
+                Each item in the list contains the inputs for one image.
+                For now, each item in the list is a dict that contains:
+                * image: Tensor, image in (C, H, W) format.
+                * instances (optional): groundtruth :class:`Instances`
+                * proposals (optional): :class:`Instances`, precomputed proposals.
+                Other information that's included in the original dicts, such as:
+                * "height", "width" (int): the output resolution of the model, used in inference.
+                  See :meth:`postprocess` for details.
+        Returns:
+            list[dict]:
+                Each dict is the output for one input image.
+                The dict contains one key "instances" whose value is a :class:`Instances`.
+                The :class:`Instances` object has the following keys:
+                "pred_boxes", "pred_classes", "scores", "pred_masks", "pred_keypoints"
+        """
+        if not self.training:
+            return self.inference(batched_inputs)
+        images = self.preprocess_image(batched_inputs)
+        if "instances" in batched_inputs[0]:
+            gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
+        else:
+            gt_instances = None
+        features = self.backbone(images.tensor)
+        if self.proposal_generator is not None:
+            proposals, proposal_losses = self.proposal_generator(images, features, gt_instances)
+        else:
+            assert "proposals" in batched_inputs[0]
+            proposals = [x["proposals"].to(self.device) for x in batched_inputs]
+            proposal_losses = {}
+        _, detector_losses = self.roi_heads(images, features, proposals, gt_instances)
+        if self.vis_period > 0:
+            storage = get_event_storage()
+            if storage.iter % self.vis_period == 0:
+                self.visualize_training(batched_inputs, proposals)
+        losses = {}
+        losses.update(detector_losses)
+        losses.update(proposal_losses)
+        return losses
+    def inference(
+        self,
+        batched_inputs: List[Dict[str, torch.Tensor]],
+        detected_instances: Optional[List[Instances]] = None,
+        do_postprocess: bool = True,
+    ):
+        """
+        Run inference on the given inputs.
+        Args:
+            batched_inputs (list[dict]): same as in :meth:`forward`
+            detected_instances (None or list[Instances]): if not None, it
+                contains an `Instances` object per image. The `Instances`
+                object contains "pred_boxes" and "pred_classes" which are
+                known boxes in the image.
+                The inference will then skip the detection of bounding boxes,
+                and only predict other per-ROI outputs.
+            do_postprocess (bool): whether to apply post-processing on the outputs.
+        Returns:
+            When do_postprocess=True, same as in :meth:`forward`.
+            Otherwise, a list[Instances] containing raw network outputs.
+        """
+        assert not self.training
+        images = self.preprocess_image(batched_inputs)
+        features = self.backbone(images.tensor)
+        if detected_instances is None:
+            if self.proposal_generator is not None:
+                proposals, _ = self.proposal_generator(images, features, None)
+            else:
+                assert "proposals" in batched_inputs[0]
+                proposals = [x["proposals"].to(self.device) for x in batched_inputs]
+            results, _ = self.roi_heads(images, features, proposals, None)
+        else:
+            detected_instances = [x.to(self.device) for x in detected_instances]
+            results = self.roi_heads.forward_with_given_boxes(features, detected_instances)
+        if do_postprocess:
+            assert not torch.jit.is_scripting(), "Scripting is not supported for postprocess."
+            return GeneralizedRCNN._postprocess(results, batched_inputs, images.image_sizes)
+        return results
+    def preprocess_image(self, batched_inputs: List[Dict[str, torch.Tensor]]):
+        """
+        Normalize, pad and batch the input images.
+        """
+        images = [self._move_to_current_device(x["image"]) for x in batched_inputs]
+        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
+        images = ImageList.from_tensors(
+            images,
+            self.backbone.size_divisibility,
+            padding_constraints=self.backbone.padding_constraints,
+        )
+        return images
+    @staticmethod
+    def _postprocess(instances, batched_inputs: List[Dict[str, torch.Tensor]], image_sizes):
+        """
+        Rescale the output instances to the target size.
+        """
+        # note: private function; subject to changes
+        processed_results = []
+        for results_per_image, input_per_image, image_size in zip(
+            instances, batched_inputs, image_sizes
+        ):
+            height = input_per_image.get("height", image_size[0])
+            width = input_per_image.get("width", image_size[1])
+            r = detector_postprocess(results_per_image, height, width)
+            processed_results.append({"instances": r})
+        return processed_results
+@META_ARCH_REGISTRY.register()
+class ProposalNetwork(nn.Module):
+    """
+    A meta architecture that only predicts object proposals.
+    """
+    @configurable
+    def __init__(
+        self,
+        *,
+        backbone: Backbone,
+        proposal_generator: nn.Module,
+        pixel_mean: Tuple[float],
+        pixel_std: Tuple[float],
+    ):
+        """
+        Args:
+            backbone: a backbone module, must follow detectron2's backbone interface
+            proposal_generator: a module that generates proposals using backbone features
+            pixel_mean, pixel_std: list or tuple with #channels element, representing
+                the per-channel mean and std to be used to normalize the input image
+        """
+        super().__init__()
+        self.backbone = backbone
+        self.proposal_generator = proposal_generator
+        self.register_buffer("pixel_mean", torch.tensor(pixel_mean).view(-1, 1, 1), False)
+        self.register_buffer("pixel_std", torch.tensor(pixel_std).view(-1, 1, 1), False)
+    @classmethod
+    def from_config(cls, cfg):
+        backbone = build_backbone(cfg)
+        return {
+            "backbone": backbone,
+            "proposal_generator": build_proposal_generator(cfg, backbone.output_shape()),
+            "pixel_mean": cfg.MODEL.PIXEL_MEAN,
+            "pixel_std": cfg.MODEL.PIXEL_STD,
+        }
+    @property
+    def device(self):
+        return self.pixel_mean.device
+    def _move_to_current_device(self, x):
+        return move_device_like(x, self.pixel_mean)
+    def forward(self, batched_inputs):
+        """
+        Args:
+            Same as in :class:`GeneralizedRCNN.forward`
+        Returns:
+            list[dict]:
+                Each dict is the output for one input image.
+                The dict contains one key "proposals" whose value is a
+                :class:`Instances` with keys "proposal_boxes" and "objectness_logits".
+        """
+        images = [self._move_to_current_device(x["image"]) for x in batched_inputs]
+        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
+        images = ImageList.from_tensors(
+            images,
+            self.backbone.size_divisibility,
+            padding_constraints=self.backbone.padding_constraints,
+        )
+        features = self.backbone(images.tensor)
+        if "instances" in batched_inputs[0]:
+            gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
+        elif "targets" in batched_inputs[0]:
+            log_first_n(
+                logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10
+            )
+            gt_instances = [x["targets"].to(self.device) for x in batched_inputs]
+        else:
+            gt_instances = None
+        proposals, proposal_losses = self.proposal_generator(images, features, gt_instances)
+        # In training, the proposals are not useful at all but we generate them anyway.
+        # This makes RPN-only models about 5% slower.
+        if self.training:
+            return proposal_losses
+        processed_results = []
+        for results_per_image, input_per_image, image_size in zip(
+            proposals, batched_inputs, images.image_sizes
+        ):
+            height = input_per_image.get("height", image_size[0])
+            width = input_per_image.get("width", image_size[1])
+            r = detector_postprocess(results_per_image, height, width)
+            processed_results.append({"proposals": r})
+        return processed_results

extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/meta_arch/retinanet.py ADDED Viewed

	@@ -0,0 +1,439 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+import logging
+import math
+from typing import List, Tuple
+import torch
+from fvcore.nn import sigmoid_focal_loss_jit
+from torch import Tensor, nn
+from torch.nn import functional as F
+from annotator.oneformer.detectron2.config import configurable
+from annotator.oneformer.detectron2.layers import CycleBatchNormList, ShapeSpec, batched_nms, cat, get_norm
+from annotator.oneformer.detectron2.structures import Boxes, ImageList, Instances, pairwise_iou
+from annotator.oneformer.detectron2.utils.events import get_event_storage
+from ..anchor_generator import build_anchor_generator
+from ..backbone import Backbone, build_backbone
+from ..box_regression import Box2BoxTransform, _dense_box_regression_loss
+from ..matcher import Matcher
+from .build import META_ARCH_REGISTRY
+from .dense_detector import DenseDetector, permute_to_N_HWA_K  # noqa
+__all__ = ["RetinaNet"]
+logger = logging.getLogger(__name__)
+@META_ARCH_REGISTRY.register()
+class RetinaNet(DenseDetector):
+    """
+    Implement RetinaNet in :paper:`RetinaNet`.
+    """
+    @configurable
+    def __init__(
+        self,
+        *,
+        backbone: Backbone,
+        head: nn.Module,
+        head_in_features,
+        anchor_generator,
+        box2box_transform,
+        anchor_matcher,
+        num_classes,
+        focal_loss_alpha=0.25,
+        focal_loss_gamma=2.0,
+        smooth_l1_beta=0.0,
+        box_reg_loss_type="smooth_l1",
+        test_score_thresh=0.05,
+        test_topk_candidates=1000,
+        test_nms_thresh=0.5,
+        max_detections_per_image=100,
+        pixel_mean,
+        pixel_std,
+        vis_period=0,
+        input_format="BGR",
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            backbone: a backbone module, must follow detectron2's backbone interface
+            head (nn.Module): a module that predicts logits and regression deltas
+                for each level from a list of per-level features
+            head_in_features (Tuple[str]): Names of the input feature maps to be used in head
+            anchor_generator (nn.Module): a module that creates anchors from a
+                list of features. Usually an instance of :class:`AnchorGenerator`
+            box2box_transform (Box2BoxTransform): defines the transform from anchors boxes to
+                instance boxes
+            anchor_matcher (Matcher): label the anchors by matching them with ground truth.
+            num_classes (int): number of classes. Used to label background proposals.
+            # Loss parameters:
+            focal_loss_alpha (float): focal_loss_alpha
+            focal_loss_gamma (float): focal_loss_gamma
+            smooth_l1_beta (float): smooth_l1_beta
+            box_reg_loss_type (str): Options are "smooth_l1", "giou", "diou", "ciou"
+            # Inference parameters:
+            test_score_thresh (float): Inference cls score threshold, only anchors with
+                score > INFERENCE_TH are considered for inference (to improve speed)
+            test_topk_candidates (int): Select topk candidates before NMS
+            test_nms_thresh (float): Overlap threshold used for non-maximum suppression
+                (suppress boxes with IoU >= this threshold)
+            max_detections_per_image (int):
+                Maximum number of detections to return per image during inference
+                (100 is based on the limit established for the COCO dataset).
+            pixel_mean, pixel_std: see :class:`DenseDetector`.
+        """
+        super().__init__(
+            backbone, head, head_in_features, pixel_mean=pixel_mean, pixel_std=pixel_std
+        )
+        self.num_classes = num_classes
+        # Anchors
+        self.anchor_generator = anchor_generator
+        self.box2box_transform = box2box_transform
+        self.anchor_matcher = anchor_matcher
+        # Loss parameters:
+        self.focal_loss_alpha = focal_loss_alpha
+        self.focal_loss_gamma = focal_loss_gamma
+        self.smooth_l1_beta = smooth_l1_beta
+        self.box_reg_loss_type = box_reg_loss_type
+        # Inference parameters:
+        self.test_score_thresh = test_score_thresh
+        self.test_topk_candidates = test_topk_candidates
+        self.test_nms_thresh = test_nms_thresh
+        self.max_detections_per_image = max_detections_per_image
+        # Vis parameters
+        self.vis_period = vis_period
+        self.input_format = input_format
+    @classmethod
+    def from_config(cls, cfg):
+        backbone = build_backbone(cfg)
+        backbone_shape = backbone.output_shape()
+        feature_shapes = [backbone_shape[f] for f in cfg.MODEL.RETINANET.IN_FEATURES]
+        head = RetinaNetHead(cfg, feature_shapes)
+        anchor_generator = build_anchor_generator(cfg, feature_shapes)
+        return {
+            "backbone": backbone,
+            "head": head,
+            "anchor_generator": anchor_generator,
+            "box2box_transform": Box2BoxTransform(weights=cfg.MODEL.RETINANET.BBOX_REG_WEIGHTS),
+            "anchor_matcher": Matcher(
+                cfg.MODEL.RETINANET.IOU_THRESHOLDS,
+                cfg.MODEL.RETINANET.IOU_LABELS,
+                allow_low_quality_matches=True,
+            ),
+            "pixel_mean": cfg.MODEL.PIXEL_MEAN,
+            "pixel_std": cfg.MODEL.PIXEL_STD,
+            "num_classes": cfg.MODEL.RETINANET.NUM_CLASSES,
+            "head_in_features": cfg.MODEL.RETINANET.IN_FEATURES,
+            # Loss parameters:
+            "focal_loss_alpha": cfg.MODEL.RETINANET.FOCAL_LOSS_ALPHA,
+            "focal_loss_gamma": cfg.MODEL.RETINANET.FOCAL_LOSS_GAMMA,
+            "smooth_l1_beta": cfg.MODEL.RETINANET.SMOOTH_L1_LOSS_BETA,
+            "box_reg_loss_type": cfg.MODEL.RETINANET.BBOX_REG_LOSS_TYPE,
+            # Inference parameters:
+            "test_score_thresh": cfg.MODEL.RETINANET.SCORE_THRESH_TEST,
+            "test_topk_candidates": cfg.MODEL.RETINANET.TOPK_CANDIDATES_TEST,
+            "test_nms_thresh": cfg.MODEL.RETINANET.NMS_THRESH_TEST,
+            "max_detections_per_image": cfg.TEST.DETECTIONS_PER_IMAGE,
+            # Vis parameters
+            "vis_period": cfg.VIS_PERIOD,
+            "input_format": cfg.INPUT.FORMAT,
+        }
+    def forward_training(self, images, features, predictions, gt_instances):
+        # Transpose the Hi*Wi*A dimension to the middle:
+        pred_logits, pred_anchor_deltas = self._transpose_dense_predictions(
+            predictions, [self.num_classes, 4]
+        )
+        anchors = self.anchor_generator(features)
+        gt_labels, gt_boxes = self.label_anchors(anchors, gt_instances)
+        return self.losses(anchors, pred_logits, gt_labels, pred_anchor_deltas, gt_boxes)
+    def losses(self, anchors, pred_logits, gt_labels, pred_anchor_deltas, gt_boxes):
+        """
+        Args:
+            anchors (list[Boxes]): a list of #feature level Boxes
+            gt_labels, gt_boxes: see output of :meth:`RetinaNet.label_anchors`.
+                Their shapes are (N, R) and (N, R, 4), respectively, where R is
+                the total number of anchors across levels, i.e. sum(Hi x Wi x Ai)
+            pred_logits, pred_anchor_deltas: both are list[Tensor]. Each element in the
+                list corresponds to one level and has shape (N, Hi * Wi * Ai, K or 4).
+                Where K is the number of classes used in `pred_logits`.
+        Returns:
+            dict[str, Tensor]:
+                mapping from a named loss to a scalar tensor storing the loss.
+                Used during training only. The dict keys are: "loss_cls" and "loss_box_reg"
+        """
+        num_images = len(gt_labels)
+        gt_labels = torch.stack(gt_labels)  # (N, R)
+        valid_mask = gt_labels >= 0
+        pos_mask = (gt_labels >= 0) & (gt_labels != self.num_classes)
+        num_pos_anchors = pos_mask.sum().item()
+        get_event_storage().put_scalar("num_pos_anchors", num_pos_anchors / num_images)
+        normalizer = self._ema_update("loss_normalizer", max(num_pos_anchors, 1), 100)
+        # classification and regression loss
+        gt_labels_target = F.one_hot(gt_labels[valid_mask], num_classes=self.num_classes + 1)[
+            :, :-1
+        ]  # no loss for the last (background) class
+        loss_cls = sigmoid_focal_loss_jit(
+            cat(pred_logits, dim=1)[valid_mask],
+            gt_labels_target.to(pred_logits[0].dtype),
+            alpha=self.focal_loss_alpha,
+            gamma=self.focal_loss_gamma,
+            reduction="sum",
+        )
+        loss_box_reg = _dense_box_regression_loss(
+            anchors,
+            self.box2box_transform,
+            pred_anchor_deltas,
+            gt_boxes,
+            pos_mask,
+            box_reg_loss_type=self.box_reg_loss_type,
+            smooth_l1_beta=self.smooth_l1_beta,
+        )
+        return {
+            "loss_cls": loss_cls / normalizer,
+            "loss_box_reg": loss_box_reg / normalizer,
+        }
+    @torch.no_grad()
+    def label_anchors(self, anchors, gt_instances):
+        """
+        Args:
+            anchors (list[Boxes]): A list of #feature level Boxes.
+                The Boxes contains anchors of this image on the specific feature level.
+            gt_instances (list[Instances]): a list of N `Instances`s. The i-th
+                `Instances` contains the ground-truth per-instance annotations
+                for the i-th input image.
+        Returns:
+            list[Tensor]: List of #img tensors. i-th element is a vector of labels whose length is
+            the total number of anchors across all feature maps (sum(Hi * Wi * A)).
+            Label values are in {-1, 0, ..., K}, with -1 means ignore, and K means background.
+            list[Tensor]: i-th element is a Rx4 tensor, where R is the total number of anchors
+            across feature maps. The values are the matched gt boxes for each anchor.
+            Values are undefined for those anchors not labeled as foreground.
+        """
+        anchors = Boxes.cat(anchors)  # Rx4
+        gt_labels = []
+        matched_gt_boxes = []
+        for gt_per_image in gt_instances:
+            match_quality_matrix = pairwise_iou(gt_per_image.gt_boxes, anchors)
+            matched_idxs, anchor_labels = self.anchor_matcher(match_quality_matrix)
+            del match_quality_matrix
+            if len(gt_per_image) > 0:
+                matched_gt_boxes_i = gt_per_image.gt_boxes.tensor[matched_idxs]
+                gt_labels_i = gt_per_image.gt_classes[matched_idxs]
+                # Anchors with label 0 are treated as background.
+                gt_labels_i[anchor_labels == 0] = self.num_classes
+                # Anchors with label -1 are ignored.
+                gt_labels_i[anchor_labels == -1] = -1
+            else:
+                matched_gt_boxes_i = torch.zeros_like(anchors.tensor)
+                gt_labels_i = torch.zeros_like(matched_idxs) + self.num_classes
+            gt_labels.append(gt_labels_i)
+            matched_gt_boxes.append(matched_gt_boxes_i)
+        return gt_labels, matched_gt_boxes
+    def forward_inference(
+        self, images: ImageList, features: List[Tensor], predictions: List[List[Tensor]]
+    ):
+        pred_logits, pred_anchor_deltas = self._transpose_dense_predictions(
+            predictions, [self.num_classes, 4]
+        )
+        anchors = self.anchor_generator(features)
+        results: List[Instances] = []
+        for img_idx, image_size in enumerate(images.image_sizes):
+            scores_per_image = [x[img_idx].sigmoid_() for x in pred_logits]
+            deltas_per_image = [x[img_idx] for x in pred_anchor_deltas]
+            results_per_image = self.inference_single_image(
+                anchors, scores_per_image, deltas_per_image, image_size
+            )
+            results.append(results_per_image)
+        return results
+    def inference_single_image(
+        self,
+        anchors: List[Boxes],
+        box_cls: List[Tensor],
+        box_delta: List[Tensor],
+        image_size: Tuple[int, int],
+    ):
+        """
+        Single-image inference. Return bounding-box detection results by thresholding
+        on scores and applying non-maximum suppression (NMS).
+        Arguments:
+            anchors (list[Boxes]): list of #feature levels. Each entry contains
+                a Boxes object, which contains all the anchors in that feature level.
+            box_cls (list[Tensor]): list of #feature levels. Each entry contains
+                tensor of size (H x W x A, K)
+            box_delta (list[Tensor]): Same shape as 'box_cls' except that K becomes 4.
+            image_size (tuple(H, W)): a tuple of the image height and width.
+        Returns:
+            Same as `inference`, but for only one image.
+        """
+        pred = self._decode_multi_level_predictions(
+            anchors,
+            box_cls,
+            box_delta,
+            self.test_score_thresh,
+            self.test_topk_candidates,
+            image_size,
+        )
+        keep = batched_nms(  # per-class NMS
+            pred.pred_boxes.tensor, pred.scores, pred.pred_classes, self.test_nms_thresh
+        )
+        return pred[keep[: self.max_detections_per_image]]
+class RetinaNetHead(nn.Module):
+    """
+    The head used in RetinaNet for object classification and box regression.
+    It has two subnets for the two tasks, with a common structure but separate parameters.
+    """
+    @configurable
+    def __init__(
+        self,
+        *,
+        input_shape: List[ShapeSpec],
+        num_classes,
+        num_anchors,
+        conv_dims: List[int],
+        norm="",
+        prior_prob=0.01,
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            input_shape (List[ShapeSpec]): input shape
+            num_classes (int): number of classes. Used to label background proposals.
+            num_anchors (int): number of generated anchors
+            conv_dims (List[int]): dimensions for each convolution layer
+            norm (str or callable):
+                Normalization for conv layers except for the two output layers.
+                See :func:`detectron2.layers.get_norm` for supported types.
+            prior_prob (float): Prior weight for computing bias
+        """
+        super().__init__()
+        self._num_features = len(input_shape)
+        if norm == "BN" or norm == "SyncBN":
+            logger.info(
+                f"Using domain-specific {norm} in RetinaNetHead with len={self._num_features}."
+            )
+            bn_class = nn.BatchNorm2d if norm == "BN" else nn.SyncBatchNorm
+            def norm(c):
+                return CycleBatchNormList(
+                    length=self._num_features, bn_class=bn_class, num_features=c
+                )
+        else:
+            norm_name = str(type(get_norm(norm, 32)))
+            if "BN" in norm_name:
+                logger.warning(
+                    f"Shared BatchNorm (type={norm_name}) may not work well in RetinaNetHead."
+                )
+        cls_subnet = []
+        bbox_subnet = []
+        for in_channels, out_channels in zip(
+            [input_shape[0].channels] + list(conv_dims), conv_dims
+        ):
+            cls_subnet.append(
+                nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+            )
+            if norm:
+                cls_subnet.append(get_norm(norm, out_channels))
+            cls_subnet.append(nn.ReLU())
+            bbox_subnet.append(
+                nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+            )
+            if norm:
+                bbox_subnet.append(get_norm(norm, out_channels))
+            bbox_subnet.append(nn.ReLU())
+        self.cls_subnet = nn.Sequential(*cls_subnet)
+        self.bbox_subnet = nn.Sequential(*bbox_subnet)
+        self.cls_score = nn.Conv2d(
+            conv_dims[-1], num_anchors * num_classes, kernel_size=3, stride=1, padding=1
+        )
+        self.bbox_pred = nn.Conv2d(
+            conv_dims[-1], num_anchors * 4, kernel_size=3, stride=1, padding=1
+        )
+        # Initialization
+        for modules in [self.cls_subnet, self.bbox_subnet, self.cls_score, self.bbox_pred]:
+            for layer in modules.modules():
+                if isinstance(layer, nn.Conv2d):
+                    torch.nn.init.normal_(layer.weight, mean=0, std=0.01)
+                    torch.nn.init.constant_(layer.bias, 0)
+        # Use prior in model initialization to improve stability
+        bias_value = -(math.log((1 - prior_prob) / prior_prob))
+        torch.nn.init.constant_(self.cls_score.bias, bias_value)
+    @classmethod
+    def from_config(cls, cfg, input_shape: List[ShapeSpec]):
+        num_anchors = build_anchor_generator(cfg, input_shape).num_cell_anchors
+        assert (
+            len(set(num_anchors)) == 1
+        ), "Using different number of anchors between levels is not currently supported!"
+        num_anchors = num_anchors[0]
+        return {
+            "input_shape": input_shape,
+            "num_classes": cfg.MODEL.RETINANET.NUM_CLASSES,
+            "conv_dims": [input_shape[0].channels] * cfg.MODEL.RETINANET.NUM_CONVS,
+            "prior_prob": cfg.MODEL.RETINANET.PRIOR_PROB,
+            "norm": cfg.MODEL.RETINANET.NORM,
+            "num_anchors": num_anchors,
+        }
+    def forward(self, features: List[Tensor]):
+        """
+        Arguments:
+            features (list[Tensor]): FPN feature map tensors in high to low resolution.
+                Each tensor in the list correspond to different feature levels.
+        Returns:
+            logits (list[Tensor]): #lvl tensors, each has shape (N, AxK, Hi, Wi).
+                The tensor predicts the classification probability
+                at each spatial position for each of the A anchors and K object
+                classes.
+            bbox_reg (list[Tensor]): #lvl tensors, each has shape (N, Ax4, Hi, Wi).
+                The tensor predicts 4-vector (dx,dy,dw,dh) box
+                regression values for every anchor. These values are the
+                relative offset between the anchor and the ground truth box.
+        """
+        assert len(features) == self._num_features
+        logits = []
+        bbox_reg = []
+        for feature in features:
+            logits.append(self.cls_score(self.cls_subnet(feature)))
+            bbox_reg.append(self.bbox_pred(self.bbox_subnet(feature)))
+        return logits, bbox_reg

extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/meta_arch/semantic_seg.py ADDED Viewed

	@@ -0,0 +1,267 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+import numpy as np
+from typing import Callable, Dict, Optional, Tuple, Union
+import fvcore.nn.weight_init as weight_init
+import torch
+from torch import nn
+from torch.nn import functional as F
+from annotator.oneformer.detectron2.config import configurable
+from annotator.oneformer.detectron2.layers import Conv2d, ShapeSpec, get_norm
+from annotator.oneformer.detectron2.structures import ImageList
+from annotator.oneformer.detectron2.utils.registry import Registry
+from ..backbone import Backbone, build_backbone
+from ..postprocessing import sem_seg_postprocess
+from .build import META_ARCH_REGISTRY
+__all__ = [
+    "SemanticSegmentor",
+    "SEM_SEG_HEADS_REGISTRY",
+    "SemSegFPNHead",
+    "build_sem_seg_head",
+]
+SEM_SEG_HEADS_REGISTRY = Registry("SEM_SEG_HEADS")
+SEM_SEG_HEADS_REGISTRY.__doc__ = """
+Registry for semantic segmentation heads, which make semantic segmentation predictions
+from feature maps.
+"""
+@META_ARCH_REGISTRY.register()
+class SemanticSegmentor(nn.Module):
+    """
+    Main class for semantic segmentation architectures.
+    """
+    @configurable
+    def __init__(
+        self,
+        *,
+        backbone: Backbone,
+        sem_seg_head: nn.Module,
+        pixel_mean: Tuple[float],
+        pixel_std: Tuple[float],
+    ):
+        """
+        Args:
+            backbone: a backbone module, must follow detectron2's backbone interface
+            sem_seg_head: a module that predicts semantic segmentation from backbone features
+            pixel_mean, pixel_std: list or tuple with #channels element, representing
+                the per-channel mean and std to be used to normalize the input image
+        """
+        super().__init__()
+        self.backbone = backbone
+        self.sem_seg_head = sem_seg_head
+        self.register_buffer("pixel_mean", torch.tensor(pixel_mean).view(-1, 1, 1), False)
+        self.register_buffer("pixel_std", torch.tensor(pixel_std).view(-1, 1, 1), False)
+    @classmethod
+    def from_config(cls, cfg):
+        backbone = build_backbone(cfg)
+        sem_seg_head = build_sem_seg_head(cfg, backbone.output_shape())
+        return {
+            "backbone": backbone,
+            "sem_seg_head": sem_seg_head,
+            "pixel_mean": cfg.MODEL.PIXEL_MEAN,
+            "pixel_std": cfg.MODEL.PIXEL_STD,
+        }
+    @property
+    def device(self):
+        return self.pixel_mean.device
+    def forward(self, batched_inputs):
+        """
+        Args:
+            batched_inputs: a list, batched outputs of :class:`DatasetMapper`.
+                Each item in the list contains the inputs for one image.
+                For now, each item in the list is a dict that contains:
+                   * "image": Tensor, image in (C, H, W) format.
+                   * "sem_seg": semantic segmentation ground truth
+                   * Other information that's included in the original dicts, such as:
+                     "height", "width" (int): the output resolution of the model (may be different
+                     from input resolution), used in inference.
+        Returns:
+            list[dict]:
+              Each dict is the output for one input image.
+              The dict contains one key "sem_seg" whose value is a
+              Tensor that represents the
+              per-pixel segmentation prediced by the head.
+              The prediction has shape KxHxW that represents the logits of
+              each class for each pixel.
+        """
+        images = [x["image"].to(self.device) for x in batched_inputs]
+        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
+        images = ImageList.from_tensors(
+            images,
+            self.backbone.size_divisibility,
+            padding_constraints=self.backbone.padding_constraints,
+        )
+        features = self.backbone(images.tensor)
+        if "sem_seg" in batched_inputs[0]:
+            targets = [x["sem_seg"].to(self.device) for x in batched_inputs]
+            targets = ImageList.from_tensors(
+                targets,
+                self.backbone.size_divisibility,
+                self.sem_seg_head.ignore_value,
+                self.backbone.padding_constraints,
+            ).tensor
+        else:
+            targets = None
+        results, losses = self.sem_seg_head(features, targets)
+        if self.training:
+            return losses
+        processed_results = []
+        for result, input_per_image, image_size in zip(results, batched_inputs, images.image_sizes):
+            height = input_per_image.get("height", image_size[0])
+            width = input_per_image.get("width", image_size[1])
+            r = sem_seg_postprocess(result, image_size, height, width)
+            processed_results.append({"sem_seg": r})
+        return processed_results
+def build_sem_seg_head(cfg, input_shape):
+    """
+    Build a semantic segmentation head from `cfg.MODEL.SEM_SEG_HEAD.NAME`.
+    """
+    name = cfg.MODEL.SEM_SEG_HEAD.NAME
+    return SEM_SEG_HEADS_REGISTRY.get(name)(cfg, input_shape)
+@SEM_SEG_HEADS_REGISTRY.register()
+class SemSegFPNHead(nn.Module):
+    """
+    A semantic segmentation head described in :paper:`PanopticFPN`.
+    It takes a list of FPN features as input, and applies a sequence of
+    3x3 convs and upsampling to scale all of them to the stride defined by
+    ``common_stride``. Then these features are added and used to make final
+    predictions by another 1x1 conv layer.
+    """
+    @configurable
+    def __init__(
+        self,
+        input_shape: Dict[str, ShapeSpec],
+        *,
+        num_classes: int,
+        conv_dims: int,
+        common_stride: int,
+        loss_weight: float = 1.0,
+        norm: Optional[Union[str, Callable]] = None,
+        ignore_value: int = -1,
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            input_shape: shapes (channels and stride) of the input features
+            num_classes: number of classes to predict
+            conv_dims: number of output channels for the intermediate conv layers.
+            common_stride: the common stride that all features will be upscaled to
+            loss_weight: loss weight
+            norm (str or callable): normalization for all conv layers
+            ignore_value: category id to be ignored during training.
+        """
+        super().__init__()
+        input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride)
+        if not len(input_shape):
+            raise ValueError("SemSegFPNHead(input_shape=) cannot be empty!")
+        self.in_features = [k for k, v in input_shape]
+        feature_strides = [v.stride for k, v in input_shape]
+        feature_channels = [v.channels for k, v in input_shape]
+        self.ignore_value = ignore_value
+        self.common_stride = common_stride
+        self.loss_weight = loss_weight
+        self.scale_heads = []
+        for in_feature, stride, channels in zip(
+            self.in_features, feature_strides, feature_channels
+        ):
+            head_ops = []
+            head_length = max(1, int(np.log2(stride) - np.log2(self.common_stride)))
+            for k in range(head_length):
+                norm_module = get_norm(norm, conv_dims)
+                conv = Conv2d(
+                    channels if k == 0 else conv_dims,
+                    conv_dims,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    bias=not norm,
+                    norm=norm_module,
+                    activation=F.relu,
+                )
+                weight_init.c2_msra_fill(conv)
+                head_ops.append(conv)
+                if stride != self.common_stride:
+                    head_ops.append(
+                        nn.Upsample(scale_factor=2, mode="bilinear", align_corners=False)
+                    )
+            self.scale_heads.append(nn.Sequential(*head_ops))
+            self.add_module(in_feature, self.scale_heads[-1])
+        self.predictor = Conv2d(conv_dims, num_classes, kernel_size=1, stride=1, padding=0)
+        weight_init.c2_msra_fill(self.predictor)
+    @classmethod
+    def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
+        return {
+            "input_shape": {
+                k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES
+            },
+            "ignore_value": cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
+            "num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES,
+            "conv_dims": cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM,
+            "common_stride": cfg.MODEL.SEM_SEG_HEAD.COMMON_STRIDE,
+            "norm": cfg.MODEL.SEM_SEG_HEAD.NORM,
+            "loss_weight": cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT,
+        }
+    def forward(self, features, targets=None):
+        """
+        Returns:
+            In training, returns (None, dict of losses)
+            In inference, returns (CxHxW logits, {})
+        """
+        x = self.layers(features)
+        if self.training:
+            return None, self.losses(x, targets)
+        else:
+            x = F.interpolate(
+                x, scale_factor=self.common_stride, mode="bilinear", align_corners=False
+            )
+            return x, {}
+    def layers(self, features):
+        for i, f in enumerate(self.in_features):
+            if i == 0:
+                x = self.scale_heads[i](features[f])
+            else:
+                x = x + self.scale_heads[i](features[f])
+        x = self.predictor(x)
+        return x
+    def losses(self, predictions, targets):
+        predictions = predictions.float()  # https://github.com/pytorch/pytorch/issues/48163
+        predictions = F.interpolate(
+            predictions,
+            scale_factor=self.common_stride,
+            mode="bilinear",
+            align_corners=False,
+        )
+        loss = F.cross_entropy(
+            predictions, targets, reduction="mean", ignore_index=self.ignore_value
+        )
+        losses = {"loss_sem_seg": loss * self.loss_weight}
+        return losses

extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/mmdet_wrapper.py ADDED Viewed

	@@ -0,0 +1,273 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+import itertools
+import logging
+import numpy as np
+from collections import OrderedDict
+from collections.abc import Mapping
+from typing import Dict, List, Optional, Tuple, Union
+import torch
+from omegaconf import DictConfig, OmegaConf
+from torch import Tensor, nn
+from annotator.oneformer.detectron2.layers import ShapeSpec
+from annotator.oneformer.detectron2.structures import BitMasks, Boxes, ImageList, Instances
+from annotator.oneformer.detectron2.utils.events import get_event_storage
+from .backbone import Backbone
+logger = logging.getLogger(__name__)
+def _to_container(cfg):
+    """
+    mmdet will assert the type of dict/list.
+    So convert omegaconf objects to dict/list.
+    """
+    if isinstance(cfg, DictConfig):
+        cfg = OmegaConf.to_container(cfg, resolve=True)
+    from mmcv.utils import ConfigDict
+    return ConfigDict(cfg)
+class MMDetBackbone(Backbone):
+    """
+    Wrapper of mmdetection backbones to use in detectron2.
+    mmdet backbones produce list/tuple of tensors, while detectron2 backbones
+    produce a dict of tensors. This class wraps the given backbone to produce
+    output in detectron2's convention, so it can be used in place of detectron2
+    backbones.
+    """
+    def __init__(
+        self,
+        backbone: Union[nn.Module, Mapping],
+        neck: Union[nn.Module, Mapping, None] = None,
+        *,
+        output_shapes: List[ShapeSpec],
+        output_names: Optional[List[str]] = None,
+    ):
+        """
+        Args:
+            backbone: either a backbone module or a mmdet config dict that defines a
+                backbone. The backbone takes a 4D image tensor and returns a
+                sequence of tensors.
+            neck: either a backbone module or a mmdet config dict that defines a
+                neck. The neck takes outputs of backbone and returns a
+                sequence of tensors. If None, no neck is used.
+            output_shapes: shape for every output of the backbone (or neck, if given).
+                stride and channels are often needed.
+            output_names: names for every output of the backbone (or neck, if given).
+                By default, will use "out0", "out1", ...
+        """
+        super().__init__()
+        if isinstance(backbone, Mapping):
+            from mmdet.models import build_backbone
+            backbone = build_backbone(_to_container(backbone))
+        self.backbone = backbone
+        if isinstance(neck, Mapping):
+            from mmdet.models import build_neck
+            neck = build_neck(_to_container(neck))
+        self.neck = neck
+        # "Neck" weights, if any, are part of neck itself. This is the interface
+        # of mmdet so we follow it. Reference:
+        # https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/detectors/two_stage.py
+        logger.info("Initializing mmdet backbone weights...")
+        self.backbone.init_weights()
+        # train() in mmdet modules is non-trivial, and has to be explicitly
+        # called. Reference:
+        # https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/backbones/resnet.py
+        self.backbone.train()
+        if self.neck is not None:
+            logger.info("Initializing mmdet neck weights ...")
+            if isinstance(self.neck, nn.Sequential):
+                for m in self.neck:
+                    m.init_weights()
+            else:
+                self.neck.init_weights()
+            self.neck.train()
+        self._output_shapes = output_shapes
+        if not output_names:
+            output_names = [f"out{i}" for i in range(len(output_shapes))]
+        self._output_names = output_names
+    def forward(self, x) -> Dict[str, Tensor]:
+        outs = self.backbone(x)
+        if self.neck is not None:
+            outs = self.neck(outs)
+        assert isinstance(
+            outs, (list, tuple)
+        ), "mmdet backbone should return a list/tuple of tensors!"
+        if len(outs) != len(self._output_shapes):
+            raise ValueError(
+                "Length of output_shapes does not match outputs from the mmdet backbone: "
+                f"{len(outs)} != {len(self._output_shapes)}"
+            )
+        return {k: v for k, v in zip(self._output_names, outs)}
+    def output_shape(self) -> Dict[str, ShapeSpec]:
+        return {k: v for k, v in zip(self._output_names, self._output_shapes)}
+class MMDetDetector(nn.Module):
+    """
+    Wrapper of a mmdetection detector model, for detection and instance segmentation.
+    Input/output formats of this class follow detectron2's convention, so a
+    mmdetection model can be trained and evaluated in detectron2.
+    """
+    def __init__(
+        self,
+        detector: Union[nn.Module, Mapping],
+        *,
+        # Default is 32 regardless of model:
+        # https://github.com/open-mmlab/mmdetection/tree/master/configs/_base_/datasets
+        size_divisibility=32,
+        pixel_mean: Tuple[float],
+        pixel_std: Tuple[float],
+    ):
+        """
+        Args:
+            detector: a mmdet detector, or a mmdet config dict that defines a detector.
+            size_divisibility: pad input images to multiple of this number
+            pixel_mean: per-channel mean to normalize input image
+            pixel_std: per-channel stddev to normalize input image
+        """
+        super().__init__()
+        if isinstance(detector, Mapping):
+            from mmdet.models import build_detector
+            detector = build_detector(_to_container(detector))
+        self.detector = detector
+        self.detector.init_weights()
+        self.size_divisibility = size_divisibility
+        self.register_buffer("pixel_mean", torch.tensor(pixel_mean).view(-1, 1, 1), False)
+        self.register_buffer("pixel_std", torch.tensor(pixel_std).view(-1, 1, 1), False)
+        assert (
+            self.pixel_mean.shape == self.pixel_std.shape
+        ), f"{self.pixel_mean} and {self.pixel_std} have different shapes!"
+    def forward(self, batched_inputs: List[Dict[str, torch.Tensor]]):
+        images = [x["image"].to(self.device) for x in batched_inputs]
+        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
+        images = ImageList.from_tensors(images, size_divisibility=self.size_divisibility).tensor
+        metas = []
+        rescale = {"height" in x for x in batched_inputs}
+        if len(rescale) != 1:
+            raise ValueError("Some inputs have original height/width, but some don't!")
+        rescale = list(rescale)[0]
+        output_shapes = []
+        for input in batched_inputs:
+            meta = {}
+            c, h, w = input["image"].shape
+            meta["img_shape"] = meta["ori_shape"] = (h, w, c)
+            if rescale:
+                scale_factor = np.array(
+                    [w / input["width"], h / input["height"]] * 2, dtype="float32"
+                )
+                ori_shape = (input["height"], input["width"])
+                output_shapes.append(ori_shape)
+                meta["ori_shape"] = ori_shape + (c,)
+            else:
+                scale_factor = 1.0
+                output_shapes.append((h, w))
+            meta["scale_factor"] = scale_factor
+            meta["flip"] = False
+            padh, padw = images.shape[-2:]
+            meta["pad_shape"] = (padh, padw, c)
+            metas.append(meta)
+        if self.training:
+            gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
+            if gt_instances[0].has("gt_masks"):
+                from mmdet.core import PolygonMasks as mm_PolygonMasks, BitmapMasks as mm_BitMasks
+                def convert_mask(m, shape):
+                    # mmdet mask format
+                    if isinstance(m, BitMasks):
+                        return mm_BitMasks(m.tensor.cpu().numpy(), shape[0], shape[1])
+                    else:
+                        return mm_PolygonMasks(m.polygons, shape[0], shape[1])
+                gt_masks = [convert_mask(x.gt_masks, x.image_size) for x in gt_instances]
+                losses_and_metrics = self.detector.forward_train(
+                    images,
+                    metas,
+                    [x.gt_boxes.tensor for x in gt_instances],
+                    [x.gt_classes for x in gt_instances],
+                    gt_masks=gt_masks,
+                )
+            else:
+                losses_and_metrics = self.detector.forward_train(
+                    images,
+                    metas,
+                    [x.gt_boxes.tensor for x in gt_instances],
+                    [x.gt_classes for x in gt_instances],
+                )
+            return _parse_losses(losses_and_metrics)
+        else:
+            results = self.detector.simple_test(images, metas, rescale=rescale)
+            results = [
+                {"instances": _convert_mmdet_result(r, shape)}
+                for r, shape in zip(results, output_shapes)
+            ]
+            return results
+    @property
+    def device(self):
+        return self.pixel_mean.device
+# Reference: show_result() in
+# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/detectors/base.py
+def _convert_mmdet_result(result, shape: Tuple[int, int]) -> Instances:
+    if isinstance(result, tuple):
+        bbox_result, segm_result = result
+        if isinstance(segm_result, tuple):
+            segm_result = segm_result[0]
+    else:
+        bbox_result, segm_result = result, None
+    bboxes = torch.from_numpy(np.vstack(bbox_result))  # Nx5
+    bboxes, scores = bboxes[:, :4], bboxes[:, -1]
+    labels = [
+        torch.full((bbox.shape[0],), i, dtype=torch.int32) for i, bbox in enumerate(bbox_result)
+    ]
+    labels = torch.cat(labels)
+    inst = Instances(shape)
+    inst.pred_boxes = Boxes(bboxes)
+    inst.scores = scores
+    inst.pred_classes = labels
+    if segm_result is not None and len(labels) > 0:
+        segm_result = list(itertools.chain(*segm_result))
+        segm_result = [torch.from_numpy(x) if isinstance(x, np.ndarray) else x for x in segm_result]
+        segm_result = torch.stack(segm_result, dim=0)
+        inst.pred_masks = segm_result
+    return inst
+# reference: https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/detectors/base.py
+def _parse_losses(losses: Dict[str, Tensor]) -> Dict[str, Tensor]:
+    log_vars = OrderedDict()
+    for loss_name, loss_value in losses.items():
+        if isinstance(loss_value, torch.Tensor):
+            log_vars[loss_name] = loss_value.mean()
+        elif isinstance(loss_value, list):
+            log_vars[loss_name] = sum(_loss.mean() for _loss in loss_value)
+        else:
+            raise TypeError(f"{loss_name} is not a tensor or list of tensors")
+        if "loss" not in loss_name:
+            # put metrics to storage; don't return them
+            storage = get_event_storage()
+            value = log_vars.pop(loss_name).cpu().item()
+            storage.put_scalar(loss_name, value)
+    return log_vars

extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/poolers.py ADDED Viewed

	@@ -0,0 +1,263 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+import math
+from typing import List, Optional
+import torch
+from torch import nn
+from torchvision.ops import RoIPool
+from annotator.oneformer.detectron2.layers import ROIAlign, ROIAlignRotated, cat, nonzero_tuple, shapes_to_tensor
+from annotator.oneformer.detectron2.structures import Boxes
+from annotator.oneformer.detectron2.utils.tracing import assert_fx_safe, is_fx_tracing
+"""
+To export ROIPooler to torchscript, in this file, variables that should be annotated with
+`Union[List[Boxes], List[RotatedBoxes]]` are only annotated with `List[Boxes]`.
+TODO: Correct these annotations when torchscript support `Union`.
+https://github.com/pytorch/pytorch/issues/41412
+"""
+__all__ = ["ROIPooler"]
+def assign_boxes_to_levels(
+    box_lists: List[Boxes],
+    min_level: int,
+    max_level: int,
+    canonical_box_size: int,
+    canonical_level: int,
+):
+    """
+    Map each box in `box_lists` to a feature map level index and return the assignment
+    vector.
+    Args:
+        box_lists (list[Boxes] | list[RotatedBoxes]): A list of N Boxes or N RotatedBoxes,
+            where N is the number of images in the batch.
+        min_level (int): Smallest feature map level index. The input is considered index 0,
+            the output of stage 1 is index 1, and so.
+        max_level (int): Largest feature map level index.
+        canonical_box_size (int): A canonical box size in pixels (sqrt(box area)).
+        canonical_level (int): The feature map level index on which a canonically-sized box
+            should be placed.
+    Returns:
+        A tensor of length M, where M is the total number of boxes aggregated over all
+            N batch images. The memory layout corresponds to the concatenation of boxes
+            from all images. Each element is the feature map index, as an offset from
+            `self.min_level`, for the corresponding box (so value i means the box is at
+            `self.min_level + i`).
+    """
+    box_sizes = torch.sqrt(cat([boxes.area() for boxes in box_lists]))
+    # Eqn.(1) in FPN paper
+    level_assignments = torch.floor(
+        canonical_level + torch.log2(box_sizes / canonical_box_size + 1e-8)
+    )
+    # clamp level to (min, max), in case the box size is too large or too small
+    # for the available feature maps
+    level_assignments = torch.clamp(level_assignments, min=min_level, max=max_level)
+    return level_assignments.to(torch.int64) - min_level
+# script the module to avoid hardcoded device type
+@torch.jit.script_if_tracing
+def _convert_boxes_to_pooler_format(boxes: torch.Tensor, sizes: torch.Tensor) -> torch.Tensor:
+    sizes = sizes.to(device=boxes.device)
+    indices = torch.repeat_interleave(
+        torch.arange(len(sizes), dtype=boxes.dtype, device=boxes.device), sizes
+    )
+    return cat([indices[:, None], boxes], dim=1)
+def convert_boxes_to_pooler_format(box_lists: List[Boxes]):
+    """
+    Convert all boxes in `box_lists` to the low-level format used by ROI pooling ops
+    (see description under Returns).
+    Args:
+        box_lists (list[Boxes] | list[RotatedBoxes]):
+            A list of N Boxes or N RotatedBoxes, where N is the number of images in the batch.
+    Returns:
+        When input is list[Boxes]:
+            A tensor of shape (M, 5), where M is the total number of boxes aggregated over all
+            N batch images.
+            The 5 columns are (batch index, x0, y0, x1, y1), where batch index
+            is the index in [0, N) identifying which batch image the box with corners at
+            (x0, y0, x1, y1) comes from.
+        When input is list[RotatedBoxes]:
+            A tensor of shape (M, 6), where M is the total number of boxes aggregated over all
+            N batch images.
+            The 6 columns are (batch index, x_ctr, y_ctr, width, height, angle_degrees),
+            where batch index is the index in [0, N) identifying which batch image the
+            rotated box (x_ctr, y_ctr, width, height, angle_degrees) comes from.
+    """
+    boxes = torch.cat([x.tensor for x in box_lists], dim=0)
+    # __len__ returns Tensor in tracing.
+    sizes = shapes_to_tensor([x.__len__() for x in box_lists])
+    return _convert_boxes_to_pooler_format(boxes, sizes)
+@torch.jit.script_if_tracing
+def _create_zeros(
+    batch_target: Optional[torch.Tensor],
+    channels: int,
+    height: int,
+    width: int,
+    like_tensor: torch.Tensor,
+) -> torch.Tensor:
+    batches = batch_target.shape[0] if batch_target is not None else 0
+    sizes = (batches, channels, height, width)
+    return torch.zeros(sizes, dtype=like_tensor.dtype, device=like_tensor.device)
+class ROIPooler(nn.Module):
+    """
+    Region of interest feature map pooler that supports pooling from one or more
+    feature maps.
+    """
+    def __init__(
+        self,
+        output_size,
+        scales,
+        sampling_ratio,
+        pooler_type,
+        canonical_box_size=224,
+        canonical_level=4,
+    ):
+        """
+        Args:
+            output_size (int, tuple[int] or list[int]): output size of the pooled region,
+                e.g., 14 x 14. If tuple or list is given, the length must be 2.
+            scales (list[float]): The scale for each low-level pooling op relative to
+                the input image. For a feature map with stride s relative to the input
+                image, scale is defined as 1/s. The stride must be power of 2.
+                When there are multiple scales, they must form a pyramid, i.e. they must be
+                a monotically decreasing geometric sequence with a factor of 1/2.
+            sampling_ratio (int): The `sampling_ratio` parameter for the ROIAlign op.
+            pooler_type (string): Name of the type of pooling operation that should be applied.
+                For instance, "ROIPool" or "ROIAlignV2".
+            canonical_box_size (int): A canonical box size in pixels (sqrt(box area)). The default
+                is heuristically defined as 224 pixels in the FPN paper (based on ImageNet
+                pre-training).
+            canonical_level (int): The feature map level index from which a canonically-sized box
+                should be placed. The default is defined as level 4 (stride=16) in the FPN paper,
+                i.e., a box of size 224x224 will be placed on the feature with stride=16.
+                The box placement for all boxes will be determined from their sizes w.r.t
+                canonical_box_size. For example, a box whose area is 4x that of a canonical box
+                should be used to pool features from feature level ``canonical_level+1``.
+                Note that the actual input feature maps given to this module may not have
+                sufficiently many levels for the input boxes. If the boxes are too large or too
+                small for the input feature maps, the closest level will be used.
+        """
+        super().__init__()
+        if isinstance(output_size, int):
+            output_size = (output_size, output_size)
+        assert len(output_size) == 2
+        assert isinstance(output_size[0], int) and isinstance(output_size[1], int)
+        self.output_size = output_size
+        if pooler_type == "ROIAlign":
+            self.level_poolers = nn.ModuleList(
+                ROIAlign(
+                    output_size, spatial_scale=scale, sampling_ratio=sampling_ratio, aligned=False
+                )
+                for scale in scales
+            )
+        elif pooler_type == "ROIAlignV2":
+            self.level_poolers = nn.ModuleList(
+                ROIAlign(
+                    output_size, spatial_scale=scale, sampling_ratio=sampling_ratio, aligned=True
+                )
+                for scale in scales
+            )
+        elif pooler_type == "ROIPool":
+            self.level_poolers = nn.ModuleList(
+                RoIPool(output_size, spatial_scale=scale) for scale in scales
+            )
+        elif pooler_type == "ROIAlignRotated":
+            self.level_poolers = nn.ModuleList(
+                ROIAlignRotated(output_size, spatial_scale=scale, sampling_ratio=sampling_ratio)
+                for scale in scales
+            )
+        else:
+            raise ValueError("Unknown pooler type: {}".format(pooler_type))
+        # Map scale (defined as 1 / stride) to its feature map level under the
+        # assumption that stride is a power of 2.
+        min_level = -(math.log2(scales[0]))
+        max_level = -(math.log2(scales[-1]))
+        assert math.isclose(min_level, int(min_level)) and math.isclose(
+            max_level, int(max_level)
+        ), "Featuremap stride is not power of 2!"
+        self.min_level = int(min_level)
+        self.max_level = int(max_level)
+        assert (
+            len(scales) == self.max_level - self.min_level + 1
+        ), "[ROIPooler] Sizes of input featuremaps do not form a pyramid!"
+        assert 0 <= self.min_level and self.min_level <= self.max_level
+        self.canonical_level = canonical_level
+        assert canonical_box_size > 0
+        self.canonical_box_size = canonical_box_size
+    def forward(self, x: List[torch.Tensor], box_lists: List[Boxes]):
+        """
+        Args:
+            x (list[Tensor]): A list of feature maps of NCHW shape, with scales matching those
+                used to construct this module.
+            box_lists (list[Boxes] | list[RotatedBoxes]):
+                A list of N Boxes or N RotatedBoxes, where N is the number of images in the batch.
+                The box coordinates are defined on the original image and
+                will be scaled by the `scales` argument of :class:`ROIPooler`.
+        Returns:
+            Tensor:
+                A tensor of shape (M, C, output_size, output_size) where M is the total number of
+                boxes aggregated over all N batch images and C is the number of channels in `x`.
+        """
+        num_level_assignments = len(self.level_poolers)
+        if not is_fx_tracing():
+            torch._assert(
+                isinstance(x, list) and isinstance(box_lists, list),
+                "Arguments to pooler must be lists",
+            )
+        assert_fx_safe(
+            len(x) == num_level_assignments,
+            "unequal value, num_level_assignments={}, but x is list of {} Tensors".format(
+                num_level_assignments, len(x)
+            ),
+        )
+        assert_fx_safe(
+            len(box_lists) == x[0].size(0),
+            "unequal value, x[0] batch dim 0 is {}, but box_list has length {}".format(
+                x[0].size(0), len(box_lists)
+            ),
+        )
+        if len(box_lists) == 0:
+            return _create_zeros(None, x[0].shape[1], *self.output_size, x[0])
+        pooler_fmt_boxes = convert_boxes_to_pooler_format(box_lists)
+        if num_level_assignments == 1:
+            return self.level_poolers[0](x[0], pooler_fmt_boxes)
+        level_assignments = assign_boxes_to_levels(
+            box_lists, self.min_level, self.max_level, self.canonical_box_size, self.canonical_level
+        )
+        num_channels = x[0].shape[1]
+        output_size = self.output_size[0]
+        output = _create_zeros(pooler_fmt_boxes, num_channels, output_size, output_size, x[0])
+        for level, pooler in enumerate(self.level_poolers):
+            inds = nonzero_tuple(level_assignments == level)[0]
+            pooler_fmt_boxes_level = pooler_fmt_boxes[inds]
+            # Use index_put_ instead of advance indexing, to avoid pytorch/issues/49852
+            output.index_put_((inds,), pooler(x[level], pooler_fmt_boxes_level))
+        return output

extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/postprocessing.py ADDED Viewed

	@@ -0,0 +1,100 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+import torch
+from torch.nn import functional as F
+from annotator.oneformer.detectron2.structures import Instances, ROIMasks
+# perhaps should rename to "resize_instance"
+def detector_postprocess(
+    results: Instances, output_height: int, output_width: int, mask_threshold: float = 0.5
+):
+    """
+    Resize the output instances.
+    The input images are often resized when entering an object detector.
+    As a result, we often need the outputs of the detector in a different
+    resolution from its inputs.
+    This function will resize the raw outputs of an R-CNN detector
+    to produce outputs according to the desired output resolution.
+    Args:
+        results (Instances): the raw outputs from the detector.
+            `results.image_size` contains the input image resolution the detector sees.
+            This object might be modified in-place.
+        output_height, output_width: the desired output resolution.
+    Returns:
+        Instances: the resized output from the model, based on the output resolution
+    """
+    if isinstance(output_width, torch.Tensor):
+        # This shape might (but not necessarily) be tensors during tracing.
+        # Converts integer tensors to float temporaries to ensure true
+        # division is performed when computing scale_x and scale_y.
+        output_width_tmp = output_width.float()
+        output_height_tmp = output_height.float()
+        new_size = torch.stack([output_height, output_width])
+    else:
+        new_size = (output_height, output_width)
+        output_width_tmp = output_width
+        output_height_tmp = output_height
+    scale_x, scale_y = (
+        output_width_tmp / results.image_size[1],
+        output_height_tmp / results.image_size[0],
+    )
+    results = Instances(new_size, **results.get_fields())
+    if results.has("pred_boxes"):
+        output_boxes = results.pred_boxes
+    elif results.has("proposal_boxes"):
+        output_boxes = results.proposal_boxes
+    else:
+        output_boxes = None
+    assert output_boxes is not None, "Predictions must contain boxes!"
+    output_boxes.scale(scale_x, scale_y)
+    output_boxes.clip(results.image_size)
+    results = results[output_boxes.nonempty()]
+    if results.has("pred_masks"):
+        if isinstance(results.pred_masks, ROIMasks):
+            roi_masks = results.pred_masks
+        else:
+            # pred_masks is a tensor of shape (N, 1, M, M)
+            roi_masks = ROIMasks(results.pred_masks[:, 0, :, :])
+        results.pred_masks = roi_masks.to_bitmasks(
+            results.pred_boxes, output_height, output_width, mask_threshold
+        ).tensor  # TODO return ROIMasks/BitMask object in the future
+    if results.has("pred_keypoints"):
+        results.pred_keypoints[:, :, 0] *= scale_x
+        results.pred_keypoints[:, :, 1] *= scale_y
+    return results
+def sem_seg_postprocess(result, img_size, output_height, output_width):
+    """
+    Return semantic segmentation predictions in the original resolution.
+    The input images are often resized when entering semantic segmentor. Moreover, in same
+    cases, they also padded inside segmentor to be divisible by maximum network stride.
+    As a result, we often need the predictions of the segmentor in a different
+    resolution from its inputs.
+    Args:
+        result (Tensor): semantic segmentation prediction logits. A tensor of shape (C, H, W),
+            where C is the number of classes, and H, W are the height and width of the prediction.
+        img_size (tuple): image size that segmentor is taking as input.
+        output_height, output_width: the desired output resolution.
+    Returns:
+        semantic segmentation prediction (Tensor): A tensor of the shape
+            (C, output_height, output_width) that contains per-pixel soft predictions.
+    """
+    result = result[:, : img_size[0], : img_size[1]].expand(1, -1, -1, -1)
+    result = F.interpolate(
+        result, size=(output_height, output_width), mode="bilinear", align_corners=False
+    )[0]
+    return result

extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/proposal_generator/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+from .build import PROPOSAL_GENERATOR_REGISTRY, build_proposal_generator
+from .rpn import RPN_HEAD_REGISTRY, build_rpn_head, RPN, StandardRPNHead
+__all__ = list(globals().keys())

extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/proposal_generator/build.py ADDED Viewed

	@@ -0,0 +1,24 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+from annotator.oneformer.detectron2.utils.registry import Registry
+PROPOSAL_GENERATOR_REGISTRY = Registry("PROPOSAL_GENERATOR")
+PROPOSAL_GENERATOR_REGISTRY.__doc__ = """
+Registry for proposal generator, which produces object proposals from feature maps.
+The registered object will be called with `obj(cfg, input_shape)`.
+The call should return a `nn.Module` object.
+"""
+from . import rpn, rrpn  # noqa F401 isort:skip
+def build_proposal_generator(cfg, input_shape):
+    """
+    Build a proposal generator from `cfg.MODEL.PROPOSAL_GENERATOR.NAME`.
+    The name can be "PrecomputedProposals" to use no proposal generator.
+    """
+    name = cfg.MODEL.PROPOSAL_GENERATOR.NAME
+    if name == "PrecomputedProposals":
+        return None
+    return PROPOSAL_GENERATOR_REGISTRY.get(name)(cfg, input_shape)

extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/proposal_generator/proposal_utils.py ADDED Viewed

	@@ -0,0 +1,205 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+import logging
+import math
+from typing import List, Tuple, Union
+import torch
+from annotator.oneformer.detectron2.layers import batched_nms, cat, move_device_like
+from annotator.oneformer.detectron2.structures import Boxes, Instances
+logger = logging.getLogger(__name__)
+def _is_tracing():
+    # (fixed in TORCH_VERSION >= 1.9)
+    if torch.jit.is_scripting():
+        # https://github.com/pytorch/pytorch/issues/47379
+        return False
+    else:
+        return torch.jit.is_tracing()
+def find_top_rpn_proposals(
+    proposals: List[torch.Tensor],
+    pred_objectness_logits: List[torch.Tensor],
+    image_sizes: List[Tuple[int, int]],
+    nms_thresh: float,
+    pre_nms_topk: int,
+    post_nms_topk: int,
+    min_box_size: float,
+    training: bool,
+):
+    """
+    For each feature map, select the `pre_nms_topk` highest scoring proposals,
+    apply NMS, clip proposals, and remove small boxes. Return the `post_nms_topk`
+    highest scoring proposals among all the feature maps for each image.
+    Args:
+        proposals (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A, 4).
+            All proposal predictions on the feature maps.
+        pred_objectness_logits (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A).
+        image_sizes (list[tuple]): sizes (h, w) for each image
+        nms_thresh (float): IoU threshold to use for NMS
+        pre_nms_topk (int): number of top k scoring proposals to keep before applying NMS.
+            When RPN is run on multiple feature maps (as in FPN) this number is per
+            feature map.
+        post_nms_topk (int): number of top k scoring proposals to keep after applying NMS.
+            When RPN is run on multiple feature maps (as in FPN) this number is total,
+            over all feature maps.
+        min_box_size (float): minimum proposal box side length in pixels (absolute units
+            wrt input images).
+        training (bool): True if proposals are to be used in training, otherwise False.
+            This arg exists only to support a legacy bug; look for the "NB: Legacy bug ..."
+            comment.
+    Returns:
+        list[Instances]: list of N Instances. The i-th Instances
+            stores post_nms_topk object proposals for image i, sorted by their
+            objectness score in descending order.
+    """
+    num_images = len(image_sizes)
+    device = (
+        proposals[0].device
+        if torch.jit.is_scripting()
+        else ("cpu" if torch.jit.is_tracing() else proposals[0].device)
+    )
+    # 1. Select top-k anchor for every level and every image
+    topk_scores = []  # #lvl Tensor, each of shape N x topk
+    topk_proposals = []
+    level_ids = []  # #lvl Tensor, each of shape (topk,)
+    batch_idx = move_device_like(torch.arange(num_images, device=device), proposals[0])
+    for level_id, (proposals_i, logits_i) in enumerate(zip(proposals, pred_objectness_logits)):
+        Hi_Wi_A = logits_i.shape[1]
+        if isinstance(Hi_Wi_A, torch.Tensor):  # it's a tensor in tracing
+            num_proposals_i = torch.clamp(Hi_Wi_A, max=pre_nms_topk)
+        else:
+            num_proposals_i = min(Hi_Wi_A, pre_nms_topk)
+        topk_scores_i, topk_idx = logits_i.topk(num_proposals_i, dim=1)
+        # each is N x topk
+        topk_proposals_i = proposals_i[batch_idx[:, None], topk_idx]  # N x topk x 4
+        topk_proposals.append(topk_proposals_i)
+        topk_scores.append(topk_scores_i)
+        level_ids.append(
+            move_device_like(
+                torch.full((num_proposals_i,), level_id, dtype=torch.int64, device=device),
+                proposals[0],
+            )
+        )
+    # 2. Concat all levels together
+    topk_scores = cat(topk_scores, dim=1)
+    topk_proposals = cat(topk_proposals, dim=1)
+    level_ids = cat(level_ids, dim=0)
+    # 3. For each image, run a per-level NMS, and choose topk results.
+    results: List[Instances] = []
+    for n, image_size in enumerate(image_sizes):
+        boxes = Boxes(topk_proposals[n])
+        scores_per_img = topk_scores[n]
+        lvl = level_ids
+        valid_mask = torch.isfinite(boxes.tensor).all(dim=1) & torch.isfinite(scores_per_img)
+        if not valid_mask.all():
+            if training:
+                raise FloatingPointError(
+                    "Predicted boxes or scores contain Inf/NaN. Training has diverged."
+                )
+            boxes = boxes[valid_mask]
+            scores_per_img = scores_per_img[valid_mask]
+            lvl = lvl[valid_mask]
+        boxes.clip(image_size)
+        # filter empty boxes
+        keep = boxes.nonempty(threshold=min_box_size)
+        if _is_tracing() or keep.sum().item() != len(boxes):
+            boxes, scores_per_img, lvl = boxes[keep], scores_per_img[keep], lvl[keep]
+        keep = batched_nms(boxes.tensor, scores_per_img, lvl, nms_thresh)
+        # In Detectron1, there was different behavior during training vs. testing.
+        # (https://github.com/facebookresearch/Detectron/issues/459)
+        # During training, topk is over the proposals from *all* images in the training batch.
+        # During testing, it is over the proposals for each image separately.
+        # As a result, the training behavior becomes batch-dependent,
+        # and the configuration "POST_NMS_TOPK_TRAIN" end up relying on the batch size.
+        # This bug is addressed in Detectron2 to make the behavior independent of batch size.
+        keep = keep[:post_nms_topk]  # keep is already sorted
+        res = Instances(image_size)
+        res.proposal_boxes = boxes[keep]
+        res.objectness_logits = scores_per_img[keep]
+        results.append(res)
+    return results
+def add_ground_truth_to_proposals(
+    gt: Union[List[Instances], List[Boxes]], proposals: List[Instances]
+) -> List[Instances]:
+    """
+    Call `add_ground_truth_to_proposals_single_image` for all images.
+    Args:
+        gt(Union[List[Instances], List[Boxes]): list of N elements. Element i is a Instances
+            representing the ground-truth for image i.
+        proposals (list[Instances]): list of N elements. Element i is a Instances
+            representing the proposals for image i.
+    Returns:
+        list[Instances]: list of N Instances. Each is the proposals for the image,
+            with field "proposal_boxes" and "objectness_logits".
+    """
+    assert gt is not None
+    if len(proposals) != len(gt):
+        raise ValueError("proposals and gt should have the same length as the number of images!")
+    if len(proposals) == 0:
+        return proposals
+    return [
+        add_ground_truth_to_proposals_single_image(gt_i, proposals_i)
+        for gt_i, proposals_i in zip(gt, proposals)
+    ]
+def add_ground_truth_to_proposals_single_image(
+    gt: Union[Instances, Boxes], proposals: Instances
+) -> Instances:
+    """
+    Augment `proposals` with `gt`.
+    Args:
+        Same as `add_ground_truth_to_proposals`, but with gt and proposals
+        per image.
+    Returns:
+        Same as `add_ground_truth_to_proposals`, but for only one image.
+    """
+    if isinstance(gt, Boxes):
+        # convert Boxes to Instances
+        gt = Instances(proposals.image_size, gt_boxes=gt)
+    gt_boxes = gt.gt_boxes
+    device = proposals.objectness_logits.device
+    # Assign all ground-truth boxes an objectness logit corresponding to
+    # P(object) = sigmoid(logit) =~ 1.
+    gt_logit_value = math.log((1.0 - 1e-10) / (1 - (1.0 - 1e-10)))
+    gt_logits = gt_logit_value * torch.ones(len(gt_boxes), device=device)
+    # Concatenating gt_boxes with proposals requires them to have the same fields
+    gt_proposal = Instances(proposals.image_size, **gt.get_fields())
+    gt_proposal.proposal_boxes = gt_boxes
+    gt_proposal.objectness_logits = gt_logits
+    for key in proposals.get_fields().keys():
+        assert gt_proposal.has(
+            key
+        ), "The attribute '{}' in `proposals` does not exist in `gt`".format(key)
+    # NOTE: Instances.cat only use fields from the first item. Extra fields in latter items
+    # will be thrown away.
+    new_proposals = Instances.cat([proposals, gt_proposal])
+    return new_proposals

extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/proposal_generator/rpn.py ADDED Viewed

	@@ -0,0 +1,533 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+from typing import Dict, List, Optional, Tuple, Union
+import torch
+import torch.nn.functional as F
+from torch import nn
+from annotator.oneformer.detectron2.config import configurable
+from annotator.oneformer.detectron2.layers import Conv2d, ShapeSpec, cat
+from annotator.oneformer.detectron2.structures import Boxes, ImageList, Instances, pairwise_iou
+from annotator.oneformer.detectron2.utils.events import get_event_storage
+from annotator.oneformer.detectron2.utils.memory import retry_if_cuda_oom
+from annotator.oneformer.detectron2.utils.registry import Registry
+from ..anchor_generator import build_anchor_generator
+from ..box_regression import Box2BoxTransform, _dense_box_regression_loss
+from ..matcher import Matcher
+from ..sampling import subsample_labels
+from .build import PROPOSAL_GENERATOR_REGISTRY
+from .proposal_utils import find_top_rpn_proposals
+RPN_HEAD_REGISTRY = Registry("RPN_HEAD")
+RPN_HEAD_REGISTRY.__doc__ = """
+Registry for RPN heads, which take feature maps and perform
+objectness classification and bounding box regression for anchors.
+The registered object will be called with `obj(cfg, input_shape)`.
+The call should return a `nn.Module` object.
+"""
+"""
+Shape shorthand in this module:
+    N: number of images in the minibatch
+    L: number of feature maps per image on which RPN is run
+    A: number of cell anchors (must be the same for all feature maps)
+    Hi, Wi: height and width of the i-th feature map
+    B: size of the box parameterization
+Naming convention:
+    objectness: refers to the binary classification of an anchor as object vs. not object.
+    deltas: refers to the 4-d (dx, dy, dw, dh) deltas that parameterize the box2box
+    transform (see :class:`box_regression.Box2BoxTransform`), or 5d for rotated boxes.
+    pred_objectness_logits: predicted objectness scores in [-inf, +inf]; use
+        sigmoid(pred_objectness_logits) to estimate P(object).
+    gt_labels: ground-truth binary classification labels for objectness
+    pred_anchor_deltas: predicted box2box transform deltas
+    gt_anchor_deltas: ground-truth box2box transform deltas
+"""
+def build_rpn_head(cfg, input_shape):
+    """
+    Build an RPN head defined by `cfg.MODEL.RPN.HEAD_NAME`.
+    """
+    name = cfg.MODEL.RPN.HEAD_NAME
+    return RPN_HEAD_REGISTRY.get(name)(cfg, input_shape)
+@RPN_HEAD_REGISTRY.register()
+class StandardRPNHead(nn.Module):
+    """
+    Standard RPN classification and regression heads described in :paper:`Faster R-CNN`.
+    Uses a 3x3 conv to produce a shared hidden state from which one 1x1 conv predicts
+    objectness logits for each anchor and a second 1x1 conv predicts bounding-box deltas
+    specifying how to deform each anchor into an object proposal.
+    """
+    @configurable
+    def __init__(
+        self, *, in_channels: int, num_anchors: int, box_dim: int = 4, conv_dims: List[int] = (-1,)
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            in_channels (int): number of input feature channels. When using multiple
+                input features, they must have the same number of channels.
+            num_anchors (int): number of anchors to predict for *each spatial position*
+                on the feature map. The total number of anchors for each
+                feature map will be `num_anchors * H * W`.
+            box_dim (int): dimension of a box, which is also the number of box regression
+                predictions to make for each anchor. An axis aligned box has
+                box_dim=4, while a rotated box has box_dim=5.
+            conv_dims (list[int]): a list of integers representing the output channels
+                of N conv layers. Set it to -1 to use the same number of output channels
+                as input channels.
+        """
+        super().__init__()
+        cur_channels = in_channels
+        # Keeping the old variable names and structure for backwards compatiblity.
+        # Otherwise the old checkpoints will fail to load.
+        if len(conv_dims) == 1:
+            out_channels = cur_channels if conv_dims[0] == -1 else conv_dims[0]
+            # 3x3 conv for the hidden representation
+            self.conv = self._get_rpn_conv(cur_channels, out_channels)
+            cur_channels = out_channels
+        else:
+            self.conv = nn.Sequential()
+            for k, conv_dim in enumerate(conv_dims):
+                out_channels = cur_channels if conv_dim == -1 else conv_dim
+                if out_channels <= 0:
+                    raise ValueError(
+                        f"Conv output channels should be greater than 0. Got {out_channels}"
+                    )
+                conv = self._get_rpn_conv(cur_channels, out_channels)
+                self.conv.add_module(f"conv{k}", conv)
+                cur_channels = out_channels
+        # 1x1 conv for predicting objectness logits
+        self.objectness_logits = nn.Conv2d(cur_channels, num_anchors, kernel_size=1, stride=1)
+        # 1x1 conv for predicting box2box transform deltas
+        self.anchor_deltas = nn.Conv2d(cur_channels, num_anchors * box_dim, kernel_size=1, stride=1)
+        # Keeping the order of weights initialization same for backwards compatiblility.
+        for layer in self.modules():
+            if isinstance(layer, nn.Conv2d):
+                nn.init.normal_(layer.weight, std=0.01)
+                nn.init.constant_(layer.bias, 0)
+    def _get_rpn_conv(self, in_channels, out_channels):
+        return Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            activation=nn.ReLU(),
+        )
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        # Standard RPN is shared across levels:
+        in_channels = [s.channels for s in input_shape]
+        assert len(set(in_channels)) == 1, "Each level must have the same channel!"
+        in_channels = in_channels[0]
+        # RPNHead should take the same input as anchor generator
+        # NOTE: it assumes that creating an anchor generator does not have unwanted side effect.
+        anchor_generator = build_anchor_generator(cfg, input_shape)
+        num_anchors = anchor_generator.num_anchors
+        box_dim = anchor_generator.box_dim
+        assert (
+            len(set(num_anchors)) == 1
+        ), "Each level must have the same number of anchors per spatial position"
+        return {
+            "in_channels": in_channels,
+            "num_anchors": num_anchors[0],
+            "box_dim": box_dim,
+            "conv_dims": cfg.MODEL.RPN.CONV_DIMS,
+        }
+    def forward(self, features: List[torch.Tensor]):
+        """
+        Args:
+            features (list[Tensor]): list of feature maps
+        Returns:
+            list[Tensor]: A list of L elements.
+                Element i is a tensor of shape (N, A, Hi, Wi) representing
+                the predicted objectness logits for all anchors. A is the number of cell anchors.
+            list[Tensor]: A list of L elements. Element i is a tensor of shape
+                (N, A*box_dim, Hi, Wi) representing the predicted "deltas" used to transform anchors
+                to proposals.
+        """
+        pred_objectness_logits = []
+        pred_anchor_deltas = []
+        for x in features:
+            t = self.conv(x)
+            pred_objectness_logits.append(self.objectness_logits(t))
+            pred_anchor_deltas.append(self.anchor_deltas(t))
+        return pred_objectness_logits, pred_anchor_deltas
+@PROPOSAL_GENERATOR_REGISTRY.register()
+class RPN(nn.Module):
+    """
+    Region Proposal Network, introduced by :paper:`Faster R-CNN`.
+    """
+    @configurable
+    def __init__(
+        self,
+        *,
+        in_features: List[str],
+        head: nn.Module,
+        anchor_generator: nn.Module,
+        anchor_matcher: Matcher,
+        box2box_transform: Box2BoxTransform,
+        batch_size_per_image: int,
+        positive_fraction: float,
+        pre_nms_topk: Tuple[float, float],
+        post_nms_topk: Tuple[float, float],
+        nms_thresh: float = 0.7,
+        min_box_size: float = 0.0,
+        anchor_boundary_thresh: float = -1.0,
+        loss_weight: Union[float, Dict[str, float]] = 1.0,
+        box_reg_loss_type: str = "smooth_l1",
+        smooth_l1_beta: float = 0.0,
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            in_features (list[str]): list of names of input features to use
+            head (nn.Module): a module that predicts logits and regression deltas
+                for each level from a list of per-level features
+            anchor_generator (nn.Module): a module that creates anchors from a
+                list of features. Usually an instance of :class:`AnchorGenerator`
+            anchor_matcher (Matcher): label the anchors by matching them with ground truth.
+            box2box_transform (Box2BoxTransform): defines the transform from anchors boxes to
+                instance boxes
+            batch_size_per_image (int): number of anchors per image to sample for training
+            positive_fraction (float): fraction of foreground anchors to sample for training
+            pre_nms_topk (tuple[float]): (train, test) that represents the
+                number of top k proposals to select before NMS, in
+                training and testing.
+            post_nms_topk (tuple[float]): (train, test) that represents the
+                number of top k proposals to select after NMS, in
+                training and testing.
+            nms_thresh (float): NMS threshold used to de-duplicate the predicted proposals
+            min_box_size (float): remove proposal boxes with any side smaller than this threshold,
+                in the unit of input image pixels
+            anchor_boundary_thresh (float): legacy option
+            loss_weight (float|dict): weights to use for losses. Can be single float for weighting
+                all rpn losses together, or a dict of individual weightings. Valid dict keys are:
+                    "loss_rpn_cls" - applied to classification loss
+                    "loss_rpn_loc" - applied to box regression loss
+            box_reg_loss_type (str): Loss type to use. Supported losses: "smooth_l1", "giou".
+            smooth_l1_beta (float): beta parameter for the smooth L1 regression loss. Default to
+                use L1 loss. Only used when `box_reg_loss_type` is "smooth_l1"
+        """
+        super().__init__()
+        self.in_features = in_features
+        self.rpn_head = head
+        self.anchor_generator = anchor_generator
+        self.anchor_matcher = anchor_matcher
+        self.box2box_transform = box2box_transform
+        self.batch_size_per_image = batch_size_per_image
+        self.positive_fraction = positive_fraction
+        # Map from self.training state to train/test settings
+        self.pre_nms_topk = {True: pre_nms_topk[0], False: pre_nms_topk[1]}
+        self.post_nms_topk = {True: post_nms_topk[0], False: post_nms_topk[1]}
+        self.nms_thresh = nms_thresh
+        self.min_box_size = float(min_box_size)
+        self.anchor_boundary_thresh = anchor_boundary_thresh
+        if isinstance(loss_weight, float):
+            loss_weight = {"loss_rpn_cls": loss_weight, "loss_rpn_loc": loss_weight}
+        self.loss_weight = loss_weight
+        self.box_reg_loss_type = box_reg_loss_type
+        self.smooth_l1_beta = smooth_l1_beta
+    @classmethod
+    def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
+        in_features = cfg.MODEL.RPN.IN_FEATURES
+        ret = {
+            "in_features": in_features,
+            "min_box_size": cfg.MODEL.PROPOSAL_GENERATOR.MIN_SIZE,
+            "nms_thresh": cfg.MODEL.RPN.NMS_THRESH,
+            "batch_size_per_image": cfg.MODEL.RPN.BATCH_SIZE_PER_IMAGE,
+            "positive_fraction": cfg.MODEL.RPN.POSITIVE_FRACTION,
+            "loss_weight": {
+                "loss_rpn_cls": cfg.MODEL.RPN.LOSS_WEIGHT,
+                "loss_rpn_loc": cfg.MODEL.RPN.BBOX_REG_LOSS_WEIGHT * cfg.MODEL.RPN.LOSS_WEIGHT,
+            },
+            "anchor_boundary_thresh": cfg.MODEL.RPN.BOUNDARY_THRESH,
+            "box2box_transform": Box2BoxTransform(weights=cfg.MODEL.RPN.BBOX_REG_WEIGHTS),
+            "box_reg_loss_type": cfg.MODEL.RPN.BBOX_REG_LOSS_TYPE,
+            "smooth_l1_beta": cfg.MODEL.RPN.SMOOTH_L1_BETA,
+        }
+        ret["pre_nms_topk"] = (cfg.MODEL.RPN.PRE_NMS_TOPK_TRAIN, cfg.MODEL.RPN.PRE_NMS_TOPK_TEST)
+        ret["post_nms_topk"] = (cfg.MODEL.RPN.POST_NMS_TOPK_TRAIN, cfg.MODEL.RPN.POST_NMS_TOPK_TEST)
+        ret["anchor_generator"] = build_anchor_generator(cfg, [input_shape[f] for f in in_features])
+        ret["anchor_matcher"] = Matcher(
+            cfg.MODEL.RPN.IOU_THRESHOLDS, cfg.MODEL.RPN.IOU_LABELS, allow_low_quality_matches=True
+        )
+        ret["head"] = build_rpn_head(cfg, [input_shape[f] for f in in_features])
+        return ret
+    def _subsample_labels(self, label):
+        """
+        Randomly sample a subset of positive and negative examples, and overwrite
+        the label vector to the ignore value (-1) for all elements that are not
+        included in the sample.
+        Args:
+            labels (Tensor): a vector of -1, 0, 1. Will be modified in-place and returned.
+        """
+        pos_idx, neg_idx = subsample_labels(
+            label, self.batch_size_per_image, self.positive_fraction, 0
+        )
+        # Fill with the ignore label (-1), then set positive and negative labels
+        label.fill_(-1)
+        label.scatter_(0, pos_idx, 1)
+        label.scatter_(0, neg_idx, 0)
+        return label
+    @torch.jit.unused
+    @torch.no_grad()
+    def label_and_sample_anchors(
+        self, anchors: List[Boxes], gt_instances: List[Instances]
+    ) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
+        """
+        Args:
+            anchors (list[Boxes]): anchors for each feature map.
+            gt_instances: the ground-truth instances for each image.
+        Returns:
+            list[Tensor]:
+                List of #img tensors. i-th element is a vector of labels whose length is
+                the total number of anchors across all feature maps R = sum(Hi * Wi * A).
+                Label values are in {-1, 0, 1}, with meanings: -1 = ignore; 0 = negative
+                class; 1 = positive class.
+            list[Tensor]:
+                i-th element is a Rx4 tensor. The values are the matched gt boxes for each
+                anchor. Values are undefined for those anchors not labeled as 1.
+        """
+        anchors = Boxes.cat(anchors)
+        gt_boxes = [x.gt_boxes for x in gt_instances]
+        image_sizes = [x.image_size for x in gt_instances]
+        del gt_instances
+        gt_labels = []
+        matched_gt_boxes = []
+        for image_size_i, gt_boxes_i in zip(image_sizes, gt_boxes):
+            """
+            image_size_i: (h, w) for the i-th image
+            gt_boxes_i: ground-truth boxes for i-th image
+            """
+            match_quality_matrix = retry_if_cuda_oom(pairwise_iou)(gt_boxes_i, anchors)
+            matched_idxs, gt_labels_i = retry_if_cuda_oom(self.anchor_matcher)(match_quality_matrix)
+            # Matching is memory-expensive and may result in CPU tensors. But the result is small
+            gt_labels_i = gt_labels_i.to(device=gt_boxes_i.device)
+            del match_quality_matrix
+            if self.anchor_boundary_thresh >= 0:
+                # Discard anchors that go out of the boundaries of the image
+                # NOTE: This is legacy functionality that is turned off by default in Detectron2
+                anchors_inside_image = anchors.inside_box(image_size_i, self.anchor_boundary_thresh)
+                gt_labels_i[~anchors_inside_image] = -1
+            # A vector of labels (-1, 0, 1) for each anchor
+            gt_labels_i = self._subsample_labels(gt_labels_i)
+            if len(gt_boxes_i) == 0:
+                # These values won't be used anyway since the anchor is labeled as background
+                matched_gt_boxes_i = torch.zeros_like(anchors.tensor)
+            else:
+                # TODO wasted indexing computation for ignored boxes
+                matched_gt_boxes_i = gt_boxes_i[matched_idxs].tensor
+            gt_labels.append(gt_labels_i)  # N,AHW
+            matched_gt_boxes.append(matched_gt_boxes_i)
+        return gt_labels, matched_gt_boxes
+    @torch.jit.unused
+    def losses(
+        self,
+        anchors: List[Boxes],
+        pred_objectness_logits: List[torch.Tensor],
+        gt_labels: List[torch.Tensor],
+        pred_anchor_deltas: List[torch.Tensor],
+        gt_boxes: List[torch.Tensor],
+    ) -> Dict[str, torch.Tensor]:
+        """
+        Return the losses from a set of RPN predictions and their associated ground-truth.
+        Args:
+            anchors (list[Boxes or RotatedBoxes]): anchors for each feature map, each
+                has shape (Hi*Wi*A, B), where B is box dimension (4 or 5).
+            pred_objectness_logits (list[Tensor]): A list of L elements.
+                Element i is a tensor of shape (N, Hi*Wi*A) representing
+                the predicted objectness logits for all anchors.
+            gt_labels (list[Tensor]): Output of :meth:`label_and_sample_anchors`.
+            pred_anchor_deltas (list[Tensor]): A list of L elements. Element i is a tensor of shape
+                (N, Hi*Wi*A, 4 or 5) representing the predicted "deltas" used to transform anchors
+                to proposals.
+            gt_boxes (list[Tensor]): Output of :meth:`label_and_sample_anchors`.
+        Returns:
+            dict[loss name -> loss value]: A dict mapping from loss name to loss value.
+                Loss names are: `loss_rpn_cls` for objectness classification and
+                `loss_rpn_loc` for proposal localization.
+        """
+        num_images = len(gt_labels)
+        gt_labels = torch.stack(gt_labels)  # (N, sum(Hi*Wi*Ai))
+        # Log the number of positive/negative anchors per-image that's used in training
+        pos_mask = gt_labels == 1
+        num_pos_anchors = pos_mask.sum().item()
+        num_neg_anchors = (gt_labels == 0).sum().item()
+        storage = get_event_storage()
+        storage.put_scalar("rpn/num_pos_anchors", num_pos_anchors / num_images)
+        storage.put_scalar("rpn/num_neg_anchors", num_neg_anchors / num_images)
+        localization_loss = _dense_box_regression_loss(
+            anchors,
+            self.box2box_transform,
+            pred_anchor_deltas,
+            gt_boxes,
+            pos_mask,
+            box_reg_loss_type=self.box_reg_loss_type,
+            smooth_l1_beta=self.smooth_l1_beta,
+        )
+        valid_mask = gt_labels >= 0
+        objectness_loss = F.binary_cross_entropy_with_logits(
+            cat(pred_objectness_logits, dim=1)[valid_mask],
+            gt_labels[valid_mask].to(torch.float32),
+            reduction="sum",
+        )
+        normalizer = self.batch_size_per_image * num_images
+        losses = {
+            "loss_rpn_cls": objectness_loss / normalizer,
+            # The original Faster R-CNN paper uses a slightly different normalizer
+            # for loc loss. But it doesn't matter in practice
+            "loss_rpn_loc": localization_loss / normalizer,
+        }
+        losses = {k: v * self.loss_weight.get(k, 1.0) for k, v in losses.items()}
+        return losses
+    def forward(
+        self,
+        images: ImageList,
+        features: Dict[str, torch.Tensor],
+        gt_instances: Optional[List[Instances]] = None,
+    ):
+        """
+        Args:
+            images (ImageList): input images of length `N`
+            features (dict[str, Tensor]): input data as a mapping from feature
+                map name to tensor. Axis 0 represents the number of images `N` in
+                the input data; axes 1-3 are channels, height, and width, which may
+                vary between feature maps (e.g., if a feature pyramid is used).
+            gt_instances (list[Instances], optional): a length `N` list of `Instances`s.
+                Each `Instances` stores ground-truth instances for the corresponding image.
+        Returns:
+            proposals: list[Instances]: contains fields "proposal_boxes", "objectness_logits"
+            loss: dict[Tensor] or None
+        """
+        features = [features[f] for f in self.in_features]
+        anchors = self.anchor_generator(features)
+        pred_objectness_logits, pred_anchor_deltas = self.rpn_head(features)
+        # Transpose the Hi*Wi*A dimension to the middle:
+        pred_objectness_logits = [
+            # (N, A, Hi, Wi) -> (N, Hi, Wi, A) -> (N, Hi*Wi*A)
+            score.permute(0, 2, 3, 1).flatten(1)
+            for score in pred_objectness_logits
+        ]
+        pred_anchor_deltas = [
+            # (N, A*B, Hi, Wi) -> (N, A, B, Hi, Wi) -> (N, Hi, Wi, A, B) -> (N, Hi*Wi*A, B)
+            x.view(x.shape[0], -1, self.anchor_generator.box_dim, x.shape[-2], x.shape[-1])
+            .permute(0, 3, 4, 1, 2)
+            .flatten(1, -2)
+            for x in pred_anchor_deltas
+        ]
+        if self.training:
+            assert gt_instances is not None, "RPN requires gt_instances in training!"
+            gt_labels, gt_boxes = self.label_and_sample_anchors(anchors, gt_instances)
+            losses = self.losses(
+                anchors, pred_objectness_logits, gt_labels, pred_anchor_deltas, gt_boxes
+            )
+        else:
+            losses = {}
+        proposals = self.predict_proposals(
+            anchors, pred_objectness_logits, pred_anchor_deltas, images.image_sizes
+        )
+        return proposals, losses
+    def predict_proposals(
+        self,
+        anchors: List[Boxes],
+        pred_objectness_logits: List[torch.Tensor],
+        pred_anchor_deltas: List[torch.Tensor],
+        image_sizes: List[Tuple[int, int]],
+    ):
+        """
+        Decode all the predicted box regression deltas to proposals. Find the top proposals
+        by applying NMS and removing boxes that are too small.
+        Returns:
+            proposals (list[Instances]): list of N Instances. The i-th Instances
+                stores post_nms_topk object proposals for image i, sorted by their
+                objectness score in descending order.
+        """
+        # The proposals are treated as fixed for joint training with roi heads.
+        # This approach ignores the derivative w.r.t. the proposal boxes’ coordinates that
+        # are also network responses.
+        with torch.no_grad():
+            pred_proposals = self._decode_proposals(anchors, pred_anchor_deltas)
+            return find_top_rpn_proposals(
+                pred_proposals,
+                pred_objectness_logits,
+                image_sizes,
+                self.nms_thresh,
+                self.pre_nms_topk[self.training],
+                self.post_nms_topk[self.training],
+                self.min_box_size,
+                self.training,
+            )
+    def _decode_proposals(self, anchors: List[Boxes], pred_anchor_deltas: List[torch.Tensor]):
+        """
+        Transform anchors into proposals by applying the predicted anchor deltas.
+        Returns:
+            proposals (list[Tensor]): A list of L tensors. Tensor i has shape
+                (N, Hi*Wi*A, B)
+        """
+        N = pred_anchor_deltas[0].shape[0]
+        proposals = []
+        # For each feature map
+        for anchors_i, pred_anchor_deltas_i in zip(anchors, pred_anchor_deltas):
+            B = anchors_i.tensor.size(1)
+            pred_anchor_deltas_i = pred_anchor_deltas_i.reshape(-1, B)
+            # Expand anchors to shape (N*Hi*Wi*A, B)
+            anchors_i = anchors_i.tensor.unsqueeze(0).expand(N, -1, -1).reshape(-1, B)
+            proposals_i = self.box2box_transform.apply_deltas(pred_anchor_deltas_i, anchors_i)
+            # Append feature map proposals with shape (N, Hi*Wi*A, B)
+            proposals.append(proposals_i.view(N, -1, B))
+        return proposals

extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/proposal_generator/rrpn.py ADDED Viewed

	@@ -0,0 +1,209 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+import itertools
+import logging
+from typing import Dict, List
+import torch
+from annotator.oneformer.detectron2.config import configurable
+from annotator.oneformer.detectron2.layers import ShapeSpec, batched_nms_rotated, cat
+from annotator.oneformer.detectron2.structures import Instances, RotatedBoxes, pairwise_iou_rotated
+from annotator.oneformer.detectron2.utils.memory import retry_if_cuda_oom
+from ..box_regression import Box2BoxTransformRotated
+from .build import PROPOSAL_GENERATOR_REGISTRY
+from .proposal_utils import _is_tracing
+from .rpn import RPN
+logger = logging.getLogger(__name__)
+def find_top_rrpn_proposals(
+    proposals,
+    pred_objectness_logits,
+    image_sizes,
+    nms_thresh,
+    pre_nms_topk,
+    post_nms_topk,
+    min_box_size,
+    training,
+):
+    """
+    For each feature map, select the `pre_nms_topk` highest scoring proposals,
+    apply NMS, clip proposals, and remove small boxes. Return the `post_nms_topk`
+    highest scoring proposals among all the feature maps if `training` is True,
+    otherwise, returns the highest `post_nms_topk` scoring proposals for each
+    feature map.
+    Args:
+        proposals (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A, 5).
+            All proposal predictions on the feature maps.
+        pred_objectness_logits (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A).
+        image_sizes (list[tuple]): sizes (h, w) for each image
+        nms_thresh (float): IoU threshold to use for NMS
+        pre_nms_topk (int): number of top k scoring proposals to keep before applying NMS.
+            When RRPN is run on multiple feature maps (as in FPN) this number is per
+            feature map.
+        post_nms_topk (int): number of top k scoring proposals to keep after applying NMS.
+            When RRPN is run on multiple feature maps (as in FPN) this number is total,
+            over all feature maps.
+        min_box_size(float): minimum proposal box side length in pixels (absolute units wrt
+            input images).
+        training (bool): True if proposals are to be used in training, otherwise False.
+            This arg exists only to support a legacy bug; look for the "NB: Legacy bug ..."
+            comment.
+    Returns:
+        proposals (list[Instances]): list of N Instances. The i-th Instances
+            stores post_nms_topk object proposals for image i.
+    """
+    num_images = len(image_sizes)
+    device = proposals[0].device
+    # 1. Select top-k anchor for every level and every image
+    topk_scores = []  # #lvl Tensor, each of shape N x topk
+    topk_proposals = []
+    level_ids = []  # #lvl Tensor, each of shape (topk,)
+    batch_idx = torch.arange(num_images, device=device)
+    for level_id, proposals_i, logits_i in zip(
+        itertools.count(), proposals, pred_objectness_logits
+    ):
+        Hi_Wi_A = logits_i.shape[1]
+        if isinstance(Hi_Wi_A, torch.Tensor):  # it's a tensor in tracing
+            num_proposals_i = torch.clamp(Hi_Wi_A, max=pre_nms_topk)
+        else:
+            num_proposals_i = min(Hi_Wi_A, pre_nms_topk)
+        topk_scores_i, topk_idx = logits_i.topk(num_proposals_i, dim=1)
+        # each is N x topk
+        topk_proposals_i = proposals_i[batch_idx[:, None], topk_idx]  # N x topk x 5
+        topk_proposals.append(topk_proposals_i)
+        topk_scores.append(topk_scores_i)
+        level_ids.append(torch.full((num_proposals_i,), level_id, dtype=torch.int64, device=device))
+    # 2. Concat all levels together
+    topk_scores = cat(topk_scores, dim=1)
+    topk_proposals = cat(topk_proposals, dim=1)
+    level_ids = cat(level_ids, dim=0)
+    # 3. For each image, run a per-level NMS, and choose topk results.
+    results = []
+    for n, image_size in enumerate(image_sizes):
+        boxes = RotatedBoxes(topk_proposals[n])
+        scores_per_img = topk_scores[n]
+        lvl = level_ids
+        valid_mask = torch.isfinite(boxes.tensor).all(dim=1) & torch.isfinite(scores_per_img)
+        if not valid_mask.all():
+            if training:
+                raise FloatingPointError(
+                    "Predicted boxes or scores contain Inf/NaN. Training has diverged."
+                )
+            boxes = boxes[valid_mask]
+            scores_per_img = scores_per_img[valid_mask]
+            lvl = lvl[valid_mask]
+        boxes.clip(image_size)
+        # filter empty boxes
+        keep = boxes.nonempty(threshold=min_box_size)
+        if _is_tracing() or keep.sum().item() != len(boxes):
+            boxes, scores_per_img, lvl = (boxes[keep], scores_per_img[keep], lvl[keep])
+        keep = batched_nms_rotated(boxes.tensor, scores_per_img, lvl, nms_thresh)
+        # In Detectron1, there was different behavior during training vs. testing.
+        # (https://github.com/facebookresearch/Detectron/issues/459)
+        # During training, topk is over the proposals from *all* images in the training batch.
+        # During testing, it is over the proposals for each image separately.
+        # As a result, the training behavior becomes batch-dependent,
+        # and the configuration "POST_NMS_TOPK_TRAIN" end up relying on the batch size.
+        # This bug is addressed in Detectron2 to make the behavior independent of batch size.
+        keep = keep[:post_nms_topk]
+        res = Instances(image_size)
+        res.proposal_boxes = boxes[keep]
+        res.objectness_logits = scores_per_img[keep]
+        results.append(res)
+    return results
+@PROPOSAL_GENERATOR_REGISTRY.register()
+class RRPN(RPN):
+    """
+    Rotated Region Proposal Network described in :paper:`RRPN`.
+    """
+    @configurable
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        if self.anchor_boundary_thresh >= 0:
+            raise NotImplementedError(
+                "anchor_boundary_thresh is a legacy option not implemented for RRPN."
+            )
+    @classmethod
+    def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
+        ret = super().from_config(cfg, input_shape)
+        ret["box2box_transform"] = Box2BoxTransformRotated(weights=cfg.MODEL.RPN.BBOX_REG_WEIGHTS)
+        return ret
+    @torch.no_grad()
+    def label_and_sample_anchors(self, anchors: List[RotatedBoxes], gt_instances: List[Instances]):
+        """
+        Args:
+            anchors (list[RotatedBoxes]): anchors for each feature map.
+            gt_instances: the ground-truth instances for each image.
+        Returns:
+            list[Tensor]:
+                List of #img tensors. i-th element is a vector of labels whose length is
+                the total number of anchors across feature maps. Label values are in {-1, 0, 1},
+                with meanings: -1 = ignore; 0 = negative class; 1 = positive class.
+            list[Tensor]:
+                i-th element is a Nx5 tensor, where N is the total number of anchors across
+                feature maps.  The values are the matched gt boxes for each anchor.
+                Values are undefined for those anchors not labeled as 1.
+        """
+        anchors = RotatedBoxes.cat(anchors)
+        gt_boxes = [x.gt_boxes for x in gt_instances]
+        del gt_instances
+        gt_labels = []
+        matched_gt_boxes = []
+        for gt_boxes_i in gt_boxes:
+            """
+            gt_boxes_i: ground-truth boxes for i-th image
+            """
+            match_quality_matrix = retry_if_cuda_oom(pairwise_iou_rotated)(gt_boxes_i, anchors)
+            matched_idxs, gt_labels_i = retry_if_cuda_oom(self.anchor_matcher)(match_quality_matrix)
+            # Matching is memory-expensive and may result in CPU tensors. But the result is small
+            gt_labels_i = gt_labels_i.to(device=gt_boxes_i.device)
+            # A vector of labels (-1, 0, 1) for each anchor
+            gt_labels_i = self._subsample_labels(gt_labels_i)
+            if len(gt_boxes_i) == 0:
+                # These values won't be used anyway since the anchor is labeled as background
+                matched_gt_boxes_i = torch.zeros_like(anchors.tensor)
+            else:
+                # TODO wasted indexing computation for ignored boxes
+                matched_gt_boxes_i = gt_boxes_i[matched_idxs].tensor
+            gt_labels.append(gt_labels_i)  # N,AHW
+            matched_gt_boxes.append(matched_gt_boxes_i)
+        return gt_labels, matched_gt_boxes
+    @torch.no_grad()
+    def predict_proposals(self, anchors, pred_objectness_logits, pred_anchor_deltas, image_sizes):
+        pred_proposals = self._decode_proposals(anchors, pred_anchor_deltas)
+        return find_top_rrpn_proposals(
+            pred_proposals,
+            pred_objectness_logits,
+            image_sizes,
+            self.nms_thresh,
+            self.pre_nms_topk[self.training],
+            self.post_nms_topk[self.training],
+            self.min_box_size,
+            self.training,
+        )

extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/roi_heads/__init__.py ADDED Viewed

	@@ -0,0 +1,29 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+from .box_head import ROI_BOX_HEAD_REGISTRY, build_box_head, FastRCNNConvFCHead
+from .keypoint_head import (
+    ROI_KEYPOINT_HEAD_REGISTRY,
+    build_keypoint_head,
+    BaseKeypointRCNNHead,
+    KRCNNConvDeconvUpsampleHead,
+)
+from .mask_head import (
+    ROI_MASK_HEAD_REGISTRY,
+    build_mask_head,
+    BaseMaskRCNNHead,
+    MaskRCNNConvUpsampleHead,
+)
+from .roi_heads import (
+    ROI_HEADS_REGISTRY,
+    ROIHeads,
+    Res5ROIHeads,
+    StandardROIHeads,
+    build_roi_heads,
+    select_foreground_proposals,
+)
+from .cascade_rcnn import CascadeROIHeads
+from .rotated_fast_rcnn import RROIHeads
+from .fast_rcnn import FastRCNNOutputLayers
+from . import cascade_rcnn  # isort:skip
+__all__ = list(globals().keys())

extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/roi_heads/box_head.py ADDED Viewed

	@@ -0,0 +1,118 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+import numpy as np
+from typing import List
+import fvcore.nn.weight_init as weight_init
+import torch
+from torch import nn
+from annotator.oneformer.detectron2.config import configurable
+from annotator.oneformer.detectron2.layers import Conv2d, ShapeSpec, get_norm
+from annotator.oneformer.detectron2.utils.registry import Registry
+__all__ = ["FastRCNNConvFCHead", "build_box_head", "ROI_BOX_HEAD_REGISTRY"]
+ROI_BOX_HEAD_REGISTRY = Registry("ROI_BOX_HEAD")
+ROI_BOX_HEAD_REGISTRY.__doc__ = """
+Registry for box heads, which make box predictions from per-region features.
+The registered object will be called with `obj(cfg, input_shape)`.
+"""
+# To get torchscript support, we make the head a subclass of `nn.Sequential`.
+# Therefore, to add new layers in this head class, please make sure they are
+# added in the order they will be used in forward().
+@ROI_BOX_HEAD_REGISTRY.register()
+class FastRCNNConvFCHead(nn.Sequential):
+    """
+    A head with several 3x3 conv layers (each followed by norm & relu) and then
+    several fc layers (each followed by relu).
+    """
+    @configurable
+    def __init__(
+        self, input_shape: ShapeSpec, *, conv_dims: List[int], fc_dims: List[int], conv_norm=""
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            input_shape (ShapeSpec): shape of the input feature.
+            conv_dims (list[int]): the output dimensions of the conv layers
+            fc_dims (list[int]): the output dimensions of the fc layers
+            conv_norm (str or callable): normalization for the conv layers.
+                See :func:`detectron2.layers.get_norm` for supported types.
+        """
+        super().__init__()
+        assert len(conv_dims) + len(fc_dims) > 0
+        self._output_size = (input_shape.channels, input_shape.height, input_shape.width)
+        self.conv_norm_relus = []
+        for k, conv_dim in enumerate(conv_dims):
+            conv = Conv2d(
+                self._output_size[0],
+                conv_dim,
+                kernel_size=3,
+                padding=1,
+                bias=not conv_norm,
+                norm=get_norm(conv_norm, conv_dim),
+                activation=nn.ReLU(),
+            )
+            self.add_module("conv{}".format(k + 1), conv)
+            self.conv_norm_relus.append(conv)
+            self._output_size = (conv_dim, self._output_size[1], self._output_size[2])
+        self.fcs = []
+        for k, fc_dim in enumerate(fc_dims):
+            if k == 0:
+                self.add_module("flatten", nn.Flatten())
+            fc = nn.Linear(int(np.prod(self._output_size)), fc_dim)
+            self.add_module("fc{}".format(k + 1), fc)
+            self.add_module("fc_relu{}".format(k + 1), nn.ReLU())
+            self.fcs.append(fc)
+            self._output_size = fc_dim
+        for layer in self.conv_norm_relus:
+            weight_init.c2_msra_fill(layer)
+        for layer in self.fcs:
+            weight_init.c2_xavier_fill(layer)
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        num_conv = cfg.MODEL.ROI_BOX_HEAD.NUM_CONV
+        conv_dim = cfg.MODEL.ROI_BOX_HEAD.CONV_DIM
+        num_fc = cfg.MODEL.ROI_BOX_HEAD.NUM_FC
+        fc_dim = cfg.MODEL.ROI_BOX_HEAD.FC_DIM
+        return {
+            "input_shape": input_shape,
+            "conv_dims": [conv_dim] * num_conv,
+            "fc_dims": [fc_dim] * num_fc,
+            "conv_norm": cfg.MODEL.ROI_BOX_HEAD.NORM,
+        }
+    def forward(self, x):
+        for layer in self:
+            x = layer(x)
+        return x
+    @property
+    @torch.jit.unused
+    def output_shape(self):
+        """
+        Returns:
+            ShapeSpec: the output feature shape
+        """
+        o = self._output_size
+        if isinstance(o, int):
+            return ShapeSpec(channels=o)
+        else:
+            return ShapeSpec(channels=o[0], height=o[1], width=o[2])
+def build_box_head(cfg, input_shape):
+    """
+    Build a box head defined by `cfg.MODEL.ROI_BOX_HEAD.NAME`.
+    """
+    name = cfg.MODEL.ROI_BOX_HEAD.NAME
+    return ROI_BOX_HEAD_REGISTRY.get(name)(cfg, input_shape)

extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/roi_heads/cascade_rcnn.py ADDED Viewed

	@@ -0,0 +1,299 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+from typing import List
+import torch
+from torch import nn
+from torch.autograd.function import Function
+from annotator.oneformer.detectron2.config import configurable
+from annotator.oneformer.detectron2.layers import ShapeSpec
+from annotator.oneformer.detectron2.structures import Boxes, Instances, pairwise_iou
+from annotator.oneformer.detectron2.utils.events import get_event_storage
+from ..box_regression import Box2BoxTransform
+from ..matcher import Matcher
+from ..poolers import ROIPooler
+from .box_head import build_box_head
+from .fast_rcnn import FastRCNNOutputLayers, fast_rcnn_inference
+from .roi_heads import ROI_HEADS_REGISTRY, StandardROIHeads
+class _ScaleGradient(Function):
+    @staticmethod
+    def forward(ctx, input, scale):
+        ctx.scale = scale
+        return input
+    @staticmethod
+    def backward(ctx, grad_output):
+        return grad_output * ctx.scale, None
+@ROI_HEADS_REGISTRY.register()
+class CascadeROIHeads(StandardROIHeads):
+    """
+    The ROI heads that implement :paper:`Cascade R-CNN`.
+    """
+    @configurable
+    def __init__(
+        self,
+        *,
+        box_in_features: List[str],
+        box_pooler: ROIPooler,
+        box_heads: List[nn.Module],
+        box_predictors: List[nn.Module],
+        proposal_matchers: List[Matcher],
+        **kwargs,
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            box_pooler (ROIPooler): pooler that extracts region features from given boxes
+            box_heads (list[nn.Module]): box head for each cascade stage
+            box_predictors (list[nn.Module]): box predictor for each cascade stage
+            proposal_matchers (list[Matcher]): matcher with different IoU thresholds to
+                match boxes with ground truth for each stage. The first matcher matches
+                RPN proposals with ground truth, the other matchers use boxes predicted
+                by the previous stage as proposals and match them with ground truth.
+        """
+        assert "proposal_matcher" not in kwargs, (
+            "CascadeROIHeads takes 'proposal_matchers=' for each stage instead "
+            "of one 'proposal_matcher='."
+        )
+        # The first matcher matches RPN proposals with ground truth, done in the base class
+        kwargs["proposal_matcher"] = proposal_matchers[0]
+        num_stages = self.num_cascade_stages = len(box_heads)
+        box_heads = nn.ModuleList(box_heads)
+        box_predictors = nn.ModuleList(box_predictors)
+        assert len(box_predictors) == num_stages, f"{len(box_predictors)} != {num_stages}!"
+        assert len(proposal_matchers) == num_stages, f"{len(proposal_matchers)} != {num_stages}!"
+        super().__init__(
+            box_in_features=box_in_features,
+            box_pooler=box_pooler,
+            box_head=box_heads,
+            box_predictor=box_predictors,
+            **kwargs,
+        )
+        self.proposal_matchers = proposal_matchers
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        ret = super().from_config(cfg, input_shape)
+        ret.pop("proposal_matcher")
+        return ret
+    @classmethod
+    def _init_box_head(cls, cfg, input_shape):
+        # fmt: off
+        in_features              = cfg.MODEL.ROI_HEADS.IN_FEATURES
+        pooler_resolution        = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION
+        pooler_scales            = tuple(1.0 / input_shape[k].stride for k in in_features)
+        sampling_ratio           = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO
+        pooler_type              = cfg.MODEL.ROI_BOX_HEAD.POOLER_TYPE
+        cascade_bbox_reg_weights = cfg.MODEL.ROI_BOX_CASCADE_HEAD.BBOX_REG_WEIGHTS
+        cascade_ious             = cfg.MODEL.ROI_BOX_CASCADE_HEAD.IOUS
+        assert len(cascade_bbox_reg_weights) == len(cascade_ious)
+        assert cfg.MODEL.ROI_BOX_HEAD.CLS_AGNOSTIC_BBOX_REG,  \
+            "CascadeROIHeads only support class-agnostic regression now!"
+        assert cascade_ious[0] == cfg.MODEL.ROI_HEADS.IOU_THRESHOLDS[0]
+        # fmt: on
+        in_channels = [input_shape[f].channels for f in in_features]
+        # Check all channel counts are equal
+        assert len(set(in_channels)) == 1, in_channels
+        in_channels = in_channels[0]
+        box_pooler = ROIPooler(
+            output_size=pooler_resolution,
+            scales=pooler_scales,
+            sampling_ratio=sampling_ratio,
+            pooler_type=pooler_type,
+        )
+        pooled_shape = ShapeSpec(
+            channels=in_channels, width=pooler_resolution, height=pooler_resolution
+        )
+        box_heads, box_predictors, proposal_matchers = [], [], []
+        for match_iou, bbox_reg_weights in zip(cascade_ious, cascade_bbox_reg_weights):
+            box_head = build_box_head(cfg, pooled_shape)
+            box_heads.append(box_head)
+            box_predictors.append(
+                FastRCNNOutputLayers(
+                    cfg,
+                    box_head.output_shape,
+                    box2box_transform=Box2BoxTransform(weights=bbox_reg_weights),
+                )
+            )
+            proposal_matchers.append(Matcher([match_iou], [0, 1], allow_low_quality_matches=False))
+        return {
+            "box_in_features": in_features,
+            "box_pooler": box_pooler,
+            "box_heads": box_heads,
+            "box_predictors": box_predictors,
+            "proposal_matchers": proposal_matchers,
+        }
+    def forward(self, images, features, proposals, targets=None):
+        del images
+        if self.training:
+            proposals = self.label_and_sample_proposals(proposals, targets)
+        if self.training:
+            # Need targets to box head
+            losses = self._forward_box(features, proposals, targets)
+            losses.update(self._forward_mask(features, proposals))
+            losses.update(self._forward_keypoint(features, proposals))
+            return proposals, losses
+        else:
+            pred_instances = self._forward_box(features, proposals)
+            pred_instances = self.forward_with_given_boxes(features, pred_instances)
+            return pred_instances, {}
+    def _forward_box(self, features, proposals, targets=None):
+        """
+        Args:
+            features, targets: the same as in
+                Same as in :meth:`ROIHeads.forward`.
+            proposals (list[Instances]): the per-image object proposals with
+                their matching ground truth.
+                Each has fields "proposal_boxes", and "objectness_logits",
+                "gt_classes", "gt_boxes".
+        """
+        features = [features[f] for f in self.box_in_features]
+        head_outputs = []  # (predictor, predictions, proposals)
+        prev_pred_boxes = None
+        image_sizes = [x.image_size for x in proposals]
+        for k in range(self.num_cascade_stages):
+            if k > 0:
+                # The output boxes of the previous stage are used to create the input
+                # proposals of the next stage.
+                proposals = self._create_proposals_from_boxes(prev_pred_boxes, image_sizes)
+                if self.training:
+                    proposals = self._match_and_label_boxes(proposals, k, targets)
+            predictions = self._run_stage(features, proposals, k)
+            prev_pred_boxes = self.box_predictor[k].predict_boxes(predictions, proposals)
+            head_outputs.append((self.box_predictor[k], predictions, proposals))
+        if self.training:
+            losses = {}
+            storage = get_event_storage()
+            for stage, (predictor, predictions, proposals) in enumerate(head_outputs):
+                with storage.name_scope("stage{}".format(stage)):
+                    stage_losses = predictor.losses(predictions, proposals)
+                losses.update({k + "_stage{}".format(stage): v for k, v in stage_losses.items()})
+            return losses
+        else:
+            # Each is a list[Tensor] of length #image. Each tensor is Ri x (K+1)
+            scores_per_stage = [h[0].predict_probs(h[1], h[2]) for h in head_outputs]
+            # Average the scores across heads
+            scores = [
+                sum(list(scores_per_image)) * (1.0 / self.num_cascade_stages)
+                for scores_per_image in zip(*scores_per_stage)
+            ]
+            # Use the boxes of the last head
+            predictor, predictions, proposals = head_outputs[-1]
+            boxes = predictor.predict_boxes(predictions, proposals)
+            pred_instances, _ = fast_rcnn_inference(
+                boxes,
+                scores,
+                image_sizes,
+                predictor.test_score_thresh,
+                predictor.test_nms_thresh,
+                predictor.test_topk_per_image,
+            )
+            return pred_instances
+    @torch.no_grad()
+    def _match_and_label_boxes(self, proposals, stage, targets):
+        """
+        Match proposals with groundtruth using the matcher at the given stage.
+        Label the proposals as foreground or background based on the match.
+        Args:
+            proposals (list[Instances]): One Instances for each image, with
+                the field "proposal_boxes".
+            stage (int): the current stage
+            targets (list[Instances]): the ground truth instances
+        Returns:
+            list[Instances]: the same proposals, but with fields "gt_classes" and "gt_boxes"
+        """
+        num_fg_samples, num_bg_samples = [], []
+        for proposals_per_image, targets_per_image in zip(proposals, targets):
+            match_quality_matrix = pairwise_iou(
+                targets_per_image.gt_boxes, proposals_per_image.proposal_boxes
+            )
+            # proposal_labels are 0 or 1
+            matched_idxs, proposal_labels = self.proposal_matchers[stage](match_quality_matrix)
+            if len(targets_per_image) > 0:
+                gt_classes = targets_per_image.gt_classes[matched_idxs]
+                # Label unmatched proposals (0 label from matcher) as background (label=num_classes)
+                gt_classes[proposal_labels == 0] = self.num_classes
+                gt_boxes = targets_per_image.gt_boxes[matched_idxs]
+            else:
+                gt_classes = torch.zeros_like(matched_idxs) + self.num_classes
+                gt_boxes = Boxes(
+                    targets_per_image.gt_boxes.tensor.new_zeros((len(proposals_per_image), 4))
+                )
+            proposals_per_image.gt_classes = gt_classes
+            proposals_per_image.gt_boxes = gt_boxes
+            num_fg_samples.append((proposal_labels == 1).sum().item())
+            num_bg_samples.append(proposal_labels.numel() - num_fg_samples[-1])
+        # Log the number of fg/bg samples in each stage
+        storage = get_event_storage()
+        storage.put_scalar(
+            "stage{}/roi_head/num_fg_samples".format(stage),
+            sum(num_fg_samples) / len(num_fg_samples),
+        )
+        storage.put_scalar(
+            "stage{}/roi_head/num_bg_samples".format(stage),
+            sum(num_bg_samples) / len(num_bg_samples),
+        )
+        return proposals
+    def _run_stage(self, features, proposals, stage):
+        """
+        Args:
+            features (list[Tensor]): #lvl input features to ROIHeads
+            proposals (list[Instances]): #image Instances, with the field "proposal_boxes"
+            stage (int): the current stage
+        Returns:
+            Same output as `FastRCNNOutputLayers.forward()`.
+        """
+        box_features = self.box_pooler(features, [x.proposal_boxes for x in proposals])
+        # The original implementation averages the losses among heads,
+        # but scale up the parameter gradients of the heads.
+        # This is equivalent to adding the losses among heads,
+        # but scale down the gradients on features.
+        if self.training:
+            box_features = _ScaleGradient.apply(box_features, 1.0 / self.num_cascade_stages)
+        box_features = self.box_head[stage](box_features)
+        return self.box_predictor[stage](box_features)
+    def _create_proposals_from_boxes(self, boxes, image_sizes):
+        """
+        Args:
+            boxes (list[Tensor]): per-image predicted boxes, each of shape Ri x 4
+            image_sizes (list[tuple]): list of image shapes in (h, w)
+        Returns:
+            list[Instances]: per-image proposals with the given boxes.
+        """
+        # Just like RPN, the proposals should not have gradients
+        boxes = [Boxes(b.detach()) for b in boxes]
+        proposals = []
+        for boxes_per_image, image_size in zip(boxes, image_sizes):
+            boxes_per_image.clip(image_size)
+            if self.training:
+                # do not filter empty boxes at inference time,
+                # because the scores from each stage need to be aligned and added later
+                boxes_per_image = boxes_per_image[boxes_per_image.nonempty()]
+            prop = Instances(image_size)
+            prop.proposal_boxes = boxes_per_image
+            proposals.append(prop)
+        return proposals

extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/roi_heads/fast_rcnn.py ADDED Viewed

	@@ -0,0 +1,569 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+import logging
+from typing import Callable, Dict, List, Optional, Tuple, Union
+import torch
+from torch import nn
+from torch.nn import functional as F
+from annotator.oneformer.detectron2.config import configurable
+from annotator.oneformer.detectron2.data.detection_utils import get_fed_loss_cls_weights
+from annotator.oneformer.detectron2.layers import ShapeSpec, batched_nms, cat, cross_entropy, nonzero_tuple
+from annotator.oneformer.detectron2.modeling.box_regression import Box2BoxTransform, _dense_box_regression_loss
+from annotator.oneformer.detectron2.structures import Boxes, Instances
+from annotator.oneformer.detectron2.utils.events import get_event_storage
+__all__ = ["fast_rcnn_inference", "FastRCNNOutputLayers"]
+logger = logging.getLogger(__name__)
+"""
+Shape shorthand in this module:
+    N: number of images in the minibatch
+    R: number of ROIs, combined over all images, in the minibatch
+    Ri: number of ROIs in image i
+    K: number of foreground classes. E.g.,there are 80 foreground classes in COCO.
+Naming convention:
+    deltas: refers to the 4-d (dx, dy, dw, dh) deltas that parameterize the box2box
+    transform (see :class:`box_regression.Box2BoxTransform`).
+    pred_class_logits: predicted class scores in [-inf, +inf]; use
+        softmax(pred_class_logits) to estimate P(class).
+    gt_classes: ground-truth classification labels in [0, K], where [0, K) represent
+        foreground object classes and K represents the background class.
+    pred_proposal_deltas: predicted box2box transform deltas for transforming proposals
+        to detection box predictions.
+    gt_proposal_deltas: ground-truth box2box transform deltas
+"""
+def fast_rcnn_inference(
+    boxes: List[torch.Tensor],
+    scores: List[torch.Tensor],
+    image_shapes: List[Tuple[int, int]],
+    score_thresh: float,
+    nms_thresh: float,
+    topk_per_image: int,
+):
+    """
+    Call `fast_rcnn_inference_single_image` for all images.
+    Args:
+        boxes (list[Tensor]): A list of Tensors of predicted class-specific or class-agnostic
+            boxes for each image. Element i has shape (Ri, K * 4) if doing
+            class-specific regression, or (Ri, 4) if doing class-agnostic
+            regression, where Ri is the number of predicted objects for image i.
+            This is compatible with the output of :meth:`FastRCNNOutputLayers.predict_boxes`.
+        scores (list[Tensor]): A list of Tensors of predicted class scores for each image.
+            Element i has shape (Ri, K + 1), where Ri is the number of predicted objects
+            for image i. Compatible with the output of :meth:`FastRCNNOutputLayers.predict_probs`.
+        image_shapes (list[tuple]): A list of (width, height) tuples for each image in the batch.
+        score_thresh (float): Only return detections with a confidence score exceeding this
+            threshold.
+        nms_thresh (float):  The threshold to use for box non-maximum suppression. Value in [0, 1].
+        topk_per_image (int): The number of top scoring detections to return. Set < 0 to return
+            all detections.
+    Returns:
+        instances: (list[Instances]): A list of N instances, one for each image in the batch,
+            that stores the topk most confidence detections.
+        kept_indices: (list[Tensor]): A list of 1D tensor of length of N, each element indicates
+            the corresponding boxes/scores index in [0, Ri) from the input, for image i.
+    """
+    result_per_image = [
+        fast_rcnn_inference_single_image(
+            boxes_per_image, scores_per_image, image_shape, score_thresh, nms_thresh, topk_per_image
+        )
+        for scores_per_image, boxes_per_image, image_shape in zip(scores, boxes, image_shapes)
+    ]
+    return [x[0] for x in result_per_image], [x[1] for x in result_per_image]
+def _log_classification_stats(pred_logits, gt_classes, prefix="fast_rcnn"):
+    """
+    Log the classification metrics to EventStorage.
+    Args:
+        pred_logits: Rx(K+1) logits. The last column is for background class.
+        gt_classes: R labels
+    """
+    num_instances = gt_classes.numel()
+    if num_instances == 0:
+        return
+    pred_classes = pred_logits.argmax(dim=1)
+    bg_class_ind = pred_logits.shape[1] - 1
+    fg_inds = (gt_classes >= 0) & (gt_classes < bg_class_ind)
+    num_fg = fg_inds.nonzero().numel()
+    fg_gt_classes = gt_classes[fg_inds]
+    fg_pred_classes = pred_classes[fg_inds]
+    num_false_negative = (fg_pred_classes == bg_class_ind).nonzero().numel()
+    num_accurate = (pred_classes == gt_classes).nonzero().numel()
+    fg_num_accurate = (fg_pred_classes == fg_gt_classes).nonzero().numel()
+    storage = get_event_storage()
+    storage.put_scalar(f"{prefix}/cls_accuracy", num_accurate / num_instances)
+    if num_fg > 0:
+        storage.put_scalar(f"{prefix}/fg_cls_accuracy", fg_num_accurate / num_fg)
+        storage.put_scalar(f"{prefix}/false_negative", num_false_negative / num_fg)
+def fast_rcnn_inference_single_image(
+    boxes,
+    scores,
+    image_shape: Tuple[int, int],
+    score_thresh: float,
+    nms_thresh: float,
+    topk_per_image: int,
+):
+    """
+    Single-image inference. Return bounding-box detection results by thresholding
+    on scores and applying non-maximum suppression (NMS).
+    Args:
+        Same as `fast_rcnn_inference`, but with boxes, scores, and image shapes
+        per image.
+    Returns:
+        Same as `fast_rcnn_inference`, but for only one image.
+    """
+    valid_mask = torch.isfinite(boxes).all(dim=1) & torch.isfinite(scores).all(dim=1)
+    if not valid_mask.all():
+        boxes = boxes[valid_mask]
+        scores = scores[valid_mask]
+    scores = scores[:, :-1]
+    num_bbox_reg_classes = boxes.shape[1] // 4
+    # Convert to Boxes to use the `clip` function ...
+    boxes = Boxes(boxes.reshape(-1, 4))
+    boxes.clip(image_shape)
+    boxes = boxes.tensor.view(-1, num_bbox_reg_classes, 4)  # R x C x 4
+    # 1. Filter results based on detection scores. It can make NMS more efficient
+    #    by filtering out low-confidence detections.
+    filter_mask = scores > score_thresh  # R x K
+    # R' x 2. First column contains indices of the R predictions;
+    # Second column contains indices of classes.
+    filter_inds = filter_mask.nonzero()
+    if num_bbox_reg_classes == 1:
+        boxes = boxes[filter_inds[:, 0], 0]
+    else:
+        boxes = boxes[filter_mask]
+    scores = scores[filter_mask]
+    # 2. Apply NMS for each class independently.
+    keep = batched_nms(boxes, scores, filter_inds[:, 1], nms_thresh)
+    if topk_per_image >= 0:
+        keep = keep[:topk_per_image]
+    boxes, scores, filter_inds = boxes[keep], scores[keep], filter_inds[keep]
+    result = Instances(image_shape)
+    result.pred_boxes = Boxes(boxes)
+    result.scores = scores
+    result.pred_classes = filter_inds[:, 1]
+    return result, filter_inds[:, 0]
+class FastRCNNOutputLayers(nn.Module):
+    """
+    Two linear layers for predicting Fast R-CNN outputs:
+    1. proposal-to-detection box regression deltas
+    2. classification scores
+    """
+    @configurable
+    def __init__(
+        self,
+        input_shape: ShapeSpec,
+        *,
+        box2box_transform,
+        num_classes: int,
+        test_score_thresh: float = 0.0,
+        test_nms_thresh: float = 0.5,
+        test_topk_per_image: int = 100,
+        cls_agnostic_bbox_reg: bool = False,
+        smooth_l1_beta: float = 0.0,
+        box_reg_loss_type: str = "smooth_l1",
+        loss_weight: Union[float, Dict[str, float]] = 1.0,
+        use_fed_loss: bool = False,
+        use_sigmoid_ce: bool = False,
+        get_fed_loss_cls_weights: Optional[Callable] = None,
+        fed_loss_num_classes: int = 50,
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            input_shape (ShapeSpec): shape of the input feature to this module
+            box2box_transform (Box2BoxTransform or Box2BoxTransformRotated):
+            num_classes (int): number of foreground classes
+            test_score_thresh (float): threshold to filter predictions results.
+            test_nms_thresh (float): NMS threshold for prediction results.
+            test_topk_per_image (int): number of top predictions to produce per image.
+            cls_agnostic_bbox_reg (bool): whether to use class agnostic for bbox regression
+            smooth_l1_beta (float): transition point from L1 to L2 loss. Only used if
+                `box_reg_loss_type` is "smooth_l1"
+            box_reg_loss_type (str): Box regression loss type. One of: "smooth_l1", "giou",
+                "diou", "ciou"
+            loss_weight (float|dict): weights to use for losses. Can be single float for weighting
+                all losses, or a dict of individual weightings. Valid dict keys are:
+                    * "loss_cls": applied to classification loss
+                    * "loss_box_reg": applied to box regression loss
+            use_fed_loss (bool): whether to use federated loss which samples additional negative
+                classes to calculate the loss
+            use_sigmoid_ce (bool): whether to calculate the loss using weighted average of binary
+                cross entropy with logits. This could be used together with federated loss
+            get_fed_loss_cls_weights (Callable): a callable which takes dataset name and frequency
+                weight power, and returns the probabilities to sample negative classes for
+                federated loss. The implementation can be found in
+                detectron2/data/detection_utils.py
+            fed_loss_num_classes (int): number of federated classes to keep in total
+        """
+        super().__init__()
+        if isinstance(input_shape, int):  # some backward compatibility
+            input_shape = ShapeSpec(channels=input_shape)
+        self.num_classes = num_classes
+        input_size = input_shape.channels * (input_shape.width or 1) * (input_shape.height or 1)
+        # prediction layer for num_classes foreground classes and one background class (hence + 1)
+        self.cls_score = nn.Linear(input_size, num_classes + 1)
+        num_bbox_reg_classes = 1 if cls_agnostic_bbox_reg else num_classes
+        box_dim = len(box2box_transform.weights)
+        self.bbox_pred = nn.Linear(input_size, num_bbox_reg_classes * box_dim)
+        nn.init.normal_(self.cls_score.weight, std=0.01)
+        nn.init.normal_(self.bbox_pred.weight, std=0.001)
+        for l in [self.cls_score, self.bbox_pred]:
+            nn.init.constant_(l.bias, 0)
+        self.box2box_transform = box2box_transform
+        self.smooth_l1_beta = smooth_l1_beta
+        self.test_score_thresh = test_score_thresh
+        self.test_nms_thresh = test_nms_thresh
+        self.test_topk_per_image = test_topk_per_image
+        self.box_reg_loss_type = box_reg_loss_type
+        if isinstance(loss_weight, float):
+            loss_weight = {"loss_cls": loss_weight, "loss_box_reg": loss_weight}
+        self.loss_weight = loss_weight
+        self.use_fed_loss = use_fed_loss
+        self.use_sigmoid_ce = use_sigmoid_ce
+        self.fed_loss_num_classes = fed_loss_num_classes
+        if self.use_fed_loss:
+            assert self.use_sigmoid_ce, "Please use sigmoid cross entropy loss with federated loss"
+            fed_loss_cls_weights = get_fed_loss_cls_weights()
+            assert (
+                len(fed_loss_cls_weights) == self.num_classes
+            ), "Please check the provided fed_loss_cls_weights. Their size should match num_classes"
+            self.register_buffer("fed_loss_cls_weights", fed_loss_cls_weights)
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {
+            "input_shape": input_shape,
+            "box2box_transform": Box2BoxTransform(weights=cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS),
+            # fmt: off
+            "num_classes"               : cfg.MODEL.ROI_HEADS.NUM_CLASSES,
+            "cls_agnostic_bbox_reg"     : cfg.MODEL.ROI_BOX_HEAD.CLS_AGNOSTIC_BBOX_REG,
+            "smooth_l1_beta"            : cfg.MODEL.ROI_BOX_HEAD.SMOOTH_L1_BETA,
+            "test_score_thresh"         : cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST,
+            "test_nms_thresh"           : cfg.MODEL.ROI_HEADS.NMS_THRESH_TEST,
+            "test_topk_per_image"       : cfg.TEST.DETECTIONS_PER_IMAGE,
+            "box_reg_loss_type"         : cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_LOSS_TYPE,
+            "loss_weight"               : {"loss_box_reg": cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_LOSS_WEIGHT},  # noqa
+            "use_fed_loss"              : cfg.MODEL.ROI_BOX_HEAD.USE_FED_LOSS,
+            "use_sigmoid_ce"            : cfg.MODEL.ROI_BOX_HEAD.USE_SIGMOID_CE,
+            "get_fed_loss_cls_weights"  : lambda: get_fed_loss_cls_weights(dataset_names=cfg.DATASETS.TRAIN, freq_weight_power=cfg.MODEL.ROI_BOX_HEAD.FED_LOSS_FREQ_WEIGHT_POWER),  # noqa
+            "fed_loss_num_classes"      : cfg.MODEL.ROI_BOX_HEAD.FED_LOSS_NUM_CLASSES,
+            # fmt: on
+        }
+    def forward(self, x):
+        """
+        Args:
+            x: per-region features of shape (N, ...) for N bounding boxes to predict.
+        Returns:
+            (Tensor, Tensor):
+            First tensor: shape (N,K+1), scores for each of the N box. Each row contains the
+            scores for K object categories and 1 background class.
+            Second tensor: bounding box regression deltas for each box. Shape is shape (N,Kx4),
+            or (N,4) for class-agnostic regression.
+        """
+        if x.dim() > 2:
+            x = torch.flatten(x, start_dim=1)
+        scores = self.cls_score(x)
+        proposal_deltas = self.bbox_pred(x)
+        return scores, proposal_deltas
+    def losses(self, predictions, proposals):
+        """
+        Args:
+            predictions: return values of :meth:`forward()`.
+            proposals (list[Instances]): proposals that match the features that were used
+                to compute predictions. The fields ``proposal_boxes``, ``gt_boxes``,
+                ``gt_classes`` are expected.
+        Returns:
+            Dict[str, Tensor]: dict of losses
+        """
+        scores, proposal_deltas = predictions
+        # parse classification outputs
+        gt_classes = (
+            cat([p.gt_classes for p in proposals], dim=0) if len(proposals) else torch.empty(0)
+        )
+        _log_classification_stats(scores, gt_classes)
+        # parse box regression outputs
+        if len(proposals):
+            proposal_boxes = cat([p.proposal_boxes.tensor for p in proposals], dim=0)  # Nx4
+            assert not proposal_boxes.requires_grad, "Proposals should not require gradients!"
+            # If "gt_boxes" does not exist, the proposals must be all negative and
+            # should not be included in regression loss computation.
+            # Here we just use proposal_boxes as an arbitrary placeholder because its
+            # value won't be used in self.box_reg_loss().
+            gt_boxes = cat(
+                [(p.gt_boxes if p.has("gt_boxes") else p.proposal_boxes).tensor for p in proposals],
+                dim=0,
+            )
+        else:
+            proposal_boxes = gt_boxes = torch.empty((0, 4), device=proposal_deltas.device)
+        if self.use_sigmoid_ce:
+            loss_cls = self.sigmoid_cross_entropy_loss(scores, gt_classes)
+        else:
+            loss_cls = cross_entropy(scores, gt_classes, reduction="mean")
+        losses = {
+            "loss_cls": loss_cls,
+            "loss_box_reg": self.box_reg_loss(
+                proposal_boxes, gt_boxes, proposal_deltas, gt_classes
+            ),
+        }
+        return {k: v * self.loss_weight.get(k, 1.0) for k, v in losses.items()}
+    # Implementation from https://github.com/xingyizhou/CenterNet2/blob/master/projects/CenterNet2/centernet/modeling/roi_heads/fed_loss.py  # noqa
+    # with slight modifications
+    def get_fed_loss_classes(self, gt_classes, num_fed_loss_classes, num_classes, weight):
+        """
+        Args:
+            gt_classes: a long tensor of shape R that contains the gt class label of each proposal.
+            num_fed_loss_classes: minimum number of classes to keep when calculating federated loss.
+            Will sample negative classes if number of unique gt_classes is smaller than this value.
+            num_classes: number of foreground classes
+            weight: probabilities used to sample negative classes
+        Returns:
+            Tensor:
+                classes to keep when calculating the federated loss, including both unique gt
+                classes and sampled negative classes.
+        """
+        unique_gt_classes = torch.unique(gt_classes)
+        prob = unique_gt_classes.new_ones(num_classes + 1).float()
+        prob[-1] = 0
+        if len(unique_gt_classes) < num_fed_loss_classes:
+            prob[:num_classes] = weight.float().clone()
+            prob[unique_gt_classes] = 0
+            sampled_negative_classes = torch.multinomial(
+                prob, num_fed_loss_classes - len(unique_gt_classes), replacement=False
+            )
+            fed_loss_classes = torch.cat([unique_gt_classes, sampled_negative_classes])
+        else:
+            fed_loss_classes = unique_gt_classes
+        return fed_loss_classes
+    # Implementation from https://github.com/xingyizhou/CenterNet2/blob/master/projects/CenterNet2/centernet/modeling/roi_heads/custom_fast_rcnn.py#L113  # noqa
+    # with slight modifications
+    def sigmoid_cross_entropy_loss(self, pred_class_logits, gt_classes):
+        """
+        Args:
+            pred_class_logits: shape (N, K+1), scores for each of the N box. Each row contains the
+            scores for K object categories and 1 background class
+            gt_classes: a long tensor of shape R that contains the gt class label of each proposal.
+        """
+        if pred_class_logits.numel() == 0:
+            return pred_class_logits.new_zeros([1])[0]
+        N = pred_class_logits.shape[0]
+        K = pred_class_logits.shape[1] - 1
+        target = pred_class_logits.new_zeros(N, K + 1)
+        target[range(len(gt_classes)), gt_classes] = 1
+        target = target[:, :K]
+        cls_loss = F.binary_cross_entropy_with_logits(
+            pred_class_logits[:, :-1], target, reduction="none"
+        )
+        if self.use_fed_loss:
+            fed_loss_classes = self.get_fed_loss_classes(
+                gt_classes,
+                num_fed_loss_classes=self.fed_loss_num_classes,
+                num_classes=K,
+                weight=self.fed_loss_cls_weights,
+            )
+            fed_loss_classes_mask = fed_loss_classes.new_zeros(K + 1)
+            fed_loss_classes_mask[fed_loss_classes] = 1
+            fed_loss_classes_mask = fed_loss_classes_mask[:K]
+            weight = fed_loss_classes_mask.view(1, K).expand(N, K).float()
+        else:
+            weight = 1
+        loss = torch.sum(cls_loss * weight) / N
+        return loss
+    def box_reg_loss(self, proposal_boxes, gt_boxes, pred_deltas, gt_classes):
+        """
+        Args:
+            proposal_boxes/gt_boxes are tensors with the same shape (R, 4 or 5).
+            pred_deltas has shape (R, 4 or 5), or (R, num_classes * (4 or 5)).
+            gt_classes is a long tensor of shape R, the gt class label of each proposal.
+            R shall be the number of proposals.
+        """
+        box_dim = proposal_boxes.shape[1]  # 4 or 5
+        # Regression loss is only computed for foreground proposals (those matched to a GT)
+        fg_inds = nonzero_tuple((gt_classes >= 0) & (gt_classes < self.num_classes))[0]
+        if pred_deltas.shape[1] == box_dim:  # cls-agnostic regression
+            fg_pred_deltas = pred_deltas[fg_inds]
+        else:
+            fg_pred_deltas = pred_deltas.view(-1, self.num_classes, box_dim)[
+                fg_inds, gt_classes[fg_inds]
+            ]
+        loss_box_reg = _dense_box_regression_loss(
+            [proposal_boxes[fg_inds]],
+            self.box2box_transform,
+            [fg_pred_deltas.unsqueeze(0)],
+            [gt_boxes[fg_inds]],
+            ...,
+            self.box_reg_loss_type,
+            self.smooth_l1_beta,
+        )
+        # The reg loss is normalized using the total number of regions (R), not the number
+        # of foreground regions even though the box regression loss is only defined on
+        # foreground regions. Why? Because doing so gives equal training influence to
+        # each foreground example. To see how, consider two different minibatches:
+        #  (1) Contains a single foreground region
+        #  (2) Contains 100 foreground regions
+        # If we normalize by the number of foreground regions, the single example in
+        # minibatch (1) will be given 100 times as much influence as each foreground
+        # example in minibatch (2). Normalizing by the total number of regions, R,
+        # means that the single example in minibatch (1) and each of the 100 examples
+        # in minibatch (2) are given equal influence.
+        return loss_box_reg / max(gt_classes.numel(), 1.0)  # return 0 if empty
+    def inference(self, predictions: Tuple[torch.Tensor, torch.Tensor], proposals: List[Instances]):
+        """
+        Args:
+            predictions: return values of :meth:`forward()`.
+            proposals (list[Instances]): proposals that match the features that were
+                used to compute predictions. The ``proposal_boxes`` field is expected.
+        Returns:
+            list[Instances]: same as `fast_rcnn_inference`.
+            list[Tensor]: same as `fast_rcnn_inference`.
+        """
+        boxes = self.predict_boxes(predictions, proposals)
+        scores = self.predict_probs(predictions, proposals)
+        image_shapes = [x.image_size for x in proposals]
+        return fast_rcnn_inference(
+            boxes,
+            scores,
+            image_shapes,
+            self.test_score_thresh,
+            self.test_nms_thresh,
+            self.test_topk_per_image,
+        )
+    def predict_boxes_for_gt_classes(self, predictions, proposals):
+        """
+        Args:
+            predictions: return values of :meth:`forward()`.
+            proposals (list[Instances]): proposals that match the features that were used
+                to compute predictions. The fields ``proposal_boxes``, ``gt_classes`` are expected.
+        Returns:
+            list[Tensor]:
+                A list of Tensors of predicted boxes for GT classes in case of
+                class-specific box head. Element i of the list has shape (Ri, B), where Ri is
+                the number of proposals for image i and B is the box dimension (4 or 5)
+        """
+        if not len(proposals):
+            return []
+        scores, proposal_deltas = predictions
+        proposal_boxes = cat([p.proposal_boxes.tensor for p in proposals], dim=0)
+        N, B = proposal_boxes.shape
+        predict_boxes = self.box2box_transform.apply_deltas(
+            proposal_deltas, proposal_boxes
+        )  # Nx(KxB)
+        K = predict_boxes.shape[1] // B
+        if K > 1:
+            gt_classes = torch.cat([p.gt_classes for p in proposals], dim=0)
+            # Some proposals are ignored or have a background class. Their gt_classes
+            # cannot be used as index.
+            gt_classes = gt_classes.clamp_(0, K - 1)
+            predict_boxes = predict_boxes.view(N, K, B)[
+                torch.arange(N, dtype=torch.long, device=predict_boxes.device), gt_classes
+            ]
+        num_prop_per_image = [len(p) for p in proposals]
+        return predict_boxes.split(num_prop_per_image)
+    def predict_boxes(
+        self, predictions: Tuple[torch.Tensor, torch.Tensor], proposals: List[Instances]
+    ):
+        """
+        Args:
+            predictions: return values of :meth:`forward()`.
+            proposals (list[Instances]): proposals that match the features that were
+                used to compute predictions. The ``proposal_boxes`` field is expected.
+        Returns:
+            list[Tensor]:
+                A list of Tensors of predicted class-specific or class-agnostic boxes
+                for each image. Element i has shape (Ri, K * B) or (Ri, B), where Ri is
+                the number of proposals for image i and B is the box dimension (4 or 5)
+        """
+        if not len(proposals):
+            return []
+        _, proposal_deltas = predictions
+        num_prop_per_image = [len(p) for p in proposals]
+        proposal_boxes = cat([p.proposal_boxes.tensor for p in proposals], dim=0)
+        predict_boxes = self.box2box_transform.apply_deltas(
+            proposal_deltas,
+            proposal_boxes,
+        )  # Nx(KxB)
+        return predict_boxes.split(num_prop_per_image)
+    def predict_probs(
+        self, predictions: Tuple[torch.Tensor, torch.Tensor], proposals: List[Instances]
+    ):
+        """
+        Args:
+            predictions: return values of :meth:`forward()`.
+            proposals (list[Instances]): proposals that match the features that were
+                used to compute predictions.
+        Returns:
+            list[Tensor]:
+                A list of Tensors of predicted class probabilities for each image.
+                Element i has shape (Ri, K + 1), where Ri is the number of proposals for image i.
+        """
+        scores, _ = predictions
+        num_inst_per_image = [len(p) for p in proposals]
+        if self.use_sigmoid_ce:
+            probs = scores.sigmoid()
+        else:
+            probs = F.softmax(scores, dim=-1)
+        return probs.split(num_inst_per_image, dim=0)

extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/roi_heads/keypoint_head.py ADDED Viewed

	@@ -0,0 +1,272 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+from typing import List
+import torch
+from torch import nn
+from torch.nn import functional as F
+from annotator.oneformer.detectron2.config import configurable
+from annotator.oneformer.detectron2.layers import Conv2d, ConvTranspose2d, cat, interpolate
+from annotator.oneformer.detectron2.structures import Instances, heatmaps_to_keypoints
+from annotator.oneformer.detectron2.utils.events import get_event_storage
+from annotator.oneformer.detectron2.utils.registry import Registry
+_TOTAL_SKIPPED = 0
+__all__ = [
+    "ROI_KEYPOINT_HEAD_REGISTRY",
+    "build_keypoint_head",
+    "BaseKeypointRCNNHead",
+    "KRCNNConvDeconvUpsampleHead",
+]
+ROI_KEYPOINT_HEAD_REGISTRY = Registry("ROI_KEYPOINT_HEAD")
+ROI_KEYPOINT_HEAD_REGISTRY.__doc__ = """
+Registry for keypoint heads, which make keypoint predictions from per-region features.
+The registered object will be called with `obj(cfg, input_shape)`.
+"""
+def build_keypoint_head(cfg, input_shape):
+    """
+    Build a keypoint head from `cfg.MODEL.ROI_KEYPOINT_HEAD.NAME`.
+    """
+    name = cfg.MODEL.ROI_KEYPOINT_HEAD.NAME
+    return ROI_KEYPOINT_HEAD_REGISTRY.get(name)(cfg, input_shape)
+def keypoint_rcnn_loss(pred_keypoint_logits, instances, normalizer):
+    """
+    Arguments:
+        pred_keypoint_logits (Tensor): A tensor of shape (N, K, S, S) where N is the total number
+            of instances in the batch, K is the number of keypoints, and S is the side length
+            of the keypoint heatmap. The values are spatial logits.
+        instances (list[Instances]): A list of M Instances, where M is the batch size.
+            These instances are predictions from the model
+            that are in 1:1 correspondence with pred_keypoint_logits.
+            Each Instances should contain a `gt_keypoints` field containing a `structures.Keypoint`
+            instance.
+        normalizer (float): Normalize the loss by this amount.
+            If not specified, we normalize by the number of visible keypoints in the minibatch.
+    Returns a scalar tensor containing the loss.
+    """
+    heatmaps = []
+    valid = []
+    keypoint_side_len = pred_keypoint_logits.shape[2]
+    for instances_per_image in instances:
+        if len(instances_per_image) == 0:
+            continue
+        keypoints = instances_per_image.gt_keypoints
+        heatmaps_per_image, valid_per_image = keypoints.to_heatmap(
+            instances_per_image.proposal_boxes.tensor, keypoint_side_len
+        )
+        heatmaps.append(heatmaps_per_image.view(-1))
+        valid.append(valid_per_image.view(-1))
+    if len(heatmaps):
+        keypoint_targets = cat(heatmaps, dim=0)
+        valid = cat(valid, dim=0).to(dtype=torch.uint8)
+        valid = torch.nonzero(valid).squeeze(1)
+    # torch.mean (in binary_cross_entropy_with_logits) doesn't
+    # accept empty tensors, so handle it separately
+    if len(heatmaps) == 0 or valid.numel() == 0:
+        global _TOTAL_SKIPPED
+        _TOTAL_SKIPPED += 1
+        storage = get_event_storage()
+        storage.put_scalar("kpts_num_skipped_batches", _TOTAL_SKIPPED, smoothing_hint=False)
+        return pred_keypoint_logits.sum() * 0
+    N, K, H, W = pred_keypoint_logits.shape
+    pred_keypoint_logits = pred_keypoint_logits.view(N * K, H * W)
+    keypoint_loss = F.cross_entropy(
+        pred_keypoint_logits[valid], keypoint_targets[valid], reduction="sum"
+    )
+    # If a normalizer isn't specified, normalize by the number of visible keypoints in the minibatch
+    if normalizer is None:
+        normalizer = valid.numel()
+    keypoint_loss /= normalizer
+    return keypoint_loss
+def keypoint_rcnn_inference(pred_keypoint_logits: torch.Tensor, pred_instances: List[Instances]):
+    """
+    Post process each predicted keypoint heatmap in `pred_keypoint_logits` into (x, y, score)
+        and add it to the `pred_instances` as a `pred_keypoints` field.
+    Args:
+        pred_keypoint_logits (Tensor): A tensor of shape (R, K, S, S) where R is the total number
+           of instances in the batch, K is the number of keypoints, and S is the side length of
+           the keypoint heatmap. The values are spatial logits.
+        pred_instances (list[Instances]): A list of N Instances, where N is the number of images.
+    Returns:
+        None. Each element in pred_instances will contain extra "pred_keypoints" and
+            "pred_keypoint_heatmaps" fields. "pred_keypoints" is a tensor of shape
+            (#instance, K, 3) where the last dimension corresponds to (x, y, score).
+            The scores are larger than 0. "pred_keypoint_heatmaps" contains the raw
+            keypoint logits as passed to this function.
+    """
+    # flatten all bboxes from all images together (list[Boxes] -> Rx4 tensor)
+    bboxes_flat = cat([b.pred_boxes.tensor for b in pred_instances], dim=0)
+    pred_keypoint_logits = pred_keypoint_logits.detach()
+    keypoint_results = heatmaps_to_keypoints(pred_keypoint_logits, bboxes_flat.detach())
+    num_instances_per_image = [len(i) for i in pred_instances]
+    keypoint_results = keypoint_results[:, :, [0, 1, 3]].split(num_instances_per_image, dim=0)
+    heatmap_results = pred_keypoint_logits.split(num_instances_per_image, dim=0)
+    for keypoint_results_per_image, heatmap_results_per_image, instances_per_image in zip(
+        keypoint_results, heatmap_results, pred_instances
+    ):
+        # keypoint_results_per_image is (num instances)x(num keypoints)x(x, y, score)
+        # heatmap_results_per_image is (num instances)x(num keypoints)x(side)x(side)
+        instances_per_image.pred_keypoints = keypoint_results_per_image
+        instances_per_image.pred_keypoint_heatmaps = heatmap_results_per_image
+class BaseKeypointRCNNHead(nn.Module):
+    """
+    Implement the basic Keypoint R-CNN losses and inference logic described in
+    Sec. 5 of :paper:`Mask R-CNN`.
+    """
+    @configurable
+    def __init__(self, *, num_keypoints, loss_weight=1.0, loss_normalizer=1.0):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            num_keypoints (int): number of keypoints to predict
+            loss_weight (float): weight to multiple on the keypoint loss
+            loss_normalizer (float or str):
+                If float, divide the loss by `loss_normalizer * #images`.
+                If 'visible', the loss is normalized by the total number of
+                visible keypoints across images.
+        """
+        super().__init__()
+        self.num_keypoints = num_keypoints
+        self.loss_weight = loss_weight
+        assert loss_normalizer == "visible" or isinstance(loss_normalizer, float), loss_normalizer
+        self.loss_normalizer = loss_normalizer
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        ret = {
+            "loss_weight": cfg.MODEL.ROI_KEYPOINT_HEAD.LOSS_WEIGHT,
+            "num_keypoints": cfg.MODEL.ROI_KEYPOINT_HEAD.NUM_KEYPOINTS,
+        }
+        normalize_by_visible = (
+            cfg.MODEL.ROI_KEYPOINT_HEAD.NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS
+        )  # noqa
+        if not normalize_by_visible:
+            batch_size_per_image = cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE
+            positive_sample_fraction = cfg.MODEL.ROI_HEADS.POSITIVE_FRACTION
+            ret["loss_normalizer"] = (
+                ret["num_keypoints"] * batch_size_per_image * positive_sample_fraction
+            )
+        else:
+            ret["loss_normalizer"] = "visible"
+        return ret
+    def forward(self, x, instances: List[Instances]):
+        """
+        Args:
+            x: input 4D region feature(s) provided by :class:`ROIHeads`.
+            instances (list[Instances]): contains the boxes & labels corresponding
+                to the input features.
+                Exact format is up to its caller to decide.
+                Typically, this is the foreground instances in training, with
+                "proposal_boxes" field and other gt annotations.
+                In inference, it contains boxes that are already predicted.
+        Returns:
+            A dict of losses if in training. The predicted "instances" if in inference.
+        """
+        x = self.layers(x)
+        if self.training:
+            num_images = len(instances)
+            normalizer = (
+                None if self.loss_normalizer == "visible" else num_images * self.loss_normalizer
+            )
+            return {
+                "loss_keypoint": keypoint_rcnn_loss(x, instances, normalizer=normalizer)
+                * self.loss_weight
+            }
+        else:
+            keypoint_rcnn_inference(x, instances)
+            return instances
+    def layers(self, x):
+        """
+        Neural network layers that makes predictions from regional input features.
+        """
+        raise NotImplementedError
+# To get torchscript support, we make the head a subclass of `nn.Sequential`.
+# Therefore, to add new layers in this head class, please make sure they are
+# added in the order they will be used in forward().
+@ROI_KEYPOINT_HEAD_REGISTRY.register()
+class KRCNNConvDeconvUpsampleHead(BaseKeypointRCNNHead, nn.Sequential):
+    """
+    A standard keypoint head containing a series of 3x3 convs, followed by
+    a transpose convolution and bilinear interpolation for upsampling.
+    It is described in Sec. 5 of :paper:`Mask R-CNN`.
+    """
+    @configurable
+    def __init__(self, input_shape, *, num_keypoints, conv_dims, **kwargs):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            input_shape (ShapeSpec): shape of the input feature
+            conv_dims: an iterable of output channel counts for each conv in the head
+                         e.g. (512, 512, 512) for three convs outputting 512 channels.
+        """
+        super().__init__(num_keypoints=num_keypoints, **kwargs)
+        # default up_scale to 2.0 (this can be made an option)
+        up_scale = 2.0
+        in_channels = input_shape.channels
+        for idx, layer_channels in enumerate(conv_dims, 1):
+            module = Conv2d(in_channels, layer_channels, 3, stride=1, padding=1)
+            self.add_module("conv_fcn{}".format(idx), module)
+            self.add_module("conv_fcn_relu{}".format(idx), nn.ReLU())
+            in_channels = layer_channels
+        deconv_kernel = 4
+        self.score_lowres = ConvTranspose2d(
+            in_channels, num_keypoints, deconv_kernel, stride=2, padding=deconv_kernel // 2 - 1
+        )
+        self.up_scale = up_scale
+        for name, param in self.named_parameters():
+            if "bias" in name:
+                nn.init.constant_(param, 0)
+            elif "weight" in name:
+                # Caffe2 implementation uses MSRAFill, which in fact
+                # corresponds to kaiming_normal_ in PyTorch
+                nn.init.kaiming_normal_(param, mode="fan_out", nonlinearity="relu")
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        ret = super().from_config(cfg, input_shape)
+        ret["input_shape"] = input_shape
+        ret["conv_dims"] = cfg.MODEL.ROI_KEYPOINT_HEAD.CONV_DIMS
+        return ret
+    def layers(self, x):
+        for layer in self:
+            x = layer(x)
+        x = interpolate(x, scale_factor=self.up_scale, mode="bilinear", align_corners=False)
+        return x

extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/roi_heads/mask_head.py ADDED Viewed

	@@ -0,0 +1,298 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+from typing import List
+import fvcore.nn.weight_init as weight_init
+import torch
+from torch import nn
+from torch.nn import functional as F
+from annotator.oneformer.detectron2.config import configurable
+from annotator.oneformer.detectron2.layers import Conv2d, ConvTranspose2d, ShapeSpec, cat, get_norm
+from annotator.oneformer.detectron2.layers.wrappers import move_device_like
+from annotator.oneformer.detectron2.structures import Instances
+from annotator.oneformer.detectron2.utils.events import get_event_storage
+from annotator.oneformer.detectron2.utils.registry import Registry
+__all__ = [
+    "BaseMaskRCNNHead",
+    "MaskRCNNConvUpsampleHead",
+    "build_mask_head",
+    "ROI_MASK_HEAD_REGISTRY",
+]
+ROI_MASK_HEAD_REGISTRY = Registry("ROI_MASK_HEAD")
+ROI_MASK_HEAD_REGISTRY.__doc__ = """
+Registry for mask heads, which predicts instance masks given
+per-region features.
+The registered object will be called with `obj(cfg, input_shape)`.
+"""
+@torch.jit.unused
+def mask_rcnn_loss(pred_mask_logits: torch.Tensor, instances: List[Instances], vis_period: int = 0):
+    """
+    Compute the mask prediction loss defined in the Mask R-CNN paper.
+    Args:
+        pred_mask_logits (Tensor): A tensor of shape (B, C, Hmask, Wmask) or (B, 1, Hmask, Wmask)
+            for class-specific or class-agnostic, where B is the total number of predicted masks
+            in all images, C is the number of foreground classes, and Hmask, Wmask are the height
+            and width of the mask predictions. The values are logits.
+        instances (list[Instances]): A list of N Instances, where N is the number of images
+            in the batch. These instances are in 1:1
+            correspondence with the pred_mask_logits. The ground-truth labels (class, box, mask,
+            ...) associated with each instance are stored in fields.
+        vis_period (int): the period (in steps) to dump visualization.
+    Returns:
+        mask_loss (Tensor): A scalar tensor containing the loss.
+    """
+    cls_agnostic_mask = pred_mask_logits.size(1) == 1
+    total_num_masks = pred_mask_logits.size(0)
+    mask_side_len = pred_mask_logits.size(2)
+    assert pred_mask_logits.size(2) == pred_mask_logits.size(3), "Mask prediction must be square!"
+    gt_classes = []
+    gt_masks = []
+    for instances_per_image in instances:
+        if len(instances_per_image) == 0:
+            continue
+        if not cls_agnostic_mask:
+            gt_classes_per_image = instances_per_image.gt_classes.to(dtype=torch.int64)
+            gt_classes.append(gt_classes_per_image)
+        gt_masks_per_image = instances_per_image.gt_masks.crop_and_resize(
+            instances_per_image.proposal_boxes.tensor, mask_side_len
+        ).to(device=pred_mask_logits.device)
+        # A tensor of shape (N, M, M), N=#instances in the image; M=mask_side_len
+        gt_masks.append(gt_masks_per_image)
+    if len(gt_masks) == 0:
+        return pred_mask_logits.sum() * 0
+    gt_masks = cat(gt_masks, dim=0)
+    if cls_agnostic_mask:
+        pred_mask_logits = pred_mask_logits[:, 0]
+    else:
+        indices = torch.arange(total_num_masks)
+        gt_classes = cat(gt_classes, dim=0)
+        pred_mask_logits = pred_mask_logits[indices, gt_classes]
+    if gt_masks.dtype == torch.bool:
+        gt_masks_bool = gt_masks
+    else:
+        # Here we allow gt_masks to be float as well (depend on the implementation of rasterize())
+        gt_masks_bool = gt_masks > 0.5
+    gt_masks = gt_masks.to(dtype=torch.float32)
+    # Log the training accuracy (using gt classes and 0.5 threshold)
+    mask_incorrect = (pred_mask_logits > 0.0) != gt_masks_bool
+    mask_accuracy = 1 - (mask_incorrect.sum().item() / max(mask_incorrect.numel(), 1.0))
+    num_positive = gt_masks_bool.sum().item()
+    false_positive = (mask_incorrect & ~gt_masks_bool).sum().item() / max(
+        gt_masks_bool.numel() - num_positive, 1.0
+    )
+    false_negative = (mask_incorrect & gt_masks_bool).sum().item() / max(num_positive, 1.0)
+    storage = get_event_storage()
+    storage.put_scalar("mask_rcnn/accuracy", mask_accuracy)
+    storage.put_scalar("mask_rcnn/false_positive", false_positive)
+    storage.put_scalar("mask_rcnn/false_negative", false_negative)
+    if vis_period > 0 and storage.iter % vis_period == 0:
+        pred_masks = pred_mask_logits.sigmoid()
+        vis_masks = torch.cat([pred_masks, gt_masks], axis=2)
+        name = "Left: mask prediction;   Right: mask GT"
+        for idx, vis_mask in enumerate(vis_masks):
+            vis_mask = torch.stack([vis_mask] * 3, axis=0)
+            storage.put_image(name + f" ({idx})", vis_mask)
+    mask_loss = F.binary_cross_entropy_with_logits(pred_mask_logits, gt_masks, reduction="mean")
+    return mask_loss
+def mask_rcnn_inference(pred_mask_logits: torch.Tensor, pred_instances: List[Instances]):
+    """
+    Convert pred_mask_logits to estimated foreground probability masks while also
+    extracting only the masks for the predicted classes in pred_instances. For each
+    predicted box, the mask of the same class is attached to the instance by adding a
+    new "pred_masks" field to pred_instances.
+    Args:
+        pred_mask_logits (Tensor): A tensor of shape (B, C, Hmask, Wmask) or (B, 1, Hmask, Wmask)
+            for class-specific or class-agnostic, where B is the total number of predicted masks
+            in all images, C is the number of foreground classes, and Hmask, Wmask are the height
+            and width of the mask predictions. The values are logits.
+        pred_instances (list[Instances]): A list of N Instances, where N is the number of images
+            in the batch. Each Instances must have field "pred_classes".
+    Returns:
+        None. pred_instances will contain an extra "pred_masks" field storing a mask of size (Hmask,
+            Wmask) for predicted class. Note that the masks are returned as a soft (non-quantized)
+            masks the resolution predicted by the network; post-processing steps, such as resizing
+            the predicted masks to the original image resolution and/or binarizing them, is left
+            to the caller.
+    """
+    cls_agnostic_mask = pred_mask_logits.size(1) == 1
+    if cls_agnostic_mask:
+        mask_probs_pred = pred_mask_logits.sigmoid()
+    else:
+        # Select masks corresponding to the predicted classes
+        num_masks = pred_mask_logits.shape[0]
+        class_pred = cat([i.pred_classes for i in pred_instances])
+        device = (
+            class_pred.device
+            if torch.jit.is_scripting()
+            else ("cpu" if torch.jit.is_tracing() else class_pred.device)
+        )
+        indices = move_device_like(torch.arange(num_masks, device=device), class_pred)
+        mask_probs_pred = pred_mask_logits[indices, class_pred][:, None].sigmoid()
+    # mask_probs_pred.shape: (B, 1, Hmask, Wmask)
+    num_boxes_per_image = [len(i) for i in pred_instances]
+    mask_probs_pred = mask_probs_pred.split(num_boxes_per_image, dim=0)
+    for prob, instances in zip(mask_probs_pred, pred_instances):
+        instances.pred_masks = prob  # (1, Hmask, Wmask)
+class BaseMaskRCNNHead(nn.Module):
+    """
+    Implement the basic Mask R-CNN losses and inference logic described in :paper:`Mask R-CNN`
+    """
+    @configurable
+    def __init__(self, *, loss_weight: float = 1.0, vis_period: int = 0):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            loss_weight (float): multiplier of the loss
+            vis_period (int): visualization period
+        """
+        super().__init__()
+        self.vis_period = vis_period
+        self.loss_weight = loss_weight
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {"vis_period": cfg.VIS_PERIOD}
+    def forward(self, x, instances: List[Instances]):
+        """
+        Args:
+            x: input region feature(s) provided by :class:`ROIHeads`.
+            instances (list[Instances]): contains the boxes & labels corresponding
+                to the input features.
+                Exact format is up to its caller to decide.
+                Typically, this is the foreground instances in training, with
+                "proposal_boxes" field and other gt annotations.
+                In inference, it contains boxes that are already predicted.
+        Returns:
+            A dict of losses in training. The predicted "instances" in inference.
+        """
+        x = self.layers(x)
+        if self.training:
+            return {"loss_mask": mask_rcnn_loss(x, instances, self.vis_period) * self.loss_weight}
+        else:
+            mask_rcnn_inference(x, instances)
+            return instances
+    def layers(self, x):
+        """
+        Neural network layers that makes predictions from input features.
+        """
+        raise NotImplementedError
+# To get torchscript support, we make the head a subclass of `nn.Sequential`.
+# Therefore, to add new layers in this head class, please make sure they are
+# added in the order they will be used in forward().
+@ROI_MASK_HEAD_REGISTRY.register()
+class MaskRCNNConvUpsampleHead(BaseMaskRCNNHead, nn.Sequential):
+    """
+    A mask head with several conv layers, plus an upsample layer (with `ConvTranspose2d`).
+    Predictions are made with a final 1x1 conv layer.
+    """
+    @configurable
+    def __init__(self, input_shape: ShapeSpec, *, num_classes, conv_dims, conv_norm="", **kwargs):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            input_shape (ShapeSpec): shape of the input feature
+            num_classes (int): the number of foreground classes (i.e. background is not
+                included). 1 if using class agnostic prediction.
+            conv_dims (list[int]): a list of N>0 integers representing the output dimensions
+                of N-1 conv layers and the last upsample layer.
+            conv_norm (str or callable): normalization for the conv layers.
+                See :func:`detectron2.layers.get_norm` for supported types.
+        """
+        super().__init__(**kwargs)
+        assert len(conv_dims) >= 1, "conv_dims have to be non-empty!"
+        self.conv_norm_relus = []
+        cur_channels = input_shape.channels
+        for k, conv_dim in enumerate(conv_dims[:-1]):
+            conv = Conv2d(
+                cur_channels,
+                conv_dim,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias=not conv_norm,
+                norm=get_norm(conv_norm, conv_dim),
+                activation=nn.ReLU(),
+            )
+            self.add_module("mask_fcn{}".format(k + 1), conv)
+            self.conv_norm_relus.append(conv)
+            cur_channels = conv_dim
+        self.deconv = ConvTranspose2d(
+            cur_channels, conv_dims[-1], kernel_size=2, stride=2, padding=0
+        )
+        self.add_module("deconv_relu", nn.ReLU())
+        cur_channels = conv_dims[-1]
+        self.predictor = Conv2d(cur_channels, num_classes, kernel_size=1, stride=1, padding=0)
+        for layer in self.conv_norm_relus + [self.deconv]:
+            weight_init.c2_msra_fill(layer)
+        # use normal distribution initialization for mask prediction layer
+        nn.init.normal_(self.predictor.weight, std=0.001)
+        if self.predictor.bias is not None:
+            nn.init.constant_(self.predictor.bias, 0)
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        ret = super().from_config(cfg, input_shape)
+        conv_dim = cfg.MODEL.ROI_MASK_HEAD.CONV_DIM
+        num_conv = cfg.MODEL.ROI_MASK_HEAD.NUM_CONV
+        ret.update(
+            conv_dims=[conv_dim] * (num_conv + 1),  # +1 for ConvTranspose
+            conv_norm=cfg.MODEL.ROI_MASK_HEAD.NORM,
+            input_shape=input_shape,
+        )
+        if cfg.MODEL.ROI_MASK_HEAD.CLS_AGNOSTIC_MASK:
+            ret["num_classes"] = 1
+        else:
+            ret["num_classes"] = cfg.MODEL.ROI_HEADS.NUM_CLASSES
+        return ret
+    def layers(self, x):
+        for layer in self:
+            x = layer(x)
+        return x
+def build_mask_head(cfg, input_shape):
+    """
+    Build a mask head defined by `cfg.MODEL.ROI_MASK_HEAD.NAME`.
+    """
+    name = cfg.MODEL.ROI_MASK_HEAD.NAME
+    return ROI_MASK_HEAD_REGISTRY.get(name)(cfg, input_shape)