toto10 commited on
Commit
e7908a1
1 Parent(s): f5d05f8

af5f9ea25cd84b6c327d58a09e9ee787fc974290ff2d7d5dfe22b54aad11d08d

Browse files
Files changed (50) hide show
  1. extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/csrc/nms_rotated/nms_rotated_cpu.cpp +75 -0
  2. extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/csrc/nms_rotated/nms_rotated_cuda.cu +145 -0
  3. extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/csrc/vision.cpp +117 -0
  4. extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/deform_conv.py +514 -0
  5. extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/losses.py +133 -0
  6. extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/mask_ops.py +275 -0
  7. extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/nms.py +144 -0
  8. extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/roi_align.py +74 -0
  9. extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/roi_align_rotated.py +100 -0
  10. extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/rotated_boxes.py +21 -0
  11. extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/shape_spec.py +18 -0
  12. extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/wrappers.py +162 -0
  13. extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/model_zoo/__init__.py +10 -0
  14. extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/model_zoo/model_zoo.py +213 -0
  15. extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/__init__.py +64 -0
  16. extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/anchor_generator.py +386 -0
  17. extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/backbone/__init__.py +20 -0
  18. extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/backbone/backbone.py +74 -0
  19. extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/backbone/build.py +33 -0
  20. extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/backbone/fpn.py +268 -0
  21. extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/backbone/mvit.py +448 -0
  22. extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/backbone/regnet.py +452 -0
  23. extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/backbone/resnet.py +694 -0
  24. extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/backbone/swin.py +695 -0
  25. extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/backbone/utils.py +186 -0
  26. extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/backbone/vit.py +524 -0
  27. extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/box_regression.py +369 -0
  28. extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/matcher.py +127 -0
  29. extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/meta_arch/__init__.py +16 -0
  30. extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/meta_arch/build.py +24 -0
  31. extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/meta_arch/dense_detector.py +294 -0
  32. extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/meta_arch/fcos.py +328 -0
  33. extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/meta_arch/panoptic_fpn.py +269 -0
  34. extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/meta_arch/rcnn.py +341 -0
  35. extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/meta_arch/retinanet.py +439 -0
  36. extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/meta_arch/semantic_seg.py +267 -0
  37. extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/mmdet_wrapper.py +273 -0
  38. extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/poolers.py +263 -0
  39. extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/postprocessing.py +100 -0
  40. extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/proposal_generator/__init__.py +5 -0
  41. extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/proposal_generator/build.py +24 -0
  42. extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/proposal_generator/proposal_utils.py +205 -0
  43. extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/proposal_generator/rpn.py +533 -0
  44. extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/proposal_generator/rrpn.py +209 -0
  45. extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/roi_heads/__init__.py +29 -0
  46. extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/roi_heads/box_head.py +118 -0
  47. extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/roi_heads/cascade_rcnn.py +299 -0
  48. extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/roi_heads/fast_rcnn.py +569 -0
  49. extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/roi_heads/keypoint_head.py +272 -0
  50. extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/roi_heads/mask_head.py +298 -0
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/csrc/nms_rotated/nms_rotated_cpu.cpp ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) Facebook, Inc. and its affiliates.
2
+ #include "../box_iou_rotated/box_iou_rotated_utils.h"
3
+ #include "nms_rotated.h"
4
+
5
+ namespace detectron2 {
6
+
7
+ template <typename scalar_t>
8
+ at::Tensor nms_rotated_cpu_kernel(
9
+ const at::Tensor& dets,
10
+ const at::Tensor& scores,
11
+ const double iou_threshold) {
12
+ // nms_rotated_cpu_kernel is modified from torchvision's nms_cpu_kernel,
13
+ // however, the code in this function is much shorter because
14
+ // we delegate the IoU computation for rotated boxes to
15
+ // the single_box_iou_rotated function in box_iou_rotated_utils.h
16
+ AT_ASSERTM(dets.device().is_cpu(), "dets must be a CPU tensor");
17
+ AT_ASSERTM(scores.device().is_cpu(), "scores must be a CPU tensor");
18
+ AT_ASSERTM(
19
+ dets.scalar_type() == scores.scalar_type(),
20
+ "dets should have the same type as scores");
21
+
22
+ if (dets.numel() == 0) {
23
+ return at::empty({0}, dets.options().dtype(at::kLong));
24
+ }
25
+
26
+ auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
27
+
28
+ auto ndets = dets.size(0);
29
+ at::Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte));
30
+ at::Tensor keep_t = at::zeros({ndets}, dets.options().dtype(at::kLong));
31
+
32
+ auto suppressed = suppressed_t.data_ptr<uint8_t>();
33
+ auto keep = keep_t.data_ptr<int64_t>();
34
+ auto order = order_t.data_ptr<int64_t>();
35
+
36
+ int64_t num_to_keep = 0;
37
+
38
+ for (int64_t _i = 0; _i < ndets; _i++) {
39
+ auto i = order[_i];
40
+ if (suppressed[i] == 1) {
41
+ continue;
42
+ }
43
+
44
+ keep[num_to_keep++] = i;
45
+
46
+ for (int64_t _j = _i + 1; _j < ndets; _j++) {
47
+ auto j = order[_j];
48
+ if (suppressed[j] == 1) {
49
+ continue;
50
+ }
51
+
52
+ auto ovr = single_box_iou_rotated<scalar_t>(
53
+ dets[i].data_ptr<scalar_t>(), dets[j].data_ptr<scalar_t>());
54
+ if (ovr >= iou_threshold) {
55
+ suppressed[j] = 1;
56
+ }
57
+ }
58
+ }
59
+ return keep_t.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep);
60
+ }
61
+
62
+ at::Tensor nms_rotated_cpu(
63
+ // input must be contiguous
64
+ const at::Tensor& dets,
65
+ const at::Tensor& scores,
66
+ const double iou_threshold) {
67
+ auto result = at::empty({0}, dets.options());
68
+
69
+ AT_DISPATCH_FLOATING_TYPES(dets.scalar_type(), "nms_rotated", [&] {
70
+ result = nms_rotated_cpu_kernel<scalar_t>(dets, scores, iou_threshold);
71
+ });
72
+ return result;
73
+ }
74
+
75
+ } // namespace detectron2
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/csrc/nms_rotated/nms_rotated_cuda.cu ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) Facebook, Inc. and its affiliates.
2
+ #include <ATen/ATen.h>
3
+ #include <ATen/cuda/CUDAContext.h>
4
+ #include <c10/cuda/CUDAGuard.h>
5
+ #include <ATen/cuda/CUDAApplyUtils.cuh>
6
+ #ifdef WITH_CUDA
7
+ #include "../box_iou_rotated/box_iou_rotated_utils.h"
8
+ #endif
9
+ // TODO avoid this when pytorch supports "same directory" hipification
10
+ #ifdef WITH_HIP
11
+ #include "box_iou_rotated/box_iou_rotated_utils.h"
12
+ #endif
13
+
14
+ using namespace detectron2;
15
+
16
+ namespace {
17
+ int const threadsPerBlock = sizeof(unsigned long long) * 8;
18
+ }
19
+
20
+ template <typename T>
21
+ __global__ void nms_rotated_cuda_kernel(
22
+ const int n_boxes,
23
+ const double iou_threshold,
24
+ const T* dev_boxes,
25
+ unsigned long long* dev_mask) {
26
+ // nms_rotated_cuda_kernel is modified from torchvision's nms_cuda_kernel
27
+
28
+ const int row_start = blockIdx.y;
29
+ const int col_start = blockIdx.x;
30
+
31
+ // if (row_start > col_start) return;
32
+
33
+ const int row_size =
34
+ min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
35
+ const int col_size =
36
+ min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
37
+
38
+ // Compared to nms_cuda_kernel, where each box is represented with 4 values
39
+ // (x1, y1, x2, y2), each rotated box is represented with 5 values
40
+ // (x_center, y_center, width, height, angle_degrees) here.
41
+ __shared__ T block_boxes[threadsPerBlock * 5];
42
+ if (threadIdx.x < col_size) {
43
+ block_boxes[threadIdx.x * 5 + 0] =
44
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0];
45
+ block_boxes[threadIdx.x * 5 + 1] =
46
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1];
47
+ block_boxes[threadIdx.x * 5 + 2] =
48
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2];
49
+ block_boxes[threadIdx.x * 5 + 3] =
50
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3];
51
+ block_boxes[threadIdx.x * 5 + 4] =
52
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4];
53
+ }
54
+ __syncthreads();
55
+
56
+ if (threadIdx.x < row_size) {
57
+ const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
58
+ const T* cur_box = dev_boxes + cur_box_idx * 5;
59
+ int i = 0;
60
+ unsigned long long t = 0;
61
+ int start = 0;
62
+ if (row_start == col_start) {
63
+ start = threadIdx.x + 1;
64
+ }
65
+ for (i = start; i < col_size; i++) {
66
+ // Instead of devIoU used by original horizontal nms, here
67
+ // we use the single_box_iou_rotated function from box_iou_rotated_utils.h
68
+ if (single_box_iou_rotated<T>(cur_box, block_boxes + i * 5) >
69
+ iou_threshold) {
70
+ t |= 1ULL << i;
71
+ }
72
+ }
73
+ const int col_blocks = at::cuda::ATenCeilDiv(n_boxes, threadsPerBlock);
74
+ dev_mask[cur_box_idx * col_blocks + col_start] = t;
75
+ }
76
+ }
77
+
78
+ namespace detectron2 {
79
+
80
+ at::Tensor nms_rotated_cuda(
81
+ // input must be contiguous
82
+ const at::Tensor& dets,
83
+ const at::Tensor& scores,
84
+ double iou_threshold) {
85
+ // using scalar_t = float;
86
+ AT_ASSERTM(dets.is_cuda(), "dets must be a CUDA tensor");
87
+ AT_ASSERTM(scores.is_cuda(), "scores must be a CUDA tensor");
88
+ at::cuda::CUDAGuard device_guard(dets.device());
89
+
90
+ auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
91
+ auto dets_sorted = dets.index_select(0, order_t);
92
+
93
+ auto dets_num = dets.size(0);
94
+
95
+ const int col_blocks =
96
+ at::cuda::ATenCeilDiv(static_cast<int>(dets_num), threadsPerBlock);
97
+
98
+ at::Tensor mask =
99
+ at::empty({dets_num * col_blocks}, dets.options().dtype(at::kLong));
100
+
101
+ dim3 blocks(col_blocks, col_blocks);
102
+ dim3 threads(threadsPerBlock);
103
+ cudaStream_t stream = at::cuda::getCurrentCUDAStream();
104
+
105
+ AT_DISPATCH_FLOATING_TYPES(
106
+ dets_sorted.scalar_type(), "nms_rotated_kernel_cuda", [&] {
107
+ nms_rotated_cuda_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
108
+ dets_num,
109
+ iou_threshold,
110
+ dets_sorted.data_ptr<scalar_t>(),
111
+ (unsigned long long*)mask.data_ptr<int64_t>());
112
+ });
113
+
114
+ at::Tensor mask_cpu = mask.to(at::kCPU);
115
+ unsigned long long* mask_host =
116
+ (unsigned long long*)mask_cpu.data_ptr<int64_t>();
117
+
118
+ std::vector<unsigned long long> remv(col_blocks);
119
+ memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
120
+
121
+ at::Tensor keep =
122
+ at::empty({dets_num}, dets.options().dtype(at::kLong).device(at::kCPU));
123
+ int64_t* keep_out = keep.data_ptr<int64_t>();
124
+
125
+ int num_to_keep = 0;
126
+ for (int i = 0; i < dets_num; i++) {
127
+ int nblock = i / threadsPerBlock;
128
+ int inblock = i % threadsPerBlock;
129
+
130
+ if (!(remv[nblock] & (1ULL << inblock))) {
131
+ keep_out[num_to_keep++] = i;
132
+ unsigned long long* p = mask_host + i * col_blocks;
133
+ for (int j = nblock; j < col_blocks; j++) {
134
+ remv[j] |= p[j];
135
+ }
136
+ }
137
+ }
138
+
139
+ AT_CUDA_CHECK(cudaGetLastError());
140
+ return order_t.index(
141
+ {keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep)
142
+ .to(order_t.device(), keep.scalar_type())});
143
+ }
144
+
145
+ } // namespace detectron2
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/csrc/vision.cpp ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) Facebook, Inc. and its affiliates.
2
+
3
+ #include <torch/extension.h>
4
+ #include "ROIAlignRotated/ROIAlignRotated.h"
5
+ #include "box_iou_rotated/box_iou_rotated.h"
6
+ #include "cocoeval/cocoeval.h"
7
+ #include "deformable/deform_conv.h"
8
+ #include "nms_rotated/nms_rotated.h"
9
+
10
+ namespace detectron2 {
11
+
12
+ #if defined(WITH_CUDA) || defined(WITH_HIP)
13
+ extern int get_cudart_version();
14
+ #endif
15
+
16
+ std::string get_cuda_version() {
17
+ #if defined(WITH_CUDA) || defined(WITH_HIP)
18
+ std::ostringstream oss;
19
+
20
+ #if defined(WITH_CUDA)
21
+ oss << "CUDA ";
22
+ #else
23
+ oss << "HIP ";
24
+ #endif
25
+
26
+ // copied from
27
+ // https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/cuda/detail/CUDAHooks.cpp#L231
28
+ auto printCudaStyleVersion = [&](int v) {
29
+ oss << (v / 1000) << "." << (v / 10 % 100);
30
+ if (v % 10 != 0) {
31
+ oss << "." << (v % 10);
32
+ }
33
+ };
34
+ printCudaStyleVersion(get_cudart_version());
35
+ return oss.str();
36
+ #else // neither CUDA nor HIP
37
+ return std::string("not available");
38
+ #endif
39
+ }
40
+
41
+ bool has_cuda() {
42
+ #if defined(WITH_CUDA)
43
+ return true;
44
+ #else
45
+ return false;
46
+ #endif
47
+ }
48
+
49
+ // similar to
50
+ // https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Version.cpp
51
+ std::string get_compiler_version() {
52
+ std::ostringstream ss;
53
+ #if defined(__GNUC__)
54
+ #ifndef __clang__
55
+
56
+ #if ((__GNUC__ <= 4) && (__GNUC_MINOR__ <= 8))
57
+ #error "GCC >= 4.9 is required!"
58
+ #endif
59
+
60
+ { ss << "GCC " << __GNUC__ << "." << __GNUC_MINOR__; }
61
+ #endif
62
+ #endif
63
+
64
+ #if defined(__clang_major__)
65
+ {
66
+ ss << "clang " << __clang_major__ << "." << __clang_minor__ << "."
67
+ << __clang_patchlevel__;
68
+ }
69
+ #endif
70
+
71
+ #if defined(_MSC_VER)
72
+ { ss << "MSVC " << _MSC_FULL_VER; }
73
+ #endif
74
+ return ss.str();
75
+ }
76
+
77
+ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
78
+ m.def("get_compiler_version", &get_compiler_version, "get_compiler_version");
79
+ m.def("get_cuda_version", &get_cuda_version, "get_cuda_version");
80
+ m.def("has_cuda", &has_cuda, "has_cuda");
81
+
82
+ m.def("deform_conv_forward", &deform_conv_forward, "deform_conv_forward");
83
+ m.def(
84
+ "deform_conv_backward_input",
85
+ &deform_conv_backward_input,
86
+ "deform_conv_backward_input");
87
+ m.def(
88
+ "deform_conv_backward_filter",
89
+ &deform_conv_backward_filter,
90
+ "deform_conv_backward_filter");
91
+ m.def(
92
+ "modulated_deform_conv_forward",
93
+ &modulated_deform_conv_forward,
94
+ "modulated_deform_conv_forward");
95
+ m.def(
96
+ "modulated_deform_conv_backward",
97
+ &modulated_deform_conv_backward,
98
+ "modulated_deform_conv_backward");
99
+
100
+ m.def("COCOevalAccumulate", &COCOeval::Accumulate, "COCOeval::Accumulate");
101
+ m.def(
102
+ "COCOevalEvaluateImages",
103
+ &COCOeval::EvaluateImages,
104
+ "COCOeval::EvaluateImages");
105
+ pybind11::class_<COCOeval::InstanceAnnotation>(m, "InstanceAnnotation")
106
+ .def(pybind11::init<uint64_t, double, double, bool, bool>());
107
+ pybind11::class_<COCOeval::ImageEvaluation>(m, "ImageEvaluation")
108
+ .def(pybind11::init<>());
109
+ }
110
+
111
+ TORCH_LIBRARY(detectron2, m) {
112
+ m.def("nms_rotated", &nms_rotated);
113
+ m.def("box_iou_rotated", &box_iou_rotated);
114
+ m.def("roi_align_rotated_forward", &ROIAlignRotated_forward);
115
+ m.def("roi_align_rotated_backward", &ROIAlignRotated_backward);
116
+ }
117
+ } // namespace detectron2
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/deform_conv.py ADDED
@@ -0,0 +1,514 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ import math
3
+ from functools import lru_cache
4
+ import torch
5
+ from torch import nn
6
+ from torch.autograd import Function
7
+ from torch.autograd.function import once_differentiable
8
+ from torch.nn.modules.utils import _pair
9
+ from torchvision.ops import deform_conv2d
10
+
11
+ from annotator.oneformer.detectron2.utils.develop import create_dummy_class, create_dummy_func
12
+
13
+ from .wrappers import _NewEmptyTensorOp
14
+
15
+
16
+ class _DeformConv(Function):
17
+ @staticmethod
18
+ def forward(
19
+ ctx,
20
+ input,
21
+ offset,
22
+ weight,
23
+ stride=1,
24
+ padding=0,
25
+ dilation=1,
26
+ groups=1,
27
+ deformable_groups=1,
28
+ im2col_step=64,
29
+ ):
30
+ if input is not None and input.dim() != 4:
31
+ raise ValueError(
32
+ "Expected 4D tensor as input, got {}D tensor instead.".format(input.dim())
33
+ )
34
+ ctx.stride = _pair(stride)
35
+ ctx.padding = _pair(padding)
36
+ ctx.dilation = _pair(dilation)
37
+ ctx.groups = groups
38
+ ctx.deformable_groups = deformable_groups
39
+ ctx.im2col_step = im2col_step
40
+
41
+ ctx.save_for_backward(input, offset, weight)
42
+
43
+ output = input.new_empty(
44
+ _DeformConv._output_size(input, weight, ctx.padding, ctx.dilation, ctx.stride)
45
+ )
46
+
47
+ ctx.bufs_ = [input.new_empty(0), input.new_empty(0)] # columns, ones
48
+
49
+ if not input.is_cuda:
50
+ # TODO: let torchvision support full features of our deformconv.
51
+ if deformable_groups != 1:
52
+ raise NotImplementedError(
53
+ "Deformable Conv with deformable_groups != 1 is not supported on CPUs!"
54
+ )
55
+ return deform_conv2d(
56
+ input, offset, weight, stride=stride, padding=padding, dilation=dilation
57
+ )
58
+ else:
59
+ cur_im2col_step = _DeformConv._cal_im2col_step(input.shape[0], ctx.im2col_step)
60
+ assert (input.shape[0] % cur_im2col_step) == 0, "im2col step must divide batchsize"
61
+
62
+ _C.deform_conv_forward(
63
+ input,
64
+ weight,
65
+ offset,
66
+ output,
67
+ ctx.bufs_[0],
68
+ ctx.bufs_[1],
69
+ weight.size(3),
70
+ weight.size(2),
71
+ ctx.stride[1],
72
+ ctx.stride[0],
73
+ ctx.padding[1],
74
+ ctx.padding[0],
75
+ ctx.dilation[1],
76
+ ctx.dilation[0],
77
+ ctx.groups,
78
+ ctx.deformable_groups,
79
+ cur_im2col_step,
80
+ )
81
+ return output
82
+
83
+ @staticmethod
84
+ @once_differentiable
85
+ def backward(ctx, grad_output):
86
+ input, offset, weight = ctx.saved_tensors
87
+
88
+ grad_input = grad_offset = grad_weight = None
89
+
90
+ if not grad_output.is_cuda:
91
+ raise NotImplementedError("Deformable Conv is not supported on CPUs!")
92
+ else:
93
+ cur_im2col_step = _DeformConv._cal_im2col_step(input.shape[0], ctx.im2col_step)
94
+ assert (input.shape[0] % cur_im2col_step) == 0, "im2col step must divide batchsize"
95
+
96
+ if ctx.needs_input_grad[0] or ctx.needs_input_grad[1]:
97
+ grad_input = torch.zeros_like(input)
98
+ grad_offset = torch.zeros_like(offset)
99
+ _C.deform_conv_backward_input(
100
+ input,
101
+ offset,
102
+ grad_output,
103
+ grad_input,
104
+ grad_offset,
105
+ weight,
106
+ ctx.bufs_[0],
107
+ weight.size(3),
108
+ weight.size(2),
109
+ ctx.stride[1],
110
+ ctx.stride[0],
111
+ ctx.padding[1],
112
+ ctx.padding[0],
113
+ ctx.dilation[1],
114
+ ctx.dilation[0],
115
+ ctx.groups,
116
+ ctx.deformable_groups,
117
+ cur_im2col_step,
118
+ )
119
+
120
+ if ctx.needs_input_grad[2]:
121
+ grad_weight = torch.zeros_like(weight)
122
+ _C.deform_conv_backward_filter(
123
+ input,
124
+ offset,
125
+ grad_output,
126
+ grad_weight,
127
+ ctx.bufs_[0],
128
+ ctx.bufs_[1],
129
+ weight.size(3),
130
+ weight.size(2),
131
+ ctx.stride[1],
132
+ ctx.stride[0],
133
+ ctx.padding[1],
134
+ ctx.padding[0],
135
+ ctx.dilation[1],
136
+ ctx.dilation[0],
137
+ ctx.groups,
138
+ ctx.deformable_groups,
139
+ 1,
140
+ cur_im2col_step,
141
+ )
142
+
143
+ return grad_input, grad_offset, grad_weight, None, None, None, None, None, None
144
+
145
+ @staticmethod
146
+ def _output_size(input, weight, padding, dilation, stride):
147
+ channels = weight.size(0)
148
+ output_size = (input.size(0), channels)
149
+ for d in range(input.dim() - 2):
150
+ in_size = input.size(d + 2)
151
+ pad = padding[d]
152
+ kernel = dilation[d] * (weight.size(d + 2) - 1) + 1
153
+ stride_ = stride[d]
154
+ output_size += ((in_size + (2 * pad) - kernel) // stride_ + 1,)
155
+ if not all(map(lambda s: s > 0, output_size)):
156
+ raise ValueError(
157
+ "convolution input is too small (output would be {})".format(
158
+ "x".join(map(str, output_size))
159
+ )
160
+ )
161
+ return output_size
162
+
163
+ @staticmethod
164
+ @lru_cache(maxsize=128)
165
+ def _cal_im2col_step(input_size, default_size):
166
+ """
167
+ Calculate proper im2col step size, which should be divisible by input_size and not larger
168
+ than prefer_size. Meanwhile the step size should be as large as possible to be more
169
+ efficient. So we choose the largest one among all divisors of input_size which are smaller
170
+ than prefer_size.
171
+ :param input_size: input batch size .
172
+ :param default_size: default preferred im2col step size.
173
+ :return: the largest proper step size.
174
+ """
175
+ if input_size <= default_size:
176
+ return input_size
177
+ best_step = 1
178
+ for step in range(2, min(int(math.sqrt(input_size)) + 1, default_size)):
179
+ if input_size % step == 0:
180
+ if input_size // step <= default_size:
181
+ return input_size // step
182
+ best_step = step
183
+
184
+ return best_step
185
+
186
+
187
+ class _ModulatedDeformConv(Function):
188
+ @staticmethod
189
+ def forward(
190
+ ctx,
191
+ input,
192
+ offset,
193
+ mask,
194
+ weight,
195
+ bias=None,
196
+ stride=1,
197
+ padding=0,
198
+ dilation=1,
199
+ groups=1,
200
+ deformable_groups=1,
201
+ ):
202
+ ctx.stride = stride
203
+ ctx.padding = padding
204
+ ctx.dilation = dilation
205
+ ctx.groups = groups
206
+ ctx.deformable_groups = deformable_groups
207
+ ctx.with_bias = bias is not None
208
+ if not ctx.with_bias:
209
+ bias = input.new_empty(1) # fake tensor
210
+ if not input.is_cuda:
211
+ raise NotImplementedError("Deformable Conv is not supported on CPUs!")
212
+ if (
213
+ weight.requires_grad
214
+ or mask.requires_grad
215
+ or offset.requires_grad
216
+ or input.requires_grad
217
+ ):
218
+ ctx.save_for_backward(input, offset, mask, weight, bias)
219
+ output = input.new_empty(_ModulatedDeformConv._infer_shape(ctx, input, weight))
220
+ ctx._bufs = [input.new_empty(0), input.new_empty(0)]
221
+ _C.modulated_deform_conv_forward(
222
+ input,
223
+ weight,
224
+ bias,
225
+ ctx._bufs[0],
226
+ offset,
227
+ mask,
228
+ output,
229
+ ctx._bufs[1],
230
+ weight.shape[2],
231
+ weight.shape[3],
232
+ ctx.stride,
233
+ ctx.stride,
234
+ ctx.padding,
235
+ ctx.padding,
236
+ ctx.dilation,
237
+ ctx.dilation,
238
+ ctx.groups,
239
+ ctx.deformable_groups,
240
+ ctx.with_bias,
241
+ )
242
+ return output
243
+
244
+ @staticmethod
245
+ @once_differentiable
246
+ def backward(ctx, grad_output):
247
+ if not grad_output.is_cuda:
248
+ raise NotImplementedError("Deformable Conv is not supported on CPUs!")
249
+ input, offset, mask, weight, bias = ctx.saved_tensors
250
+ grad_input = torch.zeros_like(input)
251
+ grad_offset = torch.zeros_like(offset)
252
+ grad_mask = torch.zeros_like(mask)
253
+ grad_weight = torch.zeros_like(weight)
254
+ grad_bias = torch.zeros_like(bias)
255
+ _C.modulated_deform_conv_backward(
256
+ input,
257
+ weight,
258
+ bias,
259
+ ctx._bufs[0],
260
+ offset,
261
+ mask,
262
+ ctx._bufs[1],
263
+ grad_input,
264
+ grad_weight,
265
+ grad_bias,
266
+ grad_offset,
267
+ grad_mask,
268
+ grad_output,
269
+ weight.shape[2],
270
+ weight.shape[3],
271
+ ctx.stride,
272
+ ctx.stride,
273
+ ctx.padding,
274
+ ctx.padding,
275
+ ctx.dilation,
276
+ ctx.dilation,
277
+ ctx.groups,
278
+ ctx.deformable_groups,
279
+ ctx.with_bias,
280
+ )
281
+ if not ctx.with_bias:
282
+ grad_bias = None
283
+
284
+ return (
285
+ grad_input,
286
+ grad_offset,
287
+ grad_mask,
288
+ grad_weight,
289
+ grad_bias,
290
+ None,
291
+ None,
292
+ None,
293
+ None,
294
+ None,
295
+ )
296
+
297
+ @staticmethod
298
+ def _infer_shape(ctx, input, weight):
299
+ n = input.size(0)
300
+ channels_out = weight.size(0)
301
+ height, width = input.shape[2:4]
302
+ kernel_h, kernel_w = weight.shape[2:4]
303
+ height_out = (
304
+ height + 2 * ctx.padding - (ctx.dilation * (kernel_h - 1) + 1)
305
+ ) // ctx.stride + 1
306
+ width_out = (
307
+ width + 2 * ctx.padding - (ctx.dilation * (kernel_w - 1) + 1)
308
+ ) // ctx.stride + 1
309
+ return n, channels_out, height_out, width_out
310
+
311
+
312
+ deform_conv = _DeformConv.apply
313
+ modulated_deform_conv = _ModulatedDeformConv.apply
314
+
315
+
316
+ class DeformConv(nn.Module):
317
+ def __init__(
318
+ self,
319
+ in_channels,
320
+ out_channels,
321
+ kernel_size,
322
+ stride=1,
323
+ padding=0,
324
+ dilation=1,
325
+ groups=1,
326
+ deformable_groups=1,
327
+ bias=False,
328
+ norm=None,
329
+ activation=None,
330
+ ):
331
+ """
332
+ Deformable convolution from :paper:`deformconv`.
333
+
334
+ Arguments are similar to :class:`Conv2D`. Extra arguments:
335
+
336
+ Args:
337
+ deformable_groups (int): number of groups used in deformable convolution.
338
+ norm (nn.Module, optional): a normalization layer
339
+ activation (callable(Tensor) -> Tensor): a callable activation function
340
+ """
341
+ super(DeformConv, self).__init__()
342
+
343
+ assert not bias
344
+ assert in_channels % groups == 0, "in_channels {} cannot be divisible by groups {}".format(
345
+ in_channels, groups
346
+ )
347
+ assert (
348
+ out_channels % groups == 0
349
+ ), "out_channels {} cannot be divisible by groups {}".format(out_channels, groups)
350
+
351
+ self.in_channels = in_channels
352
+ self.out_channels = out_channels
353
+ self.kernel_size = _pair(kernel_size)
354
+ self.stride = _pair(stride)
355
+ self.padding = _pair(padding)
356
+ self.dilation = _pair(dilation)
357
+ self.groups = groups
358
+ self.deformable_groups = deformable_groups
359
+ self.norm = norm
360
+ self.activation = activation
361
+
362
+ self.weight = nn.Parameter(
363
+ torch.Tensor(out_channels, in_channels // self.groups, *self.kernel_size)
364
+ )
365
+ self.bias = None
366
+
367
+ nn.init.kaiming_uniform_(self.weight, nonlinearity="relu")
368
+
369
+ def forward(self, x, offset):
370
+ if x.numel() == 0:
371
+ # When input is empty, we want to return a empty tensor with "correct" shape,
372
+ # So that the following operations will not panic
373
+ # if they check for the shape of the tensor.
374
+ # This computes the height and width of the output tensor
375
+ output_shape = [
376
+ (i + 2 * p - (di * (k - 1) + 1)) // s + 1
377
+ for i, p, di, k, s in zip(
378
+ x.shape[-2:], self.padding, self.dilation, self.kernel_size, self.stride
379
+ )
380
+ ]
381
+ output_shape = [x.shape[0], self.weight.shape[0]] + output_shape
382
+ return _NewEmptyTensorOp.apply(x, output_shape)
383
+
384
+ x = deform_conv(
385
+ x,
386
+ offset,
387
+ self.weight,
388
+ self.stride,
389
+ self.padding,
390
+ self.dilation,
391
+ self.groups,
392
+ self.deformable_groups,
393
+ )
394
+ if self.norm is not None:
395
+ x = self.norm(x)
396
+ if self.activation is not None:
397
+ x = self.activation(x)
398
+ return x
399
+
400
+ def extra_repr(self):
401
+ tmpstr = "in_channels=" + str(self.in_channels)
402
+ tmpstr += ", out_channels=" + str(self.out_channels)
403
+ tmpstr += ", kernel_size=" + str(self.kernel_size)
404
+ tmpstr += ", stride=" + str(self.stride)
405
+ tmpstr += ", padding=" + str(self.padding)
406
+ tmpstr += ", dilation=" + str(self.dilation)
407
+ tmpstr += ", groups=" + str(self.groups)
408
+ tmpstr += ", deformable_groups=" + str(self.deformable_groups)
409
+ tmpstr += ", bias=False"
410
+ return tmpstr
411
+
412
+
413
+ class ModulatedDeformConv(nn.Module):
414
+ def __init__(
415
+ self,
416
+ in_channels,
417
+ out_channels,
418
+ kernel_size,
419
+ stride=1,
420
+ padding=0,
421
+ dilation=1,
422
+ groups=1,
423
+ deformable_groups=1,
424
+ bias=True,
425
+ norm=None,
426
+ activation=None,
427
+ ):
428
+ """
429
+ Modulated deformable convolution from :paper:`deformconv2`.
430
+
431
+ Arguments are similar to :class:`Conv2D`. Extra arguments:
432
+
433
+ Args:
434
+ deformable_groups (int): number of groups used in deformable convolution.
435
+ norm (nn.Module, optional): a normalization layer
436
+ activation (callable(Tensor) -> Tensor): a callable activation function
437
+ """
438
+ super(ModulatedDeformConv, self).__init__()
439
+ self.in_channels = in_channels
440
+ self.out_channels = out_channels
441
+ self.kernel_size = _pair(kernel_size)
442
+ self.stride = stride
443
+ self.padding = padding
444
+ self.dilation = dilation
445
+ self.groups = groups
446
+ self.deformable_groups = deformable_groups
447
+ self.with_bias = bias
448
+ self.norm = norm
449
+ self.activation = activation
450
+
451
+ self.weight = nn.Parameter(
452
+ torch.Tensor(out_channels, in_channels // groups, *self.kernel_size)
453
+ )
454
+ if bias:
455
+ self.bias = nn.Parameter(torch.Tensor(out_channels))
456
+ else:
457
+ self.bias = None
458
+
459
+ nn.init.kaiming_uniform_(self.weight, nonlinearity="relu")
460
+ if self.bias is not None:
461
+ nn.init.constant_(self.bias, 0)
462
+
463
+ def forward(self, x, offset, mask):
464
+ if x.numel() == 0:
465
+ output_shape = [
466
+ (i + 2 * p - (di * (k - 1) + 1)) // s + 1
467
+ for i, p, di, k, s in zip(
468
+ x.shape[-2:], self.padding, self.dilation, self.kernel_size, self.stride
469
+ )
470
+ ]
471
+ output_shape = [x.shape[0], self.weight.shape[0]] + output_shape
472
+ return _NewEmptyTensorOp.apply(x, output_shape)
473
+
474
+ x = modulated_deform_conv(
475
+ x,
476
+ offset,
477
+ mask,
478
+ self.weight,
479
+ self.bias,
480
+ self.stride,
481
+ self.padding,
482
+ self.dilation,
483
+ self.groups,
484
+ self.deformable_groups,
485
+ )
486
+ if self.norm is not None:
487
+ x = self.norm(x)
488
+ if self.activation is not None:
489
+ x = self.activation(x)
490
+ return x
491
+
492
+ def extra_repr(self):
493
+ tmpstr = "in_channels=" + str(self.in_channels)
494
+ tmpstr += ", out_channels=" + str(self.out_channels)
495
+ tmpstr += ", kernel_size=" + str(self.kernel_size)
496
+ tmpstr += ", stride=" + str(self.stride)
497
+ tmpstr += ", padding=" + str(self.padding)
498
+ tmpstr += ", dilation=" + str(self.dilation)
499
+ tmpstr += ", groups=" + str(self.groups)
500
+ tmpstr += ", deformable_groups=" + str(self.deformable_groups)
501
+ tmpstr += ", bias=" + str(self.with_bias)
502
+ return tmpstr
503
+
504
+
505
+ try:
506
+ from annotator.oneformer.detectron2 import _C
507
+ except ImportError:
508
+ # TODO: register ops natively so there is no need to import _C.
509
+ _msg = "detectron2 is not compiled successfully, please build following the instructions!"
510
+ _args = ("detectron2._C", _msg)
511
+ DeformConv = create_dummy_class("DeformConv", *_args)
512
+ ModulatedDeformConv = create_dummy_class("ModulatedDeformConv", *_args)
513
+ deform_conv = create_dummy_func("deform_conv", *_args)
514
+ modulated_deform_conv = create_dummy_func("modulated_deform_conv", *_args)
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/losses.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+
4
+
5
+ def diou_loss(
6
+ boxes1: torch.Tensor,
7
+ boxes2: torch.Tensor,
8
+ reduction: str = "none",
9
+ eps: float = 1e-7,
10
+ ) -> torch.Tensor:
11
+ """
12
+ Distance Intersection over Union Loss (Zhaohui Zheng et. al)
13
+ https://arxiv.org/abs/1911.08287
14
+ Args:
15
+ boxes1, boxes2 (Tensor): box locations in XYXY format, shape (N, 4) or (4,).
16
+ reduction: 'none' | 'mean' | 'sum'
17
+ 'none': No reduction will be applied to the output.
18
+ 'mean': The output will be averaged.
19
+ 'sum': The output will be summed.
20
+ eps (float): small number to prevent division by zero
21
+ """
22
+
23
+ x1, y1, x2, y2 = boxes1.unbind(dim=-1)
24
+ x1g, y1g, x2g, y2g = boxes2.unbind(dim=-1)
25
+
26
+ # TODO: use torch._assert_async() when pytorch 1.8 support is dropped
27
+ assert (x2 >= x1).all(), "bad box: x1 larger than x2"
28
+ assert (y2 >= y1).all(), "bad box: y1 larger than y2"
29
+
30
+ # Intersection keypoints
31
+ xkis1 = torch.max(x1, x1g)
32
+ ykis1 = torch.max(y1, y1g)
33
+ xkis2 = torch.min(x2, x2g)
34
+ ykis2 = torch.min(y2, y2g)
35
+
36
+ intsct = torch.zeros_like(x1)
37
+ mask = (ykis2 > ykis1) & (xkis2 > xkis1)
38
+ intsct[mask] = (xkis2[mask] - xkis1[mask]) * (ykis2[mask] - ykis1[mask])
39
+ union = (x2 - x1) * (y2 - y1) + (x2g - x1g) * (y2g - y1g) - intsct + eps
40
+ iou = intsct / union
41
+
42
+ # smallest enclosing box
43
+ xc1 = torch.min(x1, x1g)
44
+ yc1 = torch.min(y1, y1g)
45
+ xc2 = torch.max(x2, x2g)
46
+ yc2 = torch.max(y2, y2g)
47
+ diag_len = ((xc2 - xc1) ** 2) + ((yc2 - yc1) ** 2) + eps
48
+
49
+ # centers of boxes
50
+ x_p = (x2 + x1) / 2
51
+ y_p = (y2 + y1) / 2
52
+ x_g = (x1g + x2g) / 2
53
+ y_g = (y1g + y2g) / 2
54
+ distance = ((x_p - x_g) ** 2) + ((y_p - y_g) ** 2)
55
+
56
+ # Eqn. (7)
57
+ loss = 1 - iou + (distance / diag_len)
58
+ if reduction == "mean":
59
+ loss = loss.mean() if loss.numel() > 0 else 0.0 * loss.sum()
60
+ elif reduction == "sum":
61
+ loss = loss.sum()
62
+
63
+ return loss
64
+
65
+
66
+ def ciou_loss(
67
+ boxes1: torch.Tensor,
68
+ boxes2: torch.Tensor,
69
+ reduction: str = "none",
70
+ eps: float = 1e-7,
71
+ ) -> torch.Tensor:
72
+ """
73
+ Complete Intersection over Union Loss (Zhaohui Zheng et. al)
74
+ https://arxiv.org/abs/1911.08287
75
+ Args:
76
+ boxes1, boxes2 (Tensor): box locations in XYXY format, shape (N, 4) or (4,).
77
+ reduction: 'none' | 'mean' | 'sum'
78
+ 'none': No reduction will be applied to the output.
79
+ 'mean': The output will be averaged.
80
+ 'sum': The output will be summed.
81
+ eps (float): small number to prevent division by zero
82
+ """
83
+
84
+ x1, y1, x2, y2 = boxes1.unbind(dim=-1)
85
+ x1g, y1g, x2g, y2g = boxes2.unbind(dim=-1)
86
+
87
+ # TODO: use torch._assert_async() when pytorch 1.8 support is dropped
88
+ assert (x2 >= x1).all(), "bad box: x1 larger than x2"
89
+ assert (y2 >= y1).all(), "bad box: y1 larger than y2"
90
+
91
+ # Intersection keypoints
92
+ xkis1 = torch.max(x1, x1g)
93
+ ykis1 = torch.max(y1, y1g)
94
+ xkis2 = torch.min(x2, x2g)
95
+ ykis2 = torch.min(y2, y2g)
96
+
97
+ intsct = torch.zeros_like(x1)
98
+ mask = (ykis2 > ykis1) & (xkis2 > xkis1)
99
+ intsct[mask] = (xkis2[mask] - xkis1[mask]) * (ykis2[mask] - ykis1[mask])
100
+ union = (x2 - x1) * (y2 - y1) + (x2g - x1g) * (y2g - y1g) - intsct + eps
101
+ iou = intsct / union
102
+
103
+ # smallest enclosing box
104
+ xc1 = torch.min(x1, x1g)
105
+ yc1 = torch.min(y1, y1g)
106
+ xc2 = torch.max(x2, x2g)
107
+ yc2 = torch.max(y2, y2g)
108
+ diag_len = ((xc2 - xc1) ** 2) + ((yc2 - yc1) ** 2) + eps
109
+
110
+ # centers of boxes
111
+ x_p = (x2 + x1) / 2
112
+ y_p = (y2 + y1) / 2
113
+ x_g = (x1g + x2g) / 2
114
+ y_g = (y1g + y2g) / 2
115
+ distance = ((x_p - x_g) ** 2) + ((y_p - y_g) ** 2)
116
+
117
+ # width and height of boxes
118
+ w_pred = x2 - x1
119
+ h_pred = y2 - y1
120
+ w_gt = x2g - x1g
121
+ h_gt = y2g - y1g
122
+ v = (4 / (math.pi**2)) * torch.pow((torch.atan(w_gt / h_gt) - torch.atan(w_pred / h_pred)), 2)
123
+ with torch.no_grad():
124
+ alpha = v / (1 - iou + v + eps)
125
+
126
+ # Eqn. (10)
127
+ loss = 1 - iou + (distance / diag_len) + alpha * v
128
+ if reduction == "mean":
129
+ loss = loss.mean() if loss.numel() > 0 else 0.0 * loss.sum()
130
+ elif reduction == "sum":
131
+ loss = loss.sum()
132
+
133
+ return loss
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/mask_ops.py ADDED
@@ -0,0 +1,275 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ import numpy as np
3
+ from typing import Tuple
4
+ import torch
5
+ from PIL import Image
6
+ from torch.nn import functional as F
7
+
8
+ __all__ = ["paste_masks_in_image"]
9
+
10
+
11
+ BYTES_PER_FLOAT = 4
12
+ # TODO: This memory limit may be too much or too little. It would be better to
13
+ # determine it based on available resources.
14
+ GPU_MEM_LIMIT = 1024**3 # 1 GB memory limit
15
+
16
+
17
+ def _do_paste_mask(masks, boxes, img_h: int, img_w: int, skip_empty: bool = True):
18
+ """
19
+ Args:
20
+ masks: N, 1, H, W
21
+ boxes: N, 4
22
+ img_h, img_w (int):
23
+ skip_empty (bool): only paste masks within the region that
24
+ tightly bound all boxes, and returns the results this region only.
25
+ An important optimization for CPU.
26
+
27
+ Returns:
28
+ if skip_empty == False, a mask of shape (N, img_h, img_w)
29
+ if skip_empty == True, a mask of shape (N, h', w'), and the slice
30
+ object for the corresponding region.
31
+ """
32
+ # On GPU, paste all masks together (up to chunk size)
33
+ # by using the entire image to sample the masks
34
+ # Compared to pasting them one by one,
35
+ # this has more operations but is faster on COCO-scale dataset.
36
+ device = masks.device
37
+
38
+ if skip_empty and not torch.jit.is_scripting():
39
+ x0_int, y0_int = torch.clamp(boxes.min(dim=0).values.floor()[:2] - 1, min=0).to(
40
+ dtype=torch.int32
41
+ )
42
+ x1_int = torch.clamp(boxes[:, 2].max().ceil() + 1, max=img_w).to(dtype=torch.int32)
43
+ y1_int = torch.clamp(boxes[:, 3].max().ceil() + 1, max=img_h).to(dtype=torch.int32)
44
+ else:
45
+ x0_int, y0_int = 0, 0
46
+ x1_int, y1_int = img_w, img_h
47
+ x0, y0, x1, y1 = torch.split(boxes, 1, dim=1) # each is Nx1
48
+
49
+ N = masks.shape[0]
50
+
51
+ img_y = torch.arange(y0_int, y1_int, device=device, dtype=torch.float32) + 0.5
52
+ img_x = torch.arange(x0_int, x1_int, device=device, dtype=torch.float32) + 0.5
53
+ img_y = (img_y - y0) / (y1 - y0) * 2 - 1
54
+ img_x = (img_x - x0) / (x1 - x0) * 2 - 1
55
+ # img_x, img_y have shapes (N, w), (N, h)
56
+
57
+ gx = img_x[:, None, :].expand(N, img_y.size(1), img_x.size(1))
58
+ gy = img_y[:, :, None].expand(N, img_y.size(1), img_x.size(1))
59
+ grid = torch.stack([gx, gy], dim=3)
60
+
61
+ if not torch.jit.is_scripting():
62
+ if not masks.dtype.is_floating_point:
63
+ masks = masks.float()
64
+ img_masks = F.grid_sample(masks, grid.to(masks.dtype), align_corners=False)
65
+
66
+ if skip_empty and not torch.jit.is_scripting():
67
+ return img_masks[:, 0], (slice(y0_int, y1_int), slice(x0_int, x1_int))
68
+ else:
69
+ return img_masks[:, 0], ()
70
+
71
+
72
+ # Annotate boxes as Tensor (but not Boxes) in order to use scripting
73
+ @torch.jit.script_if_tracing
74
+ def paste_masks_in_image(
75
+ masks: torch.Tensor, boxes: torch.Tensor, image_shape: Tuple[int, int], threshold: float = 0.5
76
+ ):
77
+ """
78
+ Paste a set of masks that are of a fixed resolution (e.g., 28 x 28) into an image.
79
+ The location, height, and width for pasting each mask is determined by their
80
+ corresponding bounding boxes in boxes.
81
+
82
+ Note:
83
+ This is a complicated but more accurate implementation. In actual deployment, it is
84
+ often enough to use a faster but less accurate implementation.
85
+ See :func:`paste_mask_in_image_old` in this file for an alternative implementation.
86
+
87
+ Args:
88
+ masks (tensor): Tensor of shape (Bimg, Hmask, Wmask), where Bimg is the number of
89
+ detected object instances in the image and Hmask, Wmask are the mask width and mask
90
+ height of the predicted mask (e.g., Hmask = Wmask = 28). Values are in [0, 1].
91
+ boxes (Boxes or Tensor): A Boxes of length Bimg or Tensor of shape (Bimg, 4).
92
+ boxes[i] and masks[i] correspond to the same object instance.
93
+ image_shape (tuple): height, width
94
+ threshold (float): A threshold in [0, 1] for converting the (soft) masks to
95
+ binary masks.
96
+
97
+ Returns:
98
+ img_masks (Tensor): A tensor of shape (Bimg, Himage, Wimage), where Bimg is the
99
+ number of detected object instances and Himage, Wimage are the image width
100
+ and height. img_masks[i] is a binary mask for object instance i.
101
+ """
102
+
103
+ assert masks.shape[-1] == masks.shape[-2], "Only square mask predictions are supported"
104
+ N = len(masks)
105
+ if N == 0:
106
+ return masks.new_empty((0,) + image_shape, dtype=torch.uint8)
107
+ if not isinstance(boxes, torch.Tensor):
108
+ boxes = boxes.tensor
109
+ device = boxes.device
110
+ assert len(boxes) == N, boxes.shape
111
+
112
+ img_h, img_w = image_shape
113
+
114
+ # The actual implementation split the input into chunks,
115
+ # and paste them chunk by chunk.
116
+ if device.type == "cpu" or torch.jit.is_scripting():
117
+ # CPU is most efficient when they are pasted one by one with skip_empty=True
118
+ # so that it performs minimal number of operations.
119
+ num_chunks = N
120
+ else:
121
+ # GPU benefits from parallelism for larger chunks, but may have memory issue
122
+ # int(img_h) because shape may be tensors in tracing
123
+ num_chunks = int(np.ceil(N * int(img_h) * int(img_w) * BYTES_PER_FLOAT / GPU_MEM_LIMIT))
124
+ assert (
125
+ num_chunks <= N
126
+ ), "Default GPU_MEM_LIMIT in mask_ops.py is too small; try increasing it"
127
+ chunks = torch.chunk(torch.arange(N, device=device), num_chunks)
128
+
129
+ img_masks = torch.zeros(
130
+ N, img_h, img_w, device=device, dtype=torch.bool if threshold >= 0 else torch.uint8
131
+ )
132
+ for inds in chunks:
133
+ masks_chunk, spatial_inds = _do_paste_mask(
134
+ masks[inds, None, :, :], boxes[inds], img_h, img_w, skip_empty=device.type == "cpu"
135
+ )
136
+
137
+ if threshold >= 0:
138
+ masks_chunk = (masks_chunk >= threshold).to(dtype=torch.bool)
139
+ else:
140
+ # for visualization and debugging
141
+ masks_chunk = (masks_chunk * 255).to(dtype=torch.uint8)
142
+
143
+ if torch.jit.is_scripting(): # Scripting does not use the optimized codepath
144
+ img_masks[inds] = masks_chunk
145
+ else:
146
+ img_masks[(inds,) + spatial_inds] = masks_chunk
147
+ return img_masks
148
+
149
+
150
+ # The below are the original paste function (from Detectron1) which has
151
+ # larger quantization error.
152
+ # It is faster on CPU, while the aligned one is faster on GPU thanks to grid_sample.
153
+
154
+
155
+ def paste_mask_in_image_old(mask, box, img_h, img_w, threshold):
156
+ """
157
+ Paste a single mask in an image.
158
+ This is a per-box implementation of :func:`paste_masks_in_image`.
159
+ This function has larger quantization error due to incorrect pixel
160
+ modeling and is not used any more.
161
+
162
+ Args:
163
+ mask (Tensor): A tensor of shape (Hmask, Wmask) storing the mask of a single
164
+ object instance. Values are in [0, 1].
165
+ box (Tensor): A tensor of shape (4, ) storing the x0, y0, x1, y1 box corners
166
+ of the object instance.
167
+ img_h, img_w (int): Image height and width.
168
+ threshold (float): Mask binarization threshold in [0, 1].
169
+
170
+ Returns:
171
+ im_mask (Tensor):
172
+ The resized and binarized object mask pasted into the original
173
+ image plane (a tensor of shape (img_h, img_w)).
174
+ """
175
+ # Conversion from continuous box coordinates to discrete pixel coordinates
176
+ # via truncation (cast to int32). This determines which pixels to paste the
177
+ # mask onto.
178
+ box = box.to(dtype=torch.int32) # Continuous to discrete coordinate conversion
179
+ # An example (1D) box with continuous coordinates (x0=0.7, x1=4.3) will map to
180
+ # a discrete coordinates (x0=0, x1=4). Note that box is mapped to 5 = x1 - x0 + 1
181
+ # pixels (not x1 - x0 pixels).
182
+ samples_w = box[2] - box[0] + 1 # Number of pixel samples, *not* geometric width
183
+ samples_h = box[3] - box[1] + 1 # Number of pixel samples, *not* geometric height
184
+
185
+ # Resample the mask from it's original grid to the new samples_w x samples_h grid
186
+ mask = Image.fromarray(mask.cpu().numpy())
187
+ mask = mask.resize((samples_w, samples_h), resample=Image.BILINEAR)
188
+ mask = np.array(mask, copy=False)
189
+
190
+ if threshold >= 0:
191
+ mask = np.array(mask > threshold, dtype=np.uint8)
192
+ mask = torch.from_numpy(mask)
193
+ else:
194
+ # for visualization and debugging, we also
195
+ # allow it to return an unmodified mask
196
+ mask = torch.from_numpy(mask * 255).to(torch.uint8)
197
+
198
+ im_mask = torch.zeros((img_h, img_w), dtype=torch.uint8)
199
+ x_0 = max(box[0], 0)
200
+ x_1 = min(box[2] + 1, img_w)
201
+ y_0 = max(box[1], 0)
202
+ y_1 = min(box[3] + 1, img_h)
203
+
204
+ im_mask[y_0:y_1, x_0:x_1] = mask[
205
+ (y_0 - box[1]) : (y_1 - box[1]), (x_0 - box[0]) : (x_1 - box[0])
206
+ ]
207
+ return im_mask
208
+
209
+
210
+ # Our pixel modeling requires extrapolation for any continuous
211
+ # coordinate < 0.5 or > length - 0.5. When sampling pixels on the masks,
212
+ # we would like this extrapolation to be an interpolation between boundary values and zero,
213
+ # instead of using absolute zero or boundary values.
214
+ # Therefore `paste_mask_in_image_old` is often used with zero padding around the masks like this:
215
+ # masks, scale = pad_masks(masks[:, 0, :, :], 1)
216
+ # boxes = scale_boxes(boxes.tensor, scale)
217
+
218
+
219
+ def pad_masks(masks, padding):
220
+ """
221
+ Args:
222
+ masks (tensor): A tensor of shape (B, M, M) representing B masks.
223
+ padding (int): Number of cells to pad on all sides.
224
+
225
+ Returns:
226
+ The padded masks and the scale factor of the padding size / original size.
227
+ """
228
+ B = masks.shape[0]
229
+ M = masks.shape[-1]
230
+ pad2 = 2 * padding
231
+ scale = float(M + pad2) / M
232
+ padded_masks = masks.new_zeros((B, M + pad2, M + pad2))
233
+ padded_masks[:, padding:-padding, padding:-padding] = masks
234
+ return padded_masks, scale
235
+
236
+
237
+ def scale_boxes(boxes, scale):
238
+ """
239
+ Args:
240
+ boxes (tensor): A tensor of shape (B, 4) representing B boxes with 4
241
+ coords representing the corners x0, y0, x1, y1,
242
+ scale (float): The box scaling factor.
243
+
244
+ Returns:
245
+ Scaled boxes.
246
+ """
247
+ w_half = (boxes[:, 2] - boxes[:, 0]) * 0.5
248
+ h_half = (boxes[:, 3] - boxes[:, 1]) * 0.5
249
+ x_c = (boxes[:, 2] + boxes[:, 0]) * 0.5
250
+ y_c = (boxes[:, 3] + boxes[:, 1]) * 0.5
251
+
252
+ w_half *= scale
253
+ h_half *= scale
254
+
255
+ scaled_boxes = torch.zeros_like(boxes)
256
+ scaled_boxes[:, 0] = x_c - w_half
257
+ scaled_boxes[:, 2] = x_c + w_half
258
+ scaled_boxes[:, 1] = y_c - h_half
259
+ scaled_boxes[:, 3] = y_c + h_half
260
+ return scaled_boxes
261
+
262
+
263
+ @torch.jit.script_if_tracing
264
+ def _paste_masks_tensor_shape(
265
+ masks: torch.Tensor,
266
+ boxes: torch.Tensor,
267
+ image_shape: Tuple[torch.Tensor, torch.Tensor],
268
+ threshold: float = 0.5,
269
+ ):
270
+ """
271
+ A wrapper of paste_masks_in_image where image_shape is Tensor.
272
+ During tracing, shapes might be tensors instead of ints. The Tensor->int
273
+ conversion should be scripted rather than traced.
274
+ """
275
+ return paste_masks_in_image(masks, boxes, (int(image_shape[0]), int(image_shape[1])), threshold)
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/nms.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # Copyright (c) Facebook, Inc. and its affiliates.
3
+
4
+ import torch
5
+ from torchvision.ops import boxes as box_ops
6
+ from torchvision.ops import nms # noqa . for compatibility
7
+
8
+
9
+ def batched_nms(
10
+ boxes: torch.Tensor, scores: torch.Tensor, idxs: torch.Tensor, iou_threshold: float
11
+ ):
12
+ """
13
+ Same as torchvision.ops.boxes.batched_nms, but with float().
14
+ """
15
+ assert boxes.shape[-1] == 4
16
+ # Note: Torchvision already has a strategy (https://github.com/pytorch/vision/issues/1311)
17
+ # to decide whether to use coordinate trick or for loop to implement batched_nms. So we
18
+ # just call it directly.
19
+ # Fp16 does not have enough range for batched NMS, so adding float().
20
+ return box_ops.batched_nms(boxes.float(), scores, idxs, iou_threshold)
21
+
22
+
23
+ # Note: this function (nms_rotated) might be moved into
24
+ # torchvision/ops/boxes.py in the future
25
+ def nms_rotated(boxes: torch.Tensor, scores: torch.Tensor, iou_threshold: float):
26
+ """
27
+ Performs non-maximum suppression (NMS) on the rotated boxes according
28
+ to their intersection-over-union (IoU).
29
+
30
+ Rotated NMS iteratively removes lower scoring rotated boxes which have an
31
+ IoU greater than iou_threshold with another (higher scoring) rotated box.
32
+
33
+ Note that RotatedBox (5, 3, 4, 2, -90) covers exactly the same region as
34
+ RotatedBox (5, 3, 4, 2, 90) does, and their IoU will be 1. However, they
35
+ can be representing completely different objects in certain tasks, e.g., OCR.
36
+
37
+ As for the question of whether rotated-NMS should treat them as faraway boxes
38
+ even though their IOU is 1, it depends on the application and/or ground truth annotation.
39
+
40
+ As an extreme example, consider a single character v and the square box around it.
41
+
42
+ If the angle is 0 degree, the object (text) would be read as 'v';
43
+
44
+ If the angle is 90 degrees, the object (text) would become '>';
45
+
46
+ If the angle is 180 degrees, the object (text) would become '^';
47
+
48
+ If the angle is 270/-90 degrees, the object (text) would become '<'
49
+
50
+ All of these cases have IoU of 1 to each other, and rotated NMS that only
51
+ uses IoU as criterion would only keep one of them with the highest score -
52
+ which, practically, still makes sense in most cases because typically
53
+ only one of theses orientations is the correct one. Also, it does not matter
54
+ as much if the box is only used to classify the object (instead of transcribing
55
+ them with a sequential OCR recognition model) later.
56
+
57
+ On the other hand, when we use IoU to filter proposals that are close to the
58
+ ground truth during training, we should definitely take the angle into account if
59
+ we know the ground truth is labeled with the strictly correct orientation (as in,
60
+ upside-down words are annotated with -180 degrees even though they can be covered
61
+ with a 0/90/-90 degree box, etc.)
62
+
63
+ The way the original dataset is annotated also matters. For example, if the dataset
64
+ is a 4-point polygon dataset that does not enforce ordering of vertices/orientation,
65
+ we can estimate a minimum rotated bounding box to this polygon, but there's no way
66
+ we can tell the correct angle with 100% confidence (as shown above, there could be 4 different
67
+ rotated boxes, with angles differed by 90 degrees to each other, covering the exactly
68
+ same region). In that case we have to just use IoU to determine the box
69
+ proximity (as many detection benchmarks (even for text) do) unless there're other
70
+ assumptions we can make (like width is always larger than height, or the object is not
71
+ rotated by more than 90 degrees CCW/CW, etc.)
72
+
73
+ In summary, not considering angles in rotated NMS seems to be a good option for now,
74
+ but we should be aware of its implications.
75
+
76
+ Args:
77
+ boxes (Tensor[N, 5]): Rotated boxes to perform NMS on. They are expected to be in
78
+ (x_center, y_center, width, height, angle_degrees) format.
79
+ scores (Tensor[N]): Scores for each one of the rotated boxes
80
+ iou_threshold (float): Discards all overlapping rotated boxes with IoU < iou_threshold
81
+
82
+ Returns:
83
+ keep (Tensor): int64 tensor with the indices of the elements that have been kept
84
+ by Rotated NMS, sorted in decreasing order of scores
85
+ """
86
+ return torch.ops.detectron2.nms_rotated(boxes, scores, iou_threshold)
87
+
88
+
89
+ # Note: this function (batched_nms_rotated) might be moved into
90
+ # torchvision/ops/boxes.py in the future
91
+
92
+
93
+ @torch.jit.script_if_tracing
94
+ def batched_nms_rotated(
95
+ boxes: torch.Tensor, scores: torch.Tensor, idxs: torch.Tensor, iou_threshold: float
96
+ ):
97
+ """
98
+ Performs non-maximum suppression in a batched fashion.
99
+
100
+ Each index value correspond to a category, and NMS
101
+ will not be applied between elements of different categories.
102
+
103
+ Args:
104
+ boxes (Tensor[N, 5]):
105
+ boxes where NMS will be performed. They
106
+ are expected to be in (x_ctr, y_ctr, width, height, angle_degrees) format
107
+ scores (Tensor[N]):
108
+ scores for each one of the boxes
109
+ idxs (Tensor[N]):
110
+ indices of the categories for each one of the boxes.
111
+ iou_threshold (float):
112
+ discards all overlapping boxes
113
+ with IoU < iou_threshold
114
+
115
+ Returns:
116
+ Tensor:
117
+ int64 tensor with the indices of the elements that have been kept
118
+ by NMS, sorted in decreasing order of scores
119
+ """
120
+ assert boxes.shape[-1] == 5
121
+
122
+ if boxes.numel() == 0:
123
+ return torch.empty((0,), dtype=torch.int64, device=boxes.device)
124
+ boxes = boxes.float() # fp16 does not have enough range for batched NMS
125
+ # Strategy: in order to perform NMS independently per class,
126
+ # we add an offset to all the boxes. The offset is dependent
127
+ # only on the class idx, and is large enough so that boxes
128
+ # from different classes do not overlap
129
+
130
+ # Note that batched_nms in torchvision/ops/boxes.py only uses max_coordinate,
131
+ # which won't handle negative coordinates correctly.
132
+ # Here by using min_coordinate we can make sure the negative coordinates are
133
+ # correctly handled.
134
+ max_coordinate = (
135
+ torch.max(boxes[:, 0], boxes[:, 1]) + torch.max(boxes[:, 2], boxes[:, 3]) / 2
136
+ ).max()
137
+ min_coordinate = (
138
+ torch.min(boxes[:, 0], boxes[:, 1]) - torch.max(boxes[:, 2], boxes[:, 3]) / 2
139
+ ).min()
140
+ offsets = idxs.to(boxes) * (max_coordinate - min_coordinate + 1)
141
+ boxes_for_nms = boxes.clone() # avoid modifying the original values in boxes
142
+ boxes_for_nms[:, :2] += offsets[:, None]
143
+ keep = nms_rotated(boxes_for_nms, scores, iou_threshold)
144
+ return keep
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/roi_align.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ from torch import nn
3
+ from torchvision.ops import roi_align
4
+
5
+
6
+ # NOTE: torchvision's RoIAlign has a different default aligned=False
7
+ class ROIAlign(nn.Module):
8
+ def __init__(self, output_size, spatial_scale, sampling_ratio, aligned=True):
9
+ """
10
+ Args:
11
+ output_size (tuple): h, w
12
+ spatial_scale (float): scale the input boxes by this number
13
+ sampling_ratio (int): number of inputs samples to take for each output
14
+ sample. 0 to take samples densely.
15
+ aligned (bool): if False, use the legacy implementation in
16
+ Detectron. If True, align the results more perfectly.
17
+
18
+ Note:
19
+ The meaning of aligned=True:
20
+
21
+ Given a continuous coordinate c, its two neighboring pixel indices (in our
22
+ pixel model) are computed by floor(c - 0.5) and ceil(c - 0.5). For example,
23
+ c=1.3 has pixel neighbors with discrete indices [0] and [1] (which are sampled
24
+ from the underlying signal at continuous coordinates 0.5 and 1.5). But the original
25
+ roi_align (aligned=False) does not subtract the 0.5 when computing neighboring
26
+ pixel indices and therefore it uses pixels with a slightly incorrect alignment
27
+ (relative to our pixel model) when performing bilinear interpolation.
28
+
29
+ With `aligned=True`,
30
+ we first appropriately scale the ROI and then shift it by -0.5
31
+ prior to calling roi_align. This produces the correct neighbors; see
32
+ detectron2/tests/test_roi_align.py for verification.
33
+
34
+ The difference does not make a difference to the model's performance if
35
+ ROIAlign is used together with conv layers.
36
+ """
37
+ super().__init__()
38
+ self.output_size = output_size
39
+ self.spatial_scale = spatial_scale
40
+ self.sampling_ratio = sampling_ratio
41
+ self.aligned = aligned
42
+
43
+ from torchvision import __version__
44
+
45
+ version = tuple(int(x) for x in __version__.split(".")[:2])
46
+ # https://github.com/pytorch/vision/pull/2438
47
+ assert version >= (0, 7), "Require torchvision >= 0.7"
48
+
49
+ def forward(self, input, rois):
50
+ """
51
+ Args:
52
+ input: NCHW images
53
+ rois: Bx5 boxes. First column is the index into N. The other 4 columns are xyxy.
54
+ """
55
+ assert rois.dim() == 2 and rois.size(1) == 5
56
+ if input.is_quantized:
57
+ input = input.dequantize()
58
+ return roi_align(
59
+ input,
60
+ rois.to(dtype=input.dtype),
61
+ self.output_size,
62
+ self.spatial_scale,
63
+ self.sampling_ratio,
64
+ self.aligned,
65
+ )
66
+
67
+ def __repr__(self):
68
+ tmpstr = self.__class__.__name__ + "("
69
+ tmpstr += "output_size=" + str(self.output_size)
70
+ tmpstr += ", spatial_scale=" + str(self.spatial_scale)
71
+ tmpstr += ", sampling_ratio=" + str(self.sampling_ratio)
72
+ tmpstr += ", aligned=" + str(self.aligned)
73
+ tmpstr += ")"
74
+ return tmpstr
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/roi_align_rotated.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ import torch
3
+ from torch import nn
4
+ from torch.autograd import Function
5
+ from torch.autograd.function import once_differentiable
6
+ from torch.nn.modules.utils import _pair
7
+
8
+
9
+ class _ROIAlignRotated(Function):
10
+ @staticmethod
11
+ def forward(ctx, input, roi, output_size, spatial_scale, sampling_ratio):
12
+ ctx.save_for_backward(roi)
13
+ ctx.output_size = _pair(output_size)
14
+ ctx.spatial_scale = spatial_scale
15
+ ctx.sampling_ratio = sampling_ratio
16
+ ctx.input_shape = input.size()
17
+ output = torch.ops.detectron2.roi_align_rotated_forward(
18
+ input, roi, spatial_scale, output_size[0], output_size[1], sampling_ratio
19
+ )
20
+ return output
21
+
22
+ @staticmethod
23
+ @once_differentiable
24
+ def backward(ctx, grad_output):
25
+ (rois,) = ctx.saved_tensors
26
+ output_size = ctx.output_size
27
+ spatial_scale = ctx.spatial_scale
28
+ sampling_ratio = ctx.sampling_ratio
29
+ bs, ch, h, w = ctx.input_shape
30
+ grad_input = torch.ops.detectron2.roi_align_rotated_backward(
31
+ grad_output,
32
+ rois,
33
+ spatial_scale,
34
+ output_size[0],
35
+ output_size[1],
36
+ bs,
37
+ ch,
38
+ h,
39
+ w,
40
+ sampling_ratio,
41
+ )
42
+ return grad_input, None, None, None, None, None
43
+
44
+
45
+ roi_align_rotated = _ROIAlignRotated.apply
46
+
47
+
48
+ class ROIAlignRotated(nn.Module):
49
+ def __init__(self, output_size, spatial_scale, sampling_ratio):
50
+ """
51
+ Args:
52
+ output_size (tuple): h, w
53
+ spatial_scale (float): scale the input boxes by this number
54
+ sampling_ratio (int): number of inputs samples to take for each output
55
+ sample. 0 to take samples densely.
56
+
57
+ Note:
58
+ ROIAlignRotated supports continuous coordinate by default:
59
+ Given a continuous coordinate c, its two neighboring pixel indices (in our
60
+ pixel model) are computed by floor(c - 0.5) and ceil(c - 0.5). For example,
61
+ c=1.3 has pixel neighbors with discrete indices [0] and [1] (which are sampled
62
+ from the underlying signal at continuous coordinates 0.5 and 1.5).
63
+ """
64
+ super(ROIAlignRotated, self).__init__()
65
+ self.output_size = output_size
66
+ self.spatial_scale = spatial_scale
67
+ self.sampling_ratio = sampling_ratio
68
+
69
+ def forward(self, input, rois):
70
+ """
71
+ Args:
72
+ input: NCHW images
73
+ rois: Bx6 boxes. First column is the index into N.
74
+ The other 5 columns are (x_ctr, y_ctr, width, height, angle_degrees).
75
+ """
76
+ assert rois.dim() == 2 and rois.size(1) == 6
77
+ orig_dtype = input.dtype
78
+ if orig_dtype == torch.float16:
79
+ input = input.float()
80
+ rois = rois.float()
81
+ output_size = _pair(self.output_size)
82
+
83
+ # Scripting for Autograd is currently unsupported.
84
+ # This is a quick fix without having to rewrite code on the C++ side
85
+ if torch.jit.is_scripting() or torch.jit.is_tracing():
86
+ return torch.ops.detectron2.roi_align_rotated_forward(
87
+ input, rois, self.spatial_scale, output_size[0], output_size[1], self.sampling_ratio
88
+ ).to(dtype=orig_dtype)
89
+
90
+ return roi_align_rotated(
91
+ input, rois, self.output_size, self.spatial_scale, self.sampling_ratio
92
+ ).to(dtype=orig_dtype)
93
+
94
+ def __repr__(self):
95
+ tmpstr = self.__class__.__name__ + "("
96
+ tmpstr += "output_size=" + str(self.output_size)
97
+ tmpstr += ", spatial_scale=" + str(self.spatial_scale)
98
+ tmpstr += ", sampling_ratio=" + str(self.sampling_ratio)
99
+ tmpstr += ")"
100
+ return tmpstr
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/rotated_boxes.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ from __future__ import absolute_import, division, print_function, unicode_literals
3
+ import torch
4
+
5
+
6
+ def pairwise_iou_rotated(boxes1, boxes2):
7
+ """
8
+ Return intersection-over-union (Jaccard index) of boxes.
9
+
10
+ Both sets of boxes are expected to be in
11
+ (x_center, y_center, width, height, angle) format.
12
+
13
+ Arguments:
14
+ boxes1 (Tensor[N, 5])
15
+ boxes2 (Tensor[M, 5])
16
+
17
+ Returns:
18
+ iou (Tensor[N, M]): the NxM matrix containing the pairwise
19
+ IoU values for every element in boxes1 and boxes2
20
+ """
21
+ return torch.ops.detectron2.box_iou_rotated(boxes1, boxes2)
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/shape_spec.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # Copyright (c) Facebook, Inc. and its affiliates.
3
+ from dataclasses import dataclass
4
+ from typing import Optional
5
+
6
+
7
+ @dataclass
8
+ class ShapeSpec:
9
+ """
10
+ A simple structure that contains basic shape specification about a tensor.
11
+ It is often used as the auxiliary inputs/outputs of models,
12
+ to complement the lack of shape inference ability among pytorch modules.
13
+ """
14
+
15
+ channels: Optional[int] = None
16
+ height: Optional[int] = None
17
+ width: Optional[int] = None
18
+ stride: Optional[int] = None
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/layers/wrappers.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ """
3
+ Wrappers around on some nn functions, mainly to support empty tensors.
4
+
5
+ Ideally, add support directly in PyTorch to empty tensors in those functions.
6
+
7
+ These can be removed once https://github.com/pytorch/pytorch/issues/12013
8
+ is implemented
9
+ """
10
+
11
+ import warnings
12
+ from typing import List, Optional
13
+ import torch
14
+ from torch.nn import functional as F
15
+
16
+ from annotator.oneformer.detectron2.utils.env import TORCH_VERSION
17
+
18
+
19
+ def shapes_to_tensor(x: List[int], device: Optional[torch.device] = None) -> torch.Tensor:
20
+ """
21
+ Turn a list of integer scalars or integer Tensor scalars into a vector,
22
+ in a way that's both traceable and scriptable.
23
+
24
+ In tracing, `x` should be a list of scalar Tensor, so the output can trace to the inputs.
25
+ In scripting or eager, `x` should be a list of int.
26
+ """
27
+ if torch.jit.is_scripting():
28
+ return torch.as_tensor(x, device=device)
29
+ if torch.jit.is_tracing():
30
+ assert all(
31
+ [isinstance(t, torch.Tensor) for t in x]
32
+ ), "Shape should be tensor during tracing!"
33
+ # as_tensor should not be used in tracing because it records a constant
34
+ ret = torch.stack(x)
35
+ if ret.device != device: # avoid recording a hard-coded device if not necessary
36
+ ret = ret.to(device=device)
37
+ return ret
38
+ return torch.as_tensor(x, device=device)
39
+
40
+
41
+ def check_if_dynamo_compiling():
42
+ if TORCH_VERSION >= (1, 14):
43
+ from torch._dynamo import is_compiling
44
+
45
+ return is_compiling()
46
+ else:
47
+ return False
48
+
49
+
50
+ def cat(tensors: List[torch.Tensor], dim: int = 0):
51
+ """
52
+ Efficient version of torch.cat that avoids a copy if there is only a single element in a list
53
+ """
54
+ assert isinstance(tensors, (list, tuple))
55
+ if len(tensors) == 1:
56
+ return tensors[0]
57
+ return torch.cat(tensors, dim)
58
+
59
+
60
+ def empty_input_loss_func_wrapper(loss_func):
61
+ def wrapped_loss_func(input, target, *, reduction="mean", **kwargs):
62
+ """
63
+ Same as `loss_func`, but returns 0 (instead of nan) for empty inputs.
64
+ """
65
+ if target.numel() == 0 and reduction == "mean":
66
+ return input.sum() * 0.0 # connect the gradient
67
+ return loss_func(input, target, reduction=reduction, **kwargs)
68
+
69
+ return wrapped_loss_func
70
+
71
+
72
+ cross_entropy = empty_input_loss_func_wrapper(F.cross_entropy)
73
+
74
+
75
+ class _NewEmptyTensorOp(torch.autograd.Function):
76
+ @staticmethod
77
+ def forward(ctx, x, new_shape):
78
+ ctx.shape = x.shape
79
+ return x.new_empty(new_shape)
80
+
81
+ @staticmethod
82
+ def backward(ctx, grad):
83
+ shape = ctx.shape
84
+ return _NewEmptyTensorOp.apply(grad, shape), None
85
+
86
+
87
+ class Conv2d(torch.nn.Conv2d):
88
+ """
89
+ A wrapper around :class:`torch.nn.Conv2d` to support empty inputs and more features.
90
+ """
91
+
92
+ def __init__(self, *args, **kwargs):
93
+ """
94
+ Extra keyword arguments supported in addition to those in `torch.nn.Conv2d`:
95
+
96
+ Args:
97
+ norm (nn.Module, optional): a normalization layer
98
+ activation (callable(Tensor) -> Tensor): a callable activation function
99
+
100
+ It assumes that norm layer is used before activation.
101
+ """
102
+ norm = kwargs.pop("norm", None)
103
+ activation = kwargs.pop("activation", None)
104
+ super().__init__(*args, **kwargs)
105
+
106
+ self.norm = norm
107
+ self.activation = activation
108
+
109
+ def forward(self, x):
110
+ # torchscript does not support SyncBatchNorm yet
111
+ # https://github.com/pytorch/pytorch/issues/40507
112
+ # and we skip these codes in torchscript since:
113
+ # 1. currently we only support torchscript in evaluation mode
114
+ # 2. features needed by exporting module to torchscript are added in PyTorch 1.6 or
115
+ # later version, `Conv2d` in these PyTorch versions has already supported empty inputs.
116
+ if not torch.jit.is_scripting():
117
+ # Dynamo doesn't support context managers yet
118
+ is_dynamo_compiling = check_if_dynamo_compiling()
119
+ if not is_dynamo_compiling:
120
+ with warnings.catch_warnings(record=True):
121
+ if x.numel() == 0 and self.training:
122
+ # https://github.com/pytorch/pytorch/issues/12013
123
+ assert not isinstance(
124
+ self.norm, torch.nn.SyncBatchNorm
125
+ ), "SyncBatchNorm does not support empty inputs!"
126
+
127
+ x = F.conv2d(
128
+ x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups
129
+ )
130
+ if self.norm is not None:
131
+ x = self.norm(x)
132
+ if self.activation is not None:
133
+ x = self.activation(x)
134
+ return x
135
+
136
+
137
+ ConvTranspose2d = torch.nn.ConvTranspose2d
138
+ BatchNorm2d = torch.nn.BatchNorm2d
139
+ interpolate = F.interpolate
140
+ Linear = torch.nn.Linear
141
+
142
+
143
+ def nonzero_tuple(x):
144
+ """
145
+ A 'as_tuple=True' version of torch.nonzero to support torchscript.
146
+ because of https://github.com/pytorch/pytorch/issues/38718
147
+ """
148
+ if torch.jit.is_scripting():
149
+ if x.dim() == 0:
150
+ return x.unsqueeze(0).nonzero().unbind(1)
151
+ return x.nonzero().unbind(1)
152
+ else:
153
+ return x.nonzero(as_tuple=True)
154
+
155
+
156
+ @torch.jit.script_if_tracing
157
+ def move_device_like(src: torch.Tensor, dst: torch.Tensor) -> torch.Tensor:
158
+ """
159
+ Tracing friendly way to cast tensor to another tensor's device. Device will be treated
160
+ as constant during tracing, scripting the casting process as whole can workaround this issue.
161
+ """
162
+ return src.to(dst.device)
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/model_zoo/__init__.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ """
3
+ Model Zoo API for Detectron2: a collection of functions to create common model architectures
4
+ listed in `MODEL_ZOO.md <https://github.com/facebookresearch/detectron2/blob/main/MODEL_ZOO.md>`_,
5
+ and optionally load their pre-trained weights.
6
+ """
7
+
8
+ from .model_zoo import get, get_config_file, get_checkpoint_url, get_config
9
+
10
+ __all__ = ["get_checkpoint_url", "get", "get_config_file", "get_config"]
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/model_zoo/model_zoo.py ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ import os
3
+ from typing import Optional
4
+ import pkg_resources
5
+ import torch
6
+
7
+ from annotator.oneformer.detectron2.checkpoint import DetectionCheckpointer
8
+ from annotator.oneformer.detectron2.config import CfgNode, LazyConfig, get_cfg, instantiate
9
+ from annotator.oneformer.detectron2.modeling import build_model
10
+
11
+
12
+ class _ModelZooUrls(object):
13
+ """
14
+ Mapping from names to officially released Detectron2 pre-trained models.
15
+ """
16
+
17
+ S3_PREFIX = "https://dl.fbaipublicfiles.com/detectron2/"
18
+
19
+ # format: {config_path.yaml} -> model_id/model_final_{commit}.pkl
20
+ CONFIG_PATH_TO_URL_SUFFIX = {
21
+ # COCO Detection with Faster R-CNN
22
+ "COCO-Detection/faster_rcnn_R_50_C4_1x": "137257644/model_final_721ade.pkl",
23
+ "COCO-Detection/faster_rcnn_R_50_DC5_1x": "137847829/model_final_51d356.pkl",
24
+ "COCO-Detection/faster_rcnn_R_50_FPN_1x": "137257794/model_final_b275ba.pkl",
25
+ "COCO-Detection/faster_rcnn_R_50_C4_3x": "137849393/model_final_f97cb7.pkl",
26
+ "COCO-Detection/faster_rcnn_R_50_DC5_3x": "137849425/model_final_68d202.pkl",
27
+ "COCO-Detection/faster_rcnn_R_50_FPN_3x": "137849458/model_final_280758.pkl",
28
+ "COCO-Detection/faster_rcnn_R_101_C4_3x": "138204752/model_final_298dad.pkl",
29
+ "COCO-Detection/faster_rcnn_R_101_DC5_3x": "138204841/model_final_3e0943.pkl",
30
+ "COCO-Detection/faster_rcnn_R_101_FPN_3x": "137851257/model_final_f6e8b1.pkl",
31
+ "COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x": "139173657/model_final_68b088.pkl",
32
+ # COCO Detection with RetinaNet
33
+ "COCO-Detection/retinanet_R_50_FPN_1x": "190397773/model_final_bfca0b.pkl",
34
+ "COCO-Detection/retinanet_R_50_FPN_3x": "190397829/model_final_5bd44e.pkl",
35
+ "COCO-Detection/retinanet_R_101_FPN_3x": "190397697/model_final_971ab9.pkl",
36
+ # COCO Detection with RPN and Fast R-CNN
37
+ "COCO-Detection/rpn_R_50_C4_1x": "137258005/model_final_450694.pkl",
38
+ "COCO-Detection/rpn_R_50_FPN_1x": "137258492/model_final_02ce48.pkl",
39
+ "COCO-Detection/fast_rcnn_R_50_FPN_1x": "137635226/model_final_e5f7ce.pkl",
40
+ # COCO Instance Segmentation Baselines with Mask R-CNN
41
+ "COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x": "137259246/model_final_9243eb.pkl",
42
+ "COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_1x": "137260150/model_final_4f86c3.pkl",
43
+ "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x": "137260431/model_final_a54504.pkl",
44
+ "COCO-InstanceSegmentation/mask_rcnn_R_50_C4_3x": "137849525/model_final_4ce675.pkl",
45
+ "COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_3x": "137849551/model_final_84107b.pkl",
46
+ "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x": "137849600/model_final_f10217.pkl",
47
+ "COCO-InstanceSegmentation/mask_rcnn_R_101_C4_3x": "138363239/model_final_a2914c.pkl",
48
+ "COCO-InstanceSegmentation/mask_rcnn_R_101_DC5_3x": "138363294/model_final_0464b7.pkl",
49
+ "COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x": "138205316/model_final_a3ec72.pkl",
50
+ "COCO-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_3x": "139653917/model_final_2d9806.pkl", # noqa
51
+ # New baselines using Large-Scale Jitter and Longer Training Schedule
52
+ "new_baselines/mask_rcnn_R_50_FPN_100ep_LSJ": "42047764/model_final_bb69de.pkl",
53
+ "new_baselines/mask_rcnn_R_50_FPN_200ep_LSJ": "42047638/model_final_89a8d3.pkl",
54
+ "new_baselines/mask_rcnn_R_50_FPN_400ep_LSJ": "42019571/model_final_14d201.pkl",
55
+ "new_baselines/mask_rcnn_R_101_FPN_100ep_LSJ": "42025812/model_final_4f7b58.pkl",
56
+ "new_baselines/mask_rcnn_R_101_FPN_200ep_LSJ": "42131867/model_final_0bb7ae.pkl",
57
+ "new_baselines/mask_rcnn_R_101_FPN_400ep_LSJ": "42073830/model_final_f96b26.pkl",
58
+ "new_baselines/mask_rcnn_regnetx_4gf_dds_FPN_100ep_LSJ": "42047771/model_final_b7fbab.pkl", # noqa
59
+ "new_baselines/mask_rcnn_regnetx_4gf_dds_FPN_200ep_LSJ": "42132721/model_final_5d87c1.pkl", # noqa
60
+ "new_baselines/mask_rcnn_regnetx_4gf_dds_FPN_400ep_LSJ": "42025447/model_final_f1362d.pkl", # noqa
61
+ "new_baselines/mask_rcnn_regnety_4gf_dds_FPN_100ep_LSJ": "42047784/model_final_6ba57e.pkl", # noqa
62
+ "new_baselines/mask_rcnn_regnety_4gf_dds_FPN_200ep_LSJ": "42047642/model_final_27b9c1.pkl", # noqa
63
+ "new_baselines/mask_rcnn_regnety_4gf_dds_FPN_400ep_LSJ": "42045954/model_final_ef3a80.pkl", # noqa
64
+ # COCO Person Keypoint Detection Baselines with Keypoint R-CNN
65
+ "COCO-Keypoints/keypoint_rcnn_R_50_FPN_1x": "137261548/model_final_04e291.pkl",
66
+ "COCO-Keypoints/keypoint_rcnn_R_50_FPN_3x": "137849621/model_final_a6e10b.pkl",
67
+ "COCO-Keypoints/keypoint_rcnn_R_101_FPN_3x": "138363331/model_final_997cc7.pkl",
68
+ "COCO-Keypoints/keypoint_rcnn_X_101_32x8d_FPN_3x": "139686956/model_final_5ad38f.pkl",
69
+ # COCO Panoptic Segmentation Baselines with Panoptic FPN
70
+ "COCO-PanopticSegmentation/panoptic_fpn_R_50_1x": "139514544/model_final_dbfeb4.pkl",
71
+ "COCO-PanopticSegmentation/panoptic_fpn_R_50_3x": "139514569/model_final_c10459.pkl",
72
+ "COCO-PanopticSegmentation/panoptic_fpn_R_101_3x": "139514519/model_final_cafdb1.pkl",
73
+ # LVIS Instance Segmentation Baselines with Mask R-CNN
74
+ "LVISv0.5-InstanceSegmentation/mask_rcnn_R_50_FPN_1x": "144219072/model_final_571f7c.pkl", # noqa
75
+ "LVISv0.5-InstanceSegmentation/mask_rcnn_R_101_FPN_1x": "144219035/model_final_824ab5.pkl", # noqa
76
+ "LVISv0.5-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_1x": "144219108/model_final_5e3439.pkl", # noqa
77
+ # Cityscapes & Pascal VOC Baselines
78
+ "Cityscapes/mask_rcnn_R_50_FPN": "142423278/model_final_af9cf5.pkl",
79
+ "PascalVOC-Detection/faster_rcnn_R_50_C4": "142202221/model_final_b1acc2.pkl",
80
+ # Other Settings
81
+ "Misc/mask_rcnn_R_50_FPN_1x_dconv_c3-c5": "138602867/model_final_65c703.pkl",
82
+ "Misc/mask_rcnn_R_50_FPN_3x_dconv_c3-c5": "144998336/model_final_821d0b.pkl",
83
+ "Misc/cascade_mask_rcnn_R_50_FPN_1x": "138602847/model_final_e9d89b.pkl",
84
+ "Misc/cascade_mask_rcnn_R_50_FPN_3x": "144998488/model_final_480dd8.pkl",
85
+ "Misc/mask_rcnn_R_50_FPN_3x_syncbn": "169527823/model_final_3b3c51.pkl",
86
+ "Misc/mask_rcnn_R_50_FPN_3x_gn": "138602888/model_final_dc5d9e.pkl",
87
+ "Misc/scratch_mask_rcnn_R_50_FPN_3x_gn": "138602908/model_final_01ca85.pkl",
88
+ "Misc/scratch_mask_rcnn_R_50_FPN_9x_gn": "183808979/model_final_da7b4c.pkl",
89
+ "Misc/scratch_mask_rcnn_R_50_FPN_9x_syncbn": "184226666/model_final_5ce33e.pkl",
90
+ "Misc/panoptic_fpn_R_101_dconv_cascade_gn_3x": "139797668/model_final_be35db.pkl",
91
+ "Misc/cascade_mask_rcnn_X_152_32x8d_FPN_IN5k_gn_dconv": "18131413/model_0039999_e76410.pkl", # noqa
92
+ # D1 Comparisons
93
+ "Detectron1-Comparisons/faster_rcnn_R_50_FPN_noaug_1x": "137781054/model_final_7ab50c.pkl", # noqa
94
+ "Detectron1-Comparisons/mask_rcnn_R_50_FPN_noaug_1x": "137781281/model_final_62ca52.pkl", # noqa
95
+ "Detectron1-Comparisons/keypoint_rcnn_R_50_FPN_1x": "137781195/model_final_cce136.pkl",
96
+ }
97
+
98
+ @staticmethod
99
+ def query(config_path: str) -> Optional[str]:
100
+ """
101
+ Args:
102
+ config_path: relative config filename
103
+ """
104
+ name = config_path.replace(".yaml", "").replace(".py", "")
105
+ if name in _ModelZooUrls.CONFIG_PATH_TO_URL_SUFFIX:
106
+ suffix = _ModelZooUrls.CONFIG_PATH_TO_URL_SUFFIX[name]
107
+ return _ModelZooUrls.S3_PREFIX + name + "/" + suffix
108
+ return None
109
+
110
+
111
+ def get_checkpoint_url(config_path):
112
+ """
113
+ Returns the URL to the model trained using the given config
114
+
115
+ Args:
116
+ config_path (str): config file name relative to detectron2's "configs/"
117
+ directory, e.g., "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml"
118
+
119
+ Returns:
120
+ str: a URL to the model
121
+ """
122
+ url = _ModelZooUrls.query(config_path)
123
+ if url is None:
124
+ raise RuntimeError("Pretrained model for {} is not available!".format(config_path))
125
+ return url
126
+
127
+
128
+ def get_config_file(config_path):
129
+ """
130
+ Returns path to a builtin config file.
131
+
132
+ Args:
133
+ config_path (str): config file name relative to detectron2's "configs/"
134
+ directory, e.g., "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml"
135
+
136
+ Returns:
137
+ str: the real path to the config file.
138
+ """
139
+ cfg_file = pkg_resources.resource_filename(
140
+ "detectron2.model_zoo", os.path.join("configs", config_path)
141
+ )
142
+ if not os.path.exists(cfg_file):
143
+ raise RuntimeError("{} not available in Model Zoo!".format(config_path))
144
+ return cfg_file
145
+
146
+
147
+ def get_config(config_path, trained: bool = False):
148
+ """
149
+ Returns a config object for a model in model zoo.
150
+
151
+ Args:
152
+ config_path (str): config file name relative to detectron2's "configs/"
153
+ directory, e.g., "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml"
154
+ trained (bool): If True, will set ``MODEL.WEIGHTS`` to trained model zoo weights.
155
+ If False, the checkpoint specified in the config file's ``MODEL.WEIGHTS`` is used
156
+ instead; this will typically (though not always) initialize a subset of weights using
157
+ an ImageNet pre-trained model, while randomly initializing the other weights.
158
+
159
+ Returns:
160
+ CfgNode or omegaconf.DictConfig: a config object
161
+ """
162
+ cfg_file = get_config_file(config_path)
163
+ if cfg_file.endswith(".yaml"):
164
+ cfg = get_cfg()
165
+ cfg.merge_from_file(cfg_file)
166
+ if trained:
167
+ cfg.MODEL.WEIGHTS = get_checkpoint_url(config_path)
168
+ return cfg
169
+ elif cfg_file.endswith(".py"):
170
+ cfg = LazyConfig.load(cfg_file)
171
+ if trained:
172
+ url = get_checkpoint_url(config_path)
173
+ if "train" in cfg and "init_checkpoint" in cfg.train:
174
+ cfg.train.init_checkpoint = url
175
+ else:
176
+ raise NotImplementedError
177
+ return cfg
178
+
179
+
180
+ def get(config_path, trained: bool = False, device: Optional[str] = None):
181
+ """
182
+ Get a model specified by relative path under Detectron2's official ``configs/`` directory.
183
+
184
+ Args:
185
+ config_path (str): config file name relative to detectron2's "configs/"
186
+ directory, e.g., "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml"
187
+ trained (bool): see :func:`get_config`.
188
+ device (str or None): overwrite the device in config, if given.
189
+
190
+ Returns:
191
+ nn.Module: a detectron2 model. Will be in training mode.
192
+
193
+ Example:
194
+ ::
195
+ from annotator.oneformer.detectron2 import model_zoo
196
+ model = model_zoo.get("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml", trained=True)
197
+ """
198
+ cfg = get_config(config_path, trained)
199
+ if device is None and not torch.cuda.is_available():
200
+ device = "cpu"
201
+ if device is not None and isinstance(cfg, CfgNode):
202
+ cfg.MODEL.DEVICE = device
203
+
204
+ if isinstance(cfg, CfgNode):
205
+ model = build_model(cfg)
206
+ DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS)
207
+ else:
208
+ model = instantiate(cfg.model)
209
+ if device is not None:
210
+ model = model.to(device)
211
+ if "train" in cfg and "init_checkpoint" in cfg.train:
212
+ DetectionCheckpointer(model).load(cfg.train.init_checkpoint)
213
+ return model
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/__init__.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ from annotator.oneformer.detectron2.layers import ShapeSpec
3
+
4
+ from .anchor_generator import build_anchor_generator, ANCHOR_GENERATOR_REGISTRY
5
+ from .backbone import (
6
+ BACKBONE_REGISTRY,
7
+ FPN,
8
+ Backbone,
9
+ ResNet,
10
+ ResNetBlockBase,
11
+ build_backbone,
12
+ build_resnet_backbone,
13
+ make_stage,
14
+ ViT,
15
+ SimpleFeaturePyramid,
16
+ get_vit_lr_decay_rate,
17
+ MViT,
18
+ SwinTransformer,
19
+ )
20
+ from .meta_arch import (
21
+ META_ARCH_REGISTRY,
22
+ SEM_SEG_HEADS_REGISTRY,
23
+ GeneralizedRCNN,
24
+ PanopticFPN,
25
+ ProposalNetwork,
26
+ RetinaNet,
27
+ SemanticSegmentor,
28
+ build_model,
29
+ build_sem_seg_head,
30
+ FCOS,
31
+ )
32
+ from .postprocessing import detector_postprocess
33
+ from .proposal_generator import (
34
+ PROPOSAL_GENERATOR_REGISTRY,
35
+ build_proposal_generator,
36
+ RPN_HEAD_REGISTRY,
37
+ build_rpn_head,
38
+ )
39
+ from .roi_heads import (
40
+ ROI_BOX_HEAD_REGISTRY,
41
+ ROI_HEADS_REGISTRY,
42
+ ROI_KEYPOINT_HEAD_REGISTRY,
43
+ ROI_MASK_HEAD_REGISTRY,
44
+ ROIHeads,
45
+ StandardROIHeads,
46
+ BaseMaskRCNNHead,
47
+ BaseKeypointRCNNHead,
48
+ FastRCNNOutputLayers,
49
+ build_box_head,
50
+ build_keypoint_head,
51
+ build_mask_head,
52
+ build_roi_heads,
53
+ )
54
+ from .test_time_augmentation import DatasetMapperTTA, GeneralizedRCNNWithTTA
55
+ from .mmdet_wrapper import MMDetBackbone, MMDetDetector
56
+
57
+ _EXCLUDE = {"ShapeSpec"}
58
+ __all__ = [k for k in globals().keys() if k not in _EXCLUDE and not k.startswith("_")]
59
+
60
+
61
+ from annotator.oneformer.detectron2.utils.env import fixup_module_metadata
62
+
63
+ fixup_module_metadata(__name__, globals(), __all__)
64
+ del fixup_module_metadata
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/anchor_generator.py ADDED
@@ -0,0 +1,386 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ import collections
3
+ import math
4
+ from typing import List
5
+ import torch
6
+ from torch import nn
7
+
8
+ from annotator.oneformer.detectron2.config import configurable
9
+ from annotator.oneformer.detectron2.layers import ShapeSpec, move_device_like
10
+ from annotator.oneformer.detectron2.structures import Boxes, RotatedBoxes
11
+ from annotator.oneformer.detectron2.utils.registry import Registry
12
+
13
+ ANCHOR_GENERATOR_REGISTRY = Registry("ANCHOR_GENERATOR")
14
+ ANCHOR_GENERATOR_REGISTRY.__doc__ = """
15
+ Registry for modules that creates object detection anchors for feature maps.
16
+
17
+ The registered object will be called with `obj(cfg, input_shape)`.
18
+ """
19
+
20
+
21
+ class BufferList(nn.Module):
22
+ """
23
+ Similar to nn.ParameterList, but for buffers
24
+ """
25
+
26
+ def __init__(self, buffers):
27
+ super().__init__()
28
+ for i, buffer in enumerate(buffers):
29
+ # Use non-persistent buffer so the values are not saved in checkpoint
30
+ self.register_buffer(str(i), buffer, persistent=False)
31
+
32
+ def __len__(self):
33
+ return len(self._buffers)
34
+
35
+ def __iter__(self):
36
+ return iter(self._buffers.values())
37
+
38
+
39
+ def _create_grid_offsets(
40
+ size: List[int], stride: int, offset: float, target_device_tensor: torch.Tensor
41
+ ):
42
+ grid_height, grid_width = size
43
+ shifts_x = move_device_like(
44
+ torch.arange(offset * stride, grid_width * stride, step=stride, dtype=torch.float32),
45
+ target_device_tensor,
46
+ )
47
+ shifts_y = move_device_like(
48
+ torch.arange(offset * stride, grid_height * stride, step=stride, dtype=torch.float32),
49
+ target_device_tensor,
50
+ )
51
+
52
+ shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x)
53
+ shift_x = shift_x.reshape(-1)
54
+ shift_y = shift_y.reshape(-1)
55
+ return shift_x, shift_y
56
+
57
+
58
+ def _broadcast_params(params, num_features, name):
59
+ """
60
+ If one size (or aspect ratio) is specified and there are multiple feature
61
+ maps, we "broadcast" anchors of that single size (or aspect ratio)
62
+ over all feature maps.
63
+
64
+ If params is list[float], or list[list[float]] with len(params) == 1, repeat
65
+ it num_features time.
66
+
67
+ Returns:
68
+ list[list[float]]: param for each feature
69
+ """
70
+ assert isinstance(
71
+ params, collections.abc.Sequence
72
+ ), f"{name} in anchor generator has to be a list! Got {params}."
73
+ assert len(params), f"{name} in anchor generator cannot be empty!"
74
+ if not isinstance(params[0], collections.abc.Sequence): # params is list[float]
75
+ return [params] * num_features
76
+ if len(params) == 1:
77
+ return list(params) * num_features
78
+ assert len(params) == num_features, (
79
+ f"Got {name} of length {len(params)} in anchor generator, "
80
+ f"but the number of input features is {num_features}!"
81
+ )
82
+ return params
83
+
84
+
85
+ @ANCHOR_GENERATOR_REGISTRY.register()
86
+ class DefaultAnchorGenerator(nn.Module):
87
+ """
88
+ Compute anchors in the standard ways described in
89
+ "Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks".
90
+ """
91
+
92
+ box_dim: torch.jit.Final[int] = 4
93
+ """
94
+ the dimension of each anchor box.
95
+ """
96
+
97
+ @configurable
98
+ def __init__(self, *, sizes, aspect_ratios, strides, offset=0.5):
99
+ """
100
+ This interface is experimental.
101
+
102
+ Args:
103
+ sizes (list[list[float]] or list[float]):
104
+ If ``sizes`` is list[list[float]], ``sizes[i]`` is the list of anchor sizes
105
+ (i.e. sqrt of anchor area) to use for the i-th feature map.
106
+ If ``sizes`` is list[float], ``sizes`` is used for all feature maps.
107
+ Anchor sizes are given in absolute lengths in units of
108
+ the input image; they do not dynamically scale if the input image size changes.
109
+ aspect_ratios (list[list[float]] or list[float]): list of aspect ratios
110
+ (i.e. height / width) to use for anchors. Same "broadcast" rule for `sizes` applies.
111
+ strides (list[int]): stride of each input feature.
112
+ offset (float): Relative offset between the center of the first anchor and the top-left
113
+ corner of the image. Value has to be in [0, 1).
114
+ Recommend to use 0.5, which means half stride.
115
+ """
116
+ super().__init__()
117
+
118
+ self.strides = strides
119
+ self.num_features = len(self.strides)
120
+ sizes = _broadcast_params(sizes, self.num_features, "sizes")
121
+ aspect_ratios = _broadcast_params(aspect_ratios, self.num_features, "aspect_ratios")
122
+ self.cell_anchors = self._calculate_anchors(sizes, aspect_ratios)
123
+
124
+ self.offset = offset
125
+ assert 0.0 <= self.offset < 1.0, self.offset
126
+
127
+ @classmethod
128
+ def from_config(cls, cfg, input_shape: List[ShapeSpec]):
129
+ return {
130
+ "sizes": cfg.MODEL.ANCHOR_GENERATOR.SIZES,
131
+ "aspect_ratios": cfg.MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS,
132
+ "strides": [x.stride for x in input_shape],
133
+ "offset": cfg.MODEL.ANCHOR_GENERATOR.OFFSET,
134
+ }
135
+
136
+ def _calculate_anchors(self, sizes, aspect_ratios):
137
+ cell_anchors = [
138
+ self.generate_cell_anchors(s, a).float() for s, a in zip(sizes, aspect_ratios)
139
+ ]
140
+ return BufferList(cell_anchors)
141
+
142
+ @property
143
+ @torch.jit.unused
144
+ def num_cell_anchors(self):
145
+ """
146
+ Alias of `num_anchors`.
147
+ """
148
+ return self.num_anchors
149
+
150
+ @property
151
+ @torch.jit.unused
152
+ def num_anchors(self):
153
+ """
154
+ Returns:
155
+ list[int]: Each int is the number of anchors at every pixel
156
+ location, on that feature map.
157
+ For example, if at every pixel we use anchors of 3 aspect
158
+ ratios and 5 sizes, the number of anchors is 15.
159
+ (See also ANCHOR_GENERATOR.SIZES and ANCHOR_GENERATOR.ASPECT_RATIOS in config)
160
+
161
+ In standard RPN models, `num_anchors` on every feature map is the same.
162
+ """
163
+ return [len(cell_anchors) for cell_anchors in self.cell_anchors]
164
+
165
+ def _grid_anchors(self, grid_sizes: List[List[int]]):
166
+ """
167
+ Returns:
168
+ list[Tensor]: #featuremap tensors, each is (#locations x #cell_anchors) x 4
169
+ """
170
+ anchors = []
171
+ # buffers() not supported by torchscript. use named_buffers() instead
172
+ buffers: List[torch.Tensor] = [x[1] for x in self.cell_anchors.named_buffers()]
173
+ for size, stride, base_anchors in zip(grid_sizes, self.strides, buffers):
174
+ shift_x, shift_y = _create_grid_offsets(size, stride, self.offset, base_anchors)
175
+ shifts = torch.stack((shift_x, shift_y, shift_x, shift_y), dim=1)
176
+
177
+ anchors.append((shifts.view(-1, 1, 4) + base_anchors.view(1, -1, 4)).reshape(-1, 4))
178
+
179
+ return anchors
180
+
181
+ def generate_cell_anchors(self, sizes=(32, 64, 128, 256, 512), aspect_ratios=(0.5, 1, 2)):
182
+ """
183
+ Generate a tensor storing canonical anchor boxes, which are all anchor
184
+ boxes of different sizes and aspect_ratios centered at (0, 0).
185
+ We can later build the set of anchors for a full feature map by
186
+ shifting and tiling these tensors (see `meth:_grid_anchors`).
187
+
188
+ Args:
189
+ sizes (tuple[float]):
190
+ aspect_ratios (tuple[float]]):
191
+
192
+ Returns:
193
+ Tensor of shape (len(sizes) * len(aspect_ratios), 4) storing anchor boxes
194
+ in XYXY format.
195
+ """
196
+
197
+ # This is different from the anchor generator defined in the original Faster R-CNN
198
+ # code or Detectron. They yield the same AP, however the old version defines cell
199
+ # anchors in a less natural way with a shift relative to the feature grid and
200
+ # quantization that results in slightly different sizes for different aspect ratios.
201
+ # See also https://github.com/facebookresearch/Detectron/issues/227
202
+
203
+ anchors = []
204
+ for size in sizes:
205
+ area = size**2.0
206
+ for aspect_ratio in aspect_ratios:
207
+ # s * s = w * h
208
+ # a = h / w
209
+ # ... some algebra ...
210
+ # w = sqrt(s * s / a)
211
+ # h = a * w
212
+ w = math.sqrt(area / aspect_ratio)
213
+ h = aspect_ratio * w
214
+ x0, y0, x1, y1 = -w / 2.0, -h / 2.0, w / 2.0, h / 2.0
215
+ anchors.append([x0, y0, x1, y1])
216
+ return torch.tensor(anchors)
217
+
218
+ def forward(self, features: List[torch.Tensor]):
219
+ """
220
+ Args:
221
+ features (list[Tensor]): list of backbone feature maps on which to generate anchors.
222
+
223
+ Returns:
224
+ list[Boxes]: a list of Boxes containing all the anchors for each feature map
225
+ (i.e. the cell anchors repeated over all locations in the feature map).
226
+ The number of anchors of each feature map is Hi x Wi x num_cell_anchors,
227
+ where Hi, Wi are resolution of the feature map divided by anchor stride.
228
+ """
229
+ grid_sizes = [feature_map.shape[-2:] for feature_map in features]
230
+ anchors_over_all_feature_maps = self._grid_anchors(grid_sizes)
231
+ return [Boxes(x) for x in anchors_over_all_feature_maps]
232
+
233
+
234
+ @ANCHOR_GENERATOR_REGISTRY.register()
235
+ class RotatedAnchorGenerator(nn.Module):
236
+ """
237
+ Compute rotated anchors used by Rotated RPN (RRPN), described in
238
+ "Arbitrary-Oriented Scene Text Detection via Rotation Proposals".
239
+ """
240
+
241
+ box_dim: int = 5
242
+ """
243
+ the dimension of each anchor box.
244
+ """
245
+
246
+ @configurable
247
+ def __init__(self, *, sizes, aspect_ratios, strides, angles, offset=0.5):
248
+ """
249
+ This interface is experimental.
250
+
251
+ Args:
252
+ sizes (list[list[float]] or list[float]):
253
+ If sizes is list[list[float]], sizes[i] is the list of anchor sizes
254
+ (i.e. sqrt of anchor area) to use for the i-th feature map.
255
+ If sizes is list[float], the sizes are used for all feature maps.
256
+ Anchor sizes are given in absolute lengths in units of
257
+ the input image; they do not dynamically scale if the input image size changes.
258
+ aspect_ratios (list[list[float]] or list[float]): list of aspect ratios
259
+ (i.e. height / width) to use for anchors. Same "broadcast" rule for `sizes` applies.
260
+ strides (list[int]): stride of each input feature.
261
+ angles (list[list[float]] or list[float]): list of angles (in degrees CCW)
262
+ to use for anchors. Same "broadcast" rule for `sizes` applies.
263
+ offset (float): Relative offset between the center of the first anchor and the top-left
264
+ corner of the image. Value has to be in [0, 1).
265
+ Recommend to use 0.5, which means half stride.
266
+ """
267
+ super().__init__()
268
+
269
+ self.strides = strides
270
+ self.num_features = len(self.strides)
271
+ sizes = _broadcast_params(sizes, self.num_features, "sizes")
272
+ aspect_ratios = _broadcast_params(aspect_ratios, self.num_features, "aspect_ratios")
273
+ angles = _broadcast_params(angles, self.num_features, "angles")
274
+ self.cell_anchors = self._calculate_anchors(sizes, aspect_ratios, angles)
275
+
276
+ self.offset = offset
277
+ assert 0.0 <= self.offset < 1.0, self.offset
278
+
279
+ @classmethod
280
+ def from_config(cls, cfg, input_shape: List[ShapeSpec]):
281
+ return {
282
+ "sizes": cfg.MODEL.ANCHOR_GENERATOR.SIZES,
283
+ "aspect_ratios": cfg.MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS,
284
+ "strides": [x.stride for x in input_shape],
285
+ "offset": cfg.MODEL.ANCHOR_GENERATOR.OFFSET,
286
+ "angles": cfg.MODEL.ANCHOR_GENERATOR.ANGLES,
287
+ }
288
+
289
+ def _calculate_anchors(self, sizes, aspect_ratios, angles):
290
+ cell_anchors = [
291
+ self.generate_cell_anchors(size, aspect_ratio, angle).float()
292
+ for size, aspect_ratio, angle in zip(sizes, aspect_ratios, angles)
293
+ ]
294
+ return BufferList(cell_anchors)
295
+
296
+ @property
297
+ def num_cell_anchors(self):
298
+ """
299
+ Alias of `num_anchors`.
300
+ """
301
+ return self.num_anchors
302
+
303
+ @property
304
+ def num_anchors(self):
305
+ """
306
+ Returns:
307
+ list[int]: Each int is the number of anchors at every pixel
308
+ location, on that feature map.
309
+ For example, if at every pixel we use anchors of 3 aspect
310
+ ratios, 2 sizes and 5 angles, the number of anchors is 30.
311
+ (See also ANCHOR_GENERATOR.SIZES, ANCHOR_GENERATOR.ASPECT_RATIOS
312
+ and ANCHOR_GENERATOR.ANGLES in config)
313
+
314
+ In standard RRPN models, `num_anchors` on every feature map is the same.
315
+ """
316
+ return [len(cell_anchors) for cell_anchors in self.cell_anchors]
317
+
318
+ def _grid_anchors(self, grid_sizes):
319
+ anchors = []
320
+ for size, stride, base_anchors in zip(grid_sizes, self.strides, self.cell_anchors):
321
+ shift_x, shift_y = _create_grid_offsets(size, stride, self.offset, base_anchors)
322
+ zeros = torch.zeros_like(shift_x)
323
+ shifts = torch.stack((shift_x, shift_y, zeros, zeros, zeros), dim=1)
324
+
325
+ anchors.append((shifts.view(-1, 1, 5) + base_anchors.view(1, -1, 5)).reshape(-1, 5))
326
+
327
+ return anchors
328
+
329
+ def generate_cell_anchors(
330
+ self,
331
+ sizes=(32, 64, 128, 256, 512),
332
+ aspect_ratios=(0.5, 1, 2),
333
+ angles=(-90, -60, -30, 0, 30, 60, 90),
334
+ ):
335
+ """
336
+ Generate a tensor storing canonical anchor boxes, which are all anchor
337
+ boxes of different sizes, aspect_ratios, angles centered at (0, 0).
338
+ We can later build the set of anchors for a full feature map by
339
+ shifting and tiling these tensors (see `meth:_grid_anchors`).
340
+
341
+ Args:
342
+ sizes (tuple[float]):
343
+ aspect_ratios (tuple[float]]):
344
+ angles (tuple[float]]):
345
+
346
+ Returns:
347
+ Tensor of shape (len(sizes) * len(aspect_ratios) * len(angles), 5)
348
+ storing anchor boxes in (x_ctr, y_ctr, w, h, angle) format.
349
+ """
350
+ anchors = []
351
+ for size in sizes:
352
+ area = size**2.0
353
+ for aspect_ratio in aspect_ratios:
354
+ # s * s = w * h
355
+ # a = h / w
356
+ # ... some algebra ...
357
+ # w = sqrt(s * s / a)
358
+ # h = a * w
359
+ w = math.sqrt(area / aspect_ratio)
360
+ h = aspect_ratio * w
361
+ anchors.extend([0, 0, w, h, a] for a in angles)
362
+
363
+ return torch.tensor(anchors)
364
+
365
+ def forward(self, features):
366
+ """
367
+ Args:
368
+ features (list[Tensor]): list of backbone feature maps on which to generate anchors.
369
+
370
+ Returns:
371
+ list[RotatedBoxes]: a list of Boxes containing all the anchors for each feature map
372
+ (i.e. the cell anchors repeated over all locations in the feature map).
373
+ The number of anchors of each feature map is Hi x Wi x num_cell_anchors,
374
+ where Hi, Wi are resolution of the feature map divided by anchor stride.
375
+ """
376
+ grid_sizes = [feature_map.shape[-2:] for feature_map in features]
377
+ anchors_over_all_feature_maps = self._grid_anchors(grid_sizes)
378
+ return [RotatedBoxes(x) for x in anchors_over_all_feature_maps]
379
+
380
+
381
+ def build_anchor_generator(cfg, input_shape):
382
+ """
383
+ Built an anchor generator from `cfg.MODEL.ANCHOR_GENERATOR.NAME`.
384
+ """
385
+ anchor_generator = cfg.MODEL.ANCHOR_GENERATOR.NAME
386
+ return ANCHOR_GENERATOR_REGISTRY.get(anchor_generator)(cfg, input_shape)
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/backbone/__init__.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ from .build import build_backbone, BACKBONE_REGISTRY # noqa F401 isort:skip
3
+
4
+ from .backbone import Backbone
5
+ from .fpn import FPN
6
+ from .regnet import RegNet
7
+ from .resnet import (
8
+ BasicStem,
9
+ ResNet,
10
+ ResNetBlockBase,
11
+ build_resnet_backbone,
12
+ make_stage,
13
+ BottleneckBlock,
14
+ )
15
+ from .vit import ViT, SimpleFeaturePyramid, get_vit_lr_decay_rate
16
+ from .mvit import MViT
17
+ from .swin import SwinTransformer
18
+
19
+ __all__ = [k for k in globals().keys() if not k.startswith("_")]
20
+ # TODO can expose more resnet blocks after careful consideration
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/backbone/backbone.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ from abc import ABCMeta, abstractmethod
3
+ from typing import Dict
4
+ import torch.nn as nn
5
+
6
+ from annotator.oneformer.detectron2.layers import ShapeSpec
7
+
8
+ __all__ = ["Backbone"]
9
+
10
+
11
+ class Backbone(nn.Module, metaclass=ABCMeta):
12
+ """
13
+ Abstract base class for network backbones.
14
+ """
15
+
16
+ def __init__(self):
17
+ """
18
+ The `__init__` method of any subclass can specify its own set of arguments.
19
+ """
20
+ super().__init__()
21
+
22
+ @abstractmethod
23
+ def forward(self):
24
+ """
25
+ Subclasses must override this method, but adhere to the same return type.
26
+
27
+ Returns:
28
+ dict[str->Tensor]: mapping from feature name (e.g., "res2") to tensor
29
+ """
30
+ pass
31
+
32
+ @property
33
+ def size_divisibility(self) -> int:
34
+ """
35
+ Some backbones require the input height and width to be divisible by a
36
+ specific integer. This is typically true for encoder / decoder type networks
37
+ with lateral connection (e.g., FPN) for which feature maps need to match
38
+ dimension in the "bottom up" and "top down" paths. Set to 0 if no specific
39
+ input size divisibility is required.
40
+ """
41
+ return 0
42
+
43
+ @property
44
+ def padding_constraints(self) -> Dict[str, int]:
45
+ """
46
+ This property is a generalization of size_divisibility. Some backbones and training
47
+ recipes require specific padding constraints, such as enforcing divisibility by a specific
48
+ integer (e.g., FPN) or padding to a square (e.g., ViTDet with large-scale jitter
49
+ in :paper:vitdet). `padding_constraints` contains these optional items like:
50
+ {
51
+ "size_divisibility": int,
52
+ "square_size": int,
53
+ # Future options are possible
54
+ }
55
+ `size_divisibility` will read from here if presented and `square_size` indicates the
56
+ square padding size if `square_size` > 0.
57
+
58
+ TODO: use type of Dict[str, int] to avoid torchscipt issues. The type of padding_constraints
59
+ could be generalized as TypedDict (Python 3.8+) to support more types in the future.
60
+ """
61
+ return {}
62
+
63
+ def output_shape(self):
64
+ """
65
+ Returns:
66
+ dict[str->ShapeSpec]
67
+ """
68
+ # this is a backward-compatible default
69
+ return {
70
+ name: ShapeSpec(
71
+ channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
72
+ )
73
+ for name in self._out_features
74
+ }
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/backbone/build.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ from annotator.oneformer.detectron2.layers import ShapeSpec
3
+ from annotator.oneformer.detectron2.utils.registry import Registry
4
+
5
+ from .backbone import Backbone
6
+
7
+ BACKBONE_REGISTRY = Registry("BACKBONE")
8
+ BACKBONE_REGISTRY.__doc__ = """
9
+ Registry for backbones, which extract feature maps from images
10
+
11
+ The registered object must be a callable that accepts two arguments:
12
+
13
+ 1. A :class:`detectron2.config.CfgNode`
14
+ 2. A :class:`detectron2.layers.ShapeSpec`, which contains the input shape specification.
15
+
16
+ Registered object must return instance of :class:`Backbone`.
17
+ """
18
+
19
+
20
+ def build_backbone(cfg, input_shape=None):
21
+ """
22
+ Build a backbone from `cfg.MODEL.BACKBONE.NAME`.
23
+
24
+ Returns:
25
+ an instance of :class:`Backbone`
26
+ """
27
+ if input_shape is None:
28
+ input_shape = ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN))
29
+
30
+ backbone_name = cfg.MODEL.BACKBONE.NAME
31
+ backbone = BACKBONE_REGISTRY.get(backbone_name)(cfg, input_shape)
32
+ assert isinstance(backbone, Backbone)
33
+ return backbone
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/backbone/fpn.py ADDED
@@ -0,0 +1,268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ import math
3
+ import fvcore.nn.weight_init as weight_init
4
+ import torch
5
+ import torch.nn.functional as F
6
+ from torch import nn
7
+
8
+ from annotator.oneformer.detectron2.layers import Conv2d, ShapeSpec, get_norm
9
+
10
+ from .backbone import Backbone
11
+ from .build import BACKBONE_REGISTRY
12
+ from .resnet import build_resnet_backbone
13
+
14
+ __all__ = ["build_resnet_fpn_backbone", "build_retinanet_resnet_fpn_backbone", "FPN"]
15
+
16
+
17
+ class FPN(Backbone):
18
+ """
19
+ This module implements :paper:`FPN`.
20
+ It creates pyramid features built on top of some input feature maps.
21
+ """
22
+
23
+ _fuse_type: torch.jit.Final[str]
24
+
25
+ def __init__(
26
+ self,
27
+ bottom_up,
28
+ in_features,
29
+ out_channels,
30
+ norm="",
31
+ top_block=None,
32
+ fuse_type="sum",
33
+ square_pad=0,
34
+ ):
35
+ """
36
+ Args:
37
+ bottom_up (Backbone): module representing the bottom up subnetwork.
38
+ Must be a subclass of :class:`Backbone`. The multi-scale feature
39
+ maps generated by the bottom up network, and listed in `in_features`,
40
+ are used to generate FPN levels.
41
+ in_features (list[str]): names of the input feature maps coming
42
+ from the backbone to which FPN is attached. For example, if the
43
+ backbone produces ["res2", "res3", "res4"], any *contiguous* sublist
44
+ of these may be used; order must be from high to low resolution.
45
+ out_channels (int): number of channels in the output feature maps.
46
+ norm (str): the normalization to use.
47
+ top_block (nn.Module or None): if provided, an extra operation will
48
+ be performed on the output of the last (smallest resolution)
49
+ FPN output, and the result will extend the result list. The top_block
50
+ further downsamples the feature map. It must have an attribute
51
+ "num_levels", meaning the number of extra FPN levels added by
52
+ this block, and "in_feature", which is a string representing
53
+ its input feature (e.g., p5).
54
+ fuse_type (str): types for fusing the top down features and the lateral
55
+ ones. It can be "sum" (default), which sums up element-wise; or "avg",
56
+ which takes the element-wise mean of the two.
57
+ square_pad (int): If > 0, require input images to be padded to specific square size.
58
+ """
59
+ super(FPN, self).__init__()
60
+ assert isinstance(bottom_up, Backbone)
61
+ assert in_features, in_features
62
+
63
+ # Feature map strides and channels from the bottom up network (e.g. ResNet)
64
+ input_shapes = bottom_up.output_shape()
65
+ strides = [input_shapes[f].stride for f in in_features]
66
+ in_channels_per_feature = [input_shapes[f].channels for f in in_features]
67
+
68
+ _assert_strides_are_log2_contiguous(strides)
69
+ lateral_convs = []
70
+ output_convs = []
71
+
72
+ use_bias = norm == ""
73
+ for idx, in_channels in enumerate(in_channels_per_feature):
74
+ lateral_norm = get_norm(norm, out_channels)
75
+ output_norm = get_norm(norm, out_channels)
76
+
77
+ lateral_conv = Conv2d(
78
+ in_channels, out_channels, kernel_size=1, bias=use_bias, norm=lateral_norm
79
+ )
80
+ output_conv = Conv2d(
81
+ out_channels,
82
+ out_channels,
83
+ kernel_size=3,
84
+ stride=1,
85
+ padding=1,
86
+ bias=use_bias,
87
+ norm=output_norm,
88
+ )
89
+ weight_init.c2_xavier_fill(lateral_conv)
90
+ weight_init.c2_xavier_fill(output_conv)
91
+ stage = int(math.log2(strides[idx]))
92
+ self.add_module("fpn_lateral{}".format(stage), lateral_conv)
93
+ self.add_module("fpn_output{}".format(stage), output_conv)
94
+
95
+ lateral_convs.append(lateral_conv)
96
+ output_convs.append(output_conv)
97
+ # Place convs into top-down order (from low to high resolution)
98
+ # to make the top-down computation in forward clearer.
99
+ self.lateral_convs = lateral_convs[::-1]
100
+ self.output_convs = output_convs[::-1]
101
+ self.top_block = top_block
102
+ self.in_features = tuple(in_features)
103
+ self.bottom_up = bottom_up
104
+ # Return feature names are "p<stage>", like ["p2", "p3", ..., "p6"]
105
+ self._out_feature_strides = {"p{}".format(int(math.log2(s))): s for s in strides}
106
+ # top block output feature maps.
107
+ if self.top_block is not None:
108
+ for s in range(stage, stage + self.top_block.num_levels):
109
+ self._out_feature_strides["p{}".format(s + 1)] = 2 ** (s + 1)
110
+
111
+ self._out_features = list(self._out_feature_strides.keys())
112
+ self._out_feature_channels = {k: out_channels for k in self._out_features}
113
+ self._size_divisibility = strides[-1]
114
+ self._square_pad = square_pad
115
+ assert fuse_type in {"avg", "sum"}
116
+ self._fuse_type = fuse_type
117
+
118
+ @property
119
+ def size_divisibility(self):
120
+ return self._size_divisibility
121
+
122
+ @property
123
+ def padding_constraints(self):
124
+ return {"square_size": self._square_pad}
125
+
126
+ def forward(self, x):
127
+ """
128
+ Args:
129
+ input (dict[str->Tensor]): mapping feature map name (e.g., "res5") to
130
+ feature map tensor for each feature level in high to low resolution order.
131
+
132
+ Returns:
133
+ dict[str->Tensor]:
134
+ mapping from feature map name to FPN feature map tensor
135
+ in high to low resolution order. Returned feature names follow the FPN
136
+ paper convention: "p<stage>", where stage has stride = 2 ** stage e.g.,
137
+ ["p2", "p3", ..., "p6"].
138
+ """
139
+ bottom_up_features = self.bottom_up(x)
140
+ results = []
141
+ prev_features = self.lateral_convs[0](bottom_up_features[self.in_features[-1]])
142
+ results.append(self.output_convs[0](prev_features))
143
+
144
+ # Reverse feature maps into top-down order (from low to high resolution)
145
+ for idx, (lateral_conv, output_conv) in enumerate(
146
+ zip(self.lateral_convs, self.output_convs)
147
+ ):
148
+ # Slicing of ModuleList is not supported https://github.com/pytorch/pytorch/issues/47336
149
+ # Therefore we loop over all modules but skip the first one
150
+ if idx > 0:
151
+ features = self.in_features[-idx - 1]
152
+ features = bottom_up_features[features]
153
+ top_down_features = F.interpolate(prev_features, scale_factor=2.0, mode="nearest")
154
+ lateral_features = lateral_conv(features)
155
+ prev_features = lateral_features + top_down_features
156
+ if self._fuse_type == "avg":
157
+ prev_features /= 2
158
+ results.insert(0, output_conv(prev_features))
159
+
160
+ if self.top_block is not None:
161
+ if self.top_block.in_feature in bottom_up_features:
162
+ top_block_in_feature = bottom_up_features[self.top_block.in_feature]
163
+ else:
164
+ top_block_in_feature = results[self._out_features.index(self.top_block.in_feature)]
165
+ results.extend(self.top_block(top_block_in_feature))
166
+ assert len(self._out_features) == len(results)
167
+ return {f: res for f, res in zip(self._out_features, results)}
168
+
169
+ def output_shape(self):
170
+ return {
171
+ name: ShapeSpec(
172
+ channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
173
+ )
174
+ for name in self._out_features
175
+ }
176
+
177
+
178
+ def _assert_strides_are_log2_contiguous(strides):
179
+ """
180
+ Assert that each stride is 2x times its preceding stride, i.e. "contiguous in log2".
181
+ """
182
+ for i, stride in enumerate(strides[1:], 1):
183
+ assert stride == 2 * strides[i - 1], "Strides {} {} are not log2 contiguous".format(
184
+ stride, strides[i - 1]
185
+ )
186
+
187
+
188
+ class LastLevelMaxPool(nn.Module):
189
+ """
190
+ This module is used in the original FPN to generate a downsampled
191
+ P6 feature from P5.
192
+ """
193
+
194
+ def __init__(self):
195
+ super().__init__()
196
+ self.num_levels = 1
197
+ self.in_feature = "p5"
198
+
199
+ def forward(self, x):
200
+ return [F.max_pool2d(x, kernel_size=1, stride=2, padding=0)]
201
+
202
+
203
+ class LastLevelP6P7(nn.Module):
204
+ """
205
+ This module is used in RetinaNet to generate extra layers, P6 and P7 from
206
+ C5 feature.
207
+ """
208
+
209
+ def __init__(self, in_channels, out_channels, in_feature="res5"):
210
+ super().__init__()
211
+ self.num_levels = 2
212
+ self.in_feature = in_feature
213
+ self.p6 = nn.Conv2d(in_channels, out_channels, 3, 2, 1)
214
+ self.p7 = nn.Conv2d(out_channels, out_channels, 3, 2, 1)
215
+ for module in [self.p6, self.p7]:
216
+ weight_init.c2_xavier_fill(module)
217
+
218
+ def forward(self, c5):
219
+ p6 = self.p6(c5)
220
+ p7 = self.p7(F.relu(p6))
221
+ return [p6, p7]
222
+
223
+
224
+ @BACKBONE_REGISTRY.register()
225
+ def build_resnet_fpn_backbone(cfg, input_shape: ShapeSpec):
226
+ """
227
+ Args:
228
+ cfg: a detectron2 CfgNode
229
+
230
+ Returns:
231
+ backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
232
+ """
233
+ bottom_up = build_resnet_backbone(cfg, input_shape)
234
+ in_features = cfg.MODEL.FPN.IN_FEATURES
235
+ out_channels = cfg.MODEL.FPN.OUT_CHANNELS
236
+ backbone = FPN(
237
+ bottom_up=bottom_up,
238
+ in_features=in_features,
239
+ out_channels=out_channels,
240
+ norm=cfg.MODEL.FPN.NORM,
241
+ top_block=LastLevelMaxPool(),
242
+ fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
243
+ )
244
+ return backbone
245
+
246
+
247
+ @BACKBONE_REGISTRY.register()
248
+ def build_retinanet_resnet_fpn_backbone(cfg, input_shape: ShapeSpec):
249
+ """
250
+ Args:
251
+ cfg: a detectron2 CfgNode
252
+
253
+ Returns:
254
+ backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
255
+ """
256
+ bottom_up = build_resnet_backbone(cfg, input_shape)
257
+ in_features = cfg.MODEL.FPN.IN_FEATURES
258
+ out_channels = cfg.MODEL.FPN.OUT_CHANNELS
259
+ in_channels_p6p7 = bottom_up.output_shape()["res5"].channels
260
+ backbone = FPN(
261
+ bottom_up=bottom_up,
262
+ in_features=in_features,
263
+ out_channels=out_channels,
264
+ norm=cfg.MODEL.FPN.NORM,
265
+ top_block=LastLevelP6P7(in_channels_p6p7, out_channels),
266
+ fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
267
+ )
268
+ return backbone
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/backbone/mvit.py ADDED
@@ -0,0 +1,448 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import numpy as np
3
+ import torch
4
+ import torch.nn as nn
5
+
6
+ from .backbone import Backbone
7
+ from .utils import (
8
+ PatchEmbed,
9
+ add_decomposed_rel_pos,
10
+ get_abs_pos,
11
+ window_partition,
12
+ window_unpartition,
13
+ )
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ __all__ = ["MViT"]
19
+
20
+
21
+ def attention_pool(x, pool, norm=None):
22
+ # (B, H, W, C) -> (B, C, H, W)
23
+ x = x.permute(0, 3, 1, 2)
24
+ x = pool(x)
25
+ # (B, C, H1, W1) -> (B, H1, W1, C)
26
+ x = x.permute(0, 2, 3, 1)
27
+ if norm:
28
+ x = norm(x)
29
+
30
+ return x
31
+
32
+
33
+ class MultiScaleAttention(nn.Module):
34
+ """Multiscale Multi-head Attention block."""
35
+
36
+ def __init__(
37
+ self,
38
+ dim,
39
+ dim_out,
40
+ num_heads,
41
+ qkv_bias=True,
42
+ norm_layer=nn.LayerNorm,
43
+ pool_kernel=(3, 3),
44
+ stride_q=1,
45
+ stride_kv=1,
46
+ residual_pooling=True,
47
+ window_size=0,
48
+ use_rel_pos=False,
49
+ rel_pos_zero_init=True,
50
+ input_size=None,
51
+ ):
52
+ """
53
+ Args:
54
+ dim (int): Number of input channels.
55
+ dim_out (int): Number of output channels.
56
+ num_heads (int): Number of attention heads.
57
+ qkv_bias (bool: If True, add a learnable bias to query, key, value.
58
+ norm_layer (nn.Module): Normalization layer.
59
+ pool_kernel (tuple): kernel size for qkv pooling layers.
60
+ stride_q (int): stride size for q pooling layer.
61
+ stride_kv (int): stride size for kv pooling layer.
62
+ residual_pooling (bool): If true, enable residual pooling.
63
+ use_rel_pos (bool): If True, add relative postional embeddings to the attention map.
64
+ rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
65
+ input_size (int or None): Input resolution.
66
+ """
67
+ super().__init__()
68
+ self.num_heads = num_heads
69
+ head_dim = dim_out // num_heads
70
+ self.scale = head_dim**-0.5
71
+
72
+ self.qkv = nn.Linear(dim, dim_out * 3, bias=qkv_bias)
73
+ self.proj = nn.Linear(dim_out, dim_out)
74
+
75
+ # qkv pooling
76
+ pool_padding = [k // 2 for k in pool_kernel]
77
+ dim_conv = dim_out // num_heads
78
+ self.pool_q = nn.Conv2d(
79
+ dim_conv,
80
+ dim_conv,
81
+ pool_kernel,
82
+ stride=stride_q,
83
+ padding=pool_padding,
84
+ groups=dim_conv,
85
+ bias=False,
86
+ )
87
+ self.norm_q = norm_layer(dim_conv)
88
+ self.pool_k = nn.Conv2d(
89
+ dim_conv,
90
+ dim_conv,
91
+ pool_kernel,
92
+ stride=stride_kv,
93
+ padding=pool_padding,
94
+ groups=dim_conv,
95
+ bias=False,
96
+ )
97
+ self.norm_k = norm_layer(dim_conv)
98
+ self.pool_v = nn.Conv2d(
99
+ dim_conv,
100
+ dim_conv,
101
+ pool_kernel,
102
+ stride=stride_kv,
103
+ padding=pool_padding,
104
+ groups=dim_conv,
105
+ bias=False,
106
+ )
107
+ self.norm_v = norm_layer(dim_conv)
108
+
109
+ self.window_size = window_size
110
+ if window_size:
111
+ self.q_win_size = window_size // stride_q
112
+ self.kv_win_size = window_size // stride_kv
113
+ self.residual_pooling = residual_pooling
114
+
115
+ self.use_rel_pos = use_rel_pos
116
+ if self.use_rel_pos:
117
+ # initialize relative positional embeddings
118
+ assert input_size[0] == input_size[1]
119
+ size = input_size[0]
120
+ rel_dim = 2 * max(size // stride_q, size // stride_kv) - 1
121
+ self.rel_pos_h = nn.Parameter(torch.zeros(rel_dim, head_dim))
122
+ self.rel_pos_w = nn.Parameter(torch.zeros(rel_dim, head_dim))
123
+
124
+ if not rel_pos_zero_init:
125
+ nn.init.trunc_normal_(self.rel_pos_h, std=0.02)
126
+ nn.init.trunc_normal_(self.rel_pos_w, std=0.02)
127
+
128
+ def forward(self, x):
129
+ B, H, W, _ = x.shape
130
+ # qkv with shape (3, B, nHead, H, W, C)
131
+ qkv = self.qkv(x).reshape(B, H, W, 3, self.num_heads, -1).permute(3, 0, 4, 1, 2, 5)
132
+ # q, k, v with shape (B * nHead, H, W, C)
133
+ q, k, v = qkv.reshape(3, B * self.num_heads, H, W, -1).unbind(0)
134
+
135
+ q = attention_pool(q, self.pool_q, self.norm_q)
136
+ k = attention_pool(k, self.pool_k, self.norm_k)
137
+ v = attention_pool(v, self.pool_v, self.norm_v)
138
+
139
+ ori_q = q
140
+ if self.window_size:
141
+ q, q_hw_pad = window_partition(q, self.q_win_size)
142
+ k, kv_hw_pad = window_partition(k, self.kv_win_size)
143
+ v, _ = window_partition(v, self.kv_win_size)
144
+ q_hw = (self.q_win_size, self.q_win_size)
145
+ kv_hw = (self.kv_win_size, self.kv_win_size)
146
+ else:
147
+ q_hw = q.shape[1:3]
148
+ kv_hw = k.shape[1:3]
149
+
150
+ q = q.view(q.shape[0], np.prod(q_hw), -1)
151
+ k = k.view(k.shape[0], np.prod(kv_hw), -1)
152
+ v = v.view(v.shape[0], np.prod(kv_hw), -1)
153
+
154
+ attn = (q * self.scale) @ k.transpose(-2, -1)
155
+
156
+ if self.use_rel_pos:
157
+ attn = add_decomposed_rel_pos(attn, q, self.rel_pos_h, self.rel_pos_w, q_hw, kv_hw)
158
+
159
+ attn = attn.softmax(dim=-1)
160
+ x = attn @ v
161
+
162
+ x = x.view(x.shape[0], q_hw[0], q_hw[1], -1)
163
+
164
+ if self.window_size:
165
+ x = window_unpartition(x, self.q_win_size, q_hw_pad, ori_q.shape[1:3])
166
+
167
+ if self.residual_pooling:
168
+ x += ori_q
169
+
170
+ H, W = x.shape[1], x.shape[2]
171
+ x = x.view(B, self.num_heads, H, W, -1).permute(0, 2, 3, 1, 4).reshape(B, H, W, -1)
172
+ x = self.proj(x)
173
+
174
+ return x
175
+
176
+
177
+ class MultiScaleBlock(nn.Module):
178
+ """Multiscale Transformer blocks"""
179
+
180
+ def __init__(
181
+ self,
182
+ dim,
183
+ dim_out,
184
+ num_heads,
185
+ mlp_ratio=4.0,
186
+ qkv_bias=True,
187
+ drop_path=0.0,
188
+ norm_layer=nn.LayerNorm,
189
+ act_layer=nn.GELU,
190
+ qkv_pool_kernel=(3, 3),
191
+ stride_q=1,
192
+ stride_kv=1,
193
+ residual_pooling=True,
194
+ window_size=0,
195
+ use_rel_pos=False,
196
+ rel_pos_zero_init=True,
197
+ input_size=None,
198
+ ):
199
+ """
200
+ Args:
201
+ dim (int): Number of input channels.
202
+ dim_out (int): Number of output channels.
203
+ num_heads (int): Number of attention heads in the MViT block.
204
+ mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
205
+ qkv_bias (bool): If True, add a learnable bias to query, key, value.
206
+ drop_path (float): Stochastic depth rate.
207
+ norm_layer (nn.Module): Normalization layer.
208
+ act_layer (nn.Module): Activation layer.
209
+ qkv_pool_kernel (tuple): kernel size for qkv pooling layers.
210
+ stride_q (int): stride size for q pooling layer.
211
+ stride_kv (int): stride size for kv pooling layer.
212
+ residual_pooling (bool): If true, enable residual pooling.
213
+ window_size (int): Window size for window attention blocks. If it equals 0, then not
214
+ use window attention.
215
+ use_rel_pos (bool): If True, add relative postional embeddings to the attention map.
216
+ rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
217
+ input_size (int or None): Input resolution.
218
+ """
219
+ super().__init__()
220
+ self.norm1 = norm_layer(dim)
221
+ self.attn = MultiScaleAttention(
222
+ dim,
223
+ dim_out,
224
+ num_heads=num_heads,
225
+ qkv_bias=qkv_bias,
226
+ norm_layer=norm_layer,
227
+ pool_kernel=qkv_pool_kernel,
228
+ stride_q=stride_q,
229
+ stride_kv=stride_kv,
230
+ residual_pooling=residual_pooling,
231
+ window_size=window_size,
232
+ use_rel_pos=use_rel_pos,
233
+ rel_pos_zero_init=rel_pos_zero_init,
234
+ input_size=input_size,
235
+ )
236
+
237
+ from timm.models.layers import DropPath, Mlp
238
+
239
+ self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
240
+ self.norm2 = norm_layer(dim_out)
241
+ self.mlp = Mlp(
242
+ in_features=dim_out,
243
+ hidden_features=int(dim_out * mlp_ratio),
244
+ out_features=dim_out,
245
+ act_layer=act_layer,
246
+ )
247
+
248
+ if dim != dim_out:
249
+ self.proj = nn.Linear(dim, dim_out)
250
+
251
+ if stride_q > 1:
252
+ kernel_skip = stride_q + 1
253
+ padding_skip = int(kernel_skip // 2)
254
+ self.pool_skip = nn.MaxPool2d(kernel_skip, stride_q, padding_skip, ceil_mode=False)
255
+
256
+ def forward(self, x):
257
+ x_norm = self.norm1(x)
258
+ x_block = self.attn(x_norm)
259
+
260
+ if hasattr(self, "proj"):
261
+ x = self.proj(x_norm)
262
+ if hasattr(self, "pool_skip"):
263
+ x = attention_pool(x, self.pool_skip)
264
+
265
+ x = x + self.drop_path(x_block)
266
+ x = x + self.drop_path(self.mlp(self.norm2(x)))
267
+
268
+ return x
269
+
270
+
271
+ class MViT(Backbone):
272
+ """
273
+ This module implements Multiscale Vision Transformer (MViT) backbone in :paper:'mvitv2'.
274
+ """
275
+
276
+ def __init__(
277
+ self,
278
+ img_size=224,
279
+ patch_kernel=(7, 7),
280
+ patch_stride=(4, 4),
281
+ patch_padding=(3, 3),
282
+ in_chans=3,
283
+ embed_dim=96,
284
+ depth=16,
285
+ num_heads=1,
286
+ last_block_indexes=(0, 2, 11, 15),
287
+ qkv_pool_kernel=(3, 3),
288
+ adaptive_kv_stride=4,
289
+ adaptive_window_size=56,
290
+ residual_pooling=True,
291
+ mlp_ratio=4.0,
292
+ qkv_bias=True,
293
+ drop_path_rate=0.0,
294
+ norm_layer=nn.LayerNorm,
295
+ act_layer=nn.GELU,
296
+ use_abs_pos=False,
297
+ use_rel_pos=True,
298
+ rel_pos_zero_init=True,
299
+ use_act_checkpoint=False,
300
+ pretrain_img_size=224,
301
+ pretrain_use_cls_token=True,
302
+ out_features=("scale2", "scale3", "scale4", "scale5"),
303
+ ):
304
+ """
305
+ Args:
306
+ img_size (int): Input image size.
307
+ patch_kernel (tuple): kernel size for patch embedding.
308
+ patch_stride (tuple): stride size for patch embedding.
309
+ patch_padding (tuple): padding size for patch embedding.
310
+ in_chans (int): Number of input image channels.
311
+ embed_dim (int): Patch embedding dimension.
312
+ depth (int): Depth of MViT.
313
+ num_heads (int): Number of base attention heads in each MViT block.
314
+ last_block_indexes (tuple): Block indexes for last blocks in each stage.
315
+ qkv_pool_kernel (tuple): kernel size for qkv pooling layers.
316
+ adaptive_kv_stride (int): adaptive stride size for kv pooling.
317
+ adaptive_window_size (int): adaptive window size for window attention blocks.
318
+ residual_pooling (bool): If true, enable residual pooling.
319
+ mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
320
+ qkv_bias (bool): If True, add a learnable bias to query, key, value.
321
+ drop_path_rate (float): Stochastic depth rate.
322
+ norm_layer (nn.Module): Normalization layer.
323
+ act_layer (nn.Module): Activation layer.
324
+ use_abs_pos (bool): If True, use absolute positional embeddings.
325
+ use_rel_pos (bool): If True, add relative postional embeddings to the attention map.
326
+ rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
327
+ window_size (int): Window size for window attention blocks.
328
+ use_act_checkpoint (bool): If True, use activation checkpointing.
329
+ pretrain_img_size (int): input image size for pretraining models.
330
+ pretrain_use_cls_token (bool): If True, pretrainig models use class token.
331
+ out_features (tuple): name of the feature maps from each stage.
332
+ """
333
+ super().__init__()
334
+ self.pretrain_use_cls_token = pretrain_use_cls_token
335
+
336
+ self.patch_embed = PatchEmbed(
337
+ kernel_size=patch_kernel,
338
+ stride=patch_stride,
339
+ padding=patch_padding,
340
+ in_chans=in_chans,
341
+ embed_dim=embed_dim,
342
+ )
343
+
344
+ if use_abs_pos:
345
+ # Initialize absoluate positional embedding with pretrain image size.
346
+ num_patches = (pretrain_img_size // patch_stride[0]) * (
347
+ pretrain_img_size // patch_stride[1]
348
+ )
349
+ num_positions = (num_patches + 1) if pretrain_use_cls_token else num_patches
350
+ self.pos_embed = nn.Parameter(torch.zeros(1, num_positions, embed_dim))
351
+ else:
352
+ self.pos_embed = None
353
+
354
+ # stochastic depth decay rule
355
+ dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]
356
+ dim_out = embed_dim
357
+ stride_kv = adaptive_kv_stride
358
+ window_size = adaptive_window_size
359
+ input_size = (img_size // patch_stride[0], img_size // patch_stride[1])
360
+ stage = 2
361
+ stride = patch_stride[0]
362
+ self._out_feature_strides = {}
363
+ self._out_feature_channels = {}
364
+ self.blocks = nn.ModuleList()
365
+ for i in range(depth):
366
+ # Multiply stride_kv by 2 if it's the last block of stage2 and stage3.
367
+ if i == last_block_indexes[1] or i == last_block_indexes[2]:
368
+ stride_kv_ = stride_kv * 2
369
+ else:
370
+ stride_kv_ = stride_kv
371
+ # hybrid window attention: global attention in last three stages.
372
+ window_size_ = 0 if i in last_block_indexes[1:] else window_size
373
+ block = MultiScaleBlock(
374
+ dim=embed_dim,
375
+ dim_out=dim_out,
376
+ num_heads=num_heads,
377
+ mlp_ratio=mlp_ratio,
378
+ qkv_bias=qkv_bias,
379
+ drop_path=dpr[i],
380
+ norm_layer=norm_layer,
381
+ qkv_pool_kernel=qkv_pool_kernel,
382
+ stride_q=2 if i - 1 in last_block_indexes else 1,
383
+ stride_kv=stride_kv_,
384
+ residual_pooling=residual_pooling,
385
+ window_size=window_size_,
386
+ use_rel_pos=use_rel_pos,
387
+ rel_pos_zero_init=rel_pos_zero_init,
388
+ input_size=input_size,
389
+ )
390
+ if use_act_checkpoint:
391
+ # TODO: use torch.utils.checkpoint
392
+ from fairscale.nn.checkpoint import checkpoint_wrapper
393
+
394
+ block = checkpoint_wrapper(block)
395
+ self.blocks.append(block)
396
+
397
+ embed_dim = dim_out
398
+ if i in last_block_indexes:
399
+ name = f"scale{stage}"
400
+ if name in out_features:
401
+ self._out_feature_channels[name] = dim_out
402
+ self._out_feature_strides[name] = stride
403
+ self.add_module(f"{name}_norm", norm_layer(dim_out))
404
+
405
+ dim_out *= 2
406
+ num_heads *= 2
407
+ stride_kv = max(stride_kv // 2, 1)
408
+ stride *= 2
409
+ stage += 1
410
+ if i - 1 in last_block_indexes:
411
+ window_size = window_size // 2
412
+ input_size = [s // 2 for s in input_size]
413
+
414
+ self._out_features = out_features
415
+ self._last_block_indexes = last_block_indexes
416
+
417
+ if self.pos_embed is not None:
418
+ nn.init.trunc_normal_(self.pos_embed, std=0.02)
419
+
420
+ self.apply(self._init_weights)
421
+
422
+ def _init_weights(self, m):
423
+ if isinstance(m, nn.Linear):
424
+ nn.init.trunc_normal_(m.weight, std=0.02)
425
+ if isinstance(m, nn.Linear) and m.bias is not None:
426
+ nn.init.constant_(m.bias, 0)
427
+ elif isinstance(m, nn.LayerNorm):
428
+ nn.init.constant_(m.bias, 0)
429
+ nn.init.constant_(m.weight, 1.0)
430
+
431
+ def forward(self, x):
432
+ x = self.patch_embed(x)
433
+
434
+ if self.pos_embed is not None:
435
+ x = x + get_abs_pos(self.pos_embed, self.pretrain_use_cls_token, x.shape[1:3])
436
+
437
+ outputs = {}
438
+ stage = 2
439
+ for i, blk in enumerate(self.blocks):
440
+ x = blk(x)
441
+ if i in self._last_block_indexes:
442
+ name = f"scale{stage}"
443
+ if name in self._out_features:
444
+ x_out = getattr(self, f"{name}_norm")(x)
445
+ outputs[name] = x_out.permute(0, 3, 1, 2)
446
+ stage += 1
447
+
448
+ return outputs
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/backbone/regnet.py ADDED
@@ -0,0 +1,452 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
2
+ """
3
+ Implementation of RegNet models from :paper:`dds` and :paper:`scaling`.
4
+
5
+ This code is adapted from https://github.com/facebookresearch/pycls with minimal modifications.
6
+ Some code duplication exists between RegNet and ResNets (e.g., ResStem) in order to simplify
7
+ model loading.
8
+ """
9
+
10
+ import numpy as np
11
+ from torch import nn
12
+
13
+ from annotator.oneformer.detectron2.layers import CNNBlockBase, ShapeSpec, get_norm
14
+
15
+ from .backbone import Backbone
16
+
17
+ __all__ = [
18
+ "AnyNet",
19
+ "RegNet",
20
+ "ResStem",
21
+ "SimpleStem",
22
+ "VanillaBlock",
23
+ "ResBasicBlock",
24
+ "ResBottleneckBlock",
25
+ ]
26
+
27
+
28
+ def conv2d(w_in, w_out, k, *, stride=1, groups=1, bias=False):
29
+ """Helper for building a conv2d layer."""
30
+ assert k % 2 == 1, "Only odd size kernels supported to avoid padding issues."
31
+ s, p, g, b = stride, (k - 1) // 2, groups, bias
32
+ return nn.Conv2d(w_in, w_out, k, stride=s, padding=p, groups=g, bias=b)
33
+
34
+
35
+ def gap2d():
36
+ """Helper for building a global average pooling layer."""
37
+ return nn.AdaptiveAvgPool2d((1, 1))
38
+
39
+
40
+ def pool2d(k, *, stride=1):
41
+ """Helper for building a pool2d layer."""
42
+ assert k % 2 == 1, "Only odd size kernels supported to avoid padding issues."
43
+ return nn.MaxPool2d(k, stride=stride, padding=(k - 1) // 2)
44
+
45
+
46
+ def init_weights(m):
47
+ """Performs ResNet-style weight initialization."""
48
+ if isinstance(m, nn.Conv2d):
49
+ # Note that there is no bias due to BN
50
+ fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
51
+ m.weight.data.normal_(mean=0.0, std=np.sqrt(2.0 / fan_out))
52
+ elif isinstance(m, nn.BatchNorm2d):
53
+ m.weight.data.fill_(1.0)
54
+ m.bias.data.zero_()
55
+ elif isinstance(m, nn.Linear):
56
+ m.weight.data.normal_(mean=0.0, std=0.01)
57
+ m.bias.data.zero_()
58
+
59
+
60
+ class ResStem(CNNBlockBase):
61
+ """ResNet stem for ImageNet: 7x7, BN, AF, MaxPool."""
62
+
63
+ def __init__(self, w_in, w_out, norm, activation_class):
64
+ super().__init__(w_in, w_out, 4)
65
+ self.conv = conv2d(w_in, w_out, 7, stride=2)
66
+ self.bn = get_norm(norm, w_out)
67
+ self.af = activation_class()
68
+ self.pool = pool2d(3, stride=2)
69
+
70
+ def forward(self, x):
71
+ for layer in self.children():
72
+ x = layer(x)
73
+ return x
74
+
75
+
76
+ class SimpleStem(CNNBlockBase):
77
+ """Simple stem for ImageNet: 3x3, BN, AF."""
78
+
79
+ def __init__(self, w_in, w_out, norm, activation_class):
80
+ super().__init__(w_in, w_out, 2)
81
+ self.conv = conv2d(w_in, w_out, 3, stride=2)
82
+ self.bn = get_norm(norm, w_out)
83
+ self.af = activation_class()
84
+
85
+ def forward(self, x):
86
+ for layer in self.children():
87
+ x = layer(x)
88
+ return x
89
+
90
+
91
+ class SE(nn.Module):
92
+ """Squeeze-and-Excitation (SE) block: AvgPool, FC, Act, FC, Sigmoid."""
93
+
94
+ def __init__(self, w_in, w_se, activation_class):
95
+ super().__init__()
96
+ self.avg_pool = gap2d()
97
+ self.f_ex = nn.Sequential(
98
+ conv2d(w_in, w_se, 1, bias=True),
99
+ activation_class(),
100
+ conv2d(w_se, w_in, 1, bias=True),
101
+ nn.Sigmoid(),
102
+ )
103
+
104
+ def forward(self, x):
105
+ return x * self.f_ex(self.avg_pool(x))
106
+
107
+
108
+ class VanillaBlock(CNNBlockBase):
109
+ """Vanilla block: [3x3 conv, BN, Relu] x2."""
110
+
111
+ def __init__(self, w_in, w_out, stride, norm, activation_class, _params):
112
+ super().__init__(w_in, w_out, stride)
113
+ self.a = conv2d(w_in, w_out, 3, stride=stride)
114
+ self.a_bn = get_norm(norm, w_out)
115
+ self.a_af = activation_class()
116
+ self.b = conv2d(w_out, w_out, 3)
117
+ self.b_bn = get_norm(norm, w_out)
118
+ self.b_af = activation_class()
119
+
120
+ def forward(self, x):
121
+ for layer in self.children():
122
+ x = layer(x)
123
+ return x
124
+
125
+
126
+ class BasicTransform(nn.Module):
127
+ """Basic transformation: [3x3 conv, BN, Relu] x2."""
128
+
129
+ def __init__(self, w_in, w_out, stride, norm, activation_class, _params):
130
+ super().__init__()
131
+ self.a = conv2d(w_in, w_out, 3, stride=stride)
132
+ self.a_bn = get_norm(norm, w_out)
133
+ self.a_af = activation_class()
134
+ self.b = conv2d(w_out, w_out, 3)
135
+ self.b_bn = get_norm(norm, w_out)
136
+ self.b_bn.final_bn = True
137
+
138
+ def forward(self, x):
139
+ for layer in self.children():
140
+ x = layer(x)
141
+ return x
142
+
143
+
144
+ class ResBasicBlock(CNNBlockBase):
145
+ """Residual basic block: x + f(x), f = basic transform."""
146
+
147
+ def __init__(self, w_in, w_out, stride, norm, activation_class, params):
148
+ super().__init__(w_in, w_out, stride)
149
+ self.proj, self.bn = None, None
150
+ if (w_in != w_out) or (stride != 1):
151
+ self.proj = conv2d(w_in, w_out, 1, stride=stride)
152
+ self.bn = get_norm(norm, w_out)
153
+ self.f = BasicTransform(w_in, w_out, stride, norm, activation_class, params)
154
+ self.af = activation_class()
155
+
156
+ def forward(self, x):
157
+ x_p = self.bn(self.proj(x)) if self.proj else x
158
+ return self.af(x_p + self.f(x))
159
+
160
+
161
+ class BottleneckTransform(nn.Module):
162
+ """Bottleneck transformation: 1x1, 3x3 [+SE], 1x1."""
163
+
164
+ def __init__(self, w_in, w_out, stride, norm, activation_class, params):
165
+ super().__init__()
166
+ w_b = int(round(w_out * params["bot_mul"]))
167
+ w_se = int(round(w_in * params["se_r"]))
168
+ groups = w_b // params["group_w"]
169
+ self.a = conv2d(w_in, w_b, 1)
170
+ self.a_bn = get_norm(norm, w_b)
171
+ self.a_af = activation_class()
172
+ self.b = conv2d(w_b, w_b, 3, stride=stride, groups=groups)
173
+ self.b_bn = get_norm(norm, w_b)
174
+ self.b_af = activation_class()
175
+ self.se = SE(w_b, w_se, activation_class) if w_se else None
176
+ self.c = conv2d(w_b, w_out, 1)
177
+ self.c_bn = get_norm(norm, w_out)
178
+ self.c_bn.final_bn = True
179
+
180
+ def forward(self, x):
181
+ for layer in self.children():
182
+ x = layer(x)
183
+ return x
184
+
185
+
186
+ class ResBottleneckBlock(CNNBlockBase):
187
+ """Residual bottleneck block: x + f(x), f = bottleneck transform."""
188
+
189
+ def __init__(self, w_in, w_out, stride, norm, activation_class, params):
190
+ super().__init__(w_in, w_out, stride)
191
+ self.proj, self.bn = None, None
192
+ if (w_in != w_out) or (stride != 1):
193
+ self.proj = conv2d(w_in, w_out, 1, stride=stride)
194
+ self.bn = get_norm(norm, w_out)
195
+ self.f = BottleneckTransform(w_in, w_out, stride, norm, activation_class, params)
196
+ self.af = activation_class()
197
+
198
+ def forward(self, x):
199
+ x_p = self.bn(self.proj(x)) if self.proj else x
200
+ return self.af(x_p + self.f(x))
201
+
202
+
203
+ class AnyStage(nn.Module):
204
+ """AnyNet stage (sequence of blocks w/ the same output shape)."""
205
+
206
+ def __init__(self, w_in, w_out, stride, d, block_class, norm, activation_class, params):
207
+ super().__init__()
208
+ for i in range(d):
209
+ block = block_class(w_in, w_out, stride, norm, activation_class, params)
210
+ self.add_module("b{}".format(i + 1), block)
211
+ stride, w_in = 1, w_out
212
+
213
+ def forward(self, x):
214
+ for block in self.children():
215
+ x = block(x)
216
+ return x
217
+
218
+
219
+ class AnyNet(Backbone):
220
+ """AnyNet model. See :paper:`dds`."""
221
+
222
+ def __init__(
223
+ self,
224
+ *,
225
+ stem_class,
226
+ stem_width,
227
+ block_class,
228
+ depths,
229
+ widths,
230
+ group_widths,
231
+ strides,
232
+ bottleneck_ratios,
233
+ se_ratio,
234
+ activation_class,
235
+ freeze_at=0,
236
+ norm="BN",
237
+ out_features=None,
238
+ ):
239
+ """
240
+ Args:
241
+ stem_class (callable): A callable taking 4 arguments (channels in, channels out,
242
+ normalization, callable returning an activation function) that returns another
243
+ callable implementing the stem module.
244
+ stem_width (int): The number of output channels that the stem produces.
245
+ block_class (callable): A callable taking 6 arguments (channels in, channels out,
246
+ stride, normalization, callable returning an activation function, a dict of
247
+ block-specific parameters) that returns another callable implementing the repeated
248
+ block module.
249
+ depths (list[int]): Number of blocks in each stage.
250
+ widths (list[int]): For each stage, the number of output channels of each block.
251
+ group_widths (list[int]): For each stage, the number of channels per group in group
252
+ convolution, if the block uses group convolution.
253
+ strides (list[int]): The stride that each network stage applies to its input.
254
+ bottleneck_ratios (list[float]): For each stage, the ratio of the number of bottleneck
255
+ channels to the number of block input channels (or, equivalently, output channels),
256
+ if the block uses a bottleneck.
257
+ se_ratio (float): The ratio of the number of channels used inside the squeeze-excitation
258
+ (SE) module to it number of input channels, if SE the block uses SE.
259
+ activation_class (callable): A callable taking no arguments that returns another
260
+ callable implementing an activation function.
261
+ freeze_at (int): The number of stages at the beginning to freeze.
262
+ see :meth:`freeze` for detailed explanation.
263
+ norm (str or callable): normalization for all conv layers.
264
+ See :func:`layers.get_norm` for supported format.
265
+ out_features (list[str]): name of the layers whose outputs should
266
+ be returned in forward. RegNet's use "stem" and "s1", "s2", etc for the stages after
267
+ the stem. If None, will return the output of the last layer.
268
+ """
269
+ super().__init__()
270
+ self.stem = stem_class(3, stem_width, norm, activation_class)
271
+
272
+ current_stride = self.stem.stride
273
+ self._out_feature_strides = {"stem": current_stride}
274
+ self._out_feature_channels = {"stem": self.stem.out_channels}
275
+ self.stages_and_names = []
276
+ prev_w = stem_width
277
+
278
+ for i, (d, w, s, b, g) in enumerate(
279
+ zip(depths, widths, strides, bottleneck_ratios, group_widths)
280
+ ):
281
+ params = {"bot_mul": b, "group_w": g, "se_r": se_ratio}
282
+ stage = AnyStage(prev_w, w, s, d, block_class, norm, activation_class, params)
283
+ name = "s{}".format(i + 1)
284
+ self.add_module(name, stage)
285
+ self.stages_and_names.append((stage, name))
286
+ self._out_feature_strides[name] = current_stride = int(
287
+ current_stride * np.prod([k.stride for k in stage.children()])
288
+ )
289
+ self._out_feature_channels[name] = list(stage.children())[-1].out_channels
290
+ prev_w = w
291
+
292
+ self.apply(init_weights)
293
+
294
+ if out_features is None:
295
+ out_features = [name]
296
+ self._out_features = out_features
297
+ assert len(self._out_features)
298
+ children = [x[0] for x in self.named_children()]
299
+ for out_feature in self._out_features:
300
+ assert out_feature in children, "Available children: {} does not include {}".format(
301
+ ", ".join(children), out_feature
302
+ )
303
+ self.freeze(freeze_at)
304
+
305
+ def forward(self, x):
306
+ """
307
+ Args:
308
+ x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``.
309
+
310
+ Returns:
311
+ dict[str->Tensor]: names and the corresponding features
312
+ """
313
+ assert x.dim() == 4, f"Model takes an input of shape (N, C, H, W). Got {x.shape} instead!"
314
+ outputs = {}
315
+ x = self.stem(x)
316
+ if "stem" in self._out_features:
317
+ outputs["stem"] = x
318
+ for stage, name in self.stages_and_names:
319
+ x = stage(x)
320
+ if name in self._out_features:
321
+ outputs[name] = x
322
+ return outputs
323
+
324
+ def output_shape(self):
325
+ return {
326
+ name: ShapeSpec(
327
+ channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
328
+ )
329
+ for name in self._out_features
330
+ }
331
+
332
+ def freeze(self, freeze_at=0):
333
+ """
334
+ Freeze the first several stages of the model. Commonly used in fine-tuning.
335
+
336
+ Layers that produce the same feature map spatial size are defined as one
337
+ "stage" by :paper:`FPN`.
338
+
339
+ Args:
340
+ freeze_at (int): number of stages to freeze.
341
+ `1` means freezing the stem. `2` means freezing the stem and
342
+ one residual stage, etc.
343
+
344
+ Returns:
345
+ nn.Module: this model itself
346
+ """
347
+ if freeze_at >= 1:
348
+ self.stem.freeze()
349
+ for idx, (stage, _) in enumerate(self.stages_and_names, start=2):
350
+ if freeze_at >= idx:
351
+ for block in stage.children():
352
+ block.freeze()
353
+ return self
354
+
355
+
356
+ def adjust_block_compatibility(ws, bs, gs):
357
+ """Adjusts the compatibility of widths, bottlenecks, and groups."""
358
+ assert len(ws) == len(bs) == len(gs)
359
+ assert all(w > 0 and b > 0 and g > 0 for w, b, g in zip(ws, bs, gs))
360
+ vs = [int(max(1, w * b)) for w, b in zip(ws, bs)]
361
+ gs = [int(min(g, v)) for g, v in zip(gs, vs)]
362
+ ms = [np.lcm(g, b) if b > 1 else g for g, b in zip(gs, bs)]
363
+ vs = [max(m, int(round(v / m) * m)) for v, m in zip(vs, ms)]
364
+ ws = [int(v / b) for v, b in zip(vs, bs)]
365
+ assert all(w * b % g == 0 for w, b, g in zip(ws, bs, gs))
366
+ return ws, bs, gs
367
+
368
+
369
+ def generate_regnet_parameters(w_a, w_0, w_m, d, q=8):
370
+ """Generates per stage widths and depths from RegNet parameters."""
371
+ assert w_a >= 0 and w_0 > 0 and w_m > 1 and w_0 % q == 0
372
+ # Generate continuous per-block ws
373
+ ws_cont = np.arange(d) * w_a + w_0
374
+ # Generate quantized per-block ws
375
+ ks = np.round(np.log(ws_cont / w_0) / np.log(w_m))
376
+ ws_all = w_0 * np.power(w_m, ks)
377
+ ws_all = np.round(np.divide(ws_all, q)).astype(int) * q
378
+ # Generate per stage ws and ds (assumes ws_all are sorted)
379
+ ws, ds = np.unique(ws_all, return_counts=True)
380
+ # Compute number of actual stages and total possible stages
381
+ num_stages, total_stages = len(ws), ks.max() + 1
382
+ # Convert numpy arrays to lists and return
383
+ ws, ds, ws_all, ws_cont = (x.tolist() for x in (ws, ds, ws_all, ws_cont))
384
+ return ws, ds, num_stages, total_stages, ws_all, ws_cont
385
+
386
+
387
+ class RegNet(AnyNet):
388
+ """RegNet model. See :paper:`dds`."""
389
+
390
+ def __init__(
391
+ self,
392
+ *,
393
+ stem_class,
394
+ stem_width,
395
+ block_class,
396
+ depth,
397
+ w_a,
398
+ w_0,
399
+ w_m,
400
+ group_width,
401
+ stride=2,
402
+ bottleneck_ratio=1.0,
403
+ se_ratio=0.0,
404
+ activation_class=None,
405
+ freeze_at=0,
406
+ norm="BN",
407
+ out_features=None,
408
+ ):
409
+ """
410
+ Build a RegNet from the parameterization described in :paper:`dds` Section 3.3.
411
+
412
+ Args:
413
+ See :class:`AnyNet` for arguments that are not listed here.
414
+ depth (int): Total number of blocks in the RegNet.
415
+ w_a (float): Factor by which block width would increase prior to quantizing block widths
416
+ by stage. See :paper:`dds` Section 3.3.
417
+ w_0 (int): Initial block width. See :paper:`dds` Section 3.3.
418
+ w_m (float): Parameter controlling block width quantization.
419
+ See :paper:`dds` Section 3.3.
420
+ group_width (int): Number of channels per group in group convolution, if the block uses
421
+ group convolution.
422
+ bottleneck_ratio (float): The ratio of the number of bottleneck channels to the number
423
+ of block input channels (or, equivalently, output channels), if the block uses a
424
+ bottleneck.
425
+ stride (int): The stride that each network stage applies to its input.
426
+ """
427
+ ws, ds = generate_regnet_parameters(w_a, w_0, w_m, depth)[0:2]
428
+ ss = [stride for _ in ws]
429
+ bs = [bottleneck_ratio for _ in ws]
430
+ gs = [group_width for _ in ws]
431
+ ws, bs, gs = adjust_block_compatibility(ws, bs, gs)
432
+
433
+ def default_activation_class():
434
+ return nn.ReLU(inplace=True)
435
+
436
+ super().__init__(
437
+ stem_class=stem_class,
438
+ stem_width=stem_width,
439
+ block_class=block_class,
440
+ depths=ds,
441
+ widths=ws,
442
+ strides=ss,
443
+ group_widths=gs,
444
+ bottleneck_ratios=bs,
445
+ se_ratio=se_ratio,
446
+ activation_class=default_activation_class
447
+ if activation_class is None
448
+ else activation_class,
449
+ freeze_at=freeze_at,
450
+ norm=norm,
451
+ out_features=out_features,
452
+ )
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/backbone/resnet.py ADDED
@@ -0,0 +1,694 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ import numpy as np
3
+ import fvcore.nn.weight_init as weight_init
4
+ import torch
5
+ import torch.nn.functional as F
6
+ from torch import nn
7
+
8
+ from annotator.oneformer.detectron2.layers import (
9
+ CNNBlockBase,
10
+ Conv2d,
11
+ DeformConv,
12
+ ModulatedDeformConv,
13
+ ShapeSpec,
14
+ get_norm,
15
+ )
16
+
17
+ from .backbone import Backbone
18
+ from .build import BACKBONE_REGISTRY
19
+
20
+ __all__ = [
21
+ "ResNetBlockBase",
22
+ "BasicBlock",
23
+ "BottleneckBlock",
24
+ "DeformBottleneckBlock",
25
+ "BasicStem",
26
+ "ResNet",
27
+ "make_stage",
28
+ "build_resnet_backbone",
29
+ ]
30
+
31
+
32
+ class BasicBlock(CNNBlockBase):
33
+ """
34
+ The basic residual block for ResNet-18 and ResNet-34 defined in :paper:`ResNet`,
35
+ with two 3x3 conv layers and a projection shortcut if needed.
36
+ """
37
+
38
+ def __init__(self, in_channels, out_channels, *, stride=1, norm="BN"):
39
+ """
40
+ Args:
41
+ in_channels (int): Number of input channels.
42
+ out_channels (int): Number of output channels.
43
+ stride (int): Stride for the first conv.
44
+ norm (str or callable): normalization for all conv layers.
45
+ See :func:`layers.get_norm` for supported format.
46
+ """
47
+ super().__init__(in_channels, out_channels, stride)
48
+
49
+ if in_channels != out_channels:
50
+ self.shortcut = Conv2d(
51
+ in_channels,
52
+ out_channels,
53
+ kernel_size=1,
54
+ stride=stride,
55
+ bias=False,
56
+ norm=get_norm(norm, out_channels),
57
+ )
58
+ else:
59
+ self.shortcut = None
60
+
61
+ self.conv1 = Conv2d(
62
+ in_channels,
63
+ out_channels,
64
+ kernel_size=3,
65
+ stride=stride,
66
+ padding=1,
67
+ bias=False,
68
+ norm=get_norm(norm, out_channels),
69
+ )
70
+
71
+ self.conv2 = Conv2d(
72
+ out_channels,
73
+ out_channels,
74
+ kernel_size=3,
75
+ stride=1,
76
+ padding=1,
77
+ bias=False,
78
+ norm=get_norm(norm, out_channels),
79
+ )
80
+
81
+ for layer in [self.conv1, self.conv2, self.shortcut]:
82
+ if layer is not None: # shortcut can be None
83
+ weight_init.c2_msra_fill(layer)
84
+
85
+ def forward(self, x):
86
+ out = self.conv1(x)
87
+ out = F.relu_(out)
88
+ out = self.conv2(out)
89
+
90
+ if self.shortcut is not None:
91
+ shortcut = self.shortcut(x)
92
+ else:
93
+ shortcut = x
94
+
95
+ out += shortcut
96
+ out = F.relu_(out)
97
+ return out
98
+
99
+
100
+ class BottleneckBlock(CNNBlockBase):
101
+ """
102
+ The standard bottleneck residual block used by ResNet-50, 101 and 152
103
+ defined in :paper:`ResNet`. It contains 3 conv layers with kernels
104
+ 1x1, 3x3, 1x1, and a projection shortcut if needed.
105
+ """
106
+
107
+ def __init__(
108
+ self,
109
+ in_channels,
110
+ out_channels,
111
+ *,
112
+ bottleneck_channels,
113
+ stride=1,
114
+ num_groups=1,
115
+ norm="BN",
116
+ stride_in_1x1=False,
117
+ dilation=1,
118
+ ):
119
+ """
120
+ Args:
121
+ bottleneck_channels (int): number of output channels for the 3x3
122
+ "bottleneck" conv layers.
123
+ num_groups (int): number of groups for the 3x3 conv layer.
124
+ norm (str or callable): normalization for all conv layers.
125
+ See :func:`layers.get_norm` for supported format.
126
+ stride_in_1x1 (bool): when stride>1, whether to put stride in the
127
+ first 1x1 convolution or the bottleneck 3x3 convolution.
128
+ dilation (int): the dilation rate of the 3x3 conv layer.
129
+ """
130
+ super().__init__(in_channels, out_channels, stride)
131
+
132
+ if in_channels != out_channels:
133
+ self.shortcut = Conv2d(
134
+ in_channels,
135
+ out_channels,
136
+ kernel_size=1,
137
+ stride=stride,
138
+ bias=False,
139
+ norm=get_norm(norm, out_channels),
140
+ )
141
+ else:
142
+ self.shortcut = None
143
+
144
+ # The original MSRA ResNet models have stride in the first 1x1 conv
145
+ # The subsequent fb.torch.resnet and Caffe2 ResNe[X]t implementations have
146
+ # stride in the 3x3 conv
147
+ stride_1x1, stride_3x3 = (stride, 1) if stride_in_1x1 else (1, stride)
148
+
149
+ self.conv1 = Conv2d(
150
+ in_channels,
151
+ bottleneck_channels,
152
+ kernel_size=1,
153
+ stride=stride_1x1,
154
+ bias=False,
155
+ norm=get_norm(norm, bottleneck_channels),
156
+ )
157
+
158
+ self.conv2 = Conv2d(
159
+ bottleneck_channels,
160
+ bottleneck_channels,
161
+ kernel_size=3,
162
+ stride=stride_3x3,
163
+ padding=1 * dilation,
164
+ bias=False,
165
+ groups=num_groups,
166
+ dilation=dilation,
167
+ norm=get_norm(norm, bottleneck_channels),
168
+ )
169
+
170
+ self.conv3 = Conv2d(
171
+ bottleneck_channels,
172
+ out_channels,
173
+ kernel_size=1,
174
+ bias=False,
175
+ norm=get_norm(norm, out_channels),
176
+ )
177
+
178
+ for layer in [self.conv1, self.conv2, self.conv3, self.shortcut]:
179
+ if layer is not None: # shortcut can be None
180
+ weight_init.c2_msra_fill(layer)
181
+
182
+ # Zero-initialize the last normalization in each residual branch,
183
+ # so that at the beginning, the residual branch starts with zeros,
184
+ # and each residual block behaves like an identity.
185
+ # See Sec 5.1 in "Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour":
186
+ # "For BN layers, the learnable scaling coefficient γ is initialized
187
+ # to be 1, except for each residual block's last BN
188
+ # where γ is initialized to be 0."
189
+
190
+ # nn.init.constant_(self.conv3.norm.weight, 0)
191
+ # TODO this somehow hurts performance when training GN models from scratch.
192
+ # Add it as an option when we need to use this code to train a backbone.
193
+
194
+ def forward(self, x):
195
+ out = self.conv1(x)
196
+ out = F.relu_(out)
197
+
198
+ out = self.conv2(out)
199
+ out = F.relu_(out)
200
+
201
+ out = self.conv3(out)
202
+
203
+ if self.shortcut is not None:
204
+ shortcut = self.shortcut(x)
205
+ else:
206
+ shortcut = x
207
+
208
+ out += shortcut
209
+ out = F.relu_(out)
210
+ return out
211
+
212
+
213
+ class DeformBottleneckBlock(CNNBlockBase):
214
+ """
215
+ Similar to :class:`BottleneckBlock`, but with :paper:`deformable conv <deformconv>`
216
+ in the 3x3 convolution.
217
+ """
218
+
219
+ def __init__(
220
+ self,
221
+ in_channels,
222
+ out_channels,
223
+ *,
224
+ bottleneck_channels,
225
+ stride=1,
226
+ num_groups=1,
227
+ norm="BN",
228
+ stride_in_1x1=False,
229
+ dilation=1,
230
+ deform_modulated=False,
231
+ deform_num_groups=1,
232
+ ):
233
+ super().__init__(in_channels, out_channels, stride)
234
+ self.deform_modulated = deform_modulated
235
+
236
+ if in_channels != out_channels:
237
+ self.shortcut = Conv2d(
238
+ in_channels,
239
+ out_channels,
240
+ kernel_size=1,
241
+ stride=stride,
242
+ bias=False,
243
+ norm=get_norm(norm, out_channels),
244
+ )
245
+ else:
246
+ self.shortcut = None
247
+
248
+ stride_1x1, stride_3x3 = (stride, 1) if stride_in_1x1 else (1, stride)
249
+
250
+ self.conv1 = Conv2d(
251
+ in_channels,
252
+ bottleneck_channels,
253
+ kernel_size=1,
254
+ stride=stride_1x1,
255
+ bias=False,
256
+ norm=get_norm(norm, bottleneck_channels),
257
+ )
258
+
259
+ if deform_modulated:
260
+ deform_conv_op = ModulatedDeformConv
261
+ # offset channels are 2 or 3 (if with modulated) * kernel_size * kernel_size
262
+ offset_channels = 27
263
+ else:
264
+ deform_conv_op = DeformConv
265
+ offset_channels = 18
266
+
267
+ self.conv2_offset = Conv2d(
268
+ bottleneck_channels,
269
+ offset_channels * deform_num_groups,
270
+ kernel_size=3,
271
+ stride=stride_3x3,
272
+ padding=1 * dilation,
273
+ dilation=dilation,
274
+ )
275
+ self.conv2 = deform_conv_op(
276
+ bottleneck_channels,
277
+ bottleneck_channels,
278
+ kernel_size=3,
279
+ stride=stride_3x3,
280
+ padding=1 * dilation,
281
+ bias=False,
282
+ groups=num_groups,
283
+ dilation=dilation,
284
+ deformable_groups=deform_num_groups,
285
+ norm=get_norm(norm, bottleneck_channels),
286
+ )
287
+
288
+ self.conv3 = Conv2d(
289
+ bottleneck_channels,
290
+ out_channels,
291
+ kernel_size=1,
292
+ bias=False,
293
+ norm=get_norm(norm, out_channels),
294
+ )
295
+
296
+ for layer in [self.conv1, self.conv2, self.conv3, self.shortcut]:
297
+ if layer is not None: # shortcut can be None
298
+ weight_init.c2_msra_fill(layer)
299
+
300
+ nn.init.constant_(self.conv2_offset.weight, 0)
301
+ nn.init.constant_(self.conv2_offset.bias, 0)
302
+
303
+ def forward(self, x):
304
+ out = self.conv1(x)
305
+ out = F.relu_(out)
306
+
307
+ if self.deform_modulated:
308
+ offset_mask = self.conv2_offset(out)
309
+ offset_x, offset_y, mask = torch.chunk(offset_mask, 3, dim=1)
310
+ offset = torch.cat((offset_x, offset_y), dim=1)
311
+ mask = mask.sigmoid()
312
+ out = self.conv2(out, offset, mask)
313
+ else:
314
+ offset = self.conv2_offset(out)
315
+ out = self.conv2(out, offset)
316
+ out = F.relu_(out)
317
+
318
+ out = self.conv3(out)
319
+
320
+ if self.shortcut is not None:
321
+ shortcut = self.shortcut(x)
322
+ else:
323
+ shortcut = x
324
+
325
+ out += shortcut
326
+ out = F.relu_(out)
327
+ return out
328
+
329
+
330
+ class BasicStem(CNNBlockBase):
331
+ """
332
+ The standard ResNet stem (layers before the first residual block),
333
+ with a conv, relu and max_pool.
334
+ """
335
+
336
+ def __init__(self, in_channels=3, out_channels=64, norm="BN"):
337
+ """
338
+ Args:
339
+ norm (str or callable): norm after the first conv layer.
340
+ See :func:`layers.get_norm` for supported format.
341
+ """
342
+ super().__init__(in_channels, out_channels, 4)
343
+ self.in_channels = in_channels
344
+ self.conv1 = Conv2d(
345
+ in_channels,
346
+ out_channels,
347
+ kernel_size=7,
348
+ stride=2,
349
+ padding=3,
350
+ bias=False,
351
+ norm=get_norm(norm, out_channels),
352
+ )
353
+ weight_init.c2_msra_fill(self.conv1)
354
+
355
+ def forward(self, x):
356
+ x = self.conv1(x)
357
+ x = F.relu_(x)
358
+ x = F.max_pool2d(x, kernel_size=3, stride=2, padding=1)
359
+ return x
360
+
361
+
362
+ class ResNet(Backbone):
363
+ """
364
+ Implement :paper:`ResNet`.
365
+ """
366
+
367
+ def __init__(self, stem, stages, num_classes=None, out_features=None, freeze_at=0):
368
+ """
369
+ Args:
370
+ stem (nn.Module): a stem module
371
+ stages (list[list[CNNBlockBase]]): several (typically 4) stages,
372
+ each contains multiple :class:`CNNBlockBase`.
373
+ num_classes (None or int): if None, will not perform classification.
374
+ Otherwise, will create a linear layer.
375
+ out_features (list[str]): name of the layers whose outputs should
376
+ be returned in forward. Can be anything in "stem", "linear", or "res2" ...
377
+ If None, will return the output of the last layer.
378
+ freeze_at (int): The number of stages at the beginning to freeze.
379
+ see :meth:`freeze` for detailed explanation.
380
+ """
381
+ super().__init__()
382
+ self.stem = stem
383
+ self.num_classes = num_classes
384
+
385
+ current_stride = self.stem.stride
386
+ self._out_feature_strides = {"stem": current_stride}
387
+ self._out_feature_channels = {"stem": self.stem.out_channels}
388
+
389
+ self.stage_names, self.stages = [], []
390
+
391
+ if out_features is not None:
392
+ # Avoid keeping unused layers in this module. They consume extra memory
393
+ # and may cause allreduce to fail
394
+ num_stages = max(
395
+ [{"res2": 1, "res3": 2, "res4": 3, "res5": 4}.get(f, 0) for f in out_features]
396
+ )
397
+ stages = stages[:num_stages]
398
+ for i, blocks in enumerate(stages):
399
+ assert len(blocks) > 0, len(blocks)
400
+ for block in blocks:
401
+ assert isinstance(block, CNNBlockBase), block
402
+
403
+ name = "res" + str(i + 2)
404
+ stage = nn.Sequential(*blocks)
405
+
406
+ self.add_module(name, stage)
407
+ self.stage_names.append(name)
408
+ self.stages.append(stage)
409
+
410
+ self._out_feature_strides[name] = current_stride = int(
411
+ current_stride * np.prod([k.stride for k in blocks])
412
+ )
413
+ self._out_feature_channels[name] = curr_channels = blocks[-1].out_channels
414
+ self.stage_names = tuple(self.stage_names) # Make it static for scripting
415
+
416
+ if num_classes is not None:
417
+ self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
418
+ self.linear = nn.Linear(curr_channels, num_classes)
419
+
420
+ # Sec 5.1 in "Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour":
421
+ # "The 1000-way fully-connected layer is initialized by
422
+ # drawing weights from a zero-mean Gaussian with standard deviation of 0.01."
423
+ nn.init.normal_(self.linear.weight, std=0.01)
424
+ name = "linear"
425
+
426
+ if out_features is None:
427
+ out_features = [name]
428
+ self._out_features = out_features
429
+ assert len(self._out_features)
430
+ children = [x[0] for x in self.named_children()]
431
+ for out_feature in self._out_features:
432
+ assert out_feature in children, "Available children: {}".format(", ".join(children))
433
+ self.freeze(freeze_at)
434
+
435
+ def forward(self, x):
436
+ """
437
+ Args:
438
+ x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``.
439
+
440
+ Returns:
441
+ dict[str->Tensor]: names and the corresponding features
442
+ """
443
+ assert x.dim() == 4, f"ResNet takes an input of shape (N, C, H, W). Got {x.shape} instead!"
444
+ outputs = {}
445
+ x = self.stem(x)
446
+ if "stem" in self._out_features:
447
+ outputs["stem"] = x
448
+ for name, stage in zip(self.stage_names, self.stages):
449
+ x = stage(x)
450
+ if name in self._out_features:
451
+ outputs[name] = x
452
+ if self.num_classes is not None:
453
+ x = self.avgpool(x)
454
+ x = torch.flatten(x, 1)
455
+ x = self.linear(x)
456
+ if "linear" in self._out_features:
457
+ outputs["linear"] = x
458
+ return outputs
459
+
460
+ def output_shape(self):
461
+ return {
462
+ name: ShapeSpec(
463
+ channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
464
+ )
465
+ for name in self._out_features
466
+ }
467
+
468
+ def freeze(self, freeze_at=0):
469
+ """
470
+ Freeze the first several stages of the ResNet. Commonly used in
471
+ fine-tuning.
472
+
473
+ Layers that produce the same feature map spatial size are defined as one
474
+ "stage" by :paper:`FPN`.
475
+
476
+ Args:
477
+ freeze_at (int): number of stages to freeze.
478
+ `1` means freezing the stem. `2` means freezing the stem and
479
+ one residual stage, etc.
480
+
481
+ Returns:
482
+ nn.Module: this ResNet itself
483
+ """
484
+ if freeze_at >= 1:
485
+ self.stem.freeze()
486
+ for idx, stage in enumerate(self.stages, start=2):
487
+ if freeze_at >= idx:
488
+ for block in stage.children():
489
+ block.freeze()
490
+ return self
491
+
492
+ @staticmethod
493
+ def make_stage(block_class, num_blocks, *, in_channels, out_channels, **kwargs):
494
+ """
495
+ Create a list of blocks of the same type that forms one ResNet stage.
496
+
497
+ Args:
498
+ block_class (type): a subclass of CNNBlockBase that's used to create all blocks in this
499
+ stage. A module of this type must not change spatial resolution of inputs unless its
500
+ stride != 1.
501
+ num_blocks (int): number of blocks in this stage
502
+ in_channels (int): input channels of the entire stage.
503
+ out_channels (int): output channels of **every block** in the stage.
504
+ kwargs: other arguments passed to the constructor of
505
+ `block_class`. If the argument name is "xx_per_block", the
506
+ argument is a list of values to be passed to each block in the
507
+ stage. Otherwise, the same argument is passed to every block
508
+ in the stage.
509
+
510
+ Returns:
511
+ list[CNNBlockBase]: a list of block module.
512
+
513
+ Examples:
514
+ ::
515
+ stage = ResNet.make_stage(
516
+ BottleneckBlock, 3, in_channels=16, out_channels=64,
517
+ bottleneck_channels=16, num_groups=1,
518
+ stride_per_block=[2, 1, 1],
519
+ dilations_per_block=[1, 1, 2]
520
+ )
521
+
522
+ Usually, layers that produce the same feature map spatial size are defined as one
523
+ "stage" (in :paper:`FPN`). Under such definition, ``stride_per_block[1:]`` should
524
+ all be 1.
525
+ """
526
+ blocks = []
527
+ for i in range(num_blocks):
528
+ curr_kwargs = {}
529
+ for k, v in kwargs.items():
530
+ if k.endswith("_per_block"):
531
+ assert len(v) == num_blocks, (
532
+ f"Argument '{k}' of make_stage should have the "
533
+ f"same length as num_blocks={num_blocks}."
534
+ )
535
+ newk = k[: -len("_per_block")]
536
+ assert newk not in kwargs, f"Cannot call make_stage with both {k} and {newk}!"
537
+ curr_kwargs[newk] = v[i]
538
+ else:
539
+ curr_kwargs[k] = v
540
+
541
+ blocks.append(
542
+ block_class(in_channels=in_channels, out_channels=out_channels, **curr_kwargs)
543
+ )
544
+ in_channels = out_channels
545
+ return blocks
546
+
547
+ @staticmethod
548
+ def make_default_stages(depth, block_class=None, **kwargs):
549
+ """
550
+ Created list of ResNet stages from pre-defined depth (one of 18, 34, 50, 101, 152).
551
+ If it doesn't create the ResNet variant you need, please use :meth:`make_stage`
552
+ instead for fine-grained customization.
553
+
554
+ Args:
555
+ depth (int): depth of ResNet
556
+ block_class (type): the CNN block class. Has to accept
557
+ `bottleneck_channels` argument for depth > 50.
558
+ By default it is BasicBlock or BottleneckBlock, based on the
559
+ depth.
560
+ kwargs:
561
+ other arguments to pass to `make_stage`. Should not contain
562
+ stride and channels, as they are predefined for each depth.
563
+
564
+ Returns:
565
+ list[list[CNNBlockBase]]: modules in all stages; see arguments of
566
+ :class:`ResNet.__init__`.
567
+ """
568
+ num_blocks_per_stage = {
569
+ 18: [2, 2, 2, 2],
570
+ 34: [3, 4, 6, 3],
571
+ 50: [3, 4, 6, 3],
572
+ 101: [3, 4, 23, 3],
573
+ 152: [3, 8, 36, 3],
574
+ }[depth]
575
+ if block_class is None:
576
+ block_class = BasicBlock if depth < 50 else BottleneckBlock
577
+ if depth < 50:
578
+ in_channels = [64, 64, 128, 256]
579
+ out_channels = [64, 128, 256, 512]
580
+ else:
581
+ in_channels = [64, 256, 512, 1024]
582
+ out_channels = [256, 512, 1024, 2048]
583
+ ret = []
584
+ for (n, s, i, o) in zip(num_blocks_per_stage, [1, 2, 2, 2], in_channels, out_channels):
585
+ if depth >= 50:
586
+ kwargs["bottleneck_channels"] = o // 4
587
+ ret.append(
588
+ ResNet.make_stage(
589
+ block_class=block_class,
590
+ num_blocks=n,
591
+ stride_per_block=[s] + [1] * (n - 1),
592
+ in_channels=i,
593
+ out_channels=o,
594
+ **kwargs,
595
+ )
596
+ )
597
+ return ret
598
+
599
+
600
+ ResNetBlockBase = CNNBlockBase
601
+ """
602
+ Alias for backward compatibiltiy.
603
+ """
604
+
605
+
606
+ def make_stage(*args, **kwargs):
607
+ """
608
+ Deprecated alias for backward compatibiltiy.
609
+ """
610
+ return ResNet.make_stage(*args, **kwargs)
611
+
612
+
613
+ @BACKBONE_REGISTRY.register()
614
+ def build_resnet_backbone(cfg, input_shape):
615
+ """
616
+ Create a ResNet instance from config.
617
+
618
+ Returns:
619
+ ResNet: a :class:`ResNet` instance.
620
+ """
621
+ # need registration of new blocks/stems?
622
+ norm = cfg.MODEL.RESNETS.NORM
623
+ stem = BasicStem(
624
+ in_channels=input_shape.channels,
625
+ out_channels=cfg.MODEL.RESNETS.STEM_OUT_CHANNELS,
626
+ norm=norm,
627
+ )
628
+
629
+ # fmt: off
630
+ freeze_at = cfg.MODEL.BACKBONE.FREEZE_AT
631
+ out_features = cfg.MODEL.RESNETS.OUT_FEATURES
632
+ depth = cfg.MODEL.RESNETS.DEPTH
633
+ num_groups = cfg.MODEL.RESNETS.NUM_GROUPS
634
+ width_per_group = cfg.MODEL.RESNETS.WIDTH_PER_GROUP
635
+ bottleneck_channels = num_groups * width_per_group
636
+ in_channels = cfg.MODEL.RESNETS.STEM_OUT_CHANNELS
637
+ out_channels = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS
638
+ stride_in_1x1 = cfg.MODEL.RESNETS.STRIDE_IN_1X1
639
+ res5_dilation = cfg.MODEL.RESNETS.RES5_DILATION
640
+ deform_on_per_stage = cfg.MODEL.RESNETS.DEFORM_ON_PER_STAGE
641
+ deform_modulated = cfg.MODEL.RESNETS.DEFORM_MODULATED
642
+ deform_num_groups = cfg.MODEL.RESNETS.DEFORM_NUM_GROUPS
643
+ # fmt: on
644
+ assert res5_dilation in {1, 2}, "res5_dilation cannot be {}.".format(res5_dilation)
645
+
646
+ num_blocks_per_stage = {
647
+ 18: [2, 2, 2, 2],
648
+ 34: [3, 4, 6, 3],
649
+ 50: [3, 4, 6, 3],
650
+ 101: [3, 4, 23, 3],
651
+ 152: [3, 8, 36, 3],
652
+ }[depth]
653
+
654
+ if depth in [18, 34]:
655
+ assert out_channels == 64, "Must set MODEL.RESNETS.RES2_OUT_CHANNELS = 64 for R18/R34"
656
+ assert not any(
657
+ deform_on_per_stage
658
+ ), "MODEL.RESNETS.DEFORM_ON_PER_STAGE unsupported for R18/R34"
659
+ assert res5_dilation == 1, "Must set MODEL.RESNETS.RES5_DILATION = 1 for R18/R34"
660
+ assert num_groups == 1, "Must set MODEL.RESNETS.NUM_GROUPS = 1 for R18/R34"
661
+
662
+ stages = []
663
+
664
+ for idx, stage_idx in enumerate(range(2, 6)):
665
+ # res5_dilation is used this way as a convention in R-FCN & Deformable Conv paper
666
+ dilation = res5_dilation if stage_idx == 5 else 1
667
+ first_stride = 1 if idx == 0 or (stage_idx == 5 and dilation == 2) else 2
668
+ stage_kargs = {
669
+ "num_blocks": num_blocks_per_stage[idx],
670
+ "stride_per_block": [first_stride] + [1] * (num_blocks_per_stage[idx] - 1),
671
+ "in_channels": in_channels,
672
+ "out_channels": out_channels,
673
+ "norm": norm,
674
+ }
675
+ # Use BasicBlock for R18 and R34.
676
+ if depth in [18, 34]:
677
+ stage_kargs["block_class"] = BasicBlock
678
+ else:
679
+ stage_kargs["bottleneck_channels"] = bottleneck_channels
680
+ stage_kargs["stride_in_1x1"] = stride_in_1x1
681
+ stage_kargs["dilation"] = dilation
682
+ stage_kargs["num_groups"] = num_groups
683
+ if deform_on_per_stage[idx]:
684
+ stage_kargs["block_class"] = DeformBottleneckBlock
685
+ stage_kargs["deform_modulated"] = deform_modulated
686
+ stage_kargs["deform_num_groups"] = deform_num_groups
687
+ else:
688
+ stage_kargs["block_class"] = BottleneckBlock
689
+ blocks = ResNet.make_stage(**stage_kargs)
690
+ in_channels = out_channels
691
+ out_channels *= 2
692
+ bottleneck_channels *= 2
693
+ stages.append(blocks)
694
+ return ResNet(stem, stages, out_features=out_features, freeze_at=freeze_at)
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/backbone/swin.py ADDED
@@ -0,0 +1,695 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
2
+ """
3
+ Implementation of Swin models from :paper:`swin`.
4
+
5
+ This code is adapted from https://github.com/SwinTransformer/Swin-Transformer-Object-Detection/blob/master/mmdet/models/backbones/swin_transformer.py with minimal modifications. # noqa
6
+ --------------------------------------------------------
7
+ Swin Transformer
8
+ Copyright (c) 2021 Microsoft
9
+ Licensed under The MIT License [see LICENSE for details]
10
+ Written by Ze Liu, Yutong Lin, Yixuan Wei
11
+ --------------------------------------------------------
12
+ LICENSE: https://github.com/SwinTransformer/Swin-Transformer-Object-Detection/blob/461e003166a8083d0b620beacd4662a2df306bd6/LICENSE
13
+ """
14
+
15
+ import numpy as np
16
+ import torch
17
+ import torch.nn as nn
18
+ import torch.nn.functional as F
19
+ import torch.utils.checkpoint as checkpoint
20
+
21
+ from annotator.oneformer.detectron2.modeling.backbone.backbone import Backbone
22
+
23
+ _to_2tuple = nn.modules.utils._ntuple(2)
24
+
25
+
26
+ class Mlp(nn.Module):
27
+ """Multilayer perceptron."""
28
+
29
+ def __init__(
30
+ self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.0
31
+ ):
32
+ super().__init__()
33
+ out_features = out_features or in_features
34
+ hidden_features = hidden_features or in_features
35
+ self.fc1 = nn.Linear(in_features, hidden_features)
36
+ self.act = act_layer()
37
+ self.fc2 = nn.Linear(hidden_features, out_features)
38
+ self.drop = nn.Dropout(drop)
39
+
40
+ def forward(self, x):
41
+ x = self.fc1(x)
42
+ x = self.act(x)
43
+ x = self.drop(x)
44
+ x = self.fc2(x)
45
+ x = self.drop(x)
46
+ return x
47
+
48
+
49
+ def window_partition(x, window_size):
50
+ """
51
+ Args:
52
+ x: (B, H, W, C)
53
+ window_size (int): window size
54
+ Returns:
55
+ windows: (num_windows*B, window_size, window_size, C)
56
+ """
57
+ B, H, W, C = x.shape
58
+ x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
59
+ windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
60
+ return windows
61
+
62
+
63
+ def window_reverse(windows, window_size, H, W):
64
+ """
65
+ Args:
66
+ windows: (num_windows*B, window_size, window_size, C)
67
+ window_size (int): Window size
68
+ H (int): Height of image
69
+ W (int): Width of image
70
+ Returns:
71
+ x: (B, H, W, C)
72
+ """
73
+ B = int(windows.shape[0] / (H * W / window_size / window_size))
74
+ x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
75
+ x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
76
+ return x
77
+
78
+
79
+ class WindowAttention(nn.Module):
80
+ """Window based multi-head self attention (W-MSA) module with relative position bias.
81
+ It supports both of shifted and non-shifted window.
82
+ Args:
83
+ dim (int): Number of input channels.
84
+ window_size (tuple[int]): The height and width of the window.
85
+ num_heads (int): Number of attention heads.
86
+ qkv_bias (bool, optional): If True, add a learnable bias to query, key, value.
87
+ Default: True
88
+ qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
89
+ attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
90
+ proj_drop (float, optional): Dropout ratio of output. Default: 0.0
91
+ """
92
+
93
+ def __init__(
94
+ self,
95
+ dim,
96
+ window_size,
97
+ num_heads,
98
+ qkv_bias=True,
99
+ qk_scale=None,
100
+ attn_drop=0.0,
101
+ proj_drop=0.0,
102
+ ):
103
+
104
+ super().__init__()
105
+ self.dim = dim
106
+ self.window_size = window_size # Wh, Ww
107
+ self.num_heads = num_heads
108
+ head_dim = dim // num_heads
109
+ self.scale = qk_scale or head_dim**-0.5
110
+
111
+ # define a parameter table of relative position bias
112
+ self.relative_position_bias_table = nn.Parameter(
113
+ torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads)
114
+ ) # 2*Wh-1 * 2*Ww-1, nH
115
+
116
+ # get pair-wise relative position index for each token inside the window
117
+ coords_h = torch.arange(self.window_size[0])
118
+ coords_w = torch.arange(self.window_size[1])
119
+ coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww
120
+ coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww
121
+ relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww
122
+ relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2
123
+ relative_coords[:, :, 0] += self.window_size[0] - 1 # shift to start from 0
124
+ relative_coords[:, :, 1] += self.window_size[1] - 1
125
+ relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
126
+ relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww
127
+ self.register_buffer("relative_position_index", relative_position_index)
128
+
129
+ self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
130
+ self.attn_drop = nn.Dropout(attn_drop)
131
+ self.proj = nn.Linear(dim, dim)
132
+ self.proj_drop = nn.Dropout(proj_drop)
133
+
134
+ nn.init.trunc_normal_(self.relative_position_bias_table, std=0.02)
135
+ self.softmax = nn.Softmax(dim=-1)
136
+
137
+ def forward(self, x, mask=None):
138
+ """Forward function.
139
+ Args:
140
+ x: input features with shape of (num_windows*B, N, C)
141
+ mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
142
+ """
143
+ B_, N, C = x.shape
144
+ qkv = (
145
+ self.qkv(x)
146
+ .reshape(B_, N, 3, self.num_heads, C // self.num_heads)
147
+ .permute(2, 0, 3, 1, 4)
148
+ )
149
+ q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple)
150
+
151
+ q = q * self.scale
152
+ attn = q @ k.transpose(-2, -1)
153
+
154
+ relative_position_bias = self.relative_position_bias_table[
155
+ self.relative_position_index.view(-1)
156
+ ].view(
157
+ self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1
158
+ ) # Wh*Ww,Wh*Ww,nH
159
+ relative_position_bias = relative_position_bias.permute(
160
+ 2, 0, 1
161
+ ).contiguous() # nH, Wh*Ww, Wh*Ww
162
+ attn = attn + relative_position_bias.unsqueeze(0)
163
+
164
+ if mask is not None:
165
+ nW = mask.shape[0]
166
+ attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
167
+ attn = attn.view(-1, self.num_heads, N, N)
168
+ attn = self.softmax(attn)
169
+ else:
170
+ attn = self.softmax(attn)
171
+
172
+ attn = self.attn_drop(attn)
173
+
174
+ x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
175
+ x = self.proj(x)
176
+ x = self.proj_drop(x)
177
+ return x
178
+
179
+
180
+ class SwinTransformerBlock(nn.Module):
181
+ """Swin Transformer Block.
182
+ Args:
183
+ dim (int): Number of input channels.
184
+ num_heads (int): Number of attention heads.
185
+ window_size (int): Window size.
186
+ shift_size (int): Shift size for SW-MSA.
187
+ mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
188
+ qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
189
+ qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
190
+ drop (float, optional): Dropout rate. Default: 0.0
191
+ attn_drop (float, optional): Attention dropout rate. Default: 0.0
192
+ drop_path (float, optional): Stochastic depth rate. Default: 0.0
193
+ act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
194
+ norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
195
+ """
196
+
197
+ def __init__(
198
+ self,
199
+ dim,
200
+ num_heads,
201
+ window_size=7,
202
+ shift_size=0,
203
+ mlp_ratio=4.0,
204
+ qkv_bias=True,
205
+ qk_scale=None,
206
+ drop=0.0,
207
+ attn_drop=0.0,
208
+ drop_path=0.0,
209
+ act_layer=nn.GELU,
210
+ norm_layer=nn.LayerNorm,
211
+ ):
212
+ super().__init__()
213
+ self.dim = dim
214
+ self.num_heads = num_heads
215
+ self.window_size = window_size
216
+ self.shift_size = shift_size
217
+ self.mlp_ratio = mlp_ratio
218
+ assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
219
+
220
+ self.norm1 = norm_layer(dim)
221
+ self.attn = WindowAttention(
222
+ dim,
223
+ window_size=_to_2tuple(self.window_size),
224
+ num_heads=num_heads,
225
+ qkv_bias=qkv_bias,
226
+ qk_scale=qk_scale,
227
+ attn_drop=attn_drop,
228
+ proj_drop=drop,
229
+ )
230
+
231
+ if drop_path > 0.0:
232
+ from timm.models.layers import DropPath
233
+
234
+ self.drop_path = DropPath(drop_path)
235
+ else:
236
+ self.drop_path = nn.Identity()
237
+ self.norm2 = norm_layer(dim)
238
+ mlp_hidden_dim = int(dim * mlp_ratio)
239
+ self.mlp = Mlp(
240
+ in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop
241
+ )
242
+
243
+ self.H = None
244
+ self.W = None
245
+
246
+ def forward(self, x, mask_matrix):
247
+ """Forward function.
248
+ Args:
249
+ x: Input feature, tensor size (B, H*W, C).
250
+ H, W: Spatial resolution of the input feature.
251
+ mask_matrix: Attention mask for cyclic shift.
252
+ """
253
+ B, L, C = x.shape
254
+ H, W = self.H, self.W
255
+ assert L == H * W, "input feature has wrong size"
256
+
257
+ shortcut = x
258
+ x = self.norm1(x)
259
+ x = x.view(B, H, W, C)
260
+
261
+ # pad feature maps to multiples of window size
262
+ pad_l = pad_t = 0
263
+ pad_r = (self.window_size - W % self.window_size) % self.window_size
264
+ pad_b = (self.window_size - H % self.window_size) % self.window_size
265
+ x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
266
+ _, Hp, Wp, _ = x.shape
267
+
268
+ # cyclic shift
269
+ if self.shift_size > 0:
270
+ shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
271
+ attn_mask = mask_matrix
272
+ else:
273
+ shifted_x = x
274
+ attn_mask = None
275
+
276
+ # partition windows
277
+ x_windows = window_partition(
278
+ shifted_x, self.window_size
279
+ ) # nW*B, window_size, window_size, C
280
+ x_windows = x_windows.view(
281
+ -1, self.window_size * self.window_size, C
282
+ ) # nW*B, window_size*window_size, C
283
+
284
+ # W-MSA/SW-MSA
285
+ attn_windows = self.attn(x_windows, mask=attn_mask) # nW*B, window_size*window_size, C
286
+
287
+ # merge windows
288
+ attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
289
+ shifted_x = window_reverse(attn_windows, self.window_size, Hp, Wp) # B H' W' C
290
+
291
+ # reverse cyclic shift
292
+ if self.shift_size > 0:
293
+ x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
294
+ else:
295
+ x = shifted_x
296
+
297
+ if pad_r > 0 or pad_b > 0:
298
+ x = x[:, :H, :W, :].contiguous()
299
+
300
+ x = x.view(B, H * W, C)
301
+
302
+ # FFN
303
+ x = shortcut + self.drop_path(x)
304
+ x = x + self.drop_path(self.mlp(self.norm2(x)))
305
+
306
+ return x
307
+
308
+
309
+ class PatchMerging(nn.Module):
310
+ """Patch Merging Layer
311
+ Args:
312
+ dim (int): Number of input channels.
313
+ norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
314
+ """
315
+
316
+ def __init__(self, dim, norm_layer=nn.LayerNorm):
317
+ super().__init__()
318
+ self.dim = dim
319
+ self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
320
+ self.norm = norm_layer(4 * dim)
321
+
322
+ def forward(self, x, H, W):
323
+ """Forward function.
324
+ Args:
325
+ x: Input feature, tensor size (B, H*W, C).
326
+ H, W: Spatial resolution of the input feature.
327
+ """
328
+ B, L, C = x.shape
329
+ assert L == H * W, "input feature has wrong size"
330
+
331
+ x = x.view(B, H, W, C)
332
+
333
+ # padding
334
+ pad_input = (H % 2 == 1) or (W % 2 == 1)
335
+ if pad_input:
336
+ x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2))
337
+
338
+ x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C
339
+ x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C
340
+ x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C
341
+ x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C
342
+ x = torch.cat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C
343
+ x = x.view(B, -1, 4 * C) # B H/2*W/2 4*C
344
+
345
+ x = self.norm(x)
346
+ x = self.reduction(x)
347
+
348
+ return x
349
+
350
+
351
+ class BasicLayer(nn.Module):
352
+ """A basic Swin Transformer layer for one stage.
353
+ Args:
354
+ dim (int): Number of feature channels
355
+ depth (int): Depths of this stage.
356
+ num_heads (int): Number of attention head.
357
+ window_size (int): Local window size. Default: 7.
358
+ mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
359
+ qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
360
+ qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
361
+ drop (float, optional): Dropout rate. Default: 0.0
362
+ attn_drop (float, optional): Attention dropout rate. Default: 0.0
363
+ drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
364
+ norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
365
+ downsample (nn.Module | None, optional): Downsample layer at the end of the layer.
366
+ Default: None
367
+ use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
368
+ """
369
+
370
+ def __init__(
371
+ self,
372
+ dim,
373
+ depth,
374
+ num_heads,
375
+ window_size=7,
376
+ mlp_ratio=4.0,
377
+ qkv_bias=True,
378
+ qk_scale=None,
379
+ drop=0.0,
380
+ attn_drop=0.0,
381
+ drop_path=0.0,
382
+ norm_layer=nn.LayerNorm,
383
+ downsample=None,
384
+ use_checkpoint=False,
385
+ ):
386
+ super().__init__()
387
+ self.window_size = window_size
388
+ self.shift_size = window_size // 2
389
+ self.depth = depth
390
+ self.use_checkpoint = use_checkpoint
391
+
392
+ # build blocks
393
+ self.blocks = nn.ModuleList(
394
+ [
395
+ SwinTransformerBlock(
396
+ dim=dim,
397
+ num_heads=num_heads,
398
+ window_size=window_size,
399
+ shift_size=0 if (i % 2 == 0) else window_size // 2,
400
+ mlp_ratio=mlp_ratio,
401
+ qkv_bias=qkv_bias,
402
+ qk_scale=qk_scale,
403
+ drop=drop,
404
+ attn_drop=attn_drop,
405
+ drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
406
+ norm_layer=norm_layer,
407
+ )
408
+ for i in range(depth)
409
+ ]
410
+ )
411
+
412
+ # patch merging layer
413
+ if downsample is not None:
414
+ self.downsample = downsample(dim=dim, norm_layer=norm_layer)
415
+ else:
416
+ self.downsample = None
417
+
418
+ def forward(self, x, H, W):
419
+ """Forward function.
420
+ Args:
421
+ x: Input feature, tensor size (B, H*W, C).
422
+ H, W: Spatial resolution of the input feature.
423
+ """
424
+
425
+ # calculate attention mask for SW-MSA
426
+ Hp = int(np.ceil(H / self.window_size)) * self.window_size
427
+ Wp = int(np.ceil(W / self.window_size)) * self.window_size
428
+ img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device) # 1 Hp Wp 1
429
+ h_slices = (
430
+ slice(0, -self.window_size),
431
+ slice(-self.window_size, -self.shift_size),
432
+ slice(-self.shift_size, None),
433
+ )
434
+ w_slices = (
435
+ slice(0, -self.window_size),
436
+ slice(-self.window_size, -self.shift_size),
437
+ slice(-self.shift_size, None),
438
+ )
439
+ cnt = 0
440
+ for h in h_slices:
441
+ for w in w_slices:
442
+ img_mask[:, h, w, :] = cnt
443
+ cnt += 1
444
+
445
+ mask_windows = window_partition(
446
+ img_mask, self.window_size
447
+ ) # nW, window_size, window_size, 1
448
+ mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
449
+ attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
450
+ attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(
451
+ attn_mask == 0, float(0.0)
452
+ )
453
+
454
+ for blk in self.blocks:
455
+ blk.H, blk.W = H, W
456
+ if self.use_checkpoint:
457
+ x = checkpoint.checkpoint(blk, x, attn_mask)
458
+ else:
459
+ x = blk(x, attn_mask)
460
+ if self.downsample is not None:
461
+ x_down = self.downsample(x, H, W)
462
+ Wh, Ww = (H + 1) // 2, (W + 1) // 2
463
+ return x, H, W, x_down, Wh, Ww
464
+ else:
465
+ return x, H, W, x, H, W
466
+
467
+
468
+ class PatchEmbed(nn.Module):
469
+ """Image to Patch Embedding
470
+ Args:
471
+ patch_size (int): Patch token size. Default: 4.
472
+ in_chans (int): Number of input image channels. Default: 3.
473
+ embed_dim (int): Number of linear projection output channels. Default: 96.
474
+ norm_layer (nn.Module, optional): Normalization layer. Default: None
475
+ """
476
+
477
+ def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
478
+ super().__init__()
479
+ patch_size = _to_2tuple(patch_size)
480
+ self.patch_size = patch_size
481
+
482
+ self.in_chans = in_chans
483
+ self.embed_dim = embed_dim
484
+
485
+ self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
486
+ if norm_layer is not None:
487
+ self.norm = norm_layer(embed_dim)
488
+ else:
489
+ self.norm = None
490
+
491
+ def forward(self, x):
492
+ """Forward function."""
493
+ # padding
494
+ _, _, H, W = x.size()
495
+ if W % self.patch_size[1] != 0:
496
+ x = F.pad(x, (0, self.patch_size[1] - W % self.patch_size[1]))
497
+ if H % self.patch_size[0] != 0:
498
+ x = F.pad(x, (0, 0, 0, self.patch_size[0] - H % self.patch_size[0]))
499
+
500
+ x = self.proj(x) # B C Wh Ww
501
+ if self.norm is not None:
502
+ Wh, Ww = x.size(2), x.size(3)
503
+ x = x.flatten(2).transpose(1, 2)
504
+ x = self.norm(x)
505
+ x = x.transpose(1, 2).view(-1, self.embed_dim, Wh, Ww)
506
+
507
+ return x
508
+
509
+
510
+ class SwinTransformer(Backbone):
511
+ """Swin Transformer backbone.
512
+ A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted
513
+ Windows` - https://arxiv.org/pdf/2103.14030
514
+ Args:
515
+ pretrain_img_size (int): Input image size for training the pretrained model,
516
+ used in absolute postion embedding. Default 224.
517
+ patch_size (int | tuple(int)): Patch size. Default: 4.
518
+ in_chans (int): Number of input image channels. Default: 3.
519
+ embed_dim (int): Number of linear projection output channels. Default: 96.
520
+ depths (tuple[int]): Depths of each Swin Transformer stage.
521
+ num_heads (tuple[int]): Number of attention head of each stage.
522
+ window_size (int): Window size. Default: 7.
523
+ mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
524
+ qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
525
+ qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
526
+ drop_rate (float): Dropout rate.
527
+ attn_drop_rate (float): Attention dropout rate. Default: 0.
528
+ drop_path_rate (float): Stochastic depth rate. Default: 0.2.
529
+ norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
530
+ ape (bool): If True, add absolute position embedding to the patch embedding. Default: False.
531
+ patch_norm (bool): If True, add normalization after patch embedding. Default: True.
532
+ out_indices (Sequence[int]): Output from which stages.
533
+ frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
534
+ -1 means not freezing any parameters.
535
+ use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
536
+ """
537
+
538
+ def __init__(
539
+ self,
540
+ pretrain_img_size=224,
541
+ patch_size=4,
542
+ in_chans=3,
543
+ embed_dim=96,
544
+ depths=(2, 2, 6, 2),
545
+ num_heads=(3, 6, 12, 24),
546
+ window_size=7,
547
+ mlp_ratio=4.0,
548
+ qkv_bias=True,
549
+ qk_scale=None,
550
+ drop_rate=0.0,
551
+ attn_drop_rate=0.0,
552
+ drop_path_rate=0.2,
553
+ norm_layer=nn.LayerNorm,
554
+ ape=False,
555
+ patch_norm=True,
556
+ out_indices=(0, 1, 2, 3),
557
+ frozen_stages=-1,
558
+ use_checkpoint=False,
559
+ ):
560
+ super().__init__()
561
+
562
+ self.pretrain_img_size = pretrain_img_size
563
+ self.num_layers = len(depths)
564
+ self.embed_dim = embed_dim
565
+ self.ape = ape
566
+ self.patch_norm = patch_norm
567
+ self.out_indices = out_indices
568
+ self.frozen_stages = frozen_stages
569
+
570
+ # split image into non-overlapping patches
571
+ self.patch_embed = PatchEmbed(
572
+ patch_size=patch_size,
573
+ in_chans=in_chans,
574
+ embed_dim=embed_dim,
575
+ norm_layer=norm_layer if self.patch_norm else None,
576
+ )
577
+
578
+ # absolute position embedding
579
+ if self.ape:
580
+ pretrain_img_size = _to_2tuple(pretrain_img_size)
581
+ patch_size = _to_2tuple(patch_size)
582
+ patches_resolution = [
583
+ pretrain_img_size[0] // patch_size[0],
584
+ pretrain_img_size[1] // patch_size[1],
585
+ ]
586
+
587
+ self.absolute_pos_embed = nn.Parameter(
588
+ torch.zeros(1, embed_dim, patches_resolution[0], patches_resolution[1])
589
+ )
590
+ nn.init.trunc_normal_(self.absolute_pos_embed, std=0.02)
591
+
592
+ self.pos_drop = nn.Dropout(p=drop_rate)
593
+
594
+ # stochastic depth
595
+ dpr = [
596
+ x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
597
+ ] # stochastic depth decay rule
598
+
599
+ # build layers
600
+ self.layers = nn.ModuleList()
601
+ for i_layer in range(self.num_layers):
602
+ layer = BasicLayer(
603
+ dim=int(embed_dim * 2**i_layer),
604
+ depth=depths[i_layer],
605
+ num_heads=num_heads[i_layer],
606
+ window_size=window_size,
607
+ mlp_ratio=mlp_ratio,
608
+ qkv_bias=qkv_bias,
609
+ qk_scale=qk_scale,
610
+ drop=drop_rate,
611
+ attn_drop=attn_drop_rate,
612
+ drop_path=dpr[sum(depths[:i_layer]) : sum(depths[: i_layer + 1])],
613
+ norm_layer=norm_layer,
614
+ downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
615
+ use_checkpoint=use_checkpoint,
616
+ )
617
+ self.layers.append(layer)
618
+
619
+ num_features = [int(embed_dim * 2**i) for i in range(self.num_layers)]
620
+ self.num_features = num_features
621
+
622
+ # add a norm layer for each output
623
+ for i_layer in out_indices:
624
+ layer = norm_layer(num_features[i_layer])
625
+ layer_name = f"norm{i_layer}"
626
+ self.add_module(layer_name, layer)
627
+
628
+ self._freeze_stages()
629
+ self._out_features = ["p{}".format(i) for i in self.out_indices]
630
+ self._out_feature_channels = {
631
+ "p{}".format(i): self.embed_dim * 2**i for i in self.out_indices
632
+ }
633
+ self._out_feature_strides = {"p{}".format(i): 2 ** (i + 2) for i in self.out_indices}
634
+ self._size_devisibility = 32
635
+
636
+ self.apply(self._init_weights)
637
+
638
+ def _freeze_stages(self):
639
+ if self.frozen_stages >= 0:
640
+ self.patch_embed.eval()
641
+ for param in self.patch_embed.parameters():
642
+ param.requires_grad = False
643
+
644
+ if self.frozen_stages >= 1 and self.ape:
645
+ self.absolute_pos_embed.requires_grad = False
646
+
647
+ if self.frozen_stages >= 2:
648
+ self.pos_drop.eval()
649
+ for i in range(0, self.frozen_stages - 1):
650
+ m = self.layers[i]
651
+ m.eval()
652
+ for param in m.parameters():
653
+ param.requires_grad = False
654
+
655
+ def _init_weights(self, m):
656
+ if isinstance(m, nn.Linear):
657
+ nn.init.trunc_normal_(m.weight, std=0.02)
658
+ if isinstance(m, nn.Linear) and m.bias is not None:
659
+ nn.init.constant_(m.bias, 0)
660
+ elif isinstance(m, nn.LayerNorm):
661
+ nn.init.constant_(m.bias, 0)
662
+ nn.init.constant_(m.weight, 1.0)
663
+
664
+ @property
665
+ def size_divisibility(self):
666
+ return self._size_divisibility
667
+
668
+ def forward(self, x):
669
+ """Forward function."""
670
+ x = self.patch_embed(x)
671
+
672
+ Wh, Ww = x.size(2), x.size(3)
673
+ if self.ape:
674
+ # interpolate the position embedding to the corresponding size
675
+ absolute_pos_embed = F.interpolate(
676
+ self.absolute_pos_embed, size=(Wh, Ww), mode="bicubic"
677
+ )
678
+ x = (x + absolute_pos_embed).flatten(2).transpose(1, 2) # B Wh*Ww C
679
+ else:
680
+ x = x.flatten(2).transpose(1, 2)
681
+ x = self.pos_drop(x)
682
+
683
+ outs = {}
684
+ for i in range(self.num_layers):
685
+ layer = self.layers[i]
686
+ x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
687
+
688
+ if i in self.out_indices:
689
+ norm_layer = getattr(self, f"norm{i}")
690
+ x_out = norm_layer(x_out)
691
+
692
+ out = x_out.view(-1, H, W, self.num_features[i]).permute(0, 3, 1, 2).contiguous()
693
+ outs["p{}".format(i)] = out
694
+
695
+ return outs
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/backbone/utils.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
2
+ import math
3
+ import torch
4
+ import torch.nn as nn
5
+ import torch.nn.functional as F
6
+
7
+ __all__ = [
8
+ "window_partition",
9
+ "window_unpartition",
10
+ "add_decomposed_rel_pos",
11
+ "get_abs_pos",
12
+ "PatchEmbed",
13
+ ]
14
+
15
+
16
+ def window_partition(x, window_size):
17
+ """
18
+ Partition into non-overlapping windows with padding if needed.
19
+ Args:
20
+ x (tensor): input tokens with [B, H, W, C].
21
+ window_size (int): window size.
22
+
23
+ Returns:
24
+ windows: windows after partition with [B * num_windows, window_size, window_size, C].
25
+ (Hp, Wp): padded height and width before partition
26
+ """
27
+ B, H, W, C = x.shape
28
+
29
+ pad_h = (window_size - H % window_size) % window_size
30
+ pad_w = (window_size - W % window_size) % window_size
31
+ if pad_h > 0 or pad_w > 0:
32
+ x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h))
33
+ Hp, Wp = H + pad_h, W + pad_w
34
+
35
+ x = x.view(B, Hp // window_size, window_size, Wp // window_size, window_size, C)
36
+ windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
37
+ return windows, (Hp, Wp)
38
+
39
+
40
+ def window_unpartition(windows, window_size, pad_hw, hw):
41
+ """
42
+ Window unpartition into original sequences and removing padding.
43
+ Args:
44
+ x (tensor): input tokens with [B * num_windows, window_size, window_size, C].
45
+ window_size (int): window size.
46
+ pad_hw (Tuple): padded height and width (Hp, Wp).
47
+ hw (Tuple): original height and width (H, W) before padding.
48
+
49
+ Returns:
50
+ x: unpartitioned sequences with [B, H, W, C].
51
+ """
52
+ Hp, Wp = pad_hw
53
+ H, W = hw
54
+ B = windows.shape[0] // (Hp * Wp // window_size // window_size)
55
+ x = windows.view(B, Hp // window_size, Wp // window_size, window_size, window_size, -1)
56
+ x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, Hp, Wp, -1)
57
+
58
+ if Hp > H or Wp > W:
59
+ x = x[:, :H, :W, :].contiguous()
60
+ return x
61
+
62
+
63
+ def get_rel_pos(q_size, k_size, rel_pos):
64
+ """
65
+ Get relative positional embeddings according to the relative positions of
66
+ query and key sizes.
67
+ Args:
68
+ q_size (int): size of query q.
69
+ k_size (int): size of key k.
70
+ rel_pos (Tensor): relative position embeddings (L, C).
71
+
72
+ Returns:
73
+ Extracted positional embeddings according to relative positions.
74
+ """
75
+ max_rel_dist = int(2 * max(q_size, k_size) - 1)
76
+ # Interpolate rel pos if needed.
77
+ if rel_pos.shape[0] != max_rel_dist:
78
+ # Interpolate rel pos.
79
+ rel_pos_resized = F.interpolate(
80
+ rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1),
81
+ size=max_rel_dist,
82
+ mode="linear",
83
+ )
84
+ rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0)
85
+ else:
86
+ rel_pos_resized = rel_pos
87
+
88
+ # Scale the coords with short length if shapes for q and k are different.
89
+ q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0)
90
+ k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0)
91
+ relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0)
92
+
93
+ return rel_pos_resized[relative_coords.long()]
94
+
95
+
96
+ def add_decomposed_rel_pos(attn, q, rel_pos_h, rel_pos_w, q_size, k_size):
97
+ """
98
+ Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
99
+ https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py # noqa B950
100
+ Args:
101
+ attn (Tensor): attention map.
102
+ q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C).
103
+ rel_pos_h (Tensor): relative position embeddings (Lh, C) for height axis.
104
+ rel_pos_w (Tensor): relative position embeddings (Lw, C) for width axis.
105
+ q_size (Tuple): spatial sequence size of query q with (q_h, q_w).
106
+ k_size (Tuple): spatial sequence size of key k with (k_h, k_w).
107
+
108
+ Returns:
109
+ attn (Tensor): attention map with added relative positional embeddings.
110
+ """
111
+ q_h, q_w = q_size
112
+ k_h, k_w = k_size
113
+ Rh = get_rel_pos(q_h, k_h, rel_pos_h)
114
+ Rw = get_rel_pos(q_w, k_w, rel_pos_w)
115
+
116
+ B, _, dim = q.shape
117
+ r_q = q.reshape(B, q_h, q_w, dim)
118
+ rel_h = torch.einsum("bhwc,hkc->bhwk", r_q, Rh)
119
+ rel_w = torch.einsum("bhwc,wkc->bhwk", r_q, Rw)
120
+
121
+ attn = (
122
+ attn.view(B, q_h, q_w, k_h, k_w) + rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :]
123
+ ).view(B, q_h * q_w, k_h * k_w)
124
+
125
+ return attn
126
+
127
+
128
+ def get_abs_pos(abs_pos, has_cls_token, hw):
129
+ """
130
+ Calculate absolute positional embeddings. If needed, resize embeddings and remove cls_token
131
+ dimension for the original embeddings.
132
+ Args:
133
+ abs_pos (Tensor): absolute positional embeddings with (1, num_position, C).
134
+ has_cls_token (bool): If true, has 1 embedding in abs_pos for cls token.
135
+ hw (Tuple): size of input image tokens.
136
+
137
+ Returns:
138
+ Absolute positional embeddings after processing with shape (1, H, W, C)
139
+ """
140
+ h, w = hw
141
+ if has_cls_token:
142
+ abs_pos = abs_pos[:, 1:]
143
+ xy_num = abs_pos.shape[1]
144
+ size = int(math.sqrt(xy_num))
145
+ assert size * size == xy_num
146
+
147
+ if size != h or size != w:
148
+ new_abs_pos = F.interpolate(
149
+ abs_pos.reshape(1, size, size, -1).permute(0, 3, 1, 2),
150
+ size=(h, w),
151
+ mode="bicubic",
152
+ align_corners=False,
153
+ )
154
+
155
+ return new_abs_pos.permute(0, 2, 3, 1)
156
+ else:
157
+ return abs_pos.reshape(1, h, w, -1)
158
+
159
+
160
+ class PatchEmbed(nn.Module):
161
+ """
162
+ Image to Patch Embedding.
163
+ """
164
+
165
+ def __init__(
166
+ self, kernel_size=(16, 16), stride=(16, 16), padding=(0, 0), in_chans=3, embed_dim=768
167
+ ):
168
+ """
169
+ Args:
170
+ kernel_size (Tuple): kernel size of the projection layer.
171
+ stride (Tuple): stride of the projection layer.
172
+ padding (Tuple): padding size of the projection layer.
173
+ in_chans (int): Number of input image channels.
174
+ embed_dim (int): embed_dim (int): Patch embedding dimension.
175
+ """
176
+ super().__init__()
177
+
178
+ self.proj = nn.Conv2d(
179
+ in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding
180
+ )
181
+
182
+ def forward(self, x):
183
+ x = self.proj(x)
184
+ # B C H W -> B H W C
185
+ x = x.permute(0, 2, 3, 1)
186
+ return x
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/backbone/vit.py ADDED
@@ -0,0 +1,524 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import math
3
+ import fvcore.nn.weight_init as weight_init
4
+ import torch
5
+ import torch.nn as nn
6
+
7
+ from annotator.oneformer.detectron2.layers import CNNBlockBase, Conv2d, get_norm
8
+ from annotator.oneformer.detectron2.modeling.backbone.fpn import _assert_strides_are_log2_contiguous
9
+
10
+ from .backbone import Backbone
11
+ from .utils import (
12
+ PatchEmbed,
13
+ add_decomposed_rel_pos,
14
+ get_abs_pos,
15
+ window_partition,
16
+ window_unpartition,
17
+ )
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ __all__ = ["ViT", "SimpleFeaturePyramid", "get_vit_lr_decay_rate"]
23
+
24
+
25
+ class Attention(nn.Module):
26
+ """Multi-head Attention block with relative position embeddings."""
27
+
28
+ def __init__(
29
+ self,
30
+ dim,
31
+ num_heads=8,
32
+ qkv_bias=True,
33
+ use_rel_pos=False,
34
+ rel_pos_zero_init=True,
35
+ input_size=None,
36
+ ):
37
+ """
38
+ Args:
39
+ dim (int): Number of input channels.
40
+ num_heads (int): Number of attention heads.
41
+ qkv_bias (bool: If True, add a learnable bias to query, key, value.
42
+ rel_pos (bool): If True, add relative positional embeddings to the attention map.
43
+ rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
44
+ input_size (int or None): Input resolution for calculating the relative positional
45
+ parameter size.
46
+ """
47
+ super().__init__()
48
+ self.num_heads = num_heads
49
+ head_dim = dim // num_heads
50
+ self.scale = head_dim**-0.5
51
+
52
+ self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
53
+ self.proj = nn.Linear(dim, dim)
54
+
55
+ self.use_rel_pos = use_rel_pos
56
+ if self.use_rel_pos:
57
+ # initialize relative positional embeddings
58
+ self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, head_dim))
59
+ self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, head_dim))
60
+
61
+ if not rel_pos_zero_init:
62
+ nn.init.trunc_normal_(self.rel_pos_h, std=0.02)
63
+ nn.init.trunc_normal_(self.rel_pos_w, std=0.02)
64
+
65
+ def forward(self, x):
66
+ B, H, W, _ = x.shape
67
+ # qkv with shape (3, B, nHead, H * W, C)
68
+ qkv = self.qkv(x).reshape(B, H * W, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
69
+ # q, k, v with shape (B * nHead, H * W, C)
70
+ q, k, v = qkv.reshape(3, B * self.num_heads, H * W, -1).unbind(0)
71
+
72
+ attn = (q * self.scale) @ k.transpose(-2, -1)
73
+
74
+ if self.use_rel_pos:
75
+ attn = add_decomposed_rel_pos(attn, q, self.rel_pos_h, self.rel_pos_w, (H, W), (H, W))
76
+
77
+ attn = attn.softmax(dim=-1)
78
+ x = (attn @ v).view(B, self.num_heads, H, W, -1).permute(0, 2, 3, 1, 4).reshape(B, H, W, -1)
79
+ x = self.proj(x)
80
+
81
+ return x
82
+
83
+
84
+ class ResBottleneckBlock(CNNBlockBase):
85
+ """
86
+ The standard bottleneck residual block without the last activation layer.
87
+ It contains 3 conv layers with kernels 1x1, 3x3, 1x1.
88
+ """
89
+
90
+ def __init__(
91
+ self,
92
+ in_channels,
93
+ out_channels,
94
+ bottleneck_channels,
95
+ norm="LN",
96
+ act_layer=nn.GELU,
97
+ ):
98
+ """
99
+ Args:
100
+ in_channels (int): Number of input channels.
101
+ out_channels (int): Number of output channels.
102
+ bottleneck_channels (int): number of output channels for the 3x3
103
+ "bottleneck" conv layers.
104
+ norm (str or callable): normalization for all conv layers.
105
+ See :func:`layers.get_norm` for supported format.
106
+ act_layer (callable): activation for all conv layers.
107
+ """
108
+ super().__init__(in_channels, out_channels, 1)
109
+
110
+ self.conv1 = Conv2d(in_channels, bottleneck_channels, 1, bias=False)
111
+ self.norm1 = get_norm(norm, bottleneck_channels)
112
+ self.act1 = act_layer()
113
+
114
+ self.conv2 = Conv2d(
115
+ bottleneck_channels,
116
+ bottleneck_channels,
117
+ 3,
118
+ padding=1,
119
+ bias=False,
120
+ )
121
+ self.norm2 = get_norm(norm, bottleneck_channels)
122
+ self.act2 = act_layer()
123
+
124
+ self.conv3 = Conv2d(bottleneck_channels, out_channels, 1, bias=False)
125
+ self.norm3 = get_norm(norm, out_channels)
126
+
127
+ for layer in [self.conv1, self.conv2, self.conv3]:
128
+ weight_init.c2_msra_fill(layer)
129
+ for layer in [self.norm1, self.norm2]:
130
+ layer.weight.data.fill_(1.0)
131
+ layer.bias.data.zero_()
132
+ # zero init last norm layer.
133
+ self.norm3.weight.data.zero_()
134
+ self.norm3.bias.data.zero_()
135
+
136
+ def forward(self, x):
137
+ out = x
138
+ for layer in self.children():
139
+ out = layer(out)
140
+
141
+ out = x + out
142
+ return out
143
+
144
+
145
+ class Block(nn.Module):
146
+ """Transformer blocks with support of window attention and residual propagation blocks"""
147
+
148
+ def __init__(
149
+ self,
150
+ dim,
151
+ num_heads,
152
+ mlp_ratio=4.0,
153
+ qkv_bias=True,
154
+ drop_path=0.0,
155
+ norm_layer=nn.LayerNorm,
156
+ act_layer=nn.GELU,
157
+ use_rel_pos=False,
158
+ rel_pos_zero_init=True,
159
+ window_size=0,
160
+ use_residual_block=False,
161
+ input_size=None,
162
+ ):
163
+ """
164
+ Args:
165
+ dim (int): Number of input channels.
166
+ num_heads (int): Number of attention heads in each ViT block.
167
+ mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
168
+ qkv_bias (bool): If True, add a learnable bias to query, key, value.
169
+ drop_path (float): Stochastic depth rate.
170
+ norm_layer (nn.Module): Normalization layer.
171
+ act_layer (nn.Module): Activation layer.
172
+ use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
173
+ rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
174
+ window_size (int): Window size for window attention blocks. If it equals 0, then not
175
+ use window attention.
176
+ use_residual_block (bool): If True, use a residual block after the MLP block.
177
+ input_size (int or None): Input resolution for calculating the relative positional
178
+ parameter size.
179
+ """
180
+ super().__init__()
181
+ self.norm1 = norm_layer(dim)
182
+ self.attn = Attention(
183
+ dim,
184
+ num_heads=num_heads,
185
+ qkv_bias=qkv_bias,
186
+ use_rel_pos=use_rel_pos,
187
+ rel_pos_zero_init=rel_pos_zero_init,
188
+ input_size=input_size if window_size == 0 else (window_size, window_size),
189
+ )
190
+
191
+ from timm.models.layers import DropPath, Mlp
192
+
193
+ self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
194
+ self.norm2 = norm_layer(dim)
195
+ self.mlp = Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer)
196
+
197
+ self.window_size = window_size
198
+
199
+ self.use_residual_block = use_residual_block
200
+ if use_residual_block:
201
+ # Use a residual block with bottleneck channel as dim // 2
202
+ self.residual = ResBottleneckBlock(
203
+ in_channels=dim,
204
+ out_channels=dim,
205
+ bottleneck_channels=dim // 2,
206
+ norm="LN",
207
+ act_layer=act_layer,
208
+ )
209
+
210
+ def forward(self, x):
211
+ shortcut = x
212
+ x = self.norm1(x)
213
+ # Window partition
214
+ if self.window_size > 0:
215
+ H, W = x.shape[1], x.shape[2]
216
+ x, pad_hw = window_partition(x, self.window_size)
217
+
218
+ x = self.attn(x)
219
+ # Reverse window partition
220
+ if self.window_size > 0:
221
+ x = window_unpartition(x, self.window_size, pad_hw, (H, W))
222
+
223
+ x = shortcut + self.drop_path(x)
224
+ x = x + self.drop_path(self.mlp(self.norm2(x)))
225
+
226
+ if self.use_residual_block:
227
+ x = self.residual(x.permute(0, 3, 1, 2)).permute(0, 2, 3, 1)
228
+
229
+ return x
230
+
231
+
232
+ class ViT(Backbone):
233
+ """
234
+ This module implements Vision Transformer (ViT) backbone in :paper:`vitdet`.
235
+ "Exploring Plain Vision Transformer Backbones for Object Detection",
236
+ https://arxiv.org/abs/2203.16527
237
+ """
238
+
239
+ def __init__(
240
+ self,
241
+ img_size=1024,
242
+ patch_size=16,
243
+ in_chans=3,
244
+ embed_dim=768,
245
+ depth=12,
246
+ num_heads=12,
247
+ mlp_ratio=4.0,
248
+ qkv_bias=True,
249
+ drop_path_rate=0.0,
250
+ norm_layer=nn.LayerNorm,
251
+ act_layer=nn.GELU,
252
+ use_abs_pos=True,
253
+ use_rel_pos=False,
254
+ rel_pos_zero_init=True,
255
+ window_size=0,
256
+ window_block_indexes=(),
257
+ residual_block_indexes=(),
258
+ use_act_checkpoint=False,
259
+ pretrain_img_size=224,
260
+ pretrain_use_cls_token=True,
261
+ out_feature="last_feat",
262
+ ):
263
+ """
264
+ Args:
265
+ img_size (int): Input image size.
266
+ patch_size (int): Patch size.
267
+ in_chans (int): Number of input image channels.
268
+ embed_dim (int): Patch embedding dimension.
269
+ depth (int): Depth of ViT.
270
+ num_heads (int): Number of attention heads in each ViT block.
271
+ mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
272
+ qkv_bias (bool): If True, add a learnable bias to query, key, value.
273
+ drop_path_rate (float): Stochastic depth rate.
274
+ norm_layer (nn.Module): Normalization layer.
275
+ act_layer (nn.Module): Activation layer.
276
+ use_abs_pos (bool): If True, use absolute positional embeddings.
277
+ use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
278
+ rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
279
+ window_size (int): Window size for window attention blocks.
280
+ window_block_indexes (list): Indexes for blocks using window attention.
281
+ residual_block_indexes (list): Indexes for blocks using conv propagation.
282
+ use_act_checkpoint (bool): If True, use activation checkpointing.
283
+ pretrain_img_size (int): input image size for pretraining models.
284
+ pretrain_use_cls_token (bool): If True, pretrainig models use class token.
285
+ out_feature (str): name of the feature from the last block.
286
+ """
287
+ super().__init__()
288
+ self.pretrain_use_cls_token = pretrain_use_cls_token
289
+
290
+ self.patch_embed = PatchEmbed(
291
+ kernel_size=(patch_size, patch_size),
292
+ stride=(patch_size, patch_size),
293
+ in_chans=in_chans,
294
+ embed_dim=embed_dim,
295
+ )
296
+
297
+ if use_abs_pos:
298
+ # Initialize absolute positional embedding with pretrain image size.
299
+ num_patches = (pretrain_img_size // patch_size) * (pretrain_img_size // patch_size)
300
+ num_positions = (num_patches + 1) if pretrain_use_cls_token else num_patches
301
+ self.pos_embed = nn.Parameter(torch.zeros(1, num_positions, embed_dim))
302
+ else:
303
+ self.pos_embed = None
304
+
305
+ # stochastic depth decay rule
306
+ dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]
307
+
308
+ self.blocks = nn.ModuleList()
309
+ for i in range(depth):
310
+ block = Block(
311
+ dim=embed_dim,
312
+ num_heads=num_heads,
313
+ mlp_ratio=mlp_ratio,
314
+ qkv_bias=qkv_bias,
315
+ drop_path=dpr[i],
316
+ norm_layer=norm_layer,
317
+ act_layer=act_layer,
318
+ use_rel_pos=use_rel_pos,
319
+ rel_pos_zero_init=rel_pos_zero_init,
320
+ window_size=window_size if i in window_block_indexes else 0,
321
+ use_residual_block=i in residual_block_indexes,
322
+ input_size=(img_size // patch_size, img_size // patch_size),
323
+ )
324
+ if use_act_checkpoint:
325
+ # TODO: use torch.utils.checkpoint
326
+ from fairscale.nn.checkpoint import checkpoint_wrapper
327
+
328
+ block = checkpoint_wrapper(block)
329
+ self.blocks.append(block)
330
+
331
+ self._out_feature_channels = {out_feature: embed_dim}
332
+ self._out_feature_strides = {out_feature: patch_size}
333
+ self._out_features = [out_feature]
334
+
335
+ if self.pos_embed is not None:
336
+ nn.init.trunc_normal_(self.pos_embed, std=0.02)
337
+
338
+ self.apply(self._init_weights)
339
+
340
+ def _init_weights(self, m):
341
+ if isinstance(m, nn.Linear):
342
+ nn.init.trunc_normal_(m.weight, std=0.02)
343
+ if isinstance(m, nn.Linear) and m.bias is not None:
344
+ nn.init.constant_(m.bias, 0)
345
+ elif isinstance(m, nn.LayerNorm):
346
+ nn.init.constant_(m.bias, 0)
347
+ nn.init.constant_(m.weight, 1.0)
348
+
349
+ def forward(self, x):
350
+ x = self.patch_embed(x)
351
+ if self.pos_embed is not None:
352
+ x = x + get_abs_pos(
353
+ self.pos_embed, self.pretrain_use_cls_token, (x.shape[1], x.shape[2])
354
+ )
355
+
356
+ for blk in self.blocks:
357
+ x = blk(x)
358
+
359
+ outputs = {self._out_features[0]: x.permute(0, 3, 1, 2)}
360
+ return outputs
361
+
362
+
363
+ class SimpleFeaturePyramid(Backbone):
364
+ """
365
+ This module implements SimpleFeaturePyramid in :paper:`vitdet`.
366
+ It creates pyramid features built on top of the input feature map.
367
+ """
368
+
369
+ def __init__(
370
+ self,
371
+ net,
372
+ in_feature,
373
+ out_channels,
374
+ scale_factors,
375
+ top_block=None,
376
+ norm="LN",
377
+ square_pad=0,
378
+ ):
379
+ """
380
+ Args:
381
+ net (Backbone): module representing the subnetwork backbone.
382
+ Must be a subclass of :class:`Backbone`.
383
+ in_feature (str): names of the input feature maps coming
384
+ from the net.
385
+ out_channels (int): number of channels in the output feature maps.
386
+ scale_factors (list[float]): list of scaling factors to upsample or downsample
387
+ the input features for creating pyramid features.
388
+ top_block (nn.Module or None): if provided, an extra operation will
389
+ be performed on the output of the last (smallest resolution)
390
+ pyramid output, and the result will extend the result list. The top_block
391
+ further downsamples the feature map. It must have an attribute
392
+ "num_levels", meaning the number of extra pyramid levels added by
393
+ this block, and "in_feature", which is a string representing
394
+ its input feature (e.g., p5).
395
+ norm (str): the normalization to use.
396
+ square_pad (int): If > 0, require input images to be padded to specific square size.
397
+ """
398
+ super(SimpleFeaturePyramid, self).__init__()
399
+ assert isinstance(net, Backbone)
400
+
401
+ self.scale_factors = scale_factors
402
+
403
+ input_shapes = net.output_shape()
404
+ strides = [int(input_shapes[in_feature].stride / scale) for scale in scale_factors]
405
+ _assert_strides_are_log2_contiguous(strides)
406
+
407
+ dim = input_shapes[in_feature].channels
408
+ self.stages = []
409
+ use_bias = norm == ""
410
+ for idx, scale in enumerate(scale_factors):
411
+ out_dim = dim
412
+ if scale == 4.0:
413
+ layers = [
414
+ nn.ConvTranspose2d(dim, dim // 2, kernel_size=2, stride=2),
415
+ get_norm(norm, dim // 2),
416
+ nn.GELU(),
417
+ nn.ConvTranspose2d(dim // 2, dim // 4, kernel_size=2, stride=2),
418
+ ]
419
+ out_dim = dim // 4
420
+ elif scale == 2.0:
421
+ layers = [nn.ConvTranspose2d(dim, dim // 2, kernel_size=2, stride=2)]
422
+ out_dim = dim // 2
423
+ elif scale == 1.0:
424
+ layers = []
425
+ elif scale == 0.5:
426
+ layers = [nn.MaxPool2d(kernel_size=2, stride=2)]
427
+ else:
428
+ raise NotImplementedError(f"scale_factor={scale} is not supported yet.")
429
+
430
+ layers.extend(
431
+ [
432
+ Conv2d(
433
+ out_dim,
434
+ out_channels,
435
+ kernel_size=1,
436
+ bias=use_bias,
437
+ norm=get_norm(norm, out_channels),
438
+ ),
439
+ Conv2d(
440
+ out_channels,
441
+ out_channels,
442
+ kernel_size=3,
443
+ padding=1,
444
+ bias=use_bias,
445
+ norm=get_norm(norm, out_channels),
446
+ ),
447
+ ]
448
+ )
449
+ layers = nn.Sequential(*layers)
450
+
451
+ stage = int(math.log2(strides[idx]))
452
+ self.add_module(f"simfp_{stage}", layers)
453
+ self.stages.append(layers)
454
+
455
+ self.net = net
456
+ self.in_feature = in_feature
457
+ self.top_block = top_block
458
+ # Return feature names are "p<stage>", like ["p2", "p3", ..., "p6"]
459
+ self._out_feature_strides = {"p{}".format(int(math.log2(s))): s for s in strides}
460
+ # top block output feature maps.
461
+ if self.top_block is not None:
462
+ for s in range(stage, stage + self.top_block.num_levels):
463
+ self._out_feature_strides["p{}".format(s + 1)] = 2 ** (s + 1)
464
+
465
+ self._out_features = list(self._out_feature_strides.keys())
466
+ self._out_feature_channels = {k: out_channels for k in self._out_features}
467
+ self._size_divisibility = strides[-1]
468
+ self._square_pad = square_pad
469
+
470
+ @property
471
+ def padding_constraints(self):
472
+ return {
473
+ "size_divisiblity": self._size_divisibility,
474
+ "square_size": self._square_pad,
475
+ }
476
+
477
+ def forward(self, x):
478
+ """
479
+ Args:
480
+ x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``.
481
+
482
+ Returns:
483
+ dict[str->Tensor]:
484
+ mapping from feature map name to pyramid feature map tensor
485
+ in high to low resolution order. Returned feature names follow the FPN
486
+ convention: "p<stage>", where stage has stride = 2 ** stage e.g.,
487
+ ["p2", "p3", ..., "p6"].
488
+ """
489
+ bottom_up_features = self.net(x)
490
+ features = bottom_up_features[self.in_feature]
491
+ results = []
492
+
493
+ for stage in self.stages:
494
+ results.append(stage(features))
495
+
496
+ if self.top_block is not None:
497
+ if self.top_block.in_feature in bottom_up_features:
498
+ top_block_in_feature = bottom_up_features[self.top_block.in_feature]
499
+ else:
500
+ top_block_in_feature = results[self._out_features.index(self.top_block.in_feature)]
501
+ results.extend(self.top_block(top_block_in_feature))
502
+ assert len(self._out_features) == len(results)
503
+ return {f: res for f, res in zip(self._out_features, results)}
504
+
505
+
506
+ def get_vit_lr_decay_rate(name, lr_decay_rate=1.0, num_layers=12):
507
+ """
508
+ Calculate lr decay rate for different ViT blocks.
509
+ Args:
510
+ name (string): parameter name.
511
+ lr_decay_rate (float): base lr decay rate.
512
+ num_layers (int): number of ViT blocks.
513
+
514
+ Returns:
515
+ lr decay rate for the given parameter.
516
+ """
517
+ layer_id = num_layers + 1
518
+ if name.startswith("backbone"):
519
+ if ".pos_embed" in name or ".patch_embed" in name:
520
+ layer_id = 0
521
+ elif ".blocks." in name and ".residual." not in name:
522
+ layer_id = int(name[name.find(".blocks.") :].split(".")[2]) + 1
523
+
524
+ return lr_decay_rate ** (num_layers + 1 - layer_id)
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/box_regression.py ADDED
@@ -0,0 +1,369 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ import math
3
+ from typing import List, Tuple, Union
4
+ import torch
5
+ from fvcore.nn import giou_loss, smooth_l1_loss
6
+ from torch.nn import functional as F
7
+
8
+ from annotator.oneformer.detectron2.layers import cat, ciou_loss, diou_loss
9
+ from annotator.oneformer.detectron2.structures import Boxes
10
+
11
+ # Value for clamping large dw and dh predictions. The heuristic is that we clamp
12
+ # such that dw and dh are no larger than what would transform a 16px box into a
13
+ # 1000px box (based on a small anchor, 16px, and a typical image size, 1000px).
14
+ _DEFAULT_SCALE_CLAMP = math.log(1000.0 / 16)
15
+
16
+
17
+ __all__ = ["Box2BoxTransform", "Box2BoxTransformRotated", "Box2BoxTransformLinear"]
18
+
19
+
20
+ @torch.jit.script
21
+ class Box2BoxTransform(object):
22
+ """
23
+ The box-to-box transform defined in R-CNN. The transformation is parameterized
24
+ by 4 deltas: (dx, dy, dw, dh). The transformation scales the box's width and height
25
+ by exp(dw), exp(dh) and shifts a box's center by the offset (dx * width, dy * height).
26
+ """
27
+
28
+ def __init__(
29
+ self, weights: Tuple[float, float, float, float], scale_clamp: float = _DEFAULT_SCALE_CLAMP
30
+ ):
31
+ """
32
+ Args:
33
+ weights (4-element tuple): Scaling factors that are applied to the
34
+ (dx, dy, dw, dh) deltas. In Fast R-CNN, these were originally set
35
+ such that the deltas have unit variance; now they are treated as
36
+ hyperparameters of the system.
37
+ scale_clamp (float): When predicting deltas, the predicted box scaling
38
+ factors (dw and dh) are clamped such that they are <= scale_clamp.
39
+ """
40
+ self.weights = weights
41
+ self.scale_clamp = scale_clamp
42
+
43
+ def get_deltas(self, src_boxes, target_boxes):
44
+ """
45
+ Get box regression transformation deltas (dx, dy, dw, dh) that can be used
46
+ to transform the `src_boxes` into the `target_boxes`. That is, the relation
47
+ ``target_boxes == self.apply_deltas(deltas, src_boxes)`` is true (unless
48
+ any delta is too large and is clamped).
49
+
50
+ Args:
51
+ src_boxes (Tensor): source boxes, e.g., object proposals
52
+ target_boxes (Tensor): target of the transformation, e.g., ground-truth
53
+ boxes.
54
+ """
55
+ assert isinstance(src_boxes, torch.Tensor), type(src_boxes)
56
+ assert isinstance(target_boxes, torch.Tensor), type(target_boxes)
57
+
58
+ src_widths = src_boxes[:, 2] - src_boxes[:, 0]
59
+ src_heights = src_boxes[:, 3] - src_boxes[:, 1]
60
+ src_ctr_x = src_boxes[:, 0] + 0.5 * src_widths
61
+ src_ctr_y = src_boxes[:, 1] + 0.5 * src_heights
62
+
63
+ target_widths = target_boxes[:, 2] - target_boxes[:, 0]
64
+ target_heights = target_boxes[:, 3] - target_boxes[:, 1]
65
+ target_ctr_x = target_boxes[:, 0] + 0.5 * target_widths
66
+ target_ctr_y = target_boxes[:, 1] + 0.5 * target_heights
67
+
68
+ wx, wy, ww, wh = self.weights
69
+ dx = wx * (target_ctr_x - src_ctr_x) / src_widths
70
+ dy = wy * (target_ctr_y - src_ctr_y) / src_heights
71
+ dw = ww * torch.log(target_widths / src_widths)
72
+ dh = wh * torch.log(target_heights / src_heights)
73
+
74
+ deltas = torch.stack((dx, dy, dw, dh), dim=1)
75
+ assert (src_widths > 0).all().item(), "Input boxes to Box2BoxTransform are not valid!"
76
+ return deltas
77
+
78
+ def apply_deltas(self, deltas, boxes):
79
+ """
80
+ Apply transformation `deltas` (dx, dy, dw, dh) to `boxes`.
81
+
82
+ Args:
83
+ deltas (Tensor): transformation deltas of shape (N, k*4), where k >= 1.
84
+ deltas[i] represents k potentially different class-specific
85
+ box transformations for the single box boxes[i].
86
+ boxes (Tensor): boxes to transform, of shape (N, 4)
87
+ """
88
+ deltas = deltas.float() # ensure fp32 for decoding precision
89
+ boxes = boxes.to(deltas.dtype)
90
+
91
+ widths = boxes[:, 2] - boxes[:, 0]
92
+ heights = boxes[:, 3] - boxes[:, 1]
93
+ ctr_x = boxes[:, 0] + 0.5 * widths
94
+ ctr_y = boxes[:, 1] + 0.5 * heights
95
+
96
+ wx, wy, ww, wh = self.weights
97
+ dx = deltas[:, 0::4] / wx
98
+ dy = deltas[:, 1::4] / wy
99
+ dw = deltas[:, 2::4] / ww
100
+ dh = deltas[:, 3::4] / wh
101
+
102
+ # Prevent sending too large values into torch.exp()
103
+ dw = torch.clamp(dw, max=self.scale_clamp)
104
+ dh = torch.clamp(dh, max=self.scale_clamp)
105
+
106
+ pred_ctr_x = dx * widths[:, None] + ctr_x[:, None]
107
+ pred_ctr_y = dy * heights[:, None] + ctr_y[:, None]
108
+ pred_w = torch.exp(dw) * widths[:, None]
109
+ pred_h = torch.exp(dh) * heights[:, None]
110
+
111
+ x1 = pred_ctr_x - 0.5 * pred_w
112
+ y1 = pred_ctr_y - 0.5 * pred_h
113
+ x2 = pred_ctr_x + 0.5 * pred_w
114
+ y2 = pred_ctr_y + 0.5 * pred_h
115
+ pred_boxes = torch.stack((x1, y1, x2, y2), dim=-1)
116
+ return pred_boxes.reshape(deltas.shape)
117
+
118
+
119
+ @torch.jit.script
120
+ class Box2BoxTransformRotated(object):
121
+ """
122
+ The box-to-box transform defined in Rotated R-CNN. The transformation is parameterized
123
+ by 5 deltas: (dx, dy, dw, dh, da). The transformation scales the box's width and height
124
+ by exp(dw), exp(dh), shifts a box's center by the offset (dx * width, dy * height),
125
+ and rotate a box's angle by da (radians).
126
+ Note: angles of deltas are in radians while angles of boxes are in degrees.
127
+ """
128
+
129
+ def __init__(
130
+ self,
131
+ weights: Tuple[float, float, float, float, float],
132
+ scale_clamp: float = _DEFAULT_SCALE_CLAMP,
133
+ ):
134
+ """
135
+ Args:
136
+ weights (5-element tuple): Scaling factors that are applied to the
137
+ (dx, dy, dw, dh, da) deltas. These are treated as
138
+ hyperparameters of the system.
139
+ scale_clamp (float): When predicting deltas, the predicted box scaling
140
+ factors (dw and dh) are clamped such that they are <= scale_clamp.
141
+ """
142
+ self.weights = weights
143
+ self.scale_clamp = scale_clamp
144
+
145
+ def get_deltas(self, src_boxes, target_boxes):
146
+ """
147
+ Get box regression transformation deltas (dx, dy, dw, dh, da) that can be used
148
+ to transform the `src_boxes` into the `target_boxes`. That is, the relation
149
+ ``target_boxes == self.apply_deltas(deltas, src_boxes)`` is true (unless
150
+ any delta is too large and is clamped).
151
+
152
+ Args:
153
+ src_boxes (Tensor): Nx5 source boxes, e.g., object proposals
154
+ target_boxes (Tensor): Nx5 target of the transformation, e.g., ground-truth
155
+ boxes.
156
+ """
157
+ assert isinstance(src_boxes, torch.Tensor), type(src_boxes)
158
+ assert isinstance(target_boxes, torch.Tensor), type(target_boxes)
159
+
160
+ src_ctr_x, src_ctr_y, src_widths, src_heights, src_angles = torch.unbind(src_boxes, dim=1)
161
+
162
+ target_ctr_x, target_ctr_y, target_widths, target_heights, target_angles = torch.unbind(
163
+ target_boxes, dim=1
164
+ )
165
+
166
+ wx, wy, ww, wh, wa = self.weights
167
+ dx = wx * (target_ctr_x - src_ctr_x) / src_widths
168
+ dy = wy * (target_ctr_y - src_ctr_y) / src_heights
169
+ dw = ww * torch.log(target_widths / src_widths)
170
+ dh = wh * torch.log(target_heights / src_heights)
171
+ # Angles of deltas are in radians while angles of boxes are in degrees.
172
+ # the conversion to radians serve as a way to normalize the values
173
+ da = target_angles - src_angles
174
+ da = (da + 180.0) % 360.0 - 180.0 # make it in [-180, 180)
175
+ da *= wa * math.pi / 180.0
176
+
177
+ deltas = torch.stack((dx, dy, dw, dh, da), dim=1)
178
+ assert (
179
+ (src_widths > 0).all().item()
180
+ ), "Input boxes to Box2BoxTransformRotated are not valid!"
181
+ return deltas
182
+
183
+ def apply_deltas(self, deltas, boxes):
184
+ """
185
+ Apply transformation `deltas` (dx, dy, dw, dh, da) to `boxes`.
186
+
187
+ Args:
188
+ deltas (Tensor): transformation deltas of shape (N, k*5).
189
+ deltas[i] represents box transformation for the single box boxes[i].
190
+ boxes (Tensor): boxes to transform, of shape (N, 5)
191
+ """
192
+ assert deltas.shape[1] % 5 == 0 and boxes.shape[1] == 5
193
+
194
+ boxes = boxes.to(deltas.dtype).unsqueeze(2)
195
+
196
+ ctr_x = boxes[:, 0]
197
+ ctr_y = boxes[:, 1]
198
+ widths = boxes[:, 2]
199
+ heights = boxes[:, 3]
200
+ angles = boxes[:, 4]
201
+
202
+ wx, wy, ww, wh, wa = self.weights
203
+
204
+ dx = deltas[:, 0::5] / wx
205
+ dy = deltas[:, 1::5] / wy
206
+ dw = deltas[:, 2::5] / ww
207
+ dh = deltas[:, 3::5] / wh
208
+ da = deltas[:, 4::5] / wa
209
+
210
+ # Prevent sending too large values into torch.exp()
211
+ dw = torch.clamp(dw, max=self.scale_clamp)
212
+ dh = torch.clamp(dh, max=self.scale_clamp)
213
+
214
+ pred_boxes = torch.zeros_like(deltas)
215
+ pred_boxes[:, 0::5] = dx * widths + ctr_x # x_ctr
216
+ pred_boxes[:, 1::5] = dy * heights + ctr_y # y_ctr
217
+ pred_boxes[:, 2::5] = torch.exp(dw) * widths # width
218
+ pred_boxes[:, 3::5] = torch.exp(dh) * heights # height
219
+
220
+ # Following original RRPN implementation,
221
+ # angles of deltas are in radians while angles of boxes are in degrees.
222
+ pred_angle = da * 180.0 / math.pi + angles
223
+ pred_angle = (pred_angle + 180.0) % 360.0 - 180.0 # make it in [-180, 180)
224
+
225
+ pred_boxes[:, 4::5] = pred_angle
226
+
227
+ return pred_boxes
228
+
229
+
230
+ class Box2BoxTransformLinear(object):
231
+ """
232
+ The linear box-to-box transform defined in FCOS. The transformation is parameterized
233
+ by the distance from the center of (square) src box to 4 edges of the target box.
234
+ """
235
+
236
+ def __init__(self, normalize_by_size=True):
237
+ """
238
+ Args:
239
+ normalize_by_size: normalize deltas by the size of src (anchor) boxes.
240
+ """
241
+ self.normalize_by_size = normalize_by_size
242
+
243
+ def get_deltas(self, src_boxes, target_boxes):
244
+ """
245
+ Get box regression transformation deltas (dx1, dy1, dx2, dy2) that can be used
246
+ to transform the `src_boxes` into the `target_boxes`. That is, the relation
247
+ ``target_boxes == self.apply_deltas(deltas, src_boxes)`` is true.
248
+ The center of src must be inside target boxes.
249
+
250
+ Args:
251
+ src_boxes (Tensor): square source boxes, e.g., anchors
252
+ target_boxes (Tensor): target of the transformation, e.g., ground-truth
253
+ boxes.
254
+ """
255
+ assert isinstance(src_boxes, torch.Tensor), type(src_boxes)
256
+ assert isinstance(target_boxes, torch.Tensor), type(target_boxes)
257
+
258
+ src_ctr_x = 0.5 * (src_boxes[:, 0] + src_boxes[:, 2])
259
+ src_ctr_y = 0.5 * (src_boxes[:, 1] + src_boxes[:, 3])
260
+
261
+ target_l = src_ctr_x - target_boxes[:, 0]
262
+ target_t = src_ctr_y - target_boxes[:, 1]
263
+ target_r = target_boxes[:, 2] - src_ctr_x
264
+ target_b = target_boxes[:, 3] - src_ctr_y
265
+
266
+ deltas = torch.stack((target_l, target_t, target_r, target_b), dim=1)
267
+ if self.normalize_by_size:
268
+ stride_w = src_boxes[:, 2] - src_boxes[:, 0]
269
+ stride_h = src_boxes[:, 3] - src_boxes[:, 1]
270
+ strides = torch.stack([stride_w, stride_h, stride_w, stride_h], axis=1)
271
+ deltas = deltas / strides
272
+
273
+ return deltas
274
+
275
+ def apply_deltas(self, deltas, boxes):
276
+ """
277
+ Apply transformation `deltas` (dx1, dy1, dx2, dy2) to `boxes`.
278
+
279
+ Args:
280
+ deltas (Tensor): transformation deltas of shape (N, k*4), where k >= 1.
281
+ deltas[i] represents k potentially different class-specific
282
+ box transformations for the single box boxes[i].
283
+ boxes (Tensor): boxes to transform, of shape (N, 4)
284
+ """
285
+ # Ensure the output is a valid box. See Sec 2.1 of https://arxiv.org/abs/2006.09214
286
+ deltas = F.relu(deltas)
287
+ boxes = boxes.to(deltas.dtype)
288
+
289
+ ctr_x = 0.5 * (boxes[:, 0] + boxes[:, 2])
290
+ ctr_y = 0.5 * (boxes[:, 1] + boxes[:, 3])
291
+ if self.normalize_by_size:
292
+ stride_w = boxes[:, 2] - boxes[:, 0]
293
+ stride_h = boxes[:, 3] - boxes[:, 1]
294
+ strides = torch.stack([stride_w, stride_h, stride_w, stride_h], axis=1)
295
+ deltas = deltas * strides
296
+
297
+ l = deltas[:, 0::4]
298
+ t = deltas[:, 1::4]
299
+ r = deltas[:, 2::4]
300
+ b = deltas[:, 3::4]
301
+
302
+ pred_boxes = torch.zeros_like(deltas)
303
+ pred_boxes[:, 0::4] = ctr_x[:, None] - l # x1
304
+ pred_boxes[:, 1::4] = ctr_y[:, None] - t # y1
305
+ pred_boxes[:, 2::4] = ctr_x[:, None] + r # x2
306
+ pred_boxes[:, 3::4] = ctr_y[:, None] + b # y2
307
+ return pred_boxes
308
+
309
+
310
+ def _dense_box_regression_loss(
311
+ anchors: List[Union[Boxes, torch.Tensor]],
312
+ box2box_transform: Box2BoxTransform,
313
+ pred_anchor_deltas: List[torch.Tensor],
314
+ gt_boxes: List[torch.Tensor],
315
+ fg_mask: torch.Tensor,
316
+ box_reg_loss_type="smooth_l1",
317
+ smooth_l1_beta=0.0,
318
+ ):
319
+ """
320
+ Compute loss for dense multi-level box regression.
321
+ Loss is accumulated over ``fg_mask``.
322
+
323
+ Args:
324
+ anchors: #lvl anchor boxes, each is (HixWixA, 4)
325
+ pred_anchor_deltas: #lvl predictions, each is (N, HixWixA, 4)
326
+ gt_boxes: N ground truth boxes, each has shape (R, 4) (R = sum(Hi * Wi * A))
327
+ fg_mask: the foreground boolean mask of shape (N, R) to compute loss on
328
+ box_reg_loss_type (str): Loss type to use. Supported losses: "smooth_l1", "giou",
329
+ "diou", "ciou".
330
+ smooth_l1_beta (float): beta parameter for the smooth L1 regression loss. Default to
331
+ use L1 loss. Only used when `box_reg_loss_type` is "smooth_l1"
332
+ """
333
+ if isinstance(anchors[0], Boxes):
334
+ anchors = type(anchors[0]).cat(anchors).tensor # (R, 4)
335
+ else:
336
+ anchors = cat(anchors)
337
+ if box_reg_loss_type == "smooth_l1":
338
+ gt_anchor_deltas = [box2box_transform.get_deltas(anchors, k) for k in gt_boxes]
339
+ gt_anchor_deltas = torch.stack(gt_anchor_deltas) # (N, R, 4)
340
+ loss_box_reg = smooth_l1_loss(
341
+ cat(pred_anchor_deltas, dim=1)[fg_mask],
342
+ gt_anchor_deltas[fg_mask],
343
+ beta=smooth_l1_beta,
344
+ reduction="sum",
345
+ )
346
+ elif box_reg_loss_type == "giou":
347
+ pred_boxes = [
348
+ box2box_transform.apply_deltas(k, anchors) for k in cat(pred_anchor_deltas, dim=1)
349
+ ]
350
+ loss_box_reg = giou_loss(
351
+ torch.stack(pred_boxes)[fg_mask], torch.stack(gt_boxes)[fg_mask], reduction="sum"
352
+ )
353
+ elif box_reg_loss_type == "diou":
354
+ pred_boxes = [
355
+ box2box_transform.apply_deltas(k, anchors) for k in cat(pred_anchor_deltas, dim=1)
356
+ ]
357
+ loss_box_reg = diou_loss(
358
+ torch.stack(pred_boxes)[fg_mask], torch.stack(gt_boxes)[fg_mask], reduction="sum"
359
+ )
360
+ elif box_reg_loss_type == "ciou":
361
+ pred_boxes = [
362
+ box2box_transform.apply_deltas(k, anchors) for k in cat(pred_anchor_deltas, dim=1)
363
+ ]
364
+ loss_box_reg = ciou_loss(
365
+ torch.stack(pred_boxes)[fg_mask], torch.stack(gt_boxes)[fg_mask], reduction="sum"
366
+ )
367
+ else:
368
+ raise ValueError(f"Invalid dense box regression loss type '{box_reg_loss_type}'")
369
+ return loss_box_reg
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/matcher.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ from typing import List
3
+ import torch
4
+
5
+ from annotator.oneformer.detectron2.layers import nonzero_tuple
6
+
7
+
8
+ # TODO: the name is too general
9
+ class Matcher(object):
10
+ """
11
+ This class assigns to each predicted "element" (e.g., a box) a ground-truth
12
+ element. Each predicted element will have exactly zero or one matches; each
13
+ ground-truth element may be matched to zero or more predicted elements.
14
+
15
+ The matching is determined by the MxN match_quality_matrix, that characterizes
16
+ how well each (ground-truth, prediction)-pair match each other. For example,
17
+ if the elements are boxes, this matrix may contain box intersection-over-union
18
+ overlap values.
19
+
20
+ The matcher returns (a) a vector of length N containing the index of the
21
+ ground-truth element m in [0, M) that matches to prediction n in [0, N).
22
+ (b) a vector of length N containing the labels for each prediction.
23
+ """
24
+
25
+ def __init__(
26
+ self, thresholds: List[float], labels: List[int], allow_low_quality_matches: bool = False
27
+ ):
28
+ """
29
+ Args:
30
+ thresholds (list): a list of thresholds used to stratify predictions
31
+ into levels.
32
+ labels (list): a list of values to label predictions belonging at
33
+ each level. A label can be one of {-1, 0, 1} signifying
34
+ {ignore, negative class, positive class}, respectively.
35
+ allow_low_quality_matches (bool): if True, produce additional matches
36
+ for predictions with maximum match quality lower than high_threshold.
37
+ See set_low_quality_matches_ for more details.
38
+
39
+ For example,
40
+ thresholds = [0.3, 0.5]
41
+ labels = [0, -1, 1]
42
+ All predictions with iou < 0.3 will be marked with 0 and
43
+ thus will be considered as false positives while training.
44
+ All predictions with 0.3 <= iou < 0.5 will be marked with -1 and
45
+ thus will be ignored.
46
+ All predictions with 0.5 <= iou will be marked with 1 and
47
+ thus will be considered as true positives.
48
+ """
49
+ # Add -inf and +inf to first and last position in thresholds
50
+ thresholds = thresholds[:]
51
+ assert thresholds[0] > 0
52
+ thresholds.insert(0, -float("inf"))
53
+ thresholds.append(float("inf"))
54
+ # Currently torchscript does not support all + generator
55
+ assert all([low <= high for (low, high) in zip(thresholds[:-1], thresholds[1:])])
56
+ assert all([l in [-1, 0, 1] for l in labels])
57
+ assert len(labels) == len(thresholds) - 1
58
+ self.thresholds = thresholds
59
+ self.labels = labels
60
+ self.allow_low_quality_matches = allow_low_quality_matches
61
+
62
+ def __call__(self, match_quality_matrix):
63
+ """
64
+ Args:
65
+ match_quality_matrix (Tensor[float]): an MxN tensor, containing the
66
+ pairwise quality between M ground-truth elements and N predicted
67
+ elements. All elements must be >= 0 (due to the us of `torch.nonzero`
68
+ for selecting indices in :meth:`set_low_quality_matches_`).
69
+
70
+ Returns:
71
+ matches (Tensor[int64]): a vector of length N, where matches[i] is a matched
72
+ ground-truth index in [0, M)
73
+ match_labels (Tensor[int8]): a vector of length N, where pred_labels[i] indicates
74
+ whether a prediction is a true or false positive or ignored
75
+ """
76
+ assert match_quality_matrix.dim() == 2
77
+ if match_quality_matrix.numel() == 0:
78
+ default_matches = match_quality_matrix.new_full(
79
+ (match_quality_matrix.size(1),), 0, dtype=torch.int64
80
+ )
81
+ # When no gt boxes exist, we define IOU = 0 and therefore set labels
82
+ # to `self.labels[0]`, which usually defaults to background class 0
83
+ # To choose to ignore instead, can make labels=[-1,0,-1,1] + set appropriate thresholds
84
+ default_match_labels = match_quality_matrix.new_full(
85
+ (match_quality_matrix.size(1),), self.labels[0], dtype=torch.int8
86
+ )
87
+ return default_matches, default_match_labels
88
+
89
+ assert torch.all(match_quality_matrix >= 0)
90
+
91
+ # match_quality_matrix is M (gt) x N (predicted)
92
+ # Max over gt elements (dim 0) to find best gt candidate for each prediction
93
+ matched_vals, matches = match_quality_matrix.max(dim=0)
94
+
95
+ match_labels = matches.new_full(matches.size(), 1, dtype=torch.int8)
96
+
97
+ for (l, low, high) in zip(self.labels, self.thresholds[:-1], self.thresholds[1:]):
98
+ low_high = (matched_vals >= low) & (matched_vals < high)
99
+ match_labels[low_high] = l
100
+
101
+ if self.allow_low_quality_matches:
102
+ self.set_low_quality_matches_(match_labels, match_quality_matrix)
103
+
104
+ return matches, match_labels
105
+
106
+ def set_low_quality_matches_(self, match_labels, match_quality_matrix):
107
+ """
108
+ Produce additional matches for predictions that have only low-quality matches.
109
+ Specifically, for each ground-truth G find the set of predictions that have
110
+ maximum overlap with it (including ties); for each prediction in that set, if
111
+ it is unmatched, then match it to the ground-truth G.
112
+
113
+ This function implements the RPN assignment case (i) in Sec. 3.1.2 of
114
+ :paper:`Faster R-CNN`.
115
+ """
116
+ # For each gt, find the prediction with which it has highest quality
117
+ highest_quality_foreach_gt, _ = match_quality_matrix.max(dim=1)
118
+ # Find the highest quality match available, even if it is low, including ties.
119
+ # Note that the matches qualities must be positive due to the use of
120
+ # `torch.nonzero`.
121
+ _, pred_inds_with_highest_quality = nonzero_tuple(
122
+ match_quality_matrix == highest_quality_foreach_gt[:, None]
123
+ )
124
+ # If an anchor was labeled positive only due to a low-quality match
125
+ # with gt_A, but it has larger overlap with gt_B, it's matched index will still be gt_B.
126
+ # This follows the implementation in Detectron, and is found to have no significant impact.
127
+ match_labels[pred_inds_with_highest_quality] = 1
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/meta_arch/__init__.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # Copyright (c) Facebook, Inc. and its affiliates.
3
+
4
+ from .build import META_ARCH_REGISTRY, build_model # isort:skip
5
+
6
+ from .panoptic_fpn import PanopticFPN
7
+
8
+ # import all the meta_arch, so they will be registered
9
+ from .rcnn import GeneralizedRCNN, ProposalNetwork
10
+ from .dense_detector import DenseDetector
11
+ from .retinanet import RetinaNet
12
+ from .fcos import FCOS
13
+ from .semantic_seg import SEM_SEG_HEADS_REGISTRY, SemanticSegmentor, build_sem_seg_head
14
+
15
+
16
+ __all__ = list(globals().keys())
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/meta_arch/build.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ import torch
3
+
4
+ from annotator.oneformer.detectron2.utils.logger import _log_api_usage
5
+ from annotator.oneformer.detectron2.utils.registry import Registry
6
+
7
+ META_ARCH_REGISTRY = Registry("META_ARCH") # noqa F401 isort:skip
8
+ META_ARCH_REGISTRY.__doc__ = """
9
+ Registry for meta-architectures, i.e. the whole model.
10
+
11
+ The registered object will be called with `obj(cfg)`
12
+ and expected to return a `nn.Module` object.
13
+ """
14
+
15
+
16
+ def build_model(cfg):
17
+ """
18
+ Build the whole model architecture, defined by ``cfg.MODEL.META_ARCHITECTURE``.
19
+ Note that it does not load any weights from ``cfg``.
20
+ """
21
+ meta_arch = cfg.MODEL.META_ARCHITECTURE
22
+ model = META_ARCH_REGISTRY.get(meta_arch)(cfg)
23
+ _log_api_usage("modeling.meta_arch." + meta_arch)
24
+ return model
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/meta_arch/dense_detector.py ADDED
@@ -0,0 +1,294 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from typing import Dict, List, Optional, Tuple
3
+ import torch
4
+ from torch import Tensor, nn
5
+
6
+ from annotator.oneformer.detectron2.data.detection_utils import convert_image_to_rgb
7
+ from annotator.oneformer.detectron2.layers import move_device_like
8
+ from annotator.oneformer.detectron2.modeling import Backbone
9
+ from annotator.oneformer.detectron2.structures import Boxes, ImageList, Instances
10
+ from annotator.oneformer.detectron2.utils.events import get_event_storage
11
+
12
+ from ..postprocessing import detector_postprocess
13
+
14
+
15
+ def permute_to_N_HWA_K(tensor, K: int):
16
+ """
17
+ Transpose/reshape a tensor from (N, (Ai x K), H, W) to (N, (HxWxAi), K)
18
+ """
19
+ assert tensor.dim() == 4, tensor.shape
20
+ N, _, H, W = tensor.shape
21
+ tensor = tensor.view(N, -1, K, H, W)
22
+ tensor = tensor.permute(0, 3, 4, 1, 2)
23
+ tensor = tensor.reshape(N, -1, K) # Size=(N,HWA,K)
24
+ return tensor
25
+
26
+
27
+ class DenseDetector(nn.Module):
28
+ """
29
+ Base class for dense detector. We define a dense detector as a fully-convolutional model that
30
+ makes per-pixel (i.e. dense) predictions.
31
+ """
32
+
33
+ def __init__(
34
+ self,
35
+ backbone: Backbone,
36
+ head: nn.Module,
37
+ head_in_features: Optional[List[str]] = None,
38
+ *,
39
+ pixel_mean,
40
+ pixel_std,
41
+ ):
42
+ """
43
+ Args:
44
+ backbone: backbone module
45
+ head: head module
46
+ head_in_features: backbone features to use in head. Default to all backbone features.
47
+ pixel_mean (Tuple[float]):
48
+ Values to be used for image normalization (BGR order).
49
+ To train on images of different number of channels, set different mean & std.
50
+ Default values are the mean pixel value from ImageNet: [103.53, 116.28, 123.675]
51
+ pixel_std (Tuple[float]):
52
+ When using pre-trained models in Detectron1 or any MSRA models,
53
+ std has been absorbed into its conv1 weights, so the std needs to be set 1.
54
+ Otherwise, you can use [57.375, 57.120, 58.395] (ImageNet std)
55
+ """
56
+ super().__init__()
57
+
58
+ self.backbone = backbone
59
+ self.head = head
60
+ if head_in_features is None:
61
+ shapes = self.backbone.output_shape()
62
+ self.head_in_features = sorted(shapes.keys(), key=lambda x: shapes[x].stride)
63
+ else:
64
+ self.head_in_features = head_in_features
65
+ self.register_buffer("pixel_mean", torch.tensor(pixel_mean).view(-1, 1, 1), False)
66
+ self.register_buffer("pixel_std", torch.tensor(pixel_std).view(-1, 1, 1), False)
67
+
68
+ @property
69
+ def device(self):
70
+ return self.pixel_mean.device
71
+
72
+ def _move_to_current_device(self, x):
73
+ return move_device_like(x, self.pixel_mean)
74
+
75
+ def forward(self, batched_inputs: List[Dict[str, Tensor]]):
76
+ """
77
+ Args:
78
+ batched_inputs: a list, batched outputs of :class:`DatasetMapper` .
79
+ Each item in the list contains the inputs for one image.
80
+ For now, each item in the list is a dict that contains:
81
+
82
+ * image: Tensor, image in (C, H, W) format.
83
+ * instances: Instances
84
+
85
+ Other information that's included in the original dicts, such as:
86
+
87
+ * "height", "width" (int): the output resolution of the model, used in inference.
88
+ See :meth:`postprocess` for details.
89
+
90
+ Returns:
91
+ In training, dict[str, Tensor]: mapping from a named loss to a tensor storing the
92
+ loss. Used during training only. In inference, the standard output format, described
93
+ in :doc:`/tutorials/models`.
94
+ """
95
+ images = self.preprocess_image(batched_inputs)
96
+ features = self.backbone(images.tensor)
97
+ features = [features[f] for f in self.head_in_features]
98
+ predictions = self.head(features)
99
+
100
+ if self.training:
101
+ assert not torch.jit.is_scripting(), "Not supported"
102
+ assert "instances" in batched_inputs[0], "Instance annotations are missing in training!"
103
+ gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
104
+ return self.forward_training(images, features, predictions, gt_instances)
105
+ else:
106
+ results = self.forward_inference(images, features, predictions)
107
+ if torch.jit.is_scripting():
108
+ return results
109
+
110
+ processed_results = []
111
+ for results_per_image, input_per_image, image_size in zip(
112
+ results, batched_inputs, images.image_sizes
113
+ ):
114
+ height = input_per_image.get("height", image_size[0])
115
+ width = input_per_image.get("width", image_size[1])
116
+ r = detector_postprocess(results_per_image, height, width)
117
+ processed_results.append({"instances": r})
118
+ return processed_results
119
+
120
+ def forward_training(self, images, features, predictions, gt_instances):
121
+ raise NotImplementedError()
122
+
123
+ def preprocess_image(self, batched_inputs: List[Dict[str, Tensor]]):
124
+ """
125
+ Normalize, pad and batch the input images.
126
+ """
127
+ images = [self._move_to_current_device(x["image"]) for x in batched_inputs]
128
+ images = [(x - self.pixel_mean) / self.pixel_std for x in images]
129
+ images = ImageList.from_tensors(
130
+ images,
131
+ self.backbone.size_divisibility,
132
+ padding_constraints=self.backbone.padding_constraints,
133
+ )
134
+ return images
135
+
136
+ def _transpose_dense_predictions(
137
+ self, predictions: List[List[Tensor]], dims_per_anchor: List[int]
138
+ ) -> List[List[Tensor]]:
139
+ """
140
+ Transpose the dense per-level predictions.
141
+
142
+ Args:
143
+ predictions: a list of outputs, each is a list of per-level
144
+ predictions with shape (N, Ai x K, Hi, Wi), where N is the
145
+ number of images, Ai is the number of anchors per location on
146
+ level i, K is the dimension of predictions per anchor.
147
+ dims_per_anchor: the value of K for each predictions. e.g. 4 for
148
+ box prediction, #classes for classification prediction.
149
+
150
+ Returns:
151
+ List[List[Tensor]]: each prediction is transposed to (N, Hi x Wi x Ai, K).
152
+ """
153
+ assert len(predictions) == len(dims_per_anchor)
154
+ res: List[List[Tensor]] = []
155
+ for pred, dim_per_anchor in zip(predictions, dims_per_anchor):
156
+ pred = [permute_to_N_HWA_K(x, dim_per_anchor) for x in pred]
157
+ res.append(pred)
158
+ return res
159
+
160
+ def _ema_update(self, name: str, value: float, initial_value: float, momentum: float = 0.9):
161
+ """
162
+ Apply EMA update to `self.name` using `value`.
163
+
164
+ This is mainly used for loss normalizer. In Detectron1, loss is normalized by number
165
+ of foreground samples in the batch. When batch size is 1 per GPU, #foreground has a
166
+ large variance and using it lead to lower performance. Therefore we maintain an EMA of
167
+ #foreground to stabilize the normalizer.
168
+
169
+ Args:
170
+ name: name of the normalizer
171
+ value: the new value to update
172
+ initial_value: the initial value to start with
173
+ momentum: momentum of EMA
174
+
175
+ Returns:
176
+ float: the updated EMA value
177
+ """
178
+ if hasattr(self, name):
179
+ old = getattr(self, name)
180
+ else:
181
+ old = initial_value
182
+ new = old * momentum + value * (1 - momentum)
183
+ setattr(self, name, new)
184
+ return new
185
+
186
+ def _decode_per_level_predictions(
187
+ self,
188
+ anchors: Boxes,
189
+ pred_scores: Tensor,
190
+ pred_deltas: Tensor,
191
+ score_thresh: float,
192
+ topk_candidates: int,
193
+ image_size: Tuple[int, int],
194
+ ) -> Instances:
195
+ """
196
+ Decode boxes and classification predictions of one featuer level, by
197
+ the following steps:
198
+ 1. filter the predictions based on score threshold and top K scores.
199
+ 2. transform the box regression outputs
200
+ 3. return the predicted scores, classes and boxes
201
+
202
+ Args:
203
+ anchors: Boxes, anchor for this feature level
204
+ pred_scores: HxWxA,K
205
+ pred_deltas: HxWxA,4
206
+
207
+ Returns:
208
+ Instances: with field "scores", "pred_boxes", "pred_classes".
209
+ """
210
+ # Apply two filtering to make NMS faster.
211
+ # 1. Keep boxes with confidence score higher than threshold
212
+ keep_idxs = pred_scores > score_thresh
213
+ pred_scores = pred_scores[keep_idxs]
214
+ topk_idxs = torch.nonzero(keep_idxs) # Kx2
215
+
216
+ # 2. Keep top k top scoring boxes only
217
+ topk_idxs_size = topk_idxs.shape[0]
218
+ if isinstance(topk_idxs_size, Tensor):
219
+ # It's a tensor in tracing
220
+ num_topk = torch.clamp(topk_idxs_size, max=topk_candidates)
221
+ else:
222
+ num_topk = min(topk_idxs_size, topk_candidates)
223
+ pred_scores, idxs = pred_scores.topk(num_topk)
224
+ topk_idxs = topk_idxs[idxs]
225
+
226
+ anchor_idxs, classes_idxs = topk_idxs.unbind(dim=1)
227
+
228
+ pred_boxes = self.box2box_transform.apply_deltas(
229
+ pred_deltas[anchor_idxs], anchors.tensor[anchor_idxs]
230
+ )
231
+ return Instances(
232
+ image_size, pred_boxes=Boxes(pred_boxes), scores=pred_scores, pred_classes=classes_idxs
233
+ )
234
+
235
+ def _decode_multi_level_predictions(
236
+ self,
237
+ anchors: List[Boxes],
238
+ pred_scores: List[Tensor],
239
+ pred_deltas: List[Tensor],
240
+ score_thresh: float,
241
+ topk_candidates: int,
242
+ image_size: Tuple[int, int],
243
+ ) -> Instances:
244
+ """
245
+ Run `_decode_per_level_predictions` for all feature levels and concat the results.
246
+ """
247
+ predictions = [
248
+ self._decode_per_level_predictions(
249
+ anchors_i,
250
+ box_cls_i,
251
+ box_reg_i,
252
+ self.test_score_thresh,
253
+ self.test_topk_candidates,
254
+ image_size,
255
+ )
256
+ # Iterate over every feature level
257
+ for box_cls_i, box_reg_i, anchors_i in zip(pred_scores, pred_deltas, anchors)
258
+ ]
259
+ return predictions[0].cat(predictions) # 'Instances.cat' is not scriptale but this is
260
+
261
+ def visualize_training(self, batched_inputs, results):
262
+ """
263
+ A function used to visualize ground truth images and final network predictions.
264
+ It shows ground truth bounding boxes on the original image and up to 20
265
+ predicted object bounding boxes on the original image.
266
+
267
+ Args:
268
+ batched_inputs (list): a list that contains input to the model.
269
+ results (List[Instances]): a list of #images elements returned by forward_inference().
270
+ """
271
+ from annotator.oneformer.detectron2.utils.visualizer import Visualizer
272
+
273
+ assert len(batched_inputs) == len(
274
+ results
275
+ ), "Cannot visualize inputs and results of different sizes"
276
+ storage = get_event_storage()
277
+ max_boxes = 20
278
+
279
+ image_index = 0 # only visualize a single image
280
+ img = batched_inputs[image_index]["image"]
281
+ img = convert_image_to_rgb(img.permute(1, 2, 0), self.input_format)
282
+ v_gt = Visualizer(img, None)
283
+ v_gt = v_gt.overlay_instances(boxes=batched_inputs[image_index]["instances"].gt_boxes)
284
+ anno_img = v_gt.get_image()
285
+ processed_results = detector_postprocess(results[image_index], img.shape[0], img.shape[1])
286
+ predicted_boxes = processed_results.pred_boxes.tensor.detach().cpu().numpy()
287
+
288
+ v_pred = Visualizer(img, None)
289
+ v_pred = v_pred.overlay_instances(boxes=predicted_boxes[0:max_boxes])
290
+ prop_img = v_pred.get_image()
291
+ vis_img = np.vstack((anno_img, prop_img))
292
+ vis_img = vis_img.transpose(2, 0, 1)
293
+ vis_name = f"Top: GT bounding boxes; Bottom: {max_boxes} Highest Scoring Results"
294
+ storage.put_image(vis_name, vis_img)
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/meta_arch/fcos.py ADDED
@@ -0,0 +1,328 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+
3
+ import logging
4
+ from typing import List, Optional, Tuple
5
+ import torch
6
+ from fvcore.nn import sigmoid_focal_loss_jit
7
+ from torch import nn
8
+ from torch.nn import functional as F
9
+
10
+ from annotator.oneformer.detectron2.layers import ShapeSpec, batched_nms
11
+ from annotator.oneformer.detectron2.structures import Boxes, ImageList, Instances, pairwise_point_box_distance
12
+ from annotator.oneformer.detectron2.utils.events import get_event_storage
13
+
14
+ from ..anchor_generator import DefaultAnchorGenerator
15
+ from ..backbone import Backbone
16
+ from ..box_regression import Box2BoxTransformLinear, _dense_box_regression_loss
17
+ from .dense_detector import DenseDetector
18
+ from .retinanet import RetinaNetHead
19
+
20
+ __all__ = ["FCOS"]
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ class FCOS(DenseDetector):
26
+ """
27
+ Implement FCOS in :paper:`fcos`.
28
+ """
29
+
30
+ def __init__(
31
+ self,
32
+ *,
33
+ backbone: Backbone,
34
+ head: nn.Module,
35
+ head_in_features: Optional[List[str]] = None,
36
+ box2box_transform=None,
37
+ num_classes,
38
+ center_sampling_radius: float = 1.5,
39
+ focal_loss_alpha=0.25,
40
+ focal_loss_gamma=2.0,
41
+ test_score_thresh=0.2,
42
+ test_topk_candidates=1000,
43
+ test_nms_thresh=0.6,
44
+ max_detections_per_image=100,
45
+ pixel_mean,
46
+ pixel_std,
47
+ ):
48
+ """
49
+ Args:
50
+ center_sampling_radius: radius of the "center" of a groundtruth box,
51
+ within which all anchor points are labeled positive.
52
+ Other arguments mean the same as in :class:`RetinaNet`.
53
+ """
54
+ super().__init__(
55
+ backbone, head, head_in_features, pixel_mean=pixel_mean, pixel_std=pixel_std
56
+ )
57
+
58
+ self.num_classes = num_classes
59
+
60
+ # FCOS uses one anchor point per location.
61
+ # We represent the anchor point by a box whose size equals the anchor stride.
62
+ feature_shapes = backbone.output_shape()
63
+ fpn_strides = [feature_shapes[k].stride for k in self.head_in_features]
64
+ self.anchor_generator = DefaultAnchorGenerator(
65
+ sizes=[[k] for k in fpn_strides], aspect_ratios=[1.0], strides=fpn_strides
66
+ )
67
+
68
+ # FCOS parameterizes box regression by a linear transform,
69
+ # where predictions are normalized by anchor stride (equal to anchor size).
70
+ if box2box_transform is None:
71
+ box2box_transform = Box2BoxTransformLinear(normalize_by_size=True)
72
+ self.box2box_transform = box2box_transform
73
+
74
+ self.center_sampling_radius = float(center_sampling_radius)
75
+
76
+ # Loss parameters:
77
+ self.focal_loss_alpha = focal_loss_alpha
78
+ self.focal_loss_gamma = focal_loss_gamma
79
+
80
+ # Inference parameters:
81
+ self.test_score_thresh = test_score_thresh
82
+ self.test_topk_candidates = test_topk_candidates
83
+ self.test_nms_thresh = test_nms_thresh
84
+ self.max_detections_per_image = max_detections_per_image
85
+
86
+ def forward_training(self, images, features, predictions, gt_instances):
87
+ # Transpose the Hi*Wi*A dimension to the middle:
88
+ pred_logits, pred_anchor_deltas, pred_centerness = self._transpose_dense_predictions(
89
+ predictions, [self.num_classes, 4, 1]
90
+ )
91
+ anchors = self.anchor_generator(features)
92
+ gt_labels, gt_boxes = self.label_anchors(anchors, gt_instances)
93
+ return self.losses(
94
+ anchors, pred_logits, gt_labels, pred_anchor_deltas, gt_boxes, pred_centerness
95
+ )
96
+
97
+ @torch.no_grad()
98
+ def _match_anchors(self, gt_boxes: Boxes, anchors: List[Boxes]):
99
+ """
100
+ Match ground-truth boxes to a set of multi-level anchors.
101
+
102
+ Args:
103
+ gt_boxes: Ground-truth boxes from instances of an image.
104
+ anchors: List of anchors for each feature map (of different scales).
105
+
106
+ Returns:
107
+ torch.Tensor
108
+ A tensor of shape `(M, R)`, given `M` ground-truth boxes and total
109
+ `R` anchor points from all feature levels, indicating the quality
110
+ of match between m-th box and r-th anchor. Higher value indicates
111
+ better match.
112
+ """
113
+ # Naming convention: (M = ground-truth boxes, R = anchor points)
114
+ # Anchor points are represented as square boxes of size = stride.
115
+ num_anchors_per_level = [len(x) for x in anchors]
116
+ anchors = Boxes.cat(anchors) # (R, 4)
117
+ anchor_centers = anchors.get_centers() # (R, 2)
118
+ anchor_sizes = anchors.tensor[:, 2] - anchors.tensor[:, 0] # (R, )
119
+
120
+ lower_bound = anchor_sizes * 4
121
+ lower_bound[: num_anchors_per_level[0]] = 0
122
+ upper_bound = anchor_sizes * 8
123
+ upper_bound[-num_anchors_per_level[-1] :] = float("inf")
124
+
125
+ gt_centers = gt_boxes.get_centers()
126
+
127
+ # FCOS with center sampling: anchor point must be close enough to
128
+ # ground-truth box center.
129
+ center_dists = (anchor_centers[None, :, :] - gt_centers[:, None, :]).abs_()
130
+ sampling_regions = self.center_sampling_radius * anchor_sizes[None, :]
131
+
132
+ match_quality_matrix = center_dists.max(dim=2).values < sampling_regions
133
+
134
+ pairwise_dist = pairwise_point_box_distance(anchor_centers, gt_boxes)
135
+ pairwise_dist = pairwise_dist.permute(1, 0, 2) # (M, R, 4)
136
+
137
+ # The original FCOS anchor matching rule: anchor point must be inside GT.
138
+ match_quality_matrix &= pairwise_dist.min(dim=2).values > 0
139
+
140
+ # Multilevel anchor matching in FCOS: each anchor is only responsible
141
+ # for certain scale range.
142
+ pairwise_dist = pairwise_dist.max(dim=2).values
143
+ match_quality_matrix &= (pairwise_dist > lower_bound[None, :]) & (
144
+ pairwise_dist < upper_bound[None, :]
145
+ )
146
+ # Match the GT box with minimum area, if there are multiple GT matches.
147
+ gt_areas = gt_boxes.area() # (M, )
148
+
149
+ match_quality_matrix = match_quality_matrix.to(torch.float32)
150
+ match_quality_matrix *= 1e8 - gt_areas[:, None]
151
+ return match_quality_matrix # (M, R)
152
+
153
+ @torch.no_grad()
154
+ def label_anchors(self, anchors: List[Boxes], gt_instances: List[Instances]):
155
+ """
156
+ Same interface as :meth:`RetinaNet.label_anchors`, but implemented with FCOS
157
+ anchor matching rule.
158
+
159
+ Unlike RetinaNet, there are no ignored anchors.
160
+ """
161
+
162
+ gt_labels, matched_gt_boxes = [], []
163
+
164
+ for inst in gt_instances:
165
+ if len(inst) > 0:
166
+ match_quality_matrix = self._match_anchors(inst.gt_boxes, anchors)
167
+
168
+ # Find matched ground-truth box per anchor. Un-matched anchors are
169
+ # assigned -1. This is equivalent to using an anchor matcher as used
170
+ # in R-CNN/RetinaNet: `Matcher(thresholds=[1e-5], labels=[0, 1])`
171
+ match_quality, matched_idxs = match_quality_matrix.max(dim=0)
172
+ matched_idxs[match_quality < 1e-5] = -1
173
+
174
+ matched_gt_boxes_i = inst.gt_boxes.tensor[matched_idxs.clip(min=0)]
175
+ gt_labels_i = inst.gt_classes[matched_idxs.clip(min=0)]
176
+
177
+ # Anchors with matched_idxs = -1 are labeled background.
178
+ gt_labels_i[matched_idxs < 0] = self.num_classes
179
+ else:
180
+ matched_gt_boxes_i = torch.zeros_like(Boxes.cat(anchors).tensor)
181
+ gt_labels_i = torch.full(
182
+ (len(matched_gt_boxes_i),),
183
+ fill_value=self.num_classes,
184
+ dtype=torch.long,
185
+ device=matched_gt_boxes_i.device,
186
+ )
187
+
188
+ gt_labels.append(gt_labels_i)
189
+ matched_gt_boxes.append(matched_gt_boxes_i)
190
+
191
+ return gt_labels, matched_gt_boxes
192
+
193
+ def losses(
194
+ self, anchors, pred_logits, gt_labels, pred_anchor_deltas, gt_boxes, pred_centerness
195
+ ):
196
+ """
197
+ This method is almost identical to :meth:`RetinaNet.losses`, with an extra
198
+ "loss_centerness" in the returned dict.
199
+ """
200
+ num_images = len(gt_labels)
201
+ gt_labels = torch.stack(gt_labels) # (M, R)
202
+
203
+ pos_mask = (gt_labels >= 0) & (gt_labels != self.num_classes)
204
+ num_pos_anchors = pos_mask.sum().item()
205
+ get_event_storage().put_scalar("num_pos_anchors", num_pos_anchors / num_images)
206
+ normalizer = self._ema_update("loss_normalizer", max(num_pos_anchors, 1), 300)
207
+
208
+ # classification and regression loss
209
+ gt_labels_target = F.one_hot(gt_labels, num_classes=self.num_classes + 1)[
210
+ :, :, :-1
211
+ ] # no loss for the last (background) class
212
+ loss_cls = sigmoid_focal_loss_jit(
213
+ torch.cat(pred_logits, dim=1),
214
+ gt_labels_target.to(pred_logits[0].dtype),
215
+ alpha=self.focal_loss_alpha,
216
+ gamma=self.focal_loss_gamma,
217
+ reduction="sum",
218
+ )
219
+
220
+ loss_box_reg = _dense_box_regression_loss(
221
+ anchors,
222
+ self.box2box_transform,
223
+ pred_anchor_deltas,
224
+ gt_boxes,
225
+ pos_mask,
226
+ box_reg_loss_type="giou",
227
+ )
228
+
229
+ ctrness_targets = self.compute_ctrness_targets(anchors, gt_boxes) # (M, R)
230
+ pred_centerness = torch.cat(pred_centerness, dim=1).squeeze(dim=2) # (M, R)
231
+ ctrness_loss = F.binary_cross_entropy_with_logits(
232
+ pred_centerness[pos_mask], ctrness_targets[pos_mask], reduction="sum"
233
+ )
234
+ return {
235
+ "loss_fcos_cls": loss_cls / normalizer,
236
+ "loss_fcos_loc": loss_box_reg / normalizer,
237
+ "loss_fcos_ctr": ctrness_loss / normalizer,
238
+ }
239
+
240
+ def compute_ctrness_targets(self, anchors: List[Boxes], gt_boxes: List[torch.Tensor]):
241
+ anchors = Boxes.cat(anchors).tensor # Rx4
242
+ reg_targets = [self.box2box_transform.get_deltas(anchors, m) for m in gt_boxes]
243
+ reg_targets = torch.stack(reg_targets, dim=0) # NxRx4
244
+ if len(reg_targets) == 0:
245
+ return reg_targets.new_zeros(len(reg_targets))
246
+ left_right = reg_targets[:, :, [0, 2]]
247
+ top_bottom = reg_targets[:, :, [1, 3]]
248
+ ctrness = (left_right.min(dim=-1)[0] / left_right.max(dim=-1)[0]) * (
249
+ top_bottom.min(dim=-1)[0] / top_bottom.max(dim=-1)[0]
250
+ )
251
+ return torch.sqrt(ctrness)
252
+
253
+ def forward_inference(
254
+ self,
255
+ images: ImageList,
256
+ features: List[torch.Tensor],
257
+ predictions: List[List[torch.Tensor]],
258
+ ):
259
+ pred_logits, pred_anchor_deltas, pred_centerness = self._transpose_dense_predictions(
260
+ predictions, [self.num_classes, 4, 1]
261
+ )
262
+ anchors = self.anchor_generator(features)
263
+
264
+ results: List[Instances] = []
265
+ for img_idx, image_size in enumerate(images.image_sizes):
266
+ scores_per_image = [
267
+ # Multiply and sqrt centerness & classification scores
268
+ # (See eqn. 4 in https://arxiv.org/abs/2006.09214)
269
+ torch.sqrt(x[img_idx].sigmoid_() * y[img_idx].sigmoid_())
270
+ for x, y in zip(pred_logits, pred_centerness)
271
+ ]
272
+ deltas_per_image = [x[img_idx] for x in pred_anchor_deltas]
273
+ results_per_image = self.inference_single_image(
274
+ anchors, scores_per_image, deltas_per_image, image_size
275
+ )
276
+ results.append(results_per_image)
277
+ return results
278
+
279
+ def inference_single_image(
280
+ self,
281
+ anchors: List[Boxes],
282
+ box_cls: List[torch.Tensor],
283
+ box_delta: List[torch.Tensor],
284
+ image_size: Tuple[int, int],
285
+ ):
286
+ """
287
+ Identical to :meth:`RetinaNet.inference_single_image.
288
+ """
289
+ pred = self._decode_multi_level_predictions(
290
+ anchors,
291
+ box_cls,
292
+ box_delta,
293
+ self.test_score_thresh,
294
+ self.test_topk_candidates,
295
+ image_size,
296
+ )
297
+ keep = batched_nms(
298
+ pred.pred_boxes.tensor, pred.scores, pred.pred_classes, self.test_nms_thresh
299
+ )
300
+ return pred[keep[: self.max_detections_per_image]]
301
+
302
+
303
+ class FCOSHead(RetinaNetHead):
304
+ """
305
+ The head used in :paper:`fcos`. It adds an additional centerness
306
+ prediction branch on top of :class:`RetinaNetHead`.
307
+ """
308
+
309
+ def __init__(self, *, input_shape: List[ShapeSpec], conv_dims: List[int], **kwargs):
310
+ super().__init__(input_shape=input_shape, conv_dims=conv_dims, num_anchors=1, **kwargs)
311
+ # Unlike original FCOS, we do not add an additional learnable scale layer
312
+ # because it's found to have no benefits after normalizing regression targets by stride.
313
+ self._num_features = len(input_shape)
314
+ self.ctrness = nn.Conv2d(conv_dims[-1], 1, kernel_size=3, stride=1, padding=1)
315
+ torch.nn.init.normal_(self.ctrness.weight, std=0.01)
316
+ torch.nn.init.constant_(self.ctrness.bias, 0)
317
+
318
+ def forward(self, features):
319
+ assert len(features) == self._num_features
320
+ logits = []
321
+ bbox_reg = []
322
+ ctrness = []
323
+ for feature in features:
324
+ logits.append(self.cls_score(self.cls_subnet(feature)))
325
+ bbox_feature = self.bbox_subnet(feature)
326
+ bbox_reg.append(self.bbox_pred(bbox_feature))
327
+ ctrness.append(self.ctrness(bbox_feature))
328
+ return logits, bbox_reg, ctrness
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/meta_arch/panoptic_fpn.py ADDED
@@ -0,0 +1,269 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # Copyright (c) Facebook, Inc. and its affiliates.
3
+
4
+ import logging
5
+ from typing import Dict, List
6
+ import torch
7
+ from torch import nn
8
+
9
+ from annotator.oneformer.detectron2.config import configurable
10
+ from annotator.oneformer.detectron2.structures import ImageList
11
+
12
+ from ..postprocessing import detector_postprocess, sem_seg_postprocess
13
+ from .build import META_ARCH_REGISTRY
14
+ from .rcnn import GeneralizedRCNN
15
+ from .semantic_seg import build_sem_seg_head
16
+
17
+ __all__ = ["PanopticFPN"]
18
+
19
+
20
+ @META_ARCH_REGISTRY.register()
21
+ class PanopticFPN(GeneralizedRCNN):
22
+ """
23
+ Implement the paper :paper:`PanopticFPN`.
24
+ """
25
+
26
+ @configurable
27
+ def __init__(
28
+ self,
29
+ *,
30
+ sem_seg_head: nn.Module,
31
+ combine_overlap_thresh: float = 0.5,
32
+ combine_stuff_area_thresh: float = 4096,
33
+ combine_instances_score_thresh: float = 0.5,
34
+ **kwargs,
35
+ ):
36
+ """
37
+ NOTE: this interface is experimental.
38
+
39
+ Args:
40
+ sem_seg_head: a module for the semantic segmentation head.
41
+ combine_overlap_thresh: combine masks into one instances if
42
+ they have enough overlap
43
+ combine_stuff_area_thresh: ignore stuff areas smaller than this threshold
44
+ combine_instances_score_thresh: ignore instances whose score is
45
+ smaller than this threshold
46
+
47
+ Other arguments are the same as :class:`GeneralizedRCNN`.
48
+ """
49
+ super().__init__(**kwargs)
50
+ self.sem_seg_head = sem_seg_head
51
+ # options when combining instance & semantic outputs
52
+ self.combine_overlap_thresh = combine_overlap_thresh
53
+ self.combine_stuff_area_thresh = combine_stuff_area_thresh
54
+ self.combine_instances_score_thresh = combine_instances_score_thresh
55
+
56
+ @classmethod
57
+ def from_config(cls, cfg):
58
+ ret = super().from_config(cfg)
59
+ ret.update(
60
+ {
61
+ "combine_overlap_thresh": cfg.MODEL.PANOPTIC_FPN.COMBINE.OVERLAP_THRESH,
62
+ "combine_stuff_area_thresh": cfg.MODEL.PANOPTIC_FPN.COMBINE.STUFF_AREA_LIMIT,
63
+ "combine_instances_score_thresh": cfg.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH, # noqa
64
+ }
65
+ )
66
+ ret["sem_seg_head"] = build_sem_seg_head(cfg, ret["backbone"].output_shape())
67
+ logger = logging.getLogger(__name__)
68
+ if not cfg.MODEL.PANOPTIC_FPN.COMBINE.ENABLED:
69
+ logger.warning(
70
+ "PANOPTIC_FPN.COMBINED.ENABLED is no longer used. "
71
+ " model.inference(do_postprocess=) should be used to toggle postprocessing."
72
+ )
73
+ if cfg.MODEL.PANOPTIC_FPN.INSTANCE_LOSS_WEIGHT != 1.0:
74
+ w = cfg.MODEL.PANOPTIC_FPN.INSTANCE_LOSS_WEIGHT
75
+ logger.warning(
76
+ "PANOPTIC_FPN.INSTANCE_LOSS_WEIGHT should be replaced by weights on each ROI head."
77
+ )
78
+
79
+ def update_weight(x):
80
+ if isinstance(x, dict):
81
+ return {k: v * w for k, v in x.items()}
82
+ else:
83
+ return x * w
84
+
85
+ roi_heads = ret["roi_heads"]
86
+ roi_heads.box_predictor.loss_weight = update_weight(roi_heads.box_predictor.loss_weight)
87
+ roi_heads.mask_head.loss_weight = update_weight(roi_heads.mask_head.loss_weight)
88
+ return ret
89
+
90
+ def forward(self, batched_inputs):
91
+ """
92
+ Args:
93
+ batched_inputs: a list, batched outputs of :class:`DatasetMapper`.
94
+ Each item in the list contains the inputs for one image.
95
+
96
+ For now, each item in the list is a dict that contains:
97
+
98
+ * "image": Tensor, image in (C, H, W) format.
99
+ * "instances": Instances
100
+ * "sem_seg": semantic segmentation ground truth.
101
+ * Other information that's included in the original dicts, such as:
102
+ "height", "width" (int): the output resolution of the model, used in inference.
103
+ See :meth:`postprocess` for details.
104
+
105
+ Returns:
106
+ list[dict]:
107
+ each dict has the results for one image. The dict contains the following keys:
108
+
109
+ * "instances": see :meth:`GeneralizedRCNN.forward` for its format.
110
+ * "sem_seg": see :meth:`SemanticSegmentor.forward` for its format.
111
+ * "panoptic_seg": See the return value of
112
+ :func:`combine_semantic_and_instance_outputs` for its format.
113
+ """
114
+ if not self.training:
115
+ return self.inference(batched_inputs)
116
+ images = self.preprocess_image(batched_inputs)
117
+ features = self.backbone(images.tensor)
118
+
119
+ assert "sem_seg" in batched_inputs[0]
120
+ gt_sem_seg = [x["sem_seg"].to(self.device) for x in batched_inputs]
121
+ gt_sem_seg = ImageList.from_tensors(
122
+ gt_sem_seg,
123
+ self.backbone.size_divisibility,
124
+ self.sem_seg_head.ignore_value,
125
+ self.backbone.padding_constraints,
126
+ ).tensor
127
+ sem_seg_results, sem_seg_losses = self.sem_seg_head(features, gt_sem_seg)
128
+
129
+ gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
130
+ proposals, proposal_losses = self.proposal_generator(images, features, gt_instances)
131
+ detector_results, detector_losses = self.roi_heads(
132
+ images, features, proposals, gt_instances
133
+ )
134
+
135
+ losses = sem_seg_losses
136
+ losses.update(proposal_losses)
137
+ losses.update(detector_losses)
138
+ return losses
139
+
140
+ def inference(self, batched_inputs: List[Dict[str, torch.Tensor]], do_postprocess: bool = True):
141
+ """
142
+ Run inference on the given inputs.
143
+
144
+ Args:
145
+ batched_inputs (list[dict]): same as in :meth:`forward`
146
+ do_postprocess (bool): whether to apply post-processing on the outputs.
147
+
148
+ Returns:
149
+ When do_postprocess=True, see docs in :meth:`forward`.
150
+ Otherwise, returns a (list[Instances], list[Tensor]) that contains
151
+ the raw detector outputs, and raw semantic segmentation outputs.
152
+ """
153
+ images = self.preprocess_image(batched_inputs)
154
+ features = self.backbone(images.tensor)
155
+ sem_seg_results, sem_seg_losses = self.sem_seg_head(features, None)
156
+ proposals, _ = self.proposal_generator(images, features, None)
157
+ detector_results, _ = self.roi_heads(images, features, proposals, None)
158
+
159
+ if do_postprocess:
160
+ processed_results = []
161
+ for sem_seg_result, detector_result, input_per_image, image_size in zip(
162
+ sem_seg_results, detector_results, batched_inputs, images.image_sizes
163
+ ):
164
+ height = input_per_image.get("height", image_size[0])
165
+ width = input_per_image.get("width", image_size[1])
166
+ sem_seg_r = sem_seg_postprocess(sem_seg_result, image_size, height, width)
167
+ detector_r = detector_postprocess(detector_result, height, width)
168
+
169
+ processed_results.append({"sem_seg": sem_seg_r, "instances": detector_r})
170
+
171
+ panoptic_r = combine_semantic_and_instance_outputs(
172
+ detector_r,
173
+ sem_seg_r.argmax(dim=0),
174
+ self.combine_overlap_thresh,
175
+ self.combine_stuff_area_thresh,
176
+ self.combine_instances_score_thresh,
177
+ )
178
+ processed_results[-1]["panoptic_seg"] = panoptic_r
179
+ return processed_results
180
+ else:
181
+ return detector_results, sem_seg_results
182
+
183
+
184
+ def combine_semantic_and_instance_outputs(
185
+ instance_results,
186
+ semantic_results,
187
+ overlap_threshold,
188
+ stuff_area_thresh,
189
+ instances_score_thresh,
190
+ ):
191
+ """
192
+ Implement a simple combining logic following
193
+ "combine_semantic_and_instance_predictions.py" in panopticapi
194
+ to produce panoptic segmentation outputs.
195
+
196
+ Args:
197
+ instance_results: output of :func:`detector_postprocess`.
198
+ semantic_results: an (H, W) tensor, each element is the contiguous semantic
199
+ category id
200
+
201
+ Returns:
202
+ panoptic_seg (Tensor): of shape (height, width) where the values are ids for each segment.
203
+ segments_info (list[dict]): Describe each segment in `panoptic_seg`.
204
+ Each dict contains keys "id", "category_id", "isthing".
205
+ """
206
+ panoptic_seg = torch.zeros_like(semantic_results, dtype=torch.int32)
207
+
208
+ # sort instance outputs by scores
209
+ sorted_inds = torch.argsort(-instance_results.scores)
210
+
211
+ current_segment_id = 0
212
+ segments_info = []
213
+
214
+ instance_masks = instance_results.pred_masks.to(dtype=torch.bool, device=panoptic_seg.device)
215
+
216
+ # Add instances one-by-one, check for overlaps with existing ones
217
+ for inst_id in sorted_inds:
218
+ score = instance_results.scores[inst_id].item()
219
+ if score < instances_score_thresh:
220
+ break
221
+ mask = instance_masks[inst_id] # H,W
222
+ mask_area = mask.sum().item()
223
+
224
+ if mask_area == 0:
225
+ continue
226
+
227
+ intersect = (mask > 0) & (panoptic_seg > 0)
228
+ intersect_area = intersect.sum().item()
229
+
230
+ if intersect_area * 1.0 / mask_area > overlap_threshold:
231
+ continue
232
+
233
+ if intersect_area > 0:
234
+ mask = mask & (panoptic_seg == 0)
235
+
236
+ current_segment_id += 1
237
+ panoptic_seg[mask] = current_segment_id
238
+ segments_info.append(
239
+ {
240
+ "id": current_segment_id,
241
+ "isthing": True,
242
+ "score": score,
243
+ "category_id": instance_results.pred_classes[inst_id].item(),
244
+ "instance_id": inst_id.item(),
245
+ }
246
+ )
247
+
248
+ # Add semantic results to remaining empty areas
249
+ semantic_labels = torch.unique(semantic_results).cpu().tolist()
250
+ for semantic_label in semantic_labels:
251
+ if semantic_label == 0: # 0 is a special "thing" class
252
+ continue
253
+ mask = (semantic_results == semantic_label) & (panoptic_seg == 0)
254
+ mask_area = mask.sum().item()
255
+ if mask_area < stuff_area_thresh:
256
+ continue
257
+
258
+ current_segment_id += 1
259
+ panoptic_seg[mask] = current_segment_id
260
+ segments_info.append(
261
+ {
262
+ "id": current_segment_id,
263
+ "isthing": False,
264
+ "category_id": semantic_label,
265
+ "area": mask_area,
266
+ }
267
+ )
268
+
269
+ return panoptic_seg, segments_info
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/meta_arch/rcnn.py ADDED
@@ -0,0 +1,341 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ import logging
3
+ import numpy as np
4
+ from typing import Dict, List, Optional, Tuple
5
+ import torch
6
+ from torch import nn
7
+
8
+ from annotator.oneformer.detectron2.config import configurable
9
+ from annotator.oneformer.detectron2.data.detection_utils import convert_image_to_rgb
10
+ from annotator.oneformer.detectron2.layers import move_device_like
11
+ from annotator.oneformer.detectron2.structures import ImageList, Instances
12
+ from annotator.oneformer.detectron2.utils.events import get_event_storage
13
+ from annotator.oneformer.detectron2.utils.logger import log_first_n
14
+
15
+ from ..backbone import Backbone, build_backbone
16
+ from ..postprocessing import detector_postprocess
17
+ from ..proposal_generator import build_proposal_generator
18
+ from ..roi_heads import build_roi_heads
19
+ from .build import META_ARCH_REGISTRY
20
+
21
+ __all__ = ["GeneralizedRCNN", "ProposalNetwork"]
22
+
23
+
24
+ @META_ARCH_REGISTRY.register()
25
+ class GeneralizedRCNN(nn.Module):
26
+ """
27
+ Generalized R-CNN. Any models that contains the following three components:
28
+ 1. Per-image feature extraction (aka backbone)
29
+ 2. Region proposal generation
30
+ 3. Per-region feature extraction and prediction
31
+ """
32
+
33
+ @configurable
34
+ def __init__(
35
+ self,
36
+ *,
37
+ backbone: Backbone,
38
+ proposal_generator: nn.Module,
39
+ roi_heads: nn.Module,
40
+ pixel_mean: Tuple[float],
41
+ pixel_std: Tuple[float],
42
+ input_format: Optional[str] = None,
43
+ vis_period: int = 0,
44
+ ):
45
+ """
46
+ Args:
47
+ backbone: a backbone module, must follow detectron2's backbone interface
48
+ proposal_generator: a module that generates proposals using backbone features
49
+ roi_heads: a ROI head that performs per-region computation
50
+ pixel_mean, pixel_std: list or tuple with #channels element, representing
51
+ the per-channel mean and std to be used to normalize the input image
52
+ input_format: describe the meaning of channels of input. Needed by visualization
53
+ vis_period: the period to run visualization. Set to 0 to disable.
54
+ """
55
+ super().__init__()
56
+ self.backbone = backbone
57
+ self.proposal_generator = proposal_generator
58
+ self.roi_heads = roi_heads
59
+
60
+ self.input_format = input_format
61
+ self.vis_period = vis_period
62
+ if vis_period > 0:
63
+ assert input_format is not None, "input_format is required for visualization!"
64
+
65
+ self.register_buffer("pixel_mean", torch.tensor(pixel_mean).view(-1, 1, 1), False)
66
+ self.register_buffer("pixel_std", torch.tensor(pixel_std).view(-1, 1, 1), False)
67
+ assert (
68
+ self.pixel_mean.shape == self.pixel_std.shape
69
+ ), f"{self.pixel_mean} and {self.pixel_std} have different shapes!"
70
+
71
+ @classmethod
72
+ def from_config(cls, cfg):
73
+ backbone = build_backbone(cfg)
74
+ return {
75
+ "backbone": backbone,
76
+ "proposal_generator": build_proposal_generator(cfg, backbone.output_shape()),
77
+ "roi_heads": build_roi_heads(cfg, backbone.output_shape()),
78
+ "input_format": cfg.INPUT.FORMAT,
79
+ "vis_period": cfg.VIS_PERIOD,
80
+ "pixel_mean": cfg.MODEL.PIXEL_MEAN,
81
+ "pixel_std": cfg.MODEL.PIXEL_STD,
82
+ }
83
+
84
+ @property
85
+ def device(self):
86
+ return self.pixel_mean.device
87
+
88
+ def _move_to_current_device(self, x):
89
+ return move_device_like(x, self.pixel_mean)
90
+
91
+ def visualize_training(self, batched_inputs, proposals):
92
+ """
93
+ A function used to visualize images and proposals. It shows ground truth
94
+ bounding boxes on the original image and up to 20 top-scoring predicted
95
+ object proposals on the original image. Users can implement different
96
+ visualization functions for different models.
97
+
98
+ Args:
99
+ batched_inputs (list): a list that contains input to the model.
100
+ proposals (list): a list that contains predicted proposals. Both
101
+ batched_inputs and proposals should have the same length.
102
+ """
103
+ from annotator.oneformer.detectron2.utils.visualizer import Visualizer
104
+
105
+ storage = get_event_storage()
106
+ max_vis_prop = 20
107
+
108
+ for input, prop in zip(batched_inputs, proposals):
109
+ img = input["image"]
110
+ img = convert_image_to_rgb(img.permute(1, 2, 0), self.input_format)
111
+ v_gt = Visualizer(img, None)
112
+ v_gt = v_gt.overlay_instances(boxes=input["instances"].gt_boxes)
113
+ anno_img = v_gt.get_image()
114
+ box_size = min(len(prop.proposal_boxes), max_vis_prop)
115
+ v_pred = Visualizer(img, None)
116
+ v_pred = v_pred.overlay_instances(
117
+ boxes=prop.proposal_boxes[0:box_size].tensor.cpu().numpy()
118
+ )
119
+ prop_img = v_pred.get_image()
120
+ vis_img = np.concatenate((anno_img, prop_img), axis=1)
121
+ vis_img = vis_img.transpose(2, 0, 1)
122
+ vis_name = "Left: GT bounding boxes; Right: Predicted proposals"
123
+ storage.put_image(vis_name, vis_img)
124
+ break # only visualize one image in a batch
125
+
126
+ def forward(self, batched_inputs: List[Dict[str, torch.Tensor]]):
127
+ """
128
+ Args:
129
+ batched_inputs: a list, batched outputs of :class:`DatasetMapper` .
130
+ Each item in the list contains the inputs for one image.
131
+ For now, each item in the list is a dict that contains:
132
+
133
+ * image: Tensor, image in (C, H, W) format.
134
+ * instances (optional): groundtruth :class:`Instances`
135
+ * proposals (optional): :class:`Instances`, precomputed proposals.
136
+
137
+ Other information that's included in the original dicts, such as:
138
+
139
+ * "height", "width" (int): the output resolution of the model, used in inference.
140
+ See :meth:`postprocess` for details.
141
+
142
+ Returns:
143
+ list[dict]:
144
+ Each dict is the output for one input image.
145
+ The dict contains one key "instances" whose value is a :class:`Instances`.
146
+ The :class:`Instances` object has the following keys:
147
+ "pred_boxes", "pred_classes", "scores", "pred_masks", "pred_keypoints"
148
+ """
149
+ if not self.training:
150
+ return self.inference(batched_inputs)
151
+
152
+ images = self.preprocess_image(batched_inputs)
153
+ if "instances" in batched_inputs[0]:
154
+ gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
155
+ else:
156
+ gt_instances = None
157
+
158
+ features = self.backbone(images.tensor)
159
+
160
+ if self.proposal_generator is not None:
161
+ proposals, proposal_losses = self.proposal_generator(images, features, gt_instances)
162
+ else:
163
+ assert "proposals" in batched_inputs[0]
164
+ proposals = [x["proposals"].to(self.device) for x in batched_inputs]
165
+ proposal_losses = {}
166
+
167
+ _, detector_losses = self.roi_heads(images, features, proposals, gt_instances)
168
+ if self.vis_period > 0:
169
+ storage = get_event_storage()
170
+ if storage.iter % self.vis_period == 0:
171
+ self.visualize_training(batched_inputs, proposals)
172
+
173
+ losses = {}
174
+ losses.update(detector_losses)
175
+ losses.update(proposal_losses)
176
+ return losses
177
+
178
+ def inference(
179
+ self,
180
+ batched_inputs: List[Dict[str, torch.Tensor]],
181
+ detected_instances: Optional[List[Instances]] = None,
182
+ do_postprocess: bool = True,
183
+ ):
184
+ """
185
+ Run inference on the given inputs.
186
+
187
+ Args:
188
+ batched_inputs (list[dict]): same as in :meth:`forward`
189
+ detected_instances (None or list[Instances]): if not None, it
190
+ contains an `Instances` object per image. The `Instances`
191
+ object contains "pred_boxes" and "pred_classes" which are
192
+ known boxes in the image.
193
+ The inference will then skip the detection of bounding boxes,
194
+ and only predict other per-ROI outputs.
195
+ do_postprocess (bool): whether to apply post-processing on the outputs.
196
+
197
+ Returns:
198
+ When do_postprocess=True, same as in :meth:`forward`.
199
+ Otherwise, a list[Instances] containing raw network outputs.
200
+ """
201
+ assert not self.training
202
+
203
+ images = self.preprocess_image(batched_inputs)
204
+ features = self.backbone(images.tensor)
205
+
206
+ if detected_instances is None:
207
+ if self.proposal_generator is not None:
208
+ proposals, _ = self.proposal_generator(images, features, None)
209
+ else:
210
+ assert "proposals" in batched_inputs[0]
211
+ proposals = [x["proposals"].to(self.device) for x in batched_inputs]
212
+
213
+ results, _ = self.roi_heads(images, features, proposals, None)
214
+ else:
215
+ detected_instances = [x.to(self.device) for x in detected_instances]
216
+ results = self.roi_heads.forward_with_given_boxes(features, detected_instances)
217
+
218
+ if do_postprocess:
219
+ assert not torch.jit.is_scripting(), "Scripting is not supported for postprocess."
220
+ return GeneralizedRCNN._postprocess(results, batched_inputs, images.image_sizes)
221
+ return results
222
+
223
+ def preprocess_image(self, batched_inputs: List[Dict[str, torch.Tensor]]):
224
+ """
225
+ Normalize, pad and batch the input images.
226
+ """
227
+ images = [self._move_to_current_device(x["image"]) for x in batched_inputs]
228
+ images = [(x - self.pixel_mean) / self.pixel_std for x in images]
229
+ images = ImageList.from_tensors(
230
+ images,
231
+ self.backbone.size_divisibility,
232
+ padding_constraints=self.backbone.padding_constraints,
233
+ )
234
+ return images
235
+
236
+ @staticmethod
237
+ def _postprocess(instances, batched_inputs: List[Dict[str, torch.Tensor]], image_sizes):
238
+ """
239
+ Rescale the output instances to the target size.
240
+ """
241
+ # note: private function; subject to changes
242
+ processed_results = []
243
+ for results_per_image, input_per_image, image_size in zip(
244
+ instances, batched_inputs, image_sizes
245
+ ):
246
+ height = input_per_image.get("height", image_size[0])
247
+ width = input_per_image.get("width", image_size[1])
248
+ r = detector_postprocess(results_per_image, height, width)
249
+ processed_results.append({"instances": r})
250
+ return processed_results
251
+
252
+
253
+ @META_ARCH_REGISTRY.register()
254
+ class ProposalNetwork(nn.Module):
255
+ """
256
+ A meta architecture that only predicts object proposals.
257
+ """
258
+
259
+ @configurable
260
+ def __init__(
261
+ self,
262
+ *,
263
+ backbone: Backbone,
264
+ proposal_generator: nn.Module,
265
+ pixel_mean: Tuple[float],
266
+ pixel_std: Tuple[float],
267
+ ):
268
+ """
269
+ Args:
270
+ backbone: a backbone module, must follow detectron2's backbone interface
271
+ proposal_generator: a module that generates proposals using backbone features
272
+ pixel_mean, pixel_std: list or tuple with #channels element, representing
273
+ the per-channel mean and std to be used to normalize the input image
274
+ """
275
+ super().__init__()
276
+ self.backbone = backbone
277
+ self.proposal_generator = proposal_generator
278
+ self.register_buffer("pixel_mean", torch.tensor(pixel_mean).view(-1, 1, 1), False)
279
+ self.register_buffer("pixel_std", torch.tensor(pixel_std).view(-1, 1, 1), False)
280
+
281
+ @classmethod
282
+ def from_config(cls, cfg):
283
+ backbone = build_backbone(cfg)
284
+ return {
285
+ "backbone": backbone,
286
+ "proposal_generator": build_proposal_generator(cfg, backbone.output_shape()),
287
+ "pixel_mean": cfg.MODEL.PIXEL_MEAN,
288
+ "pixel_std": cfg.MODEL.PIXEL_STD,
289
+ }
290
+
291
+ @property
292
+ def device(self):
293
+ return self.pixel_mean.device
294
+
295
+ def _move_to_current_device(self, x):
296
+ return move_device_like(x, self.pixel_mean)
297
+
298
+ def forward(self, batched_inputs):
299
+ """
300
+ Args:
301
+ Same as in :class:`GeneralizedRCNN.forward`
302
+
303
+ Returns:
304
+ list[dict]:
305
+ Each dict is the output for one input image.
306
+ The dict contains one key "proposals" whose value is a
307
+ :class:`Instances` with keys "proposal_boxes" and "objectness_logits".
308
+ """
309
+ images = [self._move_to_current_device(x["image"]) for x in batched_inputs]
310
+ images = [(x - self.pixel_mean) / self.pixel_std for x in images]
311
+ images = ImageList.from_tensors(
312
+ images,
313
+ self.backbone.size_divisibility,
314
+ padding_constraints=self.backbone.padding_constraints,
315
+ )
316
+ features = self.backbone(images.tensor)
317
+
318
+ if "instances" in batched_inputs[0]:
319
+ gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
320
+ elif "targets" in batched_inputs[0]:
321
+ log_first_n(
322
+ logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10
323
+ )
324
+ gt_instances = [x["targets"].to(self.device) for x in batched_inputs]
325
+ else:
326
+ gt_instances = None
327
+ proposals, proposal_losses = self.proposal_generator(images, features, gt_instances)
328
+ # In training, the proposals are not useful at all but we generate them anyway.
329
+ # This makes RPN-only models about 5% slower.
330
+ if self.training:
331
+ return proposal_losses
332
+
333
+ processed_results = []
334
+ for results_per_image, input_per_image, image_size in zip(
335
+ proposals, batched_inputs, images.image_sizes
336
+ ):
337
+ height = input_per_image.get("height", image_size[0])
338
+ width = input_per_image.get("width", image_size[1])
339
+ r = detector_postprocess(results_per_image, height, width)
340
+ processed_results.append({"proposals": r})
341
+ return processed_results
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/meta_arch/retinanet.py ADDED
@@ -0,0 +1,439 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ import logging
3
+ import math
4
+ from typing import List, Tuple
5
+ import torch
6
+ from fvcore.nn import sigmoid_focal_loss_jit
7
+ from torch import Tensor, nn
8
+ from torch.nn import functional as F
9
+
10
+ from annotator.oneformer.detectron2.config import configurable
11
+ from annotator.oneformer.detectron2.layers import CycleBatchNormList, ShapeSpec, batched_nms, cat, get_norm
12
+ from annotator.oneformer.detectron2.structures import Boxes, ImageList, Instances, pairwise_iou
13
+ from annotator.oneformer.detectron2.utils.events import get_event_storage
14
+
15
+ from ..anchor_generator import build_anchor_generator
16
+ from ..backbone import Backbone, build_backbone
17
+ from ..box_regression import Box2BoxTransform, _dense_box_regression_loss
18
+ from ..matcher import Matcher
19
+ from .build import META_ARCH_REGISTRY
20
+ from .dense_detector import DenseDetector, permute_to_N_HWA_K # noqa
21
+
22
+ __all__ = ["RetinaNet"]
23
+
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ @META_ARCH_REGISTRY.register()
29
+ class RetinaNet(DenseDetector):
30
+ """
31
+ Implement RetinaNet in :paper:`RetinaNet`.
32
+ """
33
+
34
+ @configurable
35
+ def __init__(
36
+ self,
37
+ *,
38
+ backbone: Backbone,
39
+ head: nn.Module,
40
+ head_in_features,
41
+ anchor_generator,
42
+ box2box_transform,
43
+ anchor_matcher,
44
+ num_classes,
45
+ focal_loss_alpha=0.25,
46
+ focal_loss_gamma=2.0,
47
+ smooth_l1_beta=0.0,
48
+ box_reg_loss_type="smooth_l1",
49
+ test_score_thresh=0.05,
50
+ test_topk_candidates=1000,
51
+ test_nms_thresh=0.5,
52
+ max_detections_per_image=100,
53
+ pixel_mean,
54
+ pixel_std,
55
+ vis_period=0,
56
+ input_format="BGR",
57
+ ):
58
+ """
59
+ NOTE: this interface is experimental.
60
+
61
+ Args:
62
+ backbone: a backbone module, must follow detectron2's backbone interface
63
+ head (nn.Module): a module that predicts logits and regression deltas
64
+ for each level from a list of per-level features
65
+ head_in_features (Tuple[str]): Names of the input feature maps to be used in head
66
+ anchor_generator (nn.Module): a module that creates anchors from a
67
+ list of features. Usually an instance of :class:`AnchorGenerator`
68
+ box2box_transform (Box2BoxTransform): defines the transform from anchors boxes to
69
+ instance boxes
70
+ anchor_matcher (Matcher): label the anchors by matching them with ground truth.
71
+ num_classes (int): number of classes. Used to label background proposals.
72
+
73
+ # Loss parameters:
74
+ focal_loss_alpha (float): focal_loss_alpha
75
+ focal_loss_gamma (float): focal_loss_gamma
76
+ smooth_l1_beta (float): smooth_l1_beta
77
+ box_reg_loss_type (str): Options are "smooth_l1", "giou", "diou", "ciou"
78
+
79
+ # Inference parameters:
80
+ test_score_thresh (float): Inference cls score threshold, only anchors with
81
+ score > INFERENCE_TH are considered for inference (to improve speed)
82
+ test_topk_candidates (int): Select topk candidates before NMS
83
+ test_nms_thresh (float): Overlap threshold used for non-maximum suppression
84
+ (suppress boxes with IoU >= this threshold)
85
+ max_detections_per_image (int):
86
+ Maximum number of detections to return per image during inference
87
+ (100 is based on the limit established for the COCO dataset).
88
+
89
+ pixel_mean, pixel_std: see :class:`DenseDetector`.
90
+ """
91
+ super().__init__(
92
+ backbone, head, head_in_features, pixel_mean=pixel_mean, pixel_std=pixel_std
93
+ )
94
+ self.num_classes = num_classes
95
+
96
+ # Anchors
97
+ self.anchor_generator = anchor_generator
98
+ self.box2box_transform = box2box_transform
99
+ self.anchor_matcher = anchor_matcher
100
+
101
+ # Loss parameters:
102
+ self.focal_loss_alpha = focal_loss_alpha
103
+ self.focal_loss_gamma = focal_loss_gamma
104
+ self.smooth_l1_beta = smooth_l1_beta
105
+ self.box_reg_loss_type = box_reg_loss_type
106
+ # Inference parameters:
107
+ self.test_score_thresh = test_score_thresh
108
+ self.test_topk_candidates = test_topk_candidates
109
+ self.test_nms_thresh = test_nms_thresh
110
+ self.max_detections_per_image = max_detections_per_image
111
+ # Vis parameters
112
+ self.vis_period = vis_period
113
+ self.input_format = input_format
114
+
115
+ @classmethod
116
+ def from_config(cls, cfg):
117
+ backbone = build_backbone(cfg)
118
+ backbone_shape = backbone.output_shape()
119
+ feature_shapes = [backbone_shape[f] for f in cfg.MODEL.RETINANET.IN_FEATURES]
120
+ head = RetinaNetHead(cfg, feature_shapes)
121
+ anchor_generator = build_anchor_generator(cfg, feature_shapes)
122
+ return {
123
+ "backbone": backbone,
124
+ "head": head,
125
+ "anchor_generator": anchor_generator,
126
+ "box2box_transform": Box2BoxTransform(weights=cfg.MODEL.RETINANET.BBOX_REG_WEIGHTS),
127
+ "anchor_matcher": Matcher(
128
+ cfg.MODEL.RETINANET.IOU_THRESHOLDS,
129
+ cfg.MODEL.RETINANET.IOU_LABELS,
130
+ allow_low_quality_matches=True,
131
+ ),
132
+ "pixel_mean": cfg.MODEL.PIXEL_MEAN,
133
+ "pixel_std": cfg.MODEL.PIXEL_STD,
134
+ "num_classes": cfg.MODEL.RETINANET.NUM_CLASSES,
135
+ "head_in_features": cfg.MODEL.RETINANET.IN_FEATURES,
136
+ # Loss parameters:
137
+ "focal_loss_alpha": cfg.MODEL.RETINANET.FOCAL_LOSS_ALPHA,
138
+ "focal_loss_gamma": cfg.MODEL.RETINANET.FOCAL_LOSS_GAMMA,
139
+ "smooth_l1_beta": cfg.MODEL.RETINANET.SMOOTH_L1_LOSS_BETA,
140
+ "box_reg_loss_type": cfg.MODEL.RETINANET.BBOX_REG_LOSS_TYPE,
141
+ # Inference parameters:
142
+ "test_score_thresh": cfg.MODEL.RETINANET.SCORE_THRESH_TEST,
143
+ "test_topk_candidates": cfg.MODEL.RETINANET.TOPK_CANDIDATES_TEST,
144
+ "test_nms_thresh": cfg.MODEL.RETINANET.NMS_THRESH_TEST,
145
+ "max_detections_per_image": cfg.TEST.DETECTIONS_PER_IMAGE,
146
+ # Vis parameters
147
+ "vis_period": cfg.VIS_PERIOD,
148
+ "input_format": cfg.INPUT.FORMAT,
149
+ }
150
+
151
+ def forward_training(self, images, features, predictions, gt_instances):
152
+ # Transpose the Hi*Wi*A dimension to the middle:
153
+ pred_logits, pred_anchor_deltas = self._transpose_dense_predictions(
154
+ predictions, [self.num_classes, 4]
155
+ )
156
+ anchors = self.anchor_generator(features)
157
+ gt_labels, gt_boxes = self.label_anchors(anchors, gt_instances)
158
+ return self.losses(anchors, pred_logits, gt_labels, pred_anchor_deltas, gt_boxes)
159
+
160
+ def losses(self, anchors, pred_logits, gt_labels, pred_anchor_deltas, gt_boxes):
161
+ """
162
+ Args:
163
+ anchors (list[Boxes]): a list of #feature level Boxes
164
+ gt_labels, gt_boxes: see output of :meth:`RetinaNet.label_anchors`.
165
+ Their shapes are (N, R) and (N, R, 4), respectively, where R is
166
+ the total number of anchors across levels, i.e. sum(Hi x Wi x Ai)
167
+ pred_logits, pred_anchor_deltas: both are list[Tensor]. Each element in the
168
+ list corresponds to one level and has shape (N, Hi * Wi * Ai, K or 4).
169
+ Where K is the number of classes used in `pred_logits`.
170
+
171
+ Returns:
172
+ dict[str, Tensor]:
173
+ mapping from a named loss to a scalar tensor storing the loss.
174
+ Used during training only. The dict keys are: "loss_cls" and "loss_box_reg"
175
+ """
176
+ num_images = len(gt_labels)
177
+ gt_labels = torch.stack(gt_labels) # (N, R)
178
+
179
+ valid_mask = gt_labels >= 0
180
+ pos_mask = (gt_labels >= 0) & (gt_labels != self.num_classes)
181
+ num_pos_anchors = pos_mask.sum().item()
182
+ get_event_storage().put_scalar("num_pos_anchors", num_pos_anchors / num_images)
183
+ normalizer = self._ema_update("loss_normalizer", max(num_pos_anchors, 1), 100)
184
+
185
+ # classification and regression loss
186
+ gt_labels_target = F.one_hot(gt_labels[valid_mask], num_classes=self.num_classes + 1)[
187
+ :, :-1
188
+ ] # no loss for the last (background) class
189
+ loss_cls = sigmoid_focal_loss_jit(
190
+ cat(pred_logits, dim=1)[valid_mask],
191
+ gt_labels_target.to(pred_logits[0].dtype),
192
+ alpha=self.focal_loss_alpha,
193
+ gamma=self.focal_loss_gamma,
194
+ reduction="sum",
195
+ )
196
+
197
+ loss_box_reg = _dense_box_regression_loss(
198
+ anchors,
199
+ self.box2box_transform,
200
+ pred_anchor_deltas,
201
+ gt_boxes,
202
+ pos_mask,
203
+ box_reg_loss_type=self.box_reg_loss_type,
204
+ smooth_l1_beta=self.smooth_l1_beta,
205
+ )
206
+
207
+ return {
208
+ "loss_cls": loss_cls / normalizer,
209
+ "loss_box_reg": loss_box_reg / normalizer,
210
+ }
211
+
212
+ @torch.no_grad()
213
+ def label_anchors(self, anchors, gt_instances):
214
+ """
215
+ Args:
216
+ anchors (list[Boxes]): A list of #feature level Boxes.
217
+ The Boxes contains anchors of this image on the specific feature level.
218
+ gt_instances (list[Instances]): a list of N `Instances`s. The i-th
219
+ `Instances` contains the ground-truth per-instance annotations
220
+ for the i-th input image.
221
+
222
+ Returns:
223
+ list[Tensor]: List of #img tensors. i-th element is a vector of labels whose length is
224
+ the total number of anchors across all feature maps (sum(Hi * Wi * A)).
225
+ Label values are in {-1, 0, ..., K}, with -1 means ignore, and K means background.
226
+
227
+ list[Tensor]: i-th element is a Rx4 tensor, where R is the total number of anchors
228
+ across feature maps. The values are the matched gt boxes for each anchor.
229
+ Values are undefined for those anchors not labeled as foreground.
230
+ """
231
+ anchors = Boxes.cat(anchors) # Rx4
232
+
233
+ gt_labels = []
234
+ matched_gt_boxes = []
235
+ for gt_per_image in gt_instances:
236
+ match_quality_matrix = pairwise_iou(gt_per_image.gt_boxes, anchors)
237
+ matched_idxs, anchor_labels = self.anchor_matcher(match_quality_matrix)
238
+ del match_quality_matrix
239
+
240
+ if len(gt_per_image) > 0:
241
+ matched_gt_boxes_i = gt_per_image.gt_boxes.tensor[matched_idxs]
242
+
243
+ gt_labels_i = gt_per_image.gt_classes[matched_idxs]
244
+ # Anchors with label 0 are treated as background.
245
+ gt_labels_i[anchor_labels == 0] = self.num_classes
246
+ # Anchors with label -1 are ignored.
247
+ gt_labels_i[anchor_labels == -1] = -1
248
+ else:
249
+ matched_gt_boxes_i = torch.zeros_like(anchors.tensor)
250
+ gt_labels_i = torch.zeros_like(matched_idxs) + self.num_classes
251
+
252
+ gt_labels.append(gt_labels_i)
253
+ matched_gt_boxes.append(matched_gt_boxes_i)
254
+
255
+ return gt_labels, matched_gt_boxes
256
+
257
+ def forward_inference(
258
+ self, images: ImageList, features: List[Tensor], predictions: List[List[Tensor]]
259
+ ):
260
+ pred_logits, pred_anchor_deltas = self._transpose_dense_predictions(
261
+ predictions, [self.num_classes, 4]
262
+ )
263
+ anchors = self.anchor_generator(features)
264
+
265
+ results: List[Instances] = []
266
+ for img_idx, image_size in enumerate(images.image_sizes):
267
+ scores_per_image = [x[img_idx].sigmoid_() for x in pred_logits]
268
+ deltas_per_image = [x[img_idx] for x in pred_anchor_deltas]
269
+ results_per_image = self.inference_single_image(
270
+ anchors, scores_per_image, deltas_per_image, image_size
271
+ )
272
+ results.append(results_per_image)
273
+ return results
274
+
275
+ def inference_single_image(
276
+ self,
277
+ anchors: List[Boxes],
278
+ box_cls: List[Tensor],
279
+ box_delta: List[Tensor],
280
+ image_size: Tuple[int, int],
281
+ ):
282
+ """
283
+ Single-image inference. Return bounding-box detection results by thresholding
284
+ on scores and applying non-maximum suppression (NMS).
285
+
286
+ Arguments:
287
+ anchors (list[Boxes]): list of #feature levels. Each entry contains
288
+ a Boxes object, which contains all the anchors in that feature level.
289
+ box_cls (list[Tensor]): list of #feature levels. Each entry contains
290
+ tensor of size (H x W x A, K)
291
+ box_delta (list[Tensor]): Same shape as 'box_cls' except that K becomes 4.
292
+ image_size (tuple(H, W)): a tuple of the image height and width.
293
+
294
+ Returns:
295
+ Same as `inference`, but for only one image.
296
+ """
297
+ pred = self._decode_multi_level_predictions(
298
+ anchors,
299
+ box_cls,
300
+ box_delta,
301
+ self.test_score_thresh,
302
+ self.test_topk_candidates,
303
+ image_size,
304
+ )
305
+ keep = batched_nms( # per-class NMS
306
+ pred.pred_boxes.tensor, pred.scores, pred.pred_classes, self.test_nms_thresh
307
+ )
308
+ return pred[keep[: self.max_detections_per_image]]
309
+
310
+
311
+ class RetinaNetHead(nn.Module):
312
+ """
313
+ The head used in RetinaNet for object classification and box regression.
314
+ It has two subnets for the two tasks, with a common structure but separate parameters.
315
+ """
316
+
317
+ @configurable
318
+ def __init__(
319
+ self,
320
+ *,
321
+ input_shape: List[ShapeSpec],
322
+ num_classes,
323
+ num_anchors,
324
+ conv_dims: List[int],
325
+ norm="",
326
+ prior_prob=0.01,
327
+ ):
328
+ """
329
+ NOTE: this interface is experimental.
330
+
331
+ Args:
332
+ input_shape (List[ShapeSpec]): input shape
333
+ num_classes (int): number of classes. Used to label background proposals.
334
+ num_anchors (int): number of generated anchors
335
+ conv_dims (List[int]): dimensions for each convolution layer
336
+ norm (str or callable):
337
+ Normalization for conv layers except for the two output layers.
338
+ See :func:`detectron2.layers.get_norm` for supported types.
339
+ prior_prob (float): Prior weight for computing bias
340
+ """
341
+ super().__init__()
342
+
343
+ self._num_features = len(input_shape)
344
+ if norm == "BN" or norm == "SyncBN":
345
+ logger.info(
346
+ f"Using domain-specific {norm} in RetinaNetHead with len={self._num_features}."
347
+ )
348
+ bn_class = nn.BatchNorm2d if norm == "BN" else nn.SyncBatchNorm
349
+
350
+ def norm(c):
351
+ return CycleBatchNormList(
352
+ length=self._num_features, bn_class=bn_class, num_features=c
353
+ )
354
+
355
+ else:
356
+ norm_name = str(type(get_norm(norm, 32)))
357
+ if "BN" in norm_name:
358
+ logger.warning(
359
+ f"Shared BatchNorm (type={norm_name}) may not work well in RetinaNetHead."
360
+ )
361
+
362
+ cls_subnet = []
363
+ bbox_subnet = []
364
+ for in_channels, out_channels in zip(
365
+ [input_shape[0].channels] + list(conv_dims), conv_dims
366
+ ):
367
+ cls_subnet.append(
368
+ nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
369
+ )
370
+ if norm:
371
+ cls_subnet.append(get_norm(norm, out_channels))
372
+ cls_subnet.append(nn.ReLU())
373
+ bbox_subnet.append(
374
+ nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
375
+ )
376
+ if norm:
377
+ bbox_subnet.append(get_norm(norm, out_channels))
378
+ bbox_subnet.append(nn.ReLU())
379
+
380
+ self.cls_subnet = nn.Sequential(*cls_subnet)
381
+ self.bbox_subnet = nn.Sequential(*bbox_subnet)
382
+ self.cls_score = nn.Conv2d(
383
+ conv_dims[-1], num_anchors * num_classes, kernel_size=3, stride=1, padding=1
384
+ )
385
+ self.bbox_pred = nn.Conv2d(
386
+ conv_dims[-1], num_anchors * 4, kernel_size=3, stride=1, padding=1
387
+ )
388
+
389
+ # Initialization
390
+ for modules in [self.cls_subnet, self.bbox_subnet, self.cls_score, self.bbox_pred]:
391
+ for layer in modules.modules():
392
+ if isinstance(layer, nn.Conv2d):
393
+ torch.nn.init.normal_(layer.weight, mean=0, std=0.01)
394
+ torch.nn.init.constant_(layer.bias, 0)
395
+
396
+ # Use prior in model initialization to improve stability
397
+ bias_value = -(math.log((1 - prior_prob) / prior_prob))
398
+ torch.nn.init.constant_(self.cls_score.bias, bias_value)
399
+
400
+ @classmethod
401
+ def from_config(cls, cfg, input_shape: List[ShapeSpec]):
402
+ num_anchors = build_anchor_generator(cfg, input_shape).num_cell_anchors
403
+ assert (
404
+ len(set(num_anchors)) == 1
405
+ ), "Using different number of anchors between levels is not currently supported!"
406
+ num_anchors = num_anchors[0]
407
+
408
+ return {
409
+ "input_shape": input_shape,
410
+ "num_classes": cfg.MODEL.RETINANET.NUM_CLASSES,
411
+ "conv_dims": [input_shape[0].channels] * cfg.MODEL.RETINANET.NUM_CONVS,
412
+ "prior_prob": cfg.MODEL.RETINANET.PRIOR_PROB,
413
+ "norm": cfg.MODEL.RETINANET.NORM,
414
+ "num_anchors": num_anchors,
415
+ }
416
+
417
+ def forward(self, features: List[Tensor]):
418
+ """
419
+ Arguments:
420
+ features (list[Tensor]): FPN feature map tensors in high to low resolution.
421
+ Each tensor in the list correspond to different feature levels.
422
+
423
+ Returns:
424
+ logits (list[Tensor]): #lvl tensors, each has shape (N, AxK, Hi, Wi).
425
+ The tensor predicts the classification probability
426
+ at each spatial position for each of the A anchors and K object
427
+ classes.
428
+ bbox_reg (list[Tensor]): #lvl tensors, each has shape (N, Ax4, Hi, Wi).
429
+ The tensor predicts 4-vector (dx,dy,dw,dh) box
430
+ regression values for every anchor. These values are the
431
+ relative offset between the anchor and the ground truth box.
432
+ """
433
+ assert len(features) == self._num_features
434
+ logits = []
435
+ bbox_reg = []
436
+ for feature in features:
437
+ logits.append(self.cls_score(self.cls_subnet(feature)))
438
+ bbox_reg.append(self.bbox_pred(self.bbox_subnet(feature)))
439
+ return logits, bbox_reg
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/meta_arch/semantic_seg.py ADDED
@@ -0,0 +1,267 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ import numpy as np
3
+ from typing import Callable, Dict, Optional, Tuple, Union
4
+ import fvcore.nn.weight_init as weight_init
5
+ import torch
6
+ from torch import nn
7
+ from torch.nn import functional as F
8
+
9
+ from annotator.oneformer.detectron2.config import configurable
10
+ from annotator.oneformer.detectron2.layers import Conv2d, ShapeSpec, get_norm
11
+ from annotator.oneformer.detectron2.structures import ImageList
12
+ from annotator.oneformer.detectron2.utils.registry import Registry
13
+
14
+ from ..backbone import Backbone, build_backbone
15
+ from ..postprocessing import sem_seg_postprocess
16
+ from .build import META_ARCH_REGISTRY
17
+
18
+ __all__ = [
19
+ "SemanticSegmentor",
20
+ "SEM_SEG_HEADS_REGISTRY",
21
+ "SemSegFPNHead",
22
+ "build_sem_seg_head",
23
+ ]
24
+
25
+
26
+ SEM_SEG_HEADS_REGISTRY = Registry("SEM_SEG_HEADS")
27
+ SEM_SEG_HEADS_REGISTRY.__doc__ = """
28
+ Registry for semantic segmentation heads, which make semantic segmentation predictions
29
+ from feature maps.
30
+ """
31
+
32
+
33
+ @META_ARCH_REGISTRY.register()
34
+ class SemanticSegmentor(nn.Module):
35
+ """
36
+ Main class for semantic segmentation architectures.
37
+ """
38
+
39
+ @configurable
40
+ def __init__(
41
+ self,
42
+ *,
43
+ backbone: Backbone,
44
+ sem_seg_head: nn.Module,
45
+ pixel_mean: Tuple[float],
46
+ pixel_std: Tuple[float],
47
+ ):
48
+ """
49
+ Args:
50
+ backbone: a backbone module, must follow detectron2's backbone interface
51
+ sem_seg_head: a module that predicts semantic segmentation from backbone features
52
+ pixel_mean, pixel_std: list or tuple with #channels element, representing
53
+ the per-channel mean and std to be used to normalize the input image
54
+ """
55
+ super().__init__()
56
+ self.backbone = backbone
57
+ self.sem_seg_head = sem_seg_head
58
+ self.register_buffer("pixel_mean", torch.tensor(pixel_mean).view(-1, 1, 1), False)
59
+ self.register_buffer("pixel_std", torch.tensor(pixel_std).view(-1, 1, 1), False)
60
+
61
+ @classmethod
62
+ def from_config(cls, cfg):
63
+ backbone = build_backbone(cfg)
64
+ sem_seg_head = build_sem_seg_head(cfg, backbone.output_shape())
65
+ return {
66
+ "backbone": backbone,
67
+ "sem_seg_head": sem_seg_head,
68
+ "pixel_mean": cfg.MODEL.PIXEL_MEAN,
69
+ "pixel_std": cfg.MODEL.PIXEL_STD,
70
+ }
71
+
72
+ @property
73
+ def device(self):
74
+ return self.pixel_mean.device
75
+
76
+ def forward(self, batched_inputs):
77
+ """
78
+ Args:
79
+ batched_inputs: a list, batched outputs of :class:`DatasetMapper`.
80
+ Each item in the list contains the inputs for one image.
81
+
82
+ For now, each item in the list is a dict that contains:
83
+
84
+ * "image": Tensor, image in (C, H, W) format.
85
+ * "sem_seg": semantic segmentation ground truth
86
+ * Other information that's included in the original dicts, such as:
87
+ "height", "width" (int): the output resolution of the model (may be different
88
+ from input resolution), used in inference.
89
+
90
+
91
+ Returns:
92
+ list[dict]:
93
+ Each dict is the output for one input image.
94
+ The dict contains one key "sem_seg" whose value is a
95
+ Tensor that represents the
96
+ per-pixel segmentation prediced by the head.
97
+ The prediction has shape KxHxW that represents the logits of
98
+ each class for each pixel.
99
+ """
100
+ images = [x["image"].to(self.device) for x in batched_inputs]
101
+ images = [(x - self.pixel_mean) / self.pixel_std for x in images]
102
+ images = ImageList.from_tensors(
103
+ images,
104
+ self.backbone.size_divisibility,
105
+ padding_constraints=self.backbone.padding_constraints,
106
+ )
107
+
108
+ features = self.backbone(images.tensor)
109
+
110
+ if "sem_seg" in batched_inputs[0]:
111
+ targets = [x["sem_seg"].to(self.device) for x in batched_inputs]
112
+ targets = ImageList.from_tensors(
113
+ targets,
114
+ self.backbone.size_divisibility,
115
+ self.sem_seg_head.ignore_value,
116
+ self.backbone.padding_constraints,
117
+ ).tensor
118
+ else:
119
+ targets = None
120
+ results, losses = self.sem_seg_head(features, targets)
121
+
122
+ if self.training:
123
+ return losses
124
+
125
+ processed_results = []
126
+ for result, input_per_image, image_size in zip(results, batched_inputs, images.image_sizes):
127
+ height = input_per_image.get("height", image_size[0])
128
+ width = input_per_image.get("width", image_size[1])
129
+ r = sem_seg_postprocess(result, image_size, height, width)
130
+ processed_results.append({"sem_seg": r})
131
+ return processed_results
132
+
133
+
134
+ def build_sem_seg_head(cfg, input_shape):
135
+ """
136
+ Build a semantic segmentation head from `cfg.MODEL.SEM_SEG_HEAD.NAME`.
137
+ """
138
+ name = cfg.MODEL.SEM_SEG_HEAD.NAME
139
+ return SEM_SEG_HEADS_REGISTRY.get(name)(cfg, input_shape)
140
+
141
+
142
+ @SEM_SEG_HEADS_REGISTRY.register()
143
+ class SemSegFPNHead(nn.Module):
144
+ """
145
+ A semantic segmentation head described in :paper:`PanopticFPN`.
146
+ It takes a list of FPN features as input, and applies a sequence of
147
+ 3x3 convs and upsampling to scale all of them to the stride defined by
148
+ ``common_stride``. Then these features are added and used to make final
149
+ predictions by another 1x1 conv layer.
150
+ """
151
+
152
+ @configurable
153
+ def __init__(
154
+ self,
155
+ input_shape: Dict[str, ShapeSpec],
156
+ *,
157
+ num_classes: int,
158
+ conv_dims: int,
159
+ common_stride: int,
160
+ loss_weight: float = 1.0,
161
+ norm: Optional[Union[str, Callable]] = None,
162
+ ignore_value: int = -1,
163
+ ):
164
+ """
165
+ NOTE: this interface is experimental.
166
+
167
+ Args:
168
+ input_shape: shapes (channels and stride) of the input features
169
+ num_classes: number of classes to predict
170
+ conv_dims: number of output channels for the intermediate conv layers.
171
+ common_stride: the common stride that all features will be upscaled to
172
+ loss_weight: loss weight
173
+ norm (str or callable): normalization for all conv layers
174
+ ignore_value: category id to be ignored during training.
175
+ """
176
+ super().__init__()
177
+ input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride)
178
+ if not len(input_shape):
179
+ raise ValueError("SemSegFPNHead(input_shape=) cannot be empty!")
180
+ self.in_features = [k for k, v in input_shape]
181
+ feature_strides = [v.stride for k, v in input_shape]
182
+ feature_channels = [v.channels for k, v in input_shape]
183
+
184
+ self.ignore_value = ignore_value
185
+ self.common_stride = common_stride
186
+ self.loss_weight = loss_weight
187
+
188
+ self.scale_heads = []
189
+ for in_feature, stride, channels in zip(
190
+ self.in_features, feature_strides, feature_channels
191
+ ):
192
+ head_ops = []
193
+ head_length = max(1, int(np.log2(stride) - np.log2(self.common_stride)))
194
+ for k in range(head_length):
195
+ norm_module = get_norm(norm, conv_dims)
196
+ conv = Conv2d(
197
+ channels if k == 0 else conv_dims,
198
+ conv_dims,
199
+ kernel_size=3,
200
+ stride=1,
201
+ padding=1,
202
+ bias=not norm,
203
+ norm=norm_module,
204
+ activation=F.relu,
205
+ )
206
+ weight_init.c2_msra_fill(conv)
207
+ head_ops.append(conv)
208
+ if stride != self.common_stride:
209
+ head_ops.append(
210
+ nn.Upsample(scale_factor=2, mode="bilinear", align_corners=False)
211
+ )
212
+ self.scale_heads.append(nn.Sequential(*head_ops))
213
+ self.add_module(in_feature, self.scale_heads[-1])
214
+ self.predictor = Conv2d(conv_dims, num_classes, kernel_size=1, stride=1, padding=0)
215
+ weight_init.c2_msra_fill(self.predictor)
216
+
217
+ @classmethod
218
+ def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
219
+ return {
220
+ "input_shape": {
221
+ k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES
222
+ },
223
+ "ignore_value": cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
224
+ "num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES,
225
+ "conv_dims": cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM,
226
+ "common_stride": cfg.MODEL.SEM_SEG_HEAD.COMMON_STRIDE,
227
+ "norm": cfg.MODEL.SEM_SEG_HEAD.NORM,
228
+ "loss_weight": cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT,
229
+ }
230
+
231
+ def forward(self, features, targets=None):
232
+ """
233
+ Returns:
234
+ In training, returns (None, dict of losses)
235
+ In inference, returns (CxHxW logits, {})
236
+ """
237
+ x = self.layers(features)
238
+ if self.training:
239
+ return None, self.losses(x, targets)
240
+ else:
241
+ x = F.interpolate(
242
+ x, scale_factor=self.common_stride, mode="bilinear", align_corners=False
243
+ )
244
+ return x, {}
245
+
246
+ def layers(self, features):
247
+ for i, f in enumerate(self.in_features):
248
+ if i == 0:
249
+ x = self.scale_heads[i](features[f])
250
+ else:
251
+ x = x + self.scale_heads[i](features[f])
252
+ x = self.predictor(x)
253
+ return x
254
+
255
+ def losses(self, predictions, targets):
256
+ predictions = predictions.float() # https://github.com/pytorch/pytorch/issues/48163
257
+ predictions = F.interpolate(
258
+ predictions,
259
+ scale_factor=self.common_stride,
260
+ mode="bilinear",
261
+ align_corners=False,
262
+ )
263
+ loss = F.cross_entropy(
264
+ predictions, targets, reduction="mean", ignore_index=self.ignore_value
265
+ )
266
+ losses = {"loss_sem_seg": loss * self.loss_weight}
267
+ return losses
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/mmdet_wrapper.py ADDED
@@ -0,0 +1,273 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ import itertools
3
+ import logging
4
+ import numpy as np
5
+ from collections import OrderedDict
6
+ from collections.abc import Mapping
7
+ from typing import Dict, List, Optional, Tuple, Union
8
+ import torch
9
+ from omegaconf import DictConfig, OmegaConf
10
+ from torch import Tensor, nn
11
+
12
+ from annotator.oneformer.detectron2.layers import ShapeSpec
13
+ from annotator.oneformer.detectron2.structures import BitMasks, Boxes, ImageList, Instances
14
+ from annotator.oneformer.detectron2.utils.events import get_event_storage
15
+
16
+ from .backbone import Backbone
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ def _to_container(cfg):
22
+ """
23
+ mmdet will assert the type of dict/list.
24
+ So convert omegaconf objects to dict/list.
25
+ """
26
+ if isinstance(cfg, DictConfig):
27
+ cfg = OmegaConf.to_container(cfg, resolve=True)
28
+ from mmcv.utils import ConfigDict
29
+
30
+ return ConfigDict(cfg)
31
+
32
+
33
+ class MMDetBackbone(Backbone):
34
+ """
35
+ Wrapper of mmdetection backbones to use in detectron2.
36
+
37
+ mmdet backbones produce list/tuple of tensors, while detectron2 backbones
38
+ produce a dict of tensors. This class wraps the given backbone to produce
39
+ output in detectron2's convention, so it can be used in place of detectron2
40
+ backbones.
41
+ """
42
+
43
+ def __init__(
44
+ self,
45
+ backbone: Union[nn.Module, Mapping],
46
+ neck: Union[nn.Module, Mapping, None] = None,
47
+ *,
48
+ output_shapes: List[ShapeSpec],
49
+ output_names: Optional[List[str]] = None,
50
+ ):
51
+ """
52
+ Args:
53
+ backbone: either a backbone module or a mmdet config dict that defines a
54
+ backbone. The backbone takes a 4D image tensor and returns a
55
+ sequence of tensors.
56
+ neck: either a backbone module or a mmdet config dict that defines a
57
+ neck. The neck takes outputs of backbone and returns a
58
+ sequence of tensors. If None, no neck is used.
59
+ output_shapes: shape for every output of the backbone (or neck, if given).
60
+ stride and channels are often needed.
61
+ output_names: names for every output of the backbone (or neck, if given).
62
+ By default, will use "out0", "out1", ...
63
+ """
64
+ super().__init__()
65
+ if isinstance(backbone, Mapping):
66
+ from mmdet.models import build_backbone
67
+
68
+ backbone = build_backbone(_to_container(backbone))
69
+ self.backbone = backbone
70
+
71
+ if isinstance(neck, Mapping):
72
+ from mmdet.models import build_neck
73
+
74
+ neck = build_neck(_to_container(neck))
75
+ self.neck = neck
76
+
77
+ # "Neck" weights, if any, are part of neck itself. This is the interface
78
+ # of mmdet so we follow it. Reference:
79
+ # https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/detectors/two_stage.py
80
+ logger.info("Initializing mmdet backbone weights...")
81
+ self.backbone.init_weights()
82
+ # train() in mmdet modules is non-trivial, and has to be explicitly
83
+ # called. Reference:
84
+ # https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/backbones/resnet.py
85
+ self.backbone.train()
86
+ if self.neck is not None:
87
+ logger.info("Initializing mmdet neck weights ...")
88
+ if isinstance(self.neck, nn.Sequential):
89
+ for m in self.neck:
90
+ m.init_weights()
91
+ else:
92
+ self.neck.init_weights()
93
+ self.neck.train()
94
+
95
+ self._output_shapes = output_shapes
96
+ if not output_names:
97
+ output_names = [f"out{i}" for i in range(len(output_shapes))]
98
+ self._output_names = output_names
99
+
100
+ def forward(self, x) -> Dict[str, Tensor]:
101
+ outs = self.backbone(x)
102
+ if self.neck is not None:
103
+ outs = self.neck(outs)
104
+ assert isinstance(
105
+ outs, (list, tuple)
106
+ ), "mmdet backbone should return a list/tuple of tensors!"
107
+ if len(outs) != len(self._output_shapes):
108
+ raise ValueError(
109
+ "Length of output_shapes does not match outputs from the mmdet backbone: "
110
+ f"{len(outs)} != {len(self._output_shapes)}"
111
+ )
112
+ return {k: v for k, v in zip(self._output_names, outs)}
113
+
114
+ def output_shape(self) -> Dict[str, ShapeSpec]:
115
+ return {k: v for k, v in zip(self._output_names, self._output_shapes)}
116
+
117
+
118
+ class MMDetDetector(nn.Module):
119
+ """
120
+ Wrapper of a mmdetection detector model, for detection and instance segmentation.
121
+ Input/output formats of this class follow detectron2's convention, so a
122
+ mmdetection model can be trained and evaluated in detectron2.
123
+ """
124
+
125
+ def __init__(
126
+ self,
127
+ detector: Union[nn.Module, Mapping],
128
+ *,
129
+ # Default is 32 regardless of model:
130
+ # https://github.com/open-mmlab/mmdetection/tree/master/configs/_base_/datasets
131
+ size_divisibility=32,
132
+ pixel_mean: Tuple[float],
133
+ pixel_std: Tuple[float],
134
+ ):
135
+ """
136
+ Args:
137
+ detector: a mmdet detector, or a mmdet config dict that defines a detector.
138
+ size_divisibility: pad input images to multiple of this number
139
+ pixel_mean: per-channel mean to normalize input image
140
+ pixel_std: per-channel stddev to normalize input image
141
+ """
142
+ super().__init__()
143
+ if isinstance(detector, Mapping):
144
+ from mmdet.models import build_detector
145
+
146
+ detector = build_detector(_to_container(detector))
147
+ self.detector = detector
148
+ self.detector.init_weights()
149
+ self.size_divisibility = size_divisibility
150
+
151
+ self.register_buffer("pixel_mean", torch.tensor(pixel_mean).view(-1, 1, 1), False)
152
+ self.register_buffer("pixel_std", torch.tensor(pixel_std).view(-1, 1, 1), False)
153
+ assert (
154
+ self.pixel_mean.shape == self.pixel_std.shape
155
+ ), f"{self.pixel_mean} and {self.pixel_std} have different shapes!"
156
+
157
+ def forward(self, batched_inputs: List[Dict[str, torch.Tensor]]):
158
+ images = [x["image"].to(self.device) for x in batched_inputs]
159
+ images = [(x - self.pixel_mean) / self.pixel_std for x in images]
160
+ images = ImageList.from_tensors(images, size_divisibility=self.size_divisibility).tensor
161
+ metas = []
162
+ rescale = {"height" in x for x in batched_inputs}
163
+ if len(rescale) != 1:
164
+ raise ValueError("Some inputs have original height/width, but some don't!")
165
+ rescale = list(rescale)[0]
166
+ output_shapes = []
167
+ for input in batched_inputs:
168
+ meta = {}
169
+ c, h, w = input["image"].shape
170
+ meta["img_shape"] = meta["ori_shape"] = (h, w, c)
171
+ if rescale:
172
+ scale_factor = np.array(
173
+ [w / input["width"], h / input["height"]] * 2, dtype="float32"
174
+ )
175
+ ori_shape = (input["height"], input["width"])
176
+ output_shapes.append(ori_shape)
177
+ meta["ori_shape"] = ori_shape + (c,)
178
+ else:
179
+ scale_factor = 1.0
180
+ output_shapes.append((h, w))
181
+ meta["scale_factor"] = scale_factor
182
+ meta["flip"] = False
183
+ padh, padw = images.shape[-2:]
184
+ meta["pad_shape"] = (padh, padw, c)
185
+ metas.append(meta)
186
+
187
+ if self.training:
188
+ gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
189
+ if gt_instances[0].has("gt_masks"):
190
+ from mmdet.core import PolygonMasks as mm_PolygonMasks, BitmapMasks as mm_BitMasks
191
+
192
+ def convert_mask(m, shape):
193
+ # mmdet mask format
194
+ if isinstance(m, BitMasks):
195
+ return mm_BitMasks(m.tensor.cpu().numpy(), shape[0], shape[1])
196
+ else:
197
+ return mm_PolygonMasks(m.polygons, shape[0], shape[1])
198
+
199
+ gt_masks = [convert_mask(x.gt_masks, x.image_size) for x in gt_instances]
200
+ losses_and_metrics = self.detector.forward_train(
201
+ images,
202
+ metas,
203
+ [x.gt_boxes.tensor for x in gt_instances],
204
+ [x.gt_classes for x in gt_instances],
205
+ gt_masks=gt_masks,
206
+ )
207
+ else:
208
+ losses_and_metrics = self.detector.forward_train(
209
+ images,
210
+ metas,
211
+ [x.gt_boxes.tensor for x in gt_instances],
212
+ [x.gt_classes for x in gt_instances],
213
+ )
214
+ return _parse_losses(losses_and_metrics)
215
+ else:
216
+ results = self.detector.simple_test(images, metas, rescale=rescale)
217
+ results = [
218
+ {"instances": _convert_mmdet_result(r, shape)}
219
+ for r, shape in zip(results, output_shapes)
220
+ ]
221
+ return results
222
+
223
+ @property
224
+ def device(self):
225
+ return self.pixel_mean.device
226
+
227
+
228
+ # Reference: show_result() in
229
+ # https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/detectors/base.py
230
+ def _convert_mmdet_result(result, shape: Tuple[int, int]) -> Instances:
231
+ if isinstance(result, tuple):
232
+ bbox_result, segm_result = result
233
+ if isinstance(segm_result, tuple):
234
+ segm_result = segm_result[0]
235
+ else:
236
+ bbox_result, segm_result = result, None
237
+
238
+ bboxes = torch.from_numpy(np.vstack(bbox_result)) # Nx5
239
+ bboxes, scores = bboxes[:, :4], bboxes[:, -1]
240
+ labels = [
241
+ torch.full((bbox.shape[0],), i, dtype=torch.int32) for i, bbox in enumerate(bbox_result)
242
+ ]
243
+ labels = torch.cat(labels)
244
+ inst = Instances(shape)
245
+ inst.pred_boxes = Boxes(bboxes)
246
+ inst.scores = scores
247
+ inst.pred_classes = labels
248
+
249
+ if segm_result is not None and len(labels) > 0:
250
+ segm_result = list(itertools.chain(*segm_result))
251
+ segm_result = [torch.from_numpy(x) if isinstance(x, np.ndarray) else x for x in segm_result]
252
+ segm_result = torch.stack(segm_result, dim=0)
253
+ inst.pred_masks = segm_result
254
+ return inst
255
+
256
+
257
+ # reference: https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/detectors/base.py
258
+ def _parse_losses(losses: Dict[str, Tensor]) -> Dict[str, Tensor]:
259
+ log_vars = OrderedDict()
260
+ for loss_name, loss_value in losses.items():
261
+ if isinstance(loss_value, torch.Tensor):
262
+ log_vars[loss_name] = loss_value.mean()
263
+ elif isinstance(loss_value, list):
264
+ log_vars[loss_name] = sum(_loss.mean() for _loss in loss_value)
265
+ else:
266
+ raise TypeError(f"{loss_name} is not a tensor or list of tensors")
267
+
268
+ if "loss" not in loss_name:
269
+ # put metrics to storage; don't return them
270
+ storage = get_event_storage()
271
+ value = log_vars.pop(loss_name).cpu().item()
272
+ storage.put_scalar(loss_name, value)
273
+ return log_vars
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/poolers.py ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ import math
3
+ from typing import List, Optional
4
+ import torch
5
+ from torch import nn
6
+ from torchvision.ops import RoIPool
7
+
8
+ from annotator.oneformer.detectron2.layers import ROIAlign, ROIAlignRotated, cat, nonzero_tuple, shapes_to_tensor
9
+ from annotator.oneformer.detectron2.structures import Boxes
10
+ from annotator.oneformer.detectron2.utils.tracing import assert_fx_safe, is_fx_tracing
11
+
12
+ """
13
+ To export ROIPooler to torchscript, in this file, variables that should be annotated with
14
+ `Union[List[Boxes], List[RotatedBoxes]]` are only annotated with `List[Boxes]`.
15
+
16
+ TODO: Correct these annotations when torchscript support `Union`.
17
+ https://github.com/pytorch/pytorch/issues/41412
18
+ """
19
+
20
+ __all__ = ["ROIPooler"]
21
+
22
+
23
+ def assign_boxes_to_levels(
24
+ box_lists: List[Boxes],
25
+ min_level: int,
26
+ max_level: int,
27
+ canonical_box_size: int,
28
+ canonical_level: int,
29
+ ):
30
+ """
31
+ Map each box in `box_lists` to a feature map level index and return the assignment
32
+ vector.
33
+
34
+ Args:
35
+ box_lists (list[Boxes] | list[RotatedBoxes]): A list of N Boxes or N RotatedBoxes,
36
+ where N is the number of images in the batch.
37
+ min_level (int): Smallest feature map level index. The input is considered index 0,
38
+ the output of stage 1 is index 1, and so.
39
+ max_level (int): Largest feature map level index.
40
+ canonical_box_size (int): A canonical box size in pixels (sqrt(box area)).
41
+ canonical_level (int): The feature map level index on which a canonically-sized box
42
+ should be placed.
43
+
44
+ Returns:
45
+ A tensor of length M, where M is the total number of boxes aggregated over all
46
+ N batch images. The memory layout corresponds to the concatenation of boxes
47
+ from all images. Each element is the feature map index, as an offset from
48
+ `self.min_level`, for the corresponding box (so value i means the box is at
49
+ `self.min_level + i`).
50
+ """
51
+ box_sizes = torch.sqrt(cat([boxes.area() for boxes in box_lists]))
52
+ # Eqn.(1) in FPN paper
53
+ level_assignments = torch.floor(
54
+ canonical_level + torch.log2(box_sizes / canonical_box_size + 1e-8)
55
+ )
56
+ # clamp level to (min, max), in case the box size is too large or too small
57
+ # for the available feature maps
58
+ level_assignments = torch.clamp(level_assignments, min=min_level, max=max_level)
59
+ return level_assignments.to(torch.int64) - min_level
60
+
61
+
62
+ # script the module to avoid hardcoded device type
63
+ @torch.jit.script_if_tracing
64
+ def _convert_boxes_to_pooler_format(boxes: torch.Tensor, sizes: torch.Tensor) -> torch.Tensor:
65
+ sizes = sizes.to(device=boxes.device)
66
+ indices = torch.repeat_interleave(
67
+ torch.arange(len(sizes), dtype=boxes.dtype, device=boxes.device), sizes
68
+ )
69
+ return cat([indices[:, None], boxes], dim=1)
70
+
71
+
72
+ def convert_boxes_to_pooler_format(box_lists: List[Boxes]):
73
+ """
74
+ Convert all boxes in `box_lists` to the low-level format used by ROI pooling ops
75
+ (see description under Returns).
76
+
77
+ Args:
78
+ box_lists (list[Boxes] | list[RotatedBoxes]):
79
+ A list of N Boxes or N RotatedBoxes, where N is the number of images in the batch.
80
+
81
+ Returns:
82
+ When input is list[Boxes]:
83
+ A tensor of shape (M, 5), where M is the total number of boxes aggregated over all
84
+ N batch images.
85
+ The 5 columns are (batch index, x0, y0, x1, y1), where batch index
86
+ is the index in [0, N) identifying which batch image the box with corners at
87
+ (x0, y0, x1, y1) comes from.
88
+ When input is list[RotatedBoxes]:
89
+ A tensor of shape (M, 6), where M is the total number of boxes aggregated over all
90
+ N batch images.
91
+ The 6 columns are (batch index, x_ctr, y_ctr, width, height, angle_degrees),
92
+ where batch index is the index in [0, N) identifying which batch image the
93
+ rotated box (x_ctr, y_ctr, width, height, angle_degrees) comes from.
94
+ """
95
+ boxes = torch.cat([x.tensor for x in box_lists], dim=0)
96
+ # __len__ returns Tensor in tracing.
97
+ sizes = shapes_to_tensor([x.__len__() for x in box_lists])
98
+ return _convert_boxes_to_pooler_format(boxes, sizes)
99
+
100
+
101
+ @torch.jit.script_if_tracing
102
+ def _create_zeros(
103
+ batch_target: Optional[torch.Tensor],
104
+ channels: int,
105
+ height: int,
106
+ width: int,
107
+ like_tensor: torch.Tensor,
108
+ ) -> torch.Tensor:
109
+ batches = batch_target.shape[0] if batch_target is not None else 0
110
+ sizes = (batches, channels, height, width)
111
+ return torch.zeros(sizes, dtype=like_tensor.dtype, device=like_tensor.device)
112
+
113
+
114
+ class ROIPooler(nn.Module):
115
+ """
116
+ Region of interest feature map pooler that supports pooling from one or more
117
+ feature maps.
118
+ """
119
+
120
+ def __init__(
121
+ self,
122
+ output_size,
123
+ scales,
124
+ sampling_ratio,
125
+ pooler_type,
126
+ canonical_box_size=224,
127
+ canonical_level=4,
128
+ ):
129
+ """
130
+ Args:
131
+ output_size (int, tuple[int] or list[int]): output size of the pooled region,
132
+ e.g., 14 x 14. If tuple or list is given, the length must be 2.
133
+ scales (list[float]): The scale for each low-level pooling op relative to
134
+ the input image. For a feature map with stride s relative to the input
135
+ image, scale is defined as 1/s. The stride must be power of 2.
136
+ When there are multiple scales, they must form a pyramid, i.e. they must be
137
+ a monotically decreasing geometric sequence with a factor of 1/2.
138
+ sampling_ratio (int): The `sampling_ratio` parameter for the ROIAlign op.
139
+ pooler_type (string): Name of the type of pooling operation that should be applied.
140
+ For instance, "ROIPool" or "ROIAlignV2".
141
+ canonical_box_size (int): A canonical box size in pixels (sqrt(box area)). The default
142
+ is heuristically defined as 224 pixels in the FPN paper (based on ImageNet
143
+ pre-training).
144
+ canonical_level (int): The feature map level index from which a canonically-sized box
145
+ should be placed. The default is defined as level 4 (stride=16) in the FPN paper,
146
+ i.e., a box of size 224x224 will be placed on the feature with stride=16.
147
+ The box placement for all boxes will be determined from their sizes w.r.t
148
+ canonical_box_size. For example, a box whose area is 4x that of a canonical box
149
+ should be used to pool features from feature level ``canonical_level+1``.
150
+
151
+ Note that the actual input feature maps given to this module may not have
152
+ sufficiently many levels for the input boxes. If the boxes are too large or too
153
+ small for the input feature maps, the closest level will be used.
154
+ """
155
+ super().__init__()
156
+
157
+ if isinstance(output_size, int):
158
+ output_size = (output_size, output_size)
159
+ assert len(output_size) == 2
160
+ assert isinstance(output_size[0], int) and isinstance(output_size[1], int)
161
+ self.output_size = output_size
162
+
163
+ if pooler_type == "ROIAlign":
164
+ self.level_poolers = nn.ModuleList(
165
+ ROIAlign(
166
+ output_size, spatial_scale=scale, sampling_ratio=sampling_ratio, aligned=False
167
+ )
168
+ for scale in scales
169
+ )
170
+ elif pooler_type == "ROIAlignV2":
171
+ self.level_poolers = nn.ModuleList(
172
+ ROIAlign(
173
+ output_size, spatial_scale=scale, sampling_ratio=sampling_ratio, aligned=True
174
+ )
175
+ for scale in scales
176
+ )
177
+ elif pooler_type == "ROIPool":
178
+ self.level_poolers = nn.ModuleList(
179
+ RoIPool(output_size, spatial_scale=scale) for scale in scales
180
+ )
181
+ elif pooler_type == "ROIAlignRotated":
182
+ self.level_poolers = nn.ModuleList(
183
+ ROIAlignRotated(output_size, spatial_scale=scale, sampling_ratio=sampling_ratio)
184
+ for scale in scales
185
+ )
186
+ else:
187
+ raise ValueError("Unknown pooler type: {}".format(pooler_type))
188
+
189
+ # Map scale (defined as 1 / stride) to its feature map level under the
190
+ # assumption that stride is a power of 2.
191
+ min_level = -(math.log2(scales[0]))
192
+ max_level = -(math.log2(scales[-1]))
193
+ assert math.isclose(min_level, int(min_level)) and math.isclose(
194
+ max_level, int(max_level)
195
+ ), "Featuremap stride is not power of 2!"
196
+ self.min_level = int(min_level)
197
+ self.max_level = int(max_level)
198
+ assert (
199
+ len(scales) == self.max_level - self.min_level + 1
200
+ ), "[ROIPooler] Sizes of input featuremaps do not form a pyramid!"
201
+ assert 0 <= self.min_level and self.min_level <= self.max_level
202
+ self.canonical_level = canonical_level
203
+ assert canonical_box_size > 0
204
+ self.canonical_box_size = canonical_box_size
205
+
206
+ def forward(self, x: List[torch.Tensor], box_lists: List[Boxes]):
207
+ """
208
+ Args:
209
+ x (list[Tensor]): A list of feature maps of NCHW shape, with scales matching those
210
+ used to construct this module.
211
+ box_lists (list[Boxes] | list[RotatedBoxes]):
212
+ A list of N Boxes or N RotatedBoxes, where N is the number of images in the batch.
213
+ The box coordinates are defined on the original image and
214
+ will be scaled by the `scales` argument of :class:`ROIPooler`.
215
+
216
+ Returns:
217
+ Tensor:
218
+ A tensor of shape (M, C, output_size, output_size) where M is the total number of
219
+ boxes aggregated over all N batch images and C is the number of channels in `x`.
220
+ """
221
+ num_level_assignments = len(self.level_poolers)
222
+
223
+ if not is_fx_tracing():
224
+ torch._assert(
225
+ isinstance(x, list) and isinstance(box_lists, list),
226
+ "Arguments to pooler must be lists",
227
+ )
228
+ assert_fx_safe(
229
+ len(x) == num_level_assignments,
230
+ "unequal value, num_level_assignments={}, but x is list of {} Tensors".format(
231
+ num_level_assignments, len(x)
232
+ ),
233
+ )
234
+ assert_fx_safe(
235
+ len(box_lists) == x[0].size(0),
236
+ "unequal value, x[0] batch dim 0 is {}, but box_list has length {}".format(
237
+ x[0].size(0), len(box_lists)
238
+ ),
239
+ )
240
+ if len(box_lists) == 0:
241
+ return _create_zeros(None, x[0].shape[1], *self.output_size, x[0])
242
+
243
+ pooler_fmt_boxes = convert_boxes_to_pooler_format(box_lists)
244
+
245
+ if num_level_assignments == 1:
246
+ return self.level_poolers[0](x[0], pooler_fmt_boxes)
247
+
248
+ level_assignments = assign_boxes_to_levels(
249
+ box_lists, self.min_level, self.max_level, self.canonical_box_size, self.canonical_level
250
+ )
251
+
252
+ num_channels = x[0].shape[1]
253
+ output_size = self.output_size[0]
254
+
255
+ output = _create_zeros(pooler_fmt_boxes, num_channels, output_size, output_size, x[0])
256
+
257
+ for level, pooler in enumerate(self.level_poolers):
258
+ inds = nonzero_tuple(level_assignments == level)[0]
259
+ pooler_fmt_boxes_level = pooler_fmt_boxes[inds]
260
+ # Use index_put_ instead of advance indexing, to avoid pytorch/issues/49852
261
+ output.index_put_((inds,), pooler(x[level], pooler_fmt_boxes_level))
262
+
263
+ return output
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/postprocessing.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ import torch
3
+ from torch.nn import functional as F
4
+
5
+ from annotator.oneformer.detectron2.structures import Instances, ROIMasks
6
+
7
+
8
+ # perhaps should rename to "resize_instance"
9
+ def detector_postprocess(
10
+ results: Instances, output_height: int, output_width: int, mask_threshold: float = 0.5
11
+ ):
12
+ """
13
+ Resize the output instances.
14
+ The input images are often resized when entering an object detector.
15
+ As a result, we often need the outputs of the detector in a different
16
+ resolution from its inputs.
17
+
18
+ This function will resize the raw outputs of an R-CNN detector
19
+ to produce outputs according to the desired output resolution.
20
+
21
+ Args:
22
+ results (Instances): the raw outputs from the detector.
23
+ `results.image_size` contains the input image resolution the detector sees.
24
+ This object might be modified in-place.
25
+ output_height, output_width: the desired output resolution.
26
+ Returns:
27
+ Instances: the resized output from the model, based on the output resolution
28
+ """
29
+ if isinstance(output_width, torch.Tensor):
30
+ # This shape might (but not necessarily) be tensors during tracing.
31
+ # Converts integer tensors to float temporaries to ensure true
32
+ # division is performed when computing scale_x and scale_y.
33
+ output_width_tmp = output_width.float()
34
+ output_height_tmp = output_height.float()
35
+ new_size = torch.stack([output_height, output_width])
36
+ else:
37
+ new_size = (output_height, output_width)
38
+ output_width_tmp = output_width
39
+ output_height_tmp = output_height
40
+
41
+ scale_x, scale_y = (
42
+ output_width_tmp / results.image_size[1],
43
+ output_height_tmp / results.image_size[0],
44
+ )
45
+ results = Instances(new_size, **results.get_fields())
46
+
47
+ if results.has("pred_boxes"):
48
+ output_boxes = results.pred_boxes
49
+ elif results.has("proposal_boxes"):
50
+ output_boxes = results.proposal_boxes
51
+ else:
52
+ output_boxes = None
53
+ assert output_boxes is not None, "Predictions must contain boxes!"
54
+
55
+ output_boxes.scale(scale_x, scale_y)
56
+ output_boxes.clip(results.image_size)
57
+
58
+ results = results[output_boxes.nonempty()]
59
+
60
+ if results.has("pred_masks"):
61
+ if isinstance(results.pred_masks, ROIMasks):
62
+ roi_masks = results.pred_masks
63
+ else:
64
+ # pred_masks is a tensor of shape (N, 1, M, M)
65
+ roi_masks = ROIMasks(results.pred_masks[:, 0, :, :])
66
+ results.pred_masks = roi_masks.to_bitmasks(
67
+ results.pred_boxes, output_height, output_width, mask_threshold
68
+ ).tensor # TODO return ROIMasks/BitMask object in the future
69
+
70
+ if results.has("pred_keypoints"):
71
+ results.pred_keypoints[:, :, 0] *= scale_x
72
+ results.pred_keypoints[:, :, 1] *= scale_y
73
+
74
+ return results
75
+
76
+
77
+ def sem_seg_postprocess(result, img_size, output_height, output_width):
78
+ """
79
+ Return semantic segmentation predictions in the original resolution.
80
+
81
+ The input images are often resized when entering semantic segmentor. Moreover, in same
82
+ cases, they also padded inside segmentor to be divisible by maximum network stride.
83
+ As a result, we often need the predictions of the segmentor in a different
84
+ resolution from its inputs.
85
+
86
+ Args:
87
+ result (Tensor): semantic segmentation prediction logits. A tensor of shape (C, H, W),
88
+ where C is the number of classes, and H, W are the height and width of the prediction.
89
+ img_size (tuple): image size that segmentor is taking as input.
90
+ output_height, output_width: the desired output resolution.
91
+
92
+ Returns:
93
+ semantic segmentation prediction (Tensor): A tensor of the shape
94
+ (C, output_height, output_width) that contains per-pixel soft predictions.
95
+ """
96
+ result = result[:, : img_size[0], : img_size[1]].expand(1, -1, -1, -1)
97
+ result = F.interpolate(
98
+ result, size=(output_height, output_width), mode="bilinear", align_corners=False
99
+ )[0]
100
+ return result
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/proposal_generator/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ from .build import PROPOSAL_GENERATOR_REGISTRY, build_proposal_generator
3
+ from .rpn import RPN_HEAD_REGISTRY, build_rpn_head, RPN, StandardRPNHead
4
+
5
+ __all__ = list(globals().keys())
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/proposal_generator/build.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ from annotator.oneformer.detectron2.utils.registry import Registry
3
+
4
+ PROPOSAL_GENERATOR_REGISTRY = Registry("PROPOSAL_GENERATOR")
5
+ PROPOSAL_GENERATOR_REGISTRY.__doc__ = """
6
+ Registry for proposal generator, which produces object proposals from feature maps.
7
+
8
+ The registered object will be called with `obj(cfg, input_shape)`.
9
+ The call should return a `nn.Module` object.
10
+ """
11
+
12
+ from . import rpn, rrpn # noqa F401 isort:skip
13
+
14
+
15
+ def build_proposal_generator(cfg, input_shape):
16
+ """
17
+ Build a proposal generator from `cfg.MODEL.PROPOSAL_GENERATOR.NAME`.
18
+ The name can be "PrecomputedProposals" to use no proposal generator.
19
+ """
20
+ name = cfg.MODEL.PROPOSAL_GENERATOR.NAME
21
+ if name == "PrecomputedProposals":
22
+ return None
23
+
24
+ return PROPOSAL_GENERATOR_REGISTRY.get(name)(cfg, input_shape)
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/proposal_generator/proposal_utils.py ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ import logging
3
+ import math
4
+ from typing import List, Tuple, Union
5
+ import torch
6
+
7
+ from annotator.oneformer.detectron2.layers import batched_nms, cat, move_device_like
8
+ from annotator.oneformer.detectron2.structures import Boxes, Instances
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ def _is_tracing():
14
+ # (fixed in TORCH_VERSION >= 1.9)
15
+ if torch.jit.is_scripting():
16
+ # https://github.com/pytorch/pytorch/issues/47379
17
+ return False
18
+ else:
19
+ return torch.jit.is_tracing()
20
+
21
+
22
+ def find_top_rpn_proposals(
23
+ proposals: List[torch.Tensor],
24
+ pred_objectness_logits: List[torch.Tensor],
25
+ image_sizes: List[Tuple[int, int]],
26
+ nms_thresh: float,
27
+ pre_nms_topk: int,
28
+ post_nms_topk: int,
29
+ min_box_size: float,
30
+ training: bool,
31
+ ):
32
+ """
33
+ For each feature map, select the `pre_nms_topk` highest scoring proposals,
34
+ apply NMS, clip proposals, and remove small boxes. Return the `post_nms_topk`
35
+ highest scoring proposals among all the feature maps for each image.
36
+
37
+ Args:
38
+ proposals (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A, 4).
39
+ All proposal predictions on the feature maps.
40
+ pred_objectness_logits (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A).
41
+ image_sizes (list[tuple]): sizes (h, w) for each image
42
+ nms_thresh (float): IoU threshold to use for NMS
43
+ pre_nms_topk (int): number of top k scoring proposals to keep before applying NMS.
44
+ When RPN is run on multiple feature maps (as in FPN) this number is per
45
+ feature map.
46
+ post_nms_topk (int): number of top k scoring proposals to keep after applying NMS.
47
+ When RPN is run on multiple feature maps (as in FPN) this number is total,
48
+ over all feature maps.
49
+ min_box_size (float): minimum proposal box side length in pixels (absolute units
50
+ wrt input images).
51
+ training (bool): True if proposals are to be used in training, otherwise False.
52
+ This arg exists only to support a legacy bug; look for the "NB: Legacy bug ..."
53
+ comment.
54
+
55
+ Returns:
56
+ list[Instances]: list of N Instances. The i-th Instances
57
+ stores post_nms_topk object proposals for image i, sorted by their
58
+ objectness score in descending order.
59
+ """
60
+ num_images = len(image_sizes)
61
+ device = (
62
+ proposals[0].device
63
+ if torch.jit.is_scripting()
64
+ else ("cpu" if torch.jit.is_tracing() else proposals[0].device)
65
+ )
66
+
67
+ # 1. Select top-k anchor for every level and every image
68
+ topk_scores = [] # #lvl Tensor, each of shape N x topk
69
+ topk_proposals = []
70
+ level_ids = [] # #lvl Tensor, each of shape (topk,)
71
+ batch_idx = move_device_like(torch.arange(num_images, device=device), proposals[0])
72
+ for level_id, (proposals_i, logits_i) in enumerate(zip(proposals, pred_objectness_logits)):
73
+ Hi_Wi_A = logits_i.shape[1]
74
+ if isinstance(Hi_Wi_A, torch.Tensor): # it's a tensor in tracing
75
+ num_proposals_i = torch.clamp(Hi_Wi_A, max=pre_nms_topk)
76
+ else:
77
+ num_proposals_i = min(Hi_Wi_A, pre_nms_topk)
78
+
79
+ topk_scores_i, topk_idx = logits_i.topk(num_proposals_i, dim=1)
80
+
81
+ # each is N x topk
82
+ topk_proposals_i = proposals_i[batch_idx[:, None], topk_idx] # N x topk x 4
83
+
84
+ topk_proposals.append(topk_proposals_i)
85
+ topk_scores.append(topk_scores_i)
86
+ level_ids.append(
87
+ move_device_like(
88
+ torch.full((num_proposals_i,), level_id, dtype=torch.int64, device=device),
89
+ proposals[0],
90
+ )
91
+ )
92
+
93
+ # 2. Concat all levels together
94
+ topk_scores = cat(topk_scores, dim=1)
95
+ topk_proposals = cat(topk_proposals, dim=1)
96
+ level_ids = cat(level_ids, dim=0)
97
+
98
+ # 3. For each image, run a per-level NMS, and choose topk results.
99
+ results: List[Instances] = []
100
+ for n, image_size in enumerate(image_sizes):
101
+ boxes = Boxes(topk_proposals[n])
102
+ scores_per_img = topk_scores[n]
103
+ lvl = level_ids
104
+
105
+ valid_mask = torch.isfinite(boxes.tensor).all(dim=1) & torch.isfinite(scores_per_img)
106
+ if not valid_mask.all():
107
+ if training:
108
+ raise FloatingPointError(
109
+ "Predicted boxes or scores contain Inf/NaN. Training has diverged."
110
+ )
111
+ boxes = boxes[valid_mask]
112
+ scores_per_img = scores_per_img[valid_mask]
113
+ lvl = lvl[valid_mask]
114
+ boxes.clip(image_size)
115
+
116
+ # filter empty boxes
117
+ keep = boxes.nonempty(threshold=min_box_size)
118
+ if _is_tracing() or keep.sum().item() != len(boxes):
119
+ boxes, scores_per_img, lvl = boxes[keep], scores_per_img[keep], lvl[keep]
120
+
121
+ keep = batched_nms(boxes.tensor, scores_per_img, lvl, nms_thresh)
122
+ # In Detectron1, there was different behavior during training vs. testing.
123
+ # (https://github.com/facebookresearch/Detectron/issues/459)
124
+ # During training, topk is over the proposals from *all* images in the training batch.
125
+ # During testing, it is over the proposals for each image separately.
126
+ # As a result, the training behavior becomes batch-dependent,
127
+ # and the configuration "POST_NMS_TOPK_TRAIN" end up relying on the batch size.
128
+ # This bug is addressed in Detectron2 to make the behavior independent of batch size.
129
+ keep = keep[:post_nms_topk] # keep is already sorted
130
+
131
+ res = Instances(image_size)
132
+ res.proposal_boxes = boxes[keep]
133
+ res.objectness_logits = scores_per_img[keep]
134
+ results.append(res)
135
+ return results
136
+
137
+
138
+ def add_ground_truth_to_proposals(
139
+ gt: Union[List[Instances], List[Boxes]], proposals: List[Instances]
140
+ ) -> List[Instances]:
141
+ """
142
+ Call `add_ground_truth_to_proposals_single_image` for all images.
143
+
144
+ Args:
145
+ gt(Union[List[Instances], List[Boxes]): list of N elements. Element i is a Instances
146
+ representing the ground-truth for image i.
147
+ proposals (list[Instances]): list of N elements. Element i is a Instances
148
+ representing the proposals for image i.
149
+
150
+ Returns:
151
+ list[Instances]: list of N Instances. Each is the proposals for the image,
152
+ with field "proposal_boxes" and "objectness_logits".
153
+ """
154
+ assert gt is not None
155
+
156
+ if len(proposals) != len(gt):
157
+ raise ValueError("proposals and gt should have the same length as the number of images!")
158
+ if len(proposals) == 0:
159
+ return proposals
160
+
161
+ return [
162
+ add_ground_truth_to_proposals_single_image(gt_i, proposals_i)
163
+ for gt_i, proposals_i in zip(gt, proposals)
164
+ ]
165
+
166
+
167
+ def add_ground_truth_to_proposals_single_image(
168
+ gt: Union[Instances, Boxes], proposals: Instances
169
+ ) -> Instances:
170
+ """
171
+ Augment `proposals` with `gt`.
172
+
173
+ Args:
174
+ Same as `add_ground_truth_to_proposals`, but with gt and proposals
175
+ per image.
176
+
177
+ Returns:
178
+ Same as `add_ground_truth_to_proposals`, but for only one image.
179
+ """
180
+ if isinstance(gt, Boxes):
181
+ # convert Boxes to Instances
182
+ gt = Instances(proposals.image_size, gt_boxes=gt)
183
+
184
+ gt_boxes = gt.gt_boxes
185
+ device = proposals.objectness_logits.device
186
+ # Assign all ground-truth boxes an objectness logit corresponding to
187
+ # P(object) = sigmoid(logit) =~ 1.
188
+ gt_logit_value = math.log((1.0 - 1e-10) / (1 - (1.0 - 1e-10)))
189
+ gt_logits = gt_logit_value * torch.ones(len(gt_boxes), device=device)
190
+
191
+ # Concatenating gt_boxes with proposals requires them to have the same fields
192
+ gt_proposal = Instances(proposals.image_size, **gt.get_fields())
193
+ gt_proposal.proposal_boxes = gt_boxes
194
+ gt_proposal.objectness_logits = gt_logits
195
+
196
+ for key in proposals.get_fields().keys():
197
+ assert gt_proposal.has(
198
+ key
199
+ ), "The attribute '{}' in `proposals` does not exist in `gt`".format(key)
200
+
201
+ # NOTE: Instances.cat only use fields from the first item. Extra fields in latter items
202
+ # will be thrown away.
203
+ new_proposals = Instances.cat([proposals, gt_proposal])
204
+
205
+ return new_proposals
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/proposal_generator/rpn.py ADDED
@@ -0,0 +1,533 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ from typing import Dict, List, Optional, Tuple, Union
3
+ import torch
4
+ import torch.nn.functional as F
5
+ from torch import nn
6
+
7
+ from annotator.oneformer.detectron2.config import configurable
8
+ from annotator.oneformer.detectron2.layers import Conv2d, ShapeSpec, cat
9
+ from annotator.oneformer.detectron2.structures import Boxes, ImageList, Instances, pairwise_iou
10
+ from annotator.oneformer.detectron2.utils.events import get_event_storage
11
+ from annotator.oneformer.detectron2.utils.memory import retry_if_cuda_oom
12
+ from annotator.oneformer.detectron2.utils.registry import Registry
13
+
14
+ from ..anchor_generator import build_anchor_generator
15
+ from ..box_regression import Box2BoxTransform, _dense_box_regression_loss
16
+ from ..matcher import Matcher
17
+ from ..sampling import subsample_labels
18
+ from .build import PROPOSAL_GENERATOR_REGISTRY
19
+ from .proposal_utils import find_top_rpn_proposals
20
+
21
+ RPN_HEAD_REGISTRY = Registry("RPN_HEAD")
22
+ RPN_HEAD_REGISTRY.__doc__ = """
23
+ Registry for RPN heads, which take feature maps and perform
24
+ objectness classification and bounding box regression for anchors.
25
+
26
+ The registered object will be called with `obj(cfg, input_shape)`.
27
+ The call should return a `nn.Module` object.
28
+ """
29
+
30
+
31
+ """
32
+ Shape shorthand in this module:
33
+
34
+ N: number of images in the minibatch
35
+ L: number of feature maps per image on which RPN is run
36
+ A: number of cell anchors (must be the same for all feature maps)
37
+ Hi, Wi: height and width of the i-th feature map
38
+ B: size of the box parameterization
39
+
40
+ Naming convention:
41
+
42
+ objectness: refers to the binary classification of an anchor as object vs. not object.
43
+
44
+ deltas: refers to the 4-d (dx, dy, dw, dh) deltas that parameterize the box2box
45
+ transform (see :class:`box_regression.Box2BoxTransform`), or 5d for rotated boxes.
46
+
47
+ pred_objectness_logits: predicted objectness scores in [-inf, +inf]; use
48
+ sigmoid(pred_objectness_logits) to estimate P(object).
49
+
50
+ gt_labels: ground-truth binary classification labels for objectness
51
+
52
+ pred_anchor_deltas: predicted box2box transform deltas
53
+
54
+ gt_anchor_deltas: ground-truth box2box transform deltas
55
+ """
56
+
57
+
58
+ def build_rpn_head(cfg, input_shape):
59
+ """
60
+ Build an RPN head defined by `cfg.MODEL.RPN.HEAD_NAME`.
61
+ """
62
+ name = cfg.MODEL.RPN.HEAD_NAME
63
+ return RPN_HEAD_REGISTRY.get(name)(cfg, input_shape)
64
+
65
+
66
+ @RPN_HEAD_REGISTRY.register()
67
+ class StandardRPNHead(nn.Module):
68
+ """
69
+ Standard RPN classification and regression heads described in :paper:`Faster R-CNN`.
70
+ Uses a 3x3 conv to produce a shared hidden state from which one 1x1 conv predicts
71
+ objectness logits for each anchor and a second 1x1 conv predicts bounding-box deltas
72
+ specifying how to deform each anchor into an object proposal.
73
+ """
74
+
75
+ @configurable
76
+ def __init__(
77
+ self, *, in_channels: int, num_anchors: int, box_dim: int = 4, conv_dims: List[int] = (-1,)
78
+ ):
79
+ """
80
+ NOTE: this interface is experimental.
81
+
82
+ Args:
83
+ in_channels (int): number of input feature channels. When using multiple
84
+ input features, they must have the same number of channels.
85
+ num_anchors (int): number of anchors to predict for *each spatial position*
86
+ on the feature map. The total number of anchors for each
87
+ feature map will be `num_anchors * H * W`.
88
+ box_dim (int): dimension of a box, which is also the number of box regression
89
+ predictions to make for each anchor. An axis aligned box has
90
+ box_dim=4, while a rotated box has box_dim=5.
91
+ conv_dims (list[int]): a list of integers representing the output channels
92
+ of N conv layers. Set it to -1 to use the same number of output channels
93
+ as input channels.
94
+ """
95
+ super().__init__()
96
+ cur_channels = in_channels
97
+ # Keeping the old variable names and structure for backwards compatiblity.
98
+ # Otherwise the old checkpoints will fail to load.
99
+ if len(conv_dims) == 1:
100
+ out_channels = cur_channels if conv_dims[0] == -1 else conv_dims[0]
101
+ # 3x3 conv for the hidden representation
102
+ self.conv = self._get_rpn_conv(cur_channels, out_channels)
103
+ cur_channels = out_channels
104
+ else:
105
+ self.conv = nn.Sequential()
106
+ for k, conv_dim in enumerate(conv_dims):
107
+ out_channels = cur_channels if conv_dim == -1 else conv_dim
108
+ if out_channels <= 0:
109
+ raise ValueError(
110
+ f"Conv output channels should be greater than 0. Got {out_channels}"
111
+ )
112
+ conv = self._get_rpn_conv(cur_channels, out_channels)
113
+ self.conv.add_module(f"conv{k}", conv)
114
+ cur_channels = out_channels
115
+ # 1x1 conv for predicting objectness logits
116
+ self.objectness_logits = nn.Conv2d(cur_channels, num_anchors, kernel_size=1, stride=1)
117
+ # 1x1 conv for predicting box2box transform deltas
118
+ self.anchor_deltas = nn.Conv2d(cur_channels, num_anchors * box_dim, kernel_size=1, stride=1)
119
+
120
+ # Keeping the order of weights initialization same for backwards compatiblility.
121
+ for layer in self.modules():
122
+ if isinstance(layer, nn.Conv2d):
123
+ nn.init.normal_(layer.weight, std=0.01)
124
+ nn.init.constant_(layer.bias, 0)
125
+
126
+ def _get_rpn_conv(self, in_channels, out_channels):
127
+ return Conv2d(
128
+ in_channels,
129
+ out_channels,
130
+ kernel_size=3,
131
+ stride=1,
132
+ padding=1,
133
+ activation=nn.ReLU(),
134
+ )
135
+
136
+ @classmethod
137
+ def from_config(cls, cfg, input_shape):
138
+ # Standard RPN is shared across levels:
139
+ in_channels = [s.channels for s in input_shape]
140
+ assert len(set(in_channels)) == 1, "Each level must have the same channel!"
141
+ in_channels = in_channels[0]
142
+
143
+ # RPNHead should take the same input as anchor generator
144
+ # NOTE: it assumes that creating an anchor generator does not have unwanted side effect.
145
+ anchor_generator = build_anchor_generator(cfg, input_shape)
146
+ num_anchors = anchor_generator.num_anchors
147
+ box_dim = anchor_generator.box_dim
148
+ assert (
149
+ len(set(num_anchors)) == 1
150
+ ), "Each level must have the same number of anchors per spatial position"
151
+ return {
152
+ "in_channels": in_channels,
153
+ "num_anchors": num_anchors[0],
154
+ "box_dim": box_dim,
155
+ "conv_dims": cfg.MODEL.RPN.CONV_DIMS,
156
+ }
157
+
158
+ def forward(self, features: List[torch.Tensor]):
159
+ """
160
+ Args:
161
+ features (list[Tensor]): list of feature maps
162
+
163
+ Returns:
164
+ list[Tensor]: A list of L elements.
165
+ Element i is a tensor of shape (N, A, Hi, Wi) representing
166
+ the predicted objectness logits for all anchors. A is the number of cell anchors.
167
+ list[Tensor]: A list of L elements. Element i is a tensor of shape
168
+ (N, A*box_dim, Hi, Wi) representing the predicted "deltas" used to transform anchors
169
+ to proposals.
170
+ """
171
+ pred_objectness_logits = []
172
+ pred_anchor_deltas = []
173
+ for x in features:
174
+ t = self.conv(x)
175
+ pred_objectness_logits.append(self.objectness_logits(t))
176
+ pred_anchor_deltas.append(self.anchor_deltas(t))
177
+ return pred_objectness_logits, pred_anchor_deltas
178
+
179
+
180
+ @PROPOSAL_GENERATOR_REGISTRY.register()
181
+ class RPN(nn.Module):
182
+ """
183
+ Region Proposal Network, introduced by :paper:`Faster R-CNN`.
184
+ """
185
+
186
+ @configurable
187
+ def __init__(
188
+ self,
189
+ *,
190
+ in_features: List[str],
191
+ head: nn.Module,
192
+ anchor_generator: nn.Module,
193
+ anchor_matcher: Matcher,
194
+ box2box_transform: Box2BoxTransform,
195
+ batch_size_per_image: int,
196
+ positive_fraction: float,
197
+ pre_nms_topk: Tuple[float, float],
198
+ post_nms_topk: Tuple[float, float],
199
+ nms_thresh: float = 0.7,
200
+ min_box_size: float = 0.0,
201
+ anchor_boundary_thresh: float = -1.0,
202
+ loss_weight: Union[float, Dict[str, float]] = 1.0,
203
+ box_reg_loss_type: str = "smooth_l1",
204
+ smooth_l1_beta: float = 0.0,
205
+ ):
206
+ """
207
+ NOTE: this interface is experimental.
208
+
209
+ Args:
210
+ in_features (list[str]): list of names of input features to use
211
+ head (nn.Module): a module that predicts logits and regression deltas
212
+ for each level from a list of per-level features
213
+ anchor_generator (nn.Module): a module that creates anchors from a
214
+ list of features. Usually an instance of :class:`AnchorGenerator`
215
+ anchor_matcher (Matcher): label the anchors by matching them with ground truth.
216
+ box2box_transform (Box2BoxTransform): defines the transform from anchors boxes to
217
+ instance boxes
218
+ batch_size_per_image (int): number of anchors per image to sample for training
219
+ positive_fraction (float): fraction of foreground anchors to sample for training
220
+ pre_nms_topk (tuple[float]): (train, test) that represents the
221
+ number of top k proposals to select before NMS, in
222
+ training and testing.
223
+ post_nms_topk (tuple[float]): (train, test) that represents the
224
+ number of top k proposals to select after NMS, in
225
+ training and testing.
226
+ nms_thresh (float): NMS threshold used to de-duplicate the predicted proposals
227
+ min_box_size (float): remove proposal boxes with any side smaller than this threshold,
228
+ in the unit of input image pixels
229
+ anchor_boundary_thresh (float): legacy option
230
+ loss_weight (float|dict): weights to use for losses. Can be single float for weighting
231
+ all rpn losses together, or a dict of individual weightings. Valid dict keys are:
232
+ "loss_rpn_cls" - applied to classification loss
233
+ "loss_rpn_loc" - applied to box regression loss
234
+ box_reg_loss_type (str): Loss type to use. Supported losses: "smooth_l1", "giou".
235
+ smooth_l1_beta (float): beta parameter for the smooth L1 regression loss. Default to
236
+ use L1 loss. Only used when `box_reg_loss_type` is "smooth_l1"
237
+ """
238
+ super().__init__()
239
+ self.in_features = in_features
240
+ self.rpn_head = head
241
+ self.anchor_generator = anchor_generator
242
+ self.anchor_matcher = anchor_matcher
243
+ self.box2box_transform = box2box_transform
244
+ self.batch_size_per_image = batch_size_per_image
245
+ self.positive_fraction = positive_fraction
246
+ # Map from self.training state to train/test settings
247
+ self.pre_nms_topk = {True: pre_nms_topk[0], False: pre_nms_topk[1]}
248
+ self.post_nms_topk = {True: post_nms_topk[0], False: post_nms_topk[1]}
249
+ self.nms_thresh = nms_thresh
250
+ self.min_box_size = float(min_box_size)
251
+ self.anchor_boundary_thresh = anchor_boundary_thresh
252
+ if isinstance(loss_weight, float):
253
+ loss_weight = {"loss_rpn_cls": loss_weight, "loss_rpn_loc": loss_weight}
254
+ self.loss_weight = loss_weight
255
+ self.box_reg_loss_type = box_reg_loss_type
256
+ self.smooth_l1_beta = smooth_l1_beta
257
+
258
+ @classmethod
259
+ def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
260
+ in_features = cfg.MODEL.RPN.IN_FEATURES
261
+ ret = {
262
+ "in_features": in_features,
263
+ "min_box_size": cfg.MODEL.PROPOSAL_GENERATOR.MIN_SIZE,
264
+ "nms_thresh": cfg.MODEL.RPN.NMS_THRESH,
265
+ "batch_size_per_image": cfg.MODEL.RPN.BATCH_SIZE_PER_IMAGE,
266
+ "positive_fraction": cfg.MODEL.RPN.POSITIVE_FRACTION,
267
+ "loss_weight": {
268
+ "loss_rpn_cls": cfg.MODEL.RPN.LOSS_WEIGHT,
269
+ "loss_rpn_loc": cfg.MODEL.RPN.BBOX_REG_LOSS_WEIGHT * cfg.MODEL.RPN.LOSS_WEIGHT,
270
+ },
271
+ "anchor_boundary_thresh": cfg.MODEL.RPN.BOUNDARY_THRESH,
272
+ "box2box_transform": Box2BoxTransform(weights=cfg.MODEL.RPN.BBOX_REG_WEIGHTS),
273
+ "box_reg_loss_type": cfg.MODEL.RPN.BBOX_REG_LOSS_TYPE,
274
+ "smooth_l1_beta": cfg.MODEL.RPN.SMOOTH_L1_BETA,
275
+ }
276
+
277
+ ret["pre_nms_topk"] = (cfg.MODEL.RPN.PRE_NMS_TOPK_TRAIN, cfg.MODEL.RPN.PRE_NMS_TOPK_TEST)
278
+ ret["post_nms_topk"] = (cfg.MODEL.RPN.POST_NMS_TOPK_TRAIN, cfg.MODEL.RPN.POST_NMS_TOPK_TEST)
279
+
280
+ ret["anchor_generator"] = build_anchor_generator(cfg, [input_shape[f] for f in in_features])
281
+ ret["anchor_matcher"] = Matcher(
282
+ cfg.MODEL.RPN.IOU_THRESHOLDS, cfg.MODEL.RPN.IOU_LABELS, allow_low_quality_matches=True
283
+ )
284
+ ret["head"] = build_rpn_head(cfg, [input_shape[f] for f in in_features])
285
+ return ret
286
+
287
+ def _subsample_labels(self, label):
288
+ """
289
+ Randomly sample a subset of positive and negative examples, and overwrite
290
+ the label vector to the ignore value (-1) for all elements that are not
291
+ included in the sample.
292
+
293
+ Args:
294
+ labels (Tensor): a vector of -1, 0, 1. Will be modified in-place and returned.
295
+ """
296
+ pos_idx, neg_idx = subsample_labels(
297
+ label, self.batch_size_per_image, self.positive_fraction, 0
298
+ )
299
+ # Fill with the ignore label (-1), then set positive and negative labels
300
+ label.fill_(-1)
301
+ label.scatter_(0, pos_idx, 1)
302
+ label.scatter_(0, neg_idx, 0)
303
+ return label
304
+
305
+ @torch.jit.unused
306
+ @torch.no_grad()
307
+ def label_and_sample_anchors(
308
+ self, anchors: List[Boxes], gt_instances: List[Instances]
309
+ ) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
310
+ """
311
+ Args:
312
+ anchors (list[Boxes]): anchors for each feature map.
313
+ gt_instances: the ground-truth instances for each image.
314
+
315
+ Returns:
316
+ list[Tensor]:
317
+ List of #img tensors. i-th element is a vector of labels whose length is
318
+ the total number of anchors across all feature maps R = sum(Hi * Wi * A).
319
+ Label values are in {-1, 0, 1}, with meanings: -1 = ignore; 0 = negative
320
+ class; 1 = positive class.
321
+ list[Tensor]:
322
+ i-th element is a Rx4 tensor. The values are the matched gt boxes for each
323
+ anchor. Values are undefined for those anchors not labeled as 1.
324
+ """
325
+ anchors = Boxes.cat(anchors)
326
+
327
+ gt_boxes = [x.gt_boxes for x in gt_instances]
328
+ image_sizes = [x.image_size for x in gt_instances]
329
+ del gt_instances
330
+
331
+ gt_labels = []
332
+ matched_gt_boxes = []
333
+ for image_size_i, gt_boxes_i in zip(image_sizes, gt_boxes):
334
+ """
335
+ image_size_i: (h, w) for the i-th image
336
+ gt_boxes_i: ground-truth boxes for i-th image
337
+ """
338
+
339
+ match_quality_matrix = retry_if_cuda_oom(pairwise_iou)(gt_boxes_i, anchors)
340
+ matched_idxs, gt_labels_i = retry_if_cuda_oom(self.anchor_matcher)(match_quality_matrix)
341
+ # Matching is memory-expensive and may result in CPU tensors. But the result is small
342
+ gt_labels_i = gt_labels_i.to(device=gt_boxes_i.device)
343
+ del match_quality_matrix
344
+
345
+ if self.anchor_boundary_thresh >= 0:
346
+ # Discard anchors that go out of the boundaries of the image
347
+ # NOTE: This is legacy functionality that is turned off by default in Detectron2
348
+ anchors_inside_image = anchors.inside_box(image_size_i, self.anchor_boundary_thresh)
349
+ gt_labels_i[~anchors_inside_image] = -1
350
+
351
+ # A vector of labels (-1, 0, 1) for each anchor
352
+ gt_labels_i = self._subsample_labels(gt_labels_i)
353
+
354
+ if len(gt_boxes_i) == 0:
355
+ # These values won't be used anyway since the anchor is labeled as background
356
+ matched_gt_boxes_i = torch.zeros_like(anchors.tensor)
357
+ else:
358
+ # TODO wasted indexing computation for ignored boxes
359
+ matched_gt_boxes_i = gt_boxes_i[matched_idxs].tensor
360
+
361
+ gt_labels.append(gt_labels_i) # N,AHW
362
+ matched_gt_boxes.append(matched_gt_boxes_i)
363
+ return gt_labels, matched_gt_boxes
364
+
365
+ @torch.jit.unused
366
+ def losses(
367
+ self,
368
+ anchors: List[Boxes],
369
+ pred_objectness_logits: List[torch.Tensor],
370
+ gt_labels: List[torch.Tensor],
371
+ pred_anchor_deltas: List[torch.Tensor],
372
+ gt_boxes: List[torch.Tensor],
373
+ ) -> Dict[str, torch.Tensor]:
374
+ """
375
+ Return the losses from a set of RPN predictions and their associated ground-truth.
376
+
377
+ Args:
378
+ anchors (list[Boxes or RotatedBoxes]): anchors for each feature map, each
379
+ has shape (Hi*Wi*A, B), where B is box dimension (4 or 5).
380
+ pred_objectness_logits (list[Tensor]): A list of L elements.
381
+ Element i is a tensor of shape (N, Hi*Wi*A) representing
382
+ the predicted objectness logits for all anchors.
383
+ gt_labels (list[Tensor]): Output of :meth:`label_and_sample_anchors`.
384
+ pred_anchor_deltas (list[Tensor]): A list of L elements. Element i is a tensor of shape
385
+ (N, Hi*Wi*A, 4 or 5) representing the predicted "deltas" used to transform anchors
386
+ to proposals.
387
+ gt_boxes (list[Tensor]): Output of :meth:`label_and_sample_anchors`.
388
+
389
+ Returns:
390
+ dict[loss name -> loss value]: A dict mapping from loss name to loss value.
391
+ Loss names are: `loss_rpn_cls` for objectness classification and
392
+ `loss_rpn_loc` for proposal localization.
393
+ """
394
+ num_images = len(gt_labels)
395
+ gt_labels = torch.stack(gt_labels) # (N, sum(Hi*Wi*Ai))
396
+
397
+ # Log the number of positive/negative anchors per-image that's used in training
398
+ pos_mask = gt_labels == 1
399
+ num_pos_anchors = pos_mask.sum().item()
400
+ num_neg_anchors = (gt_labels == 0).sum().item()
401
+ storage = get_event_storage()
402
+ storage.put_scalar("rpn/num_pos_anchors", num_pos_anchors / num_images)
403
+ storage.put_scalar("rpn/num_neg_anchors", num_neg_anchors / num_images)
404
+
405
+ localization_loss = _dense_box_regression_loss(
406
+ anchors,
407
+ self.box2box_transform,
408
+ pred_anchor_deltas,
409
+ gt_boxes,
410
+ pos_mask,
411
+ box_reg_loss_type=self.box_reg_loss_type,
412
+ smooth_l1_beta=self.smooth_l1_beta,
413
+ )
414
+
415
+ valid_mask = gt_labels >= 0
416
+ objectness_loss = F.binary_cross_entropy_with_logits(
417
+ cat(pred_objectness_logits, dim=1)[valid_mask],
418
+ gt_labels[valid_mask].to(torch.float32),
419
+ reduction="sum",
420
+ )
421
+ normalizer = self.batch_size_per_image * num_images
422
+ losses = {
423
+ "loss_rpn_cls": objectness_loss / normalizer,
424
+ # The original Faster R-CNN paper uses a slightly different normalizer
425
+ # for loc loss. But it doesn't matter in practice
426
+ "loss_rpn_loc": localization_loss / normalizer,
427
+ }
428
+ losses = {k: v * self.loss_weight.get(k, 1.0) for k, v in losses.items()}
429
+ return losses
430
+
431
+ def forward(
432
+ self,
433
+ images: ImageList,
434
+ features: Dict[str, torch.Tensor],
435
+ gt_instances: Optional[List[Instances]] = None,
436
+ ):
437
+ """
438
+ Args:
439
+ images (ImageList): input images of length `N`
440
+ features (dict[str, Tensor]): input data as a mapping from feature
441
+ map name to tensor. Axis 0 represents the number of images `N` in
442
+ the input data; axes 1-3 are channels, height, and width, which may
443
+ vary between feature maps (e.g., if a feature pyramid is used).
444
+ gt_instances (list[Instances], optional): a length `N` list of `Instances`s.
445
+ Each `Instances` stores ground-truth instances for the corresponding image.
446
+
447
+ Returns:
448
+ proposals: list[Instances]: contains fields "proposal_boxes", "objectness_logits"
449
+ loss: dict[Tensor] or None
450
+ """
451
+ features = [features[f] for f in self.in_features]
452
+ anchors = self.anchor_generator(features)
453
+
454
+ pred_objectness_logits, pred_anchor_deltas = self.rpn_head(features)
455
+ # Transpose the Hi*Wi*A dimension to the middle:
456
+ pred_objectness_logits = [
457
+ # (N, A, Hi, Wi) -> (N, Hi, Wi, A) -> (N, Hi*Wi*A)
458
+ score.permute(0, 2, 3, 1).flatten(1)
459
+ for score in pred_objectness_logits
460
+ ]
461
+ pred_anchor_deltas = [
462
+ # (N, A*B, Hi, Wi) -> (N, A, B, Hi, Wi) -> (N, Hi, Wi, A, B) -> (N, Hi*Wi*A, B)
463
+ x.view(x.shape[0], -1, self.anchor_generator.box_dim, x.shape[-2], x.shape[-1])
464
+ .permute(0, 3, 4, 1, 2)
465
+ .flatten(1, -2)
466
+ for x in pred_anchor_deltas
467
+ ]
468
+
469
+ if self.training:
470
+ assert gt_instances is not None, "RPN requires gt_instances in training!"
471
+ gt_labels, gt_boxes = self.label_and_sample_anchors(anchors, gt_instances)
472
+ losses = self.losses(
473
+ anchors, pred_objectness_logits, gt_labels, pred_anchor_deltas, gt_boxes
474
+ )
475
+ else:
476
+ losses = {}
477
+ proposals = self.predict_proposals(
478
+ anchors, pred_objectness_logits, pred_anchor_deltas, images.image_sizes
479
+ )
480
+ return proposals, losses
481
+
482
+ def predict_proposals(
483
+ self,
484
+ anchors: List[Boxes],
485
+ pred_objectness_logits: List[torch.Tensor],
486
+ pred_anchor_deltas: List[torch.Tensor],
487
+ image_sizes: List[Tuple[int, int]],
488
+ ):
489
+ """
490
+ Decode all the predicted box regression deltas to proposals. Find the top proposals
491
+ by applying NMS and removing boxes that are too small.
492
+
493
+ Returns:
494
+ proposals (list[Instances]): list of N Instances. The i-th Instances
495
+ stores post_nms_topk object proposals for image i, sorted by their
496
+ objectness score in descending order.
497
+ """
498
+ # The proposals are treated as fixed for joint training with roi heads.
499
+ # This approach ignores the derivative w.r.t. the proposal boxes’ coordinates that
500
+ # are also network responses.
501
+ with torch.no_grad():
502
+ pred_proposals = self._decode_proposals(anchors, pred_anchor_deltas)
503
+ return find_top_rpn_proposals(
504
+ pred_proposals,
505
+ pred_objectness_logits,
506
+ image_sizes,
507
+ self.nms_thresh,
508
+ self.pre_nms_topk[self.training],
509
+ self.post_nms_topk[self.training],
510
+ self.min_box_size,
511
+ self.training,
512
+ )
513
+
514
+ def _decode_proposals(self, anchors: List[Boxes], pred_anchor_deltas: List[torch.Tensor]):
515
+ """
516
+ Transform anchors into proposals by applying the predicted anchor deltas.
517
+
518
+ Returns:
519
+ proposals (list[Tensor]): A list of L tensors. Tensor i has shape
520
+ (N, Hi*Wi*A, B)
521
+ """
522
+ N = pred_anchor_deltas[0].shape[0]
523
+ proposals = []
524
+ # For each feature map
525
+ for anchors_i, pred_anchor_deltas_i in zip(anchors, pred_anchor_deltas):
526
+ B = anchors_i.tensor.size(1)
527
+ pred_anchor_deltas_i = pred_anchor_deltas_i.reshape(-1, B)
528
+ # Expand anchors to shape (N*Hi*Wi*A, B)
529
+ anchors_i = anchors_i.tensor.unsqueeze(0).expand(N, -1, -1).reshape(-1, B)
530
+ proposals_i = self.box2box_transform.apply_deltas(pred_anchor_deltas_i, anchors_i)
531
+ # Append feature map proposals with shape (N, Hi*Wi*A, B)
532
+ proposals.append(proposals_i.view(N, -1, B))
533
+ return proposals
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/proposal_generator/rrpn.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ import itertools
3
+ import logging
4
+ from typing import Dict, List
5
+ import torch
6
+
7
+ from annotator.oneformer.detectron2.config import configurable
8
+ from annotator.oneformer.detectron2.layers import ShapeSpec, batched_nms_rotated, cat
9
+ from annotator.oneformer.detectron2.structures import Instances, RotatedBoxes, pairwise_iou_rotated
10
+ from annotator.oneformer.detectron2.utils.memory import retry_if_cuda_oom
11
+
12
+ from ..box_regression import Box2BoxTransformRotated
13
+ from .build import PROPOSAL_GENERATOR_REGISTRY
14
+ from .proposal_utils import _is_tracing
15
+ from .rpn import RPN
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ def find_top_rrpn_proposals(
21
+ proposals,
22
+ pred_objectness_logits,
23
+ image_sizes,
24
+ nms_thresh,
25
+ pre_nms_topk,
26
+ post_nms_topk,
27
+ min_box_size,
28
+ training,
29
+ ):
30
+ """
31
+ For each feature map, select the `pre_nms_topk` highest scoring proposals,
32
+ apply NMS, clip proposals, and remove small boxes. Return the `post_nms_topk`
33
+ highest scoring proposals among all the feature maps if `training` is True,
34
+ otherwise, returns the highest `post_nms_topk` scoring proposals for each
35
+ feature map.
36
+
37
+ Args:
38
+ proposals (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A, 5).
39
+ All proposal predictions on the feature maps.
40
+ pred_objectness_logits (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A).
41
+ image_sizes (list[tuple]): sizes (h, w) for each image
42
+ nms_thresh (float): IoU threshold to use for NMS
43
+ pre_nms_topk (int): number of top k scoring proposals to keep before applying NMS.
44
+ When RRPN is run on multiple feature maps (as in FPN) this number is per
45
+ feature map.
46
+ post_nms_topk (int): number of top k scoring proposals to keep after applying NMS.
47
+ When RRPN is run on multiple feature maps (as in FPN) this number is total,
48
+ over all feature maps.
49
+ min_box_size(float): minimum proposal box side length in pixels (absolute units wrt
50
+ input images).
51
+ training (bool): True if proposals are to be used in training, otherwise False.
52
+ This arg exists only to support a legacy bug; look for the "NB: Legacy bug ..."
53
+ comment.
54
+
55
+ Returns:
56
+ proposals (list[Instances]): list of N Instances. The i-th Instances
57
+ stores post_nms_topk object proposals for image i.
58
+ """
59
+ num_images = len(image_sizes)
60
+ device = proposals[0].device
61
+
62
+ # 1. Select top-k anchor for every level and every image
63
+ topk_scores = [] # #lvl Tensor, each of shape N x topk
64
+ topk_proposals = []
65
+ level_ids = [] # #lvl Tensor, each of shape (topk,)
66
+ batch_idx = torch.arange(num_images, device=device)
67
+ for level_id, proposals_i, logits_i in zip(
68
+ itertools.count(), proposals, pred_objectness_logits
69
+ ):
70
+ Hi_Wi_A = logits_i.shape[1]
71
+ if isinstance(Hi_Wi_A, torch.Tensor): # it's a tensor in tracing
72
+ num_proposals_i = torch.clamp(Hi_Wi_A, max=pre_nms_topk)
73
+ else:
74
+ num_proposals_i = min(Hi_Wi_A, pre_nms_topk)
75
+
76
+ topk_scores_i, topk_idx = logits_i.topk(num_proposals_i, dim=1)
77
+
78
+ # each is N x topk
79
+ topk_proposals_i = proposals_i[batch_idx[:, None], topk_idx] # N x topk x 5
80
+
81
+ topk_proposals.append(topk_proposals_i)
82
+ topk_scores.append(topk_scores_i)
83
+ level_ids.append(torch.full((num_proposals_i,), level_id, dtype=torch.int64, device=device))
84
+
85
+ # 2. Concat all levels together
86
+ topk_scores = cat(topk_scores, dim=1)
87
+ topk_proposals = cat(topk_proposals, dim=1)
88
+ level_ids = cat(level_ids, dim=0)
89
+
90
+ # 3. For each image, run a per-level NMS, and choose topk results.
91
+ results = []
92
+ for n, image_size in enumerate(image_sizes):
93
+ boxes = RotatedBoxes(topk_proposals[n])
94
+ scores_per_img = topk_scores[n]
95
+ lvl = level_ids
96
+
97
+ valid_mask = torch.isfinite(boxes.tensor).all(dim=1) & torch.isfinite(scores_per_img)
98
+ if not valid_mask.all():
99
+ if training:
100
+ raise FloatingPointError(
101
+ "Predicted boxes or scores contain Inf/NaN. Training has diverged."
102
+ )
103
+ boxes = boxes[valid_mask]
104
+ scores_per_img = scores_per_img[valid_mask]
105
+ lvl = lvl[valid_mask]
106
+ boxes.clip(image_size)
107
+
108
+ # filter empty boxes
109
+ keep = boxes.nonempty(threshold=min_box_size)
110
+ if _is_tracing() or keep.sum().item() != len(boxes):
111
+ boxes, scores_per_img, lvl = (boxes[keep], scores_per_img[keep], lvl[keep])
112
+
113
+ keep = batched_nms_rotated(boxes.tensor, scores_per_img, lvl, nms_thresh)
114
+ # In Detectron1, there was different behavior during training vs. testing.
115
+ # (https://github.com/facebookresearch/Detectron/issues/459)
116
+ # During training, topk is over the proposals from *all* images in the training batch.
117
+ # During testing, it is over the proposals for each image separately.
118
+ # As a result, the training behavior becomes batch-dependent,
119
+ # and the configuration "POST_NMS_TOPK_TRAIN" end up relying on the batch size.
120
+ # This bug is addressed in Detectron2 to make the behavior independent of batch size.
121
+ keep = keep[:post_nms_topk]
122
+
123
+ res = Instances(image_size)
124
+ res.proposal_boxes = boxes[keep]
125
+ res.objectness_logits = scores_per_img[keep]
126
+ results.append(res)
127
+ return results
128
+
129
+
130
+ @PROPOSAL_GENERATOR_REGISTRY.register()
131
+ class RRPN(RPN):
132
+ """
133
+ Rotated Region Proposal Network described in :paper:`RRPN`.
134
+ """
135
+
136
+ @configurable
137
+ def __init__(self, *args, **kwargs):
138
+ super().__init__(*args, **kwargs)
139
+ if self.anchor_boundary_thresh >= 0:
140
+ raise NotImplementedError(
141
+ "anchor_boundary_thresh is a legacy option not implemented for RRPN."
142
+ )
143
+
144
+ @classmethod
145
+ def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
146
+ ret = super().from_config(cfg, input_shape)
147
+ ret["box2box_transform"] = Box2BoxTransformRotated(weights=cfg.MODEL.RPN.BBOX_REG_WEIGHTS)
148
+ return ret
149
+
150
+ @torch.no_grad()
151
+ def label_and_sample_anchors(self, anchors: List[RotatedBoxes], gt_instances: List[Instances]):
152
+ """
153
+ Args:
154
+ anchors (list[RotatedBoxes]): anchors for each feature map.
155
+ gt_instances: the ground-truth instances for each image.
156
+
157
+ Returns:
158
+ list[Tensor]:
159
+ List of #img tensors. i-th element is a vector of labels whose length is
160
+ the total number of anchors across feature maps. Label values are in {-1, 0, 1},
161
+ with meanings: -1 = ignore; 0 = negative class; 1 = positive class.
162
+ list[Tensor]:
163
+ i-th element is a Nx5 tensor, where N is the total number of anchors across
164
+ feature maps. The values are the matched gt boxes for each anchor.
165
+ Values are undefined for those anchors not labeled as 1.
166
+ """
167
+ anchors = RotatedBoxes.cat(anchors)
168
+
169
+ gt_boxes = [x.gt_boxes for x in gt_instances]
170
+ del gt_instances
171
+
172
+ gt_labels = []
173
+ matched_gt_boxes = []
174
+ for gt_boxes_i in gt_boxes:
175
+ """
176
+ gt_boxes_i: ground-truth boxes for i-th image
177
+ """
178
+ match_quality_matrix = retry_if_cuda_oom(pairwise_iou_rotated)(gt_boxes_i, anchors)
179
+ matched_idxs, gt_labels_i = retry_if_cuda_oom(self.anchor_matcher)(match_quality_matrix)
180
+ # Matching is memory-expensive and may result in CPU tensors. But the result is small
181
+ gt_labels_i = gt_labels_i.to(device=gt_boxes_i.device)
182
+
183
+ # A vector of labels (-1, 0, 1) for each anchor
184
+ gt_labels_i = self._subsample_labels(gt_labels_i)
185
+
186
+ if len(gt_boxes_i) == 0:
187
+ # These values won't be used anyway since the anchor is labeled as background
188
+ matched_gt_boxes_i = torch.zeros_like(anchors.tensor)
189
+ else:
190
+ # TODO wasted indexing computation for ignored boxes
191
+ matched_gt_boxes_i = gt_boxes_i[matched_idxs].tensor
192
+
193
+ gt_labels.append(gt_labels_i) # N,AHW
194
+ matched_gt_boxes.append(matched_gt_boxes_i)
195
+ return gt_labels, matched_gt_boxes
196
+
197
+ @torch.no_grad()
198
+ def predict_proposals(self, anchors, pred_objectness_logits, pred_anchor_deltas, image_sizes):
199
+ pred_proposals = self._decode_proposals(anchors, pred_anchor_deltas)
200
+ return find_top_rrpn_proposals(
201
+ pred_proposals,
202
+ pred_objectness_logits,
203
+ image_sizes,
204
+ self.nms_thresh,
205
+ self.pre_nms_topk[self.training],
206
+ self.post_nms_topk[self.training],
207
+ self.min_box_size,
208
+ self.training,
209
+ )
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/roi_heads/__init__.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ from .box_head import ROI_BOX_HEAD_REGISTRY, build_box_head, FastRCNNConvFCHead
3
+ from .keypoint_head import (
4
+ ROI_KEYPOINT_HEAD_REGISTRY,
5
+ build_keypoint_head,
6
+ BaseKeypointRCNNHead,
7
+ KRCNNConvDeconvUpsampleHead,
8
+ )
9
+ from .mask_head import (
10
+ ROI_MASK_HEAD_REGISTRY,
11
+ build_mask_head,
12
+ BaseMaskRCNNHead,
13
+ MaskRCNNConvUpsampleHead,
14
+ )
15
+ from .roi_heads import (
16
+ ROI_HEADS_REGISTRY,
17
+ ROIHeads,
18
+ Res5ROIHeads,
19
+ StandardROIHeads,
20
+ build_roi_heads,
21
+ select_foreground_proposals,
22
+ )
23
+ from .cascade_rcnn import CascadeROIHeads
24
+ from .rotated_fast_rcnn import RROIHeads
25
+ from .fast_rcnn import FastRCNNOutputLayers
26
+
27
+ from . import cascade_rcnn # isort:skip
28
+
29
+ __all__ = list(globals().keys())
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/roi_heads/box_head.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ import numpy as np
3
+ from typing import List
4
+ import fvcore.nn.weight_init as weight_init
5
+ import torch
6
+ from torch import nn
7
+
8
+ from annotator.oneformer.detectron2.config import configurable
9
+ from annotator.oneformer.detectron2.layers import Conv2d, ShapeSpec, get_norm
10
+ from annotator.oneformer.detectron2.utils.registry import Registry
11
+
12
+ __all__ = ["FastRCNNConvFCHead", "build_box_head", "ROI_BOX_HEAD_REGISTRY"]
13
+
14
+ ROI_BOX_HEAD_REGISTRY = Registry("ROI_BOX_HEAD")
15
+ ROI_BOX_HEAD_REGISTRY.__doc__ = """
16
+ Registry for box heads, which make box predictions from per-region features.
17
+
18
+ The registered object will be called with `obj(cfg, input_shape)`.
19
+ """
20
+
21
+
22
+ # To get torchscript support, we make the head a subclass of `nn.Sequential`.
23
+ # Therefore, to add new layers in this head class, please make sure they are
24
+ # added in the order they will be used in forward().
25
+ @ROI_BOX_HEAD_REGISTRY.register()
26
+ class FastRCNNConvFCHead(nn.Sequential):
27
+ """
28
+ A head with several 3x3 conv layers (each followed by norm & relu) and then
29
+ several fc layers (each followed by relu).
30
+ """
31
+
32
+ @configurable
33
+ def __init__(
34
+ self, input_shape: ShapeSpec, *, conv_dims: List[int], fc_dims: List[int], conv_norm=""
35
+ ):
36
+ """
37
+ NOTE: this interface is experimental.
38
+
39
+ Args:
40
+ input_shape (ShapeSpec): shape of the input feature.
41
+ conv_dims (list[int]): the output dimensions of the conv layers
42
+ fc_dims (list[int]): the output dimensions of the fc layers
43
+ conv_norm (str or callable): normalization for the conv layers.
44
+ See :func:`detectron2.layers.get_norm` for supported types.
45
+ """
46
+ super().__init__()
47
+ assert len(conv_dims) + len(fc_dims) > 0
48
+
49
+ self._output_size = (input_shape.channels, input_shape.height, input_shape.width)
50
+
51
+ self.conv_norm_relus = []
52
+ for k, conv_dim in enumerate(conv_dims):
53
+ conv = Conv2d(
54
+ self._output_size[0],
55
+ conv_dim,
56
+ kernel_size=3,
57
+ padding=1,
58
+ bias=not conv_norm,
59
+ norm=get_norm(conv_norm, conv_dim),
60
+ activation=nn.ReLU(),
61
+ )
62
+ self.add_module("conv{}".format(k + 1), conv)
63
+ self.conv_norm_relus.append(conv)
64
+ self._output_size = (conv_dim, self._output_size[1], self._output_size[2])
65
+
66
+ self.fcs = []
67
+ for k, fc_dim in enumerate(fc_dims):
68
+ if k == 0:
69
+ self.add_module("flatten", nn.Flatten())
70
+ fc = nn.Linear(int(np.prod(self._output_size)), fc_dim)
71
+ self.add_module("fc{}".format(k + 1), fc)
72
+ self.add_module("fc_relu{}".format(k + 1), nn.ReLU())
73
+ self.fcs.append(fc)
74
+ self._output_size = fc_dim
75
+
76
+ for layer in self.conv_norm_relus:
77
+ weight_init.c2_msra_fill(layer)
78
+ for layer in self.fcs:
79
+ weight_init.c2_xavier_fill(layer)
80
+
81
+ @classmethod
82
+ def from_config(cls, cfg, input_shape):
83
+ num_conv = cfg.MODEL.ROI_BOX_HEAD.NUM_CONV
84
+ conv_dim = cfg.MODEL.ROI_BOX_HEAD.CONV_DIM
85
+ num_fc = cfg.MODEL.ROI_BOX_HEAD.NUM_FC
86
+ fc_dim = cfg.MODEL.ROI_BOX_HEAD.FC_DIM
87
+ return {
88
+ "input_shape": input_shape,
89
+ "conv_dims": [conv_dim] * num_conv,
90
+ "fc_dims": [fc_dim] * num_fc,
91
+ "conv_norm": cfg.MODEL.ROI_BOX_HEAD.NORM,
92
+ }
93
+
94
+ def forward(self, x):
95
+ for layer in self:
96
+ x = layer(x)
97
+ return x
98
+
99
+ @property
100
+ @torch.jit.unused
101
+ def output_shape(self):
102
+ """
103
+ Returns:
104
+ ShapeSpec: the output feature shape
105
+ """
106
+ o = self._output_size
107
+ if isinstance(o, int):
108
+ return ShapeSpec(channels=o)
109
+ else:
110
+ return ShapeSpec(channels=o[0], height=o[1], width=o[2])
111
+
112
+
113
+ def build_box_head(cfg, input_shape):
114
+ """
115
+ Build a box head defined by `cfg.MODEL.ROI_BOX_HEAD.NAME`.
116
+ """
117
+ name = cfg.MODEL.ROI_BOX_HEAD.NAME
118
+ return ROI_BOX_HEAD_REGISTRY.get(name)(cfg, input_shape)
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/roi_heads/cascade_rcnn.py ADDED
@@ -0,0 +1,299 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ from typing import List
3
+ import torch
4
+ from torch import nn
5
+ from torch.autograd.function import Function
6
+
7
+ from annotator.oneformer.detectron2.config import configurable
8
+ from annotator.oneformer.detectron2.layers import ShapeSpec
9
+ from annotator.oneformer.detectron2.structures import Boxes, Instances, pairwise_iou
10
+ from annotator.oneformer.detectron2.utils.events import get_event_storage
11
+
12
+ from ..box_regression import Box2BoxTransform
13
+ from ..matcher import Matcher
14
+ from ..poolers import ROIPooler
15
+ from .box_head import build_box_head
16
+ from .fast_rcnn import FastRCNNOutputLayers, fast_rcnn_inference
17
+ from .roi_heads import ROI_HEADS_REGISTRY, StandardROIHeads
18
+
19
+
20
+ class _ScaleGradient(Function):
21
+ @staticmethod
22
+ def forward(ctx, input, scale):
23
+ ctx.scale = scale
24
+ return input
25
+
26
+ @staticmethod
27
+ def backward(ctx, grad_output):
28
+ return grad_output * ctx.scale, None
29
+
30
+
31
+ @ROI_HEADS_REGISTRY.register()
32
+ class CascadeROIHeads(StandardROIHeads):
33
+ """
34
+ The ROI heads that implement :paper:`Cascade R-CNN`.
35
+ """
36
+
37
+ @configurable
38
+ def __init__(
39
+ self,
40
+ *,
41
+ box_in_features: List[str],
42
+ box_pooler: ROIPooler,
43
+ box_heads: List[nn.Module],
44
+ box_predictors: List[nn.Module],
45
+ proposal_matchers: List[Matcher],
46
+ **kwargs,
47
+ ):
48
+ """
49
+ NOTE: this interface is experimental.
50
+
51
+ Args:
52
+ box_pooler (ROIPooler): pooler that extracts region features from given boxes
53
+ box_heads (list[nn.Module]): box head for each cascade stage
54
+ box_predictors (list[nn.Module]): box predictor for each cascade stage
55
+ proposal_matchers (list[Matcher]): matcher with different IoU thresholds to
56
+ match boxes with ground truth for each stage. The first matcher matches
57
+ RPN proposals with ground truth, the other matchers use boxes predicted
58
+ by the previous stage as proposals and match them with ground truth.
59
+ """
60
+ assert "proposal_matcher" not in kwargs, (
61
+ "CascadeROIHeads takes 'proposal_matchers=' for each stage instead "
62
+ "of one 'proposal_matcher='."
63
+ )
64
+ # The first matcher matches RPN proposals with ground truth, done in the base class
65
+ kwargs["proposal_matcher"] = proposal_matchers[0]
66
+ num_stages = self.num_cascade_stages = len(box_heads)
67
+ box_heads = nn.ModuleList(box_heads)
68
+ box_predictors = nn.ModuleList(box_predictors)
69
+ assert len(box_predictors) == num_stages, f"{len(box_predictors)} != {num_stages}!"
70
+ assert len(proposal_matchers) == num_stages, f"{len(proposal_matchers)} != {num_stages}!"
71
+ super().__init__(
72
+ box_in_features=box_in_features,
73
+ box_pooler=box_pooler,
74
+ box_head=box_heads,
75
+ box_predictor=box_predictors,
76
+ **kwargs,
77
+ )
78
+ self.proposal_matchers = proposal_matchers
79
+
80
+ @classmethod
81
+ def from_config(cls, cfg, input_shape):
82
+ ret = super().from_config(cfg, input_shape)
83
+ ret.pop("proposal_matcher")
84
+ return ret
85
+
86
+ @classmethod
87
+ def _init_box_head(cls, cfg, input_shape):
88
+ # fmt: off
89
+ in_features = cfg.MODEL.ROI_HEADS.IN_FEATURES
90
+ pooler_resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION
91
+ pooler_scales = tuple(1.0 / input_shape[k].stride for k in in_features)
92
+ sampling_ratio = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO
93
+ pooler_type = cfg.MODEL.ROI_BOX_HEAD.POOLER_TYPE
94
+ cascade_bbox_reg_weights = cfg.MODEL.ROI_BOX_CASCADE_HEAD.BBOX_REG_WEIGHTS
95
+ cascade_ious = cfg.MODEL.ROI_BOX_CASCADE_HEAD.IOUS
96
+ assert len(cascade_bbox_reg_weights) == len(cascade_ious)
97
+ assert cfg.MODEL.ROI_BOX_HEAD.CLS_AGNOSTIC_BBOX_REG, \
98
+ "CascadeROIHeads only support class-agnostic regression now!"
99
+ assert cascade_ious[0] == cfg.MODEL.ROI_HEADS.IOU_THRESHOLDS[0]
100
+ # fmt: on
101
+
102
+ in_channels = [input_shape[f].channels for f in in_features]
103
+ # Check all channel counts are equal
104
+ assert len(set(in_channels)) == 1, in_channels
105
+ in_channels = in_channels[0]
106
+
107
+ box_pooler = ROIPooler(
108
+ output_size=pooler_resolution,
109
+ scales=pooler_scales,
110
+ sampling_ratio=sampling_ratio,
111
+ pooler_type=pooler_type,
112
+ )
113
+ pooled_shape = ShapeSpec(
114
+ channels=in_channels, width=pooler_resolution, height=pooler_resolution
115
+ )
116
+
117
+ box_heads, box_predictors, proposal_matchers = [], [], []
118
+ for match_iou, bbox_reg_weights in zip(cascade_ious, cascade_bbox_reg_weights):
119
+ box_head = build_box_head(cfg, pooled_shape)
120
+ box_heads.append(box_head)
121
+ box_predictors.append(
122
+ FastRCNNOutputLayers(
123
+ cfg,
124
+ box_head.output_shape,
125
+ box2box_transform=Box2BoxTransform(weights=bbox_reg_weights),
126
+ )
127
+ )
128
+ proposal_matchers.append(Matcher([match_iou], [0, 1], allow_low_quality_matches=False))
129
+ return {
130
+ "box_in_features": in_features,
131
+ "box_pooler": box_pooler,
132
+ "box_heads": box_heads,
133
+ "box_predictors": box_predictors,
134
+ "proposal_matchers": proposal_matchers,
135
+ }
136
+
137
+ def forward(self, images, features, proposals, targets=None):
138
+ del images
139
+ if self.training:
140
+ proposals = self.label_and_sample_proposals(proposals, targets)
141
+
142
+ if self.training:
143
+ # Need targets to box head
144
+ losses = self._forward_box(features, proposals, targets)
145
+ losses.update(self._forward_mask(features, proposals))
146
+ losses.update(self._forward_keypoint(features, proposals))
147
+ return proposals, losses
148
+ else:
149
+ pred_instances = self._forward_box(features, proposals)
150
+ pred_instances = self.forward_with_given_boxes(features, pred_instances)
151
+ return pred_instances, {}
152
+
153
+ def _forward_box(self, features, proposals, targets=None):
154
+ """
155
+ Args:
156
+ features, targets: the same as in
157
+ Same as in :meth:`ROIHeads.forward`.
158
+ proposals (list[Instances]): the per-image object proposals with
159
+ their matching ground truth.
160
+ Each has fields "proposal_boxes", and "objectness_logits",
161
+ "gt_classes", "gt_boxes".
162
+ """
163
+ features = [features[f] for f in self.box_in_features]
164
+ head_outputs = [] # (predictor, predictions, proposals)
165
+ prev_pred_boxes = None
166
+ image_sizes = [x.image_size for x in proposals]
167
+ for k in range(self.num_cascade_stages):
168
+ if k > 0:
169
+ # The output boxes of the previous stage are used to create the input
170
+ # proposals of the next stage.
171
+ proposals = self._create_proposals_from_boxes(prev_pred_boxes, image_sizes)
172
+ if self.training:
173
+ proposals = self._match_and_label_boxes(proposals, k, targets)
174
+ predictions = self._run_stage(features, proposals, k)
175
+ prev_pred_boxes = self.box_predictor[k].predict_boxes(predictions, proposals)
176
+ head_outputs.append((self.box_predictor[k], predictions, proposals))
177
+
178
+ if self.training:
179
+ losses = {}
180
+ storage = get_event_storage()
181
+ for stage, (predictor, predictions, proposals) in enumerate(head_outputs):
182
+ with storage.name_scope("stage{}".format(stage)):
183
+ stage_losses = predictor.losses(predictions, proposals)
184
+ losses.update({k + "_stage{}".format(stage): v for k, v in stage_losses.items()})
185
+ return losses
186
+ else:
187
+ # Each is a list[Tensor] of length #image. Each tensor is Ri x (K+1)
188
+ scores_per_stage = [h[0].predict_probs(h[1], h[2]) for h in head_outputs]
189
+
190
+ # Average the scores across heads
191
+ scores = [
192
+ sum(list(scores_per_image)) * (1.0 / self.num_cascade_stages)
193
+ for scores_per_image in zip(*scores_per_stage)
194
+ ]
195
+ # Use the boxes of the last head
196
+ predictor, predictions, proposals = head_outputs[-1]
197
+ boxes = predictor.predict_boxes(predictions, proposals)
198
+ pred_instances, _ = fast_rcnn_inference(
199
+ boxes,
200
+ scores,
201
+ image_sizes,
202
+ predictor.test_score_thresh,
203
+ predictor.test_nms_thresh,
204
+ predictor.test_topk_per_image,
205
+ )
206
+ return pred_instances
207
+
208
+ @torch.no_grad()
209
+ def _match_and_label_boxes(self, proposals, stage, targets):
210
+ """
211
+ Match proposals with groundtruth using the matcher at the given stage.
212
+ Label the proposals as foreground or background based on the match.
213
+
214
+ Args:
215
+ proposals (list[Instances]): One Instances for each image, with
216
+ the field "proposal_boxes".
217
+ stage (int): the current stage
218
+ targets (list[Instances]): the ground truth instances
219
+
220
+ Returns:
221
+ list[Instances]: the same proposals, but with fields "gt_classes" and "gt_boxes"
222
+ """
223
+ num_fg_samples, num_bg_samples = [], []
224
+ for proposals_per_image, targets_per_image in zip(proposals, targets):
225
+ match_quality_matrix = pairwise_iou(
226
+ targets_per_image.gt_boxes, proposals_per_image.proposal_boxes
227
+ )
228
+ # proposal_labels are 0 or 1
229
+ matched_idxs, proposal_labels = self.proposal_matchers[stage](match_quality_matrix)
230
+ if len(targets_per_image) > 0:
231
+ gt_classes = targets_per_image.gt_classes[matched_idxs]
232
+ # Label unmatched proposals (0 label from matcher) as background (label=num_classes)
233
+ gt_classes[proposal_labels == 0] = self.num_classes
234
+ gt_boxes = targets_per_image.gt_boxes[matched_idxs]
235
+ else:
236
+ gt_classes = torch.zeros_like(matched_idxs) + self.num_classes
237
+ gt_boxes = Boxes(
238
+ targets_per_image.gt_boxes.tensor.new_zeros((len(proposals_per_image), 4))
239
+ )
240
+ proposals_per_image.gt_classes = gt_classes
241
+ proposals_per_image.gt_boxes = gt_boxes
242
+
243
+ num_fg_samples.append((proposal_labels == 1).sum().item())
244
+ num_bg_samples.append(proposal_labels.numel() - num_fg_samples[-1])
245
+
246
+ # Log the number of fg/bg samples in each stage
247
+ storage = get_event_storage()
248
+ storage.put_scalar(
249
+ "stage{}/roi_head/num_fg_samples".format(stage),
250
+ sum(num_fg_samples) / len(num_fg_samples),
251
+ )
252
+ storage.put_scalar(
253
+ "stage{}/roi_head/num_bg_samples".format(stage),
254
+ sum(num_bg_samples) / len(num_bg_samples),
255
+ )
256
+ return proposals
257
+
258
+ def _run_stage(self, features, proposals, stage):
259
+ """
260
+ Args:
261
+ features (list[Tensor]): #lvl input features to ROIHeads
262
+ proposals (list[Instances]): #image Instances, with the field "proposal_boxes"
263
+ stage (int): the current stage
264
+
265
+ Returns:
266
+ Same output as `FastRCNNOutputLayers.forward()`.
267
+ """
268
+ box_features = self.box_pooler(features, [x.proposal_boxes for x in proposals])
269
+ # The original implementation averages the losses among heads,
270
+ # but scale up the parameter gradients of the heads.
271
+ # This is equivalent to adding the losses among heads,
272
+ # but scale down the gradients on features.
273
+ if self.training:
274
+ box_features = _ScaleGradient.apply(box_features, 1.0 / self.num_cascade_stages)
275
+ box_features = self.box_head[stage](box_features)
276
+ return self.box_predictor[stage](box_features)
277
+
278
+ def _create_proposals_from_boxes(self, boxes, image_sizes):
279
+ """
280
+ Args:
281
+ boxes (list[Tensor]): per-image predicted boxes, each of shape Ri x 4
282
+ image_sizes (list[tuple]): list of image shapes in (h, w)
283
+
284
+ Returns:
285
+ list[Instances]: per-image proposals with the given boxes.
286
+ """
287
+ # Just like RPN, the proposals should not have gradients
288
+ boxes = [Boxes(b.detach()) for b in boxes]
289
+ proposals = []
290
+ for boxes_per_image, image_size in zip(boxes, image_sizes):
291
+ boxes_per_image.clip(image_size)
292
+ if self.training:
293
+ # do not filter empty boxes at inference time,
294
+ # because the scores from each stage need to be aligned and added later
295
+ boxes_per_image = boxes_per_image[boxes_per_image.nonempty()]
296
+ prop = Instances(image_size)
297
+ prop.proposal_boxes = boxes_per_image
298
+ proposals.append(prop)
299
+ return proposals
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/roi_heads/fast_rcnn.py ADDED
@@ -0,0 +1,569 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ import logging
3
+ from typing import Callable, Dict, List, Optional, Tuple, Union
4
+ import torch
5
+ from torch import nn
6
+ from torch.nn import functional as F
7
+
8
+ from annotator.oneformer.detectron2.config import configurable
9
+ from annotator.oneformer.detectron2.data.detection_utils import get_fed_loss_cls_weights
10
+ from annotator.oneformer.detectron2.layers import ShapeSpec, batched_nms, cat, cross_entropy, nonzero_tuple
11
+ from annotator.oneformer.detectron2.modeling.box_regression import Box2BoxTransform, _dense_box_regression_loss
12
+ from annotator.oneformer.detectron2.structures import Boxes, Instances
13
+ from annotator.oneformer.detectron2.utils.events import get_event_storage
14
+
15
+ __all__ = ["fast_rcnn_inference", "FastRCNNOutputLayers"]
16
+
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+ """
21
+ Shape shorthand in this module:
22
+
23
+ N: number of images in the minibatch
24
+ R: number of ROIs, combined over all images, in the minibatch
25
+ Ri: number of ROIs in image i
26
+ K: number of foreground classes. E.g.,there are 80 foreground classes in COCO.
27
+
28
+ Naming convention:
29
+
30
+ deltas: refers to the 4-d (dx, dy, dw, dh) deltas that parameterize the box2box
31
+ transform (see :class:`box_regression.Box2BoxTransform`).
32
+
33
+ pred_class_logits: predicted class scores in [-inf, +inf]; use
34
+ softmax(pred_class_logits) to estimate P(class).
35
+
36
+ gt_classes: ground-truth classification labels in [0, K], where [0, K) represent
37
+ foreground object classes and K represents the background class.
38
+
39
+ pred_proposal_deltas: predicted box2box transform deltas for transforming proposals
40
+ to detection box predictions.
41
+
42
+ gt_proposal_deltas: ground-truth box2box transform deltas
43
+ """
44
+
45
+
46
+ def fast_rcnn_inference(
47
+ boxes: List[torch.Tensor],
48
+ scores: List[torch.Tensor],
49
+ image_shapes: List[Tuple[int, int]],
50
+ score_thresh: float,
51
+ nms_thresh: float,
52
+ topk_per_image: int,
53
+ ):
54
+ """
55
+ Call `fast_rcnn_inference_single_image` for all images.
56
+
57
+ Args:
58
+ boxes (list[Tensor]): A list of Tensors of predicted class-specific or class-agnostic
59
+ boxes for each image. Element i has shape (Ri, K * 4) if doing
60
+ class-specific regression, or (Ri, 4) if doing class-agnostic
61
+ regression, where Ri is the number of predicted objects for image i.
62
+ This is compatible with the output of :meth:`FastRCNNOutputLayers.predict_boxes`.
63
+ scores (list[Tensor]): A list of Tensors of predicted class scores for each image.
64
+ Element i has shape (Ri, K + 1), where Ri is the number of predicted objects
65
+ for image i. Compatible with the output of :meth:`FastRCNNOutputLayers.predict_probs`.
66
+ image_shapes (list[tuple]): A list of (width, height) tuples for each image in the batch.
67
+ score_thresh (float): Only return detections with a confidence score exceeding this
68
+ threshold.
69
+ nms_thresh (float): The threshold to use for box non-maximum suppression. Value in [0, 1].
70
+ topk_per_image (int): The number of top scoring detections to return. Set < 0 to return
71
+ all detections.
72
+
73
+ Returns:
74
+ instances: (list[Instances]): A list of N instances, one for each image in the batch,
75
+ that stores the topk most confidence detections.
76
+ kept_indices: (list[Tensor]): A list of 1D tensor of length of N, each element indicates
77
+ the corresponding boxes/scores index in [0, Ri) from the input, for image i.
78
+ """
79
+ result_per_image = [
80
+ fast_rcnn_inference_single_image(
81
+ boxes_per_image, scores_per_image, image_shape, score_thresh, nms_thresh, topk_per_image
82
+ )
83
+ for scores_per_image, boxes_per_image, image_shape in zip(scores, boxes, image_shapes)
84
+ ]
85
+ return [x[0] for x in result_per_image], [x[1] for x in result_per_image]
86
+
87
+
88
+ def _log_classification_stats(pred_logits, gt_classes, prefix="fast_rcnn"):
89
+ """
90
+ Log the classification metrics to EventStorage.
91
+
92
+ Args:
93
+ pred_logits: Rx(K+1) logits. The last column is for background class.
94
+ gt_classes: R labels
95
+ """
96
+ num_instances = gt_classes.numel()
97
+ if num_instances == 0:
98
+ return
99
+ pred_classes = pred_logits.argmax(dim=1)
100
+ bg_class_ind = pred_logits.shape[1] - 1
101
+
102
+ fg_inds = (gt_classes >= 0) & (gt_classes < bg_class_ind)
103
+ num_fg = fg_inds.nonzero().numel()
104
+ fg_gt_classes = gt_classes[fg_inds]
105
+ fg_pred_classes = pred_classes[fg_inds]
106
+
107
+ num_false_negative = (fg_pred_classes == bg_class_ind).nonzero().numel()
108
+ num_accurate = (pred_classes == gt_classes).nonzero().numel()
109
+ fg_num_accurate = (fg_pred_classes == fg_gt_classes).nonzero().numel()
110
+
111
+ storage = get_event_storage()
112
+ storage.put_scalar(f"{prefix}/cls_accuracy", num_accurate / num_instances)
113
+ if num_fg > 0:
114
+ storage.put_scalar(f"{prefix}/fg_cls_accuracy", fg_num_accurate / num_fg)
115
+ storage.put_scalar(f"{prefix}/false_negative", num_false_negative / num_fg)
116
+
117
+
118
+ def fast_rcnn_inference_single_image(
119
+ boxes,
120
+ scores,
121
+ image_shape: Tuple[int, int],
122
+ score_thresh: float,
123
+ nms_thresh: float,
124
+ topk_per_image: int,
125
+ ):
126
+ """
127
+ Single-image inference. Return bounding-box detection results by thresholding
128
+ on scores and applying non-maximum suppression (NMS).
129
+
130
+ Args:
131
+ Same as `fast_rcnn_inference`, but with boxes, scores, and image shapes
132
+ per image.
133
+
134
+ Returns:
135
+ Same as `fast_rcnn_inference`, but for only one image.
136
+ """
137
+ valid_mask = torch.isfinite(boxes).all(dim=1) & torch.isfinite(scores).all(dim=1)
138
+ if not valid_mask.all():
139
+ boxes = boxes[valid_mask]
140
+ scores = scores[valid_mask]
141
+
142
+ scores = scores[:, :-1]
143
+ num_bbox_reg_classes = boxes.shape[1] // 4
144
+ # Convert to Boxes to use the `clip` function ...
145
+ boxes = Boxes(boxes.reshape(-1, 4))
146
+ boxes.clip(image_shape)
147
+ boxes = boxes.tensor.view(-1, num_bbox_reg_classes, 4) # R x C x 4
148
+
149
+ # 1. Filter results based on detection scores. It can make NMS more efficient
150
+ # by filtering out low-confidence detections.
151
+ filter_mask = scores > score_thresh # R x K
152
+ # R' x 2. First column contains indices of the R predictions;
153
+ # Second column contains indices of classes.
154
+ filter_inds = filter_mask.nonzero()
155
+ if num_bbox_reg_classes == 1:
156
+ boxes = boxes[filter_inds[:, 0], 0]
157
+ else:
158
+ boxes = boxes[filter_mask]
159
+ scores = scores[filter_mask]
160
+
161
+ # 2. Apply NMS for each class independently.
162
+ keep = batched_nms(boxes, scores, filter_inds[:, 1], nms_thresh)
163
+ if topk_per_image >= 0:
164
+ keep = keep[:topk_per_image]
165
+ boxes, scores, filter_inds = boxes[keep], scores[keep], filter_inds[keep]
166
+
167
+ result = Instances(image_shape)
168
+ result.pred_boxes = Boxes(boxes)
169
+ result.scores = scores
170
+ result.pred_classes = filter_inds[:, 1]
171
+ return result, filter_inds[:, 0]
172
+
173
+
174
+ class FastRCNNOutputLayers(nn.Module):
175
+ """
176
+ Two linear layers for predicting Fast R-CNN outputs:
177
+
178
+ 1. proposal-to-detection box regression deltas
179
+ 2. classification scores
180
+ """
181
+
182
+ @configurable
183
+ def __init__(
184
+ self,
185
+ input_shape: ShapeSpec,
186
+ *,
187
+ box2box_transform,
188
+ num_classes: int,
189
+ test_score_thresh: float = 0.0,
190
+ test_nms_thresh: float = 0.5,
191
+ test_topk_per_image: int = 100,
192
+ cls_agnostic_bbox_reg: bool = False,
193
+ smooth_l1_beta: float = 0.0,
194
+ box_reg_loss_type: str = "smooth_l1",
195
+ loss_weight: Union[float, Dict[str, float]] = 1.0,
196
+ use_fed_loss: bool = False,
197
+ use_sigmoid_ce: bool = False,
198
+ get_fed_loss_cls_weights: Optional[Callable] = None,
199
+ fed_loss_num_classes: int = 50,
200
+ ):
201
+ """
202
+ NOTE: this interface is experimental.
203
+
204
+ Args:
205
+ input_shape (ShapeSpec): shape of the input feature to this module
206
+ box2box_transform (Box2BoxTransform or Box2BoxTransformRotated):
207
+ num_classes (int): number of foreground classes
208
+ test_score_thresh (float): threshold to filter predictions results.
209
+ test_nms_thresh (float): NMS threshold for prediction results.
210
+ test_topk_per_image (int): number of top predictions to produce per image.
211
+ cls_agnostic_bbox_reg (bool): whether to use class agnostic for bbox regression
212
+ smooth_l1_beta (float): transition point from L1 to L2 loss. Only used if
213
+ `box_reg_loss_type` is "smooth_l1"
214
+ box_reg_loss_type (str): Box regression loss type. One of: "smooth_l1", "giou",
215
+ "diou", "ciou"
216
+ loss_weight (float|dict): weights to use for losses. Can be single float for weighting
217
+ all losses, or a dict of individual weightings. Valid dict keys are:
218
+ * "loss_cls": applied to classification loss
219
+ * "loss_box_reg": applied to box regression loss
220
+ use_fed_loss (bool): whether to use federated loss which samples additional negative
221
+ classes to calculate the loss
222
+ use_sigmoid_ce (bool): whether to calculate the loss using weighted average of binary
223
+ cross entropy with logits. This could be used together with federated loss
224
+ get_fed_loss_cls_weights (Callable): a callable which takes dataset name and frequency
225
+ weight power, and returns the probabilities to sample negative classes for
226
+ federated loss. The implementation can be found in
227
+ detectron2/data/detection_utils.py
228
+ fed_loss_num_classes (int): number of federated classes to keep in total
229
+ """
230
+ super().__init__()
231
+ if isinstance(input_shape, int): # some backward compatibility
232
+ input_shape = ShapeSpec(channels=input_shape)
233
+ self.num_classes = num_classes
234
+ input_size = input_shape.channels * (input_shape.width or 1) * (input_shape.height or 1)
235
+ # prediction layer for num_classes foreground classes and one background class (hence + 1)
236
+ self.cls_score = nn.Linear(input_size, num_classes + 1)
237
+ num_bbox_reg_classes = 1 if cls_agnostic_bbox_reg else num_classes
238
+ box_dim = len(box2box_transform.weights)
239
+ self.bbox_pred = nn.Linear(input_size, num_bbox_reg_classes * box_dim)
240
+
241
+ nn.init.normal_(self.cls_score.weight, std=0.01)
242
+ nn.init.normal_(self.bbox_pred.weight, std=0.001)
243
+ for l in [self.cls_score, self.bbox_pred]:
244
+ nn.init.constant_(l.bias, 0)
245
+
246
+ self.box2box_transform = box2box_transform
247
+ self.smooth_l1_beta = smooth_l1_beta
248
+ self.test_score_thresh = test_score_thresh
249
+ self.test_nms_thresh = test_nms_thresh
250
+ self.test_topk_per_image = test_topk_per_image
251
+ self.box_reg_loss_type = box_reg_loss_type
252
+ if isinstance(loss_weight, float):
253
+ loss_weight = {"loss_cls": loss_weight, "loss_box_reg": loss_weight}
254
+ self.loss_weight = loss_weight
255
+ self.use_fed_loss = use_fed_loss
256
+ self.use_sigmoid_ce = use_sigmoid_ce
257
+ self.fed_loss_num_classes = fed_loss_num_classes
258
+
259
+ if self.use_fed_loss:
260
+ assert self.use_sigmoid_ce, "Please use sigmoid cross entropy loss with federated loss"
261
+ fed_loss_cls_weights = get_fed_loss_cls_weights()
262
+ assert (
263
+ len(fed_loss_cls_weights) == self.num_classes
264
+ ), "Please check the provided fed_loss_cls_weights. Their size should match num_classes"
265
+ self.register_buffer("fed_loss_cls_weights", fed_loss_cls_weights)
266
+
267
+ @classmethod
268
+ def from_config(cls, cfg, input_shape):
269
+ return {
270
+ "input_shape": input_shape,
271
+ "box2box_transform": Box2BoxTransform(weights=cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS),
272
+ # fmt: off
273
+ "num_classes" : cfg.MODEL.ROI_HEADS.NUM_CLASSES,
274
+ "cls_agnostic_bbox_reg" : cfg.MODEL.ROI_BOX_HEAD.CLS_AGNOSTIC_BBOX_REG,
275
+ "smooth_l1_beta" : cfg.MODEL.ROI_BOX_HEAD.SMOOTH_L1_BETA,
276
+ "test_score_thresh" : cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST,
277
+ "test_nms_thresh" : cfg.MODEL.ROI_HEADS.NMS_THRESH_TEST,
278
+ "test_topk_per_image" : cfg.TEST.DETECTIONS_PER_IMAGE,
279
+ "box_reg_loss_type" : cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_LOSS_TYPE,
280
+ "loss_weight" : {"loss_box_reg": cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_LOSS_WEIGHT}, # noqa
281
+ "use_fed_loss" : cfg.MODEL.ROI_BOX_HEAD.USE_FED_LOSS,
282
+ "use_sigmoid_ce" : cfg.MODEL.ROI_BOX_HEAD.USE_SIGMOID_CE,
283
+ "get_fed_loss_cls_weights" : lambda: get_fed_loss_cls_weights(dataset_names=cfg.DATASETS.TRAIN, freq_weight_power=cfg.MODEL.ROI_BOX_HEAD.FED_LOSS_FREQ_WEIGHT_POWER), # noqa
284
+ "fed_loss_num_classes" : cfg.MODEL.ROI_BOX_HEAD.FED_LOSS_NUM_CLASSES,
285
+ # fmt: on
286
+ }
287
+
288
+ def forward(self, x):
289
+ """
290
+ Args:
291
+ x: per-region features of shape (N, ...) for N bounding boxes to predict.
292
+
293
+ Returns:
294
+ (Tensor, Tensor):
295
+ First tensor: shape (N,K+1), scores for each of the N box. Each row contains the
296
+ scores for K object categories and 1 background class.
297
+
298
+ Second tensor: bounding box regression deltas for each box. Shape is shape (N,Kx4),
299
+ or (N,4) for class-agnostic regression.
300
+ """
301
+ if x.dim() > 2:
302
+ x = torch.flatten(x, start_dim=1)
303
+ scores = self.cls_score(x)
304
+ proposal_deltas = self.bbox_pred(x)
305
+ return scores, proposal_deltas
306
+
307
+ def losses(self, predictions, proposals):
308
+ """
309
+ Args:
310
+ predictions: return values of :meth:`forward()`.
311
+ proposals (list[Instances]): proposals that match the features that were used
312
+ to compute predictions. The fields ``proposal_boxes``, ``gt_boxes``,
313
+ ``gt_classes`` are expected.
314
+
315
+ Returns:
316
+ Dict[str, Tensor]: dict of losses
317
+ """
318
+ scores, proposal_deltas = predictions
319
+
320
+ # parse classification outputs
321
+ gt_classes = (
322
+ cat([p.gt_classes for p in proposals], dim=0) if len(proposals) else torch.empty(0)
323
+ )
324
+ _log_classification_stats(scores, gt_classes)
325
+
326
+ # parse box regression outputs
327
+ if len(proposals):
328
+ proposal_boxes = cat([p.proposal_boxes.tensor for p in proposals], dim=0) # Nx4
329
+ assert not proposal_boxes.requires_grad, "Proposals should not require gradients!"
330
+ # If "gt_boxes" does not exist, the proposals must be all negative and
331
+ # should not be included in regression loss computation.
332
+ # Here we just use proposal_boxes as an arbitrary placeholder because its
333
+ # value won't be used in self.box_reg_loss().
334
+ gt_boxes = cat(
335
+ [(p.gt_boxes if p.has("gt_boxes") else p.proposal_boxes).tensor for p in proposals],
336
+ dim=0,
337
+ )
338
+ else:
339
+ proposal_boxes = gt_boxes = torch.empty((0, 4), device=proposal_deltas.device)
340
+
341
+ if self.use_sigmoid_ce:
342
+ loss_cls = self.sigmoid_cross_entropy_loss(scores, gt_classes)
343
+ else:
344
+ loss_cls = cross_entropy(scores, gt_classes, reduction="mean")
345
+
346
+ losses = {
347
+ "loss_cls": loss_cls,
348
+ "loss_box_reg": self.box_reg_loss(
349
+ proposal_boxes, gt_boxes, proposal_deltas, gt_classes
350
+ ),
351
+ }
352
+ return {k: v * self.loss_weight.get(k, 1.0) for k, v in losses.items()}
353
+
354
+ # Implementation from https://github.com/xingyizhou/CenterNet2/blob/master/projects/CenterNet2/centernet/modeling/roi_heads/fed_loss.py # noqa
355
+ # with slight modifications
356
+ def get_fed_loss_classes(self, gt_classes, num_fed_loss_classes, num_classes, weight):
357
+ """
358
+ Args:
359
+ gt_classes: a long tensor of shape R that contains the gt class label of each proposal.
360
+ num_fed_loss_classes: minimum number of classes to keep when calculating federated loss.
361
+ Will sample negative classes if number of unique gt_classes is smaller than this value.
362
+ num_classes: number of foreground classes
363
+ weight: probabilities used to sample negative classes
364
+
365
+ Returns:
366
+ Tensor:
367
+ classes to keep when calculating the federated loss, including both unique gt
368
+ classes and sampled negative classes.
369
+ """
370
+ unique_gt_classes = torch.unique(gt_classes)
371
+ prob = unique_gt_classes.new_ones(num_classes + 1).float()
372
+ prob[-1] = 0
373
+ if len(unique_gt_classes) < num_fed_loss_classes:
374
+ prob[:num_classes] = weight.float().clone()
375
+ prob[unique_gt_classes] = 0
376
+ sampled_negative_classes = torch.multinomial(
377
+ prob, num_fed_loss_classes - len(unique_gt_classes), replacement=False
378
+ )
379
+ fed_loss_classes = torch.cat([unique_gt_classes, sampled_negative_classes])
380
+ else:
381
+ fed_loss_classes = unique_gt_classes
382
+ return fed_loss_classes
383
+
384
+ # Implementation from https://github.com/xingyizhou/CenterNet2/blob/master/projects/CenterNet2/centernet/modeling/roi_heads/custom_fast_rcnn.py#L113 # noqa
385
+ # with slight modifications
386
+ def sigmoid_cross_entropy_loss(self, pred_class_logits, gt_classes):
387
+ """
388
+ Args:
389
+ pred_class_logits: shape (N, K+1), scores for each of the N box. Each row contains the
390
+ scores for K object categories and 1 background class
391
+ gt_classes: a long tensor of shape R that contains the gt class label of each proposal.
392
+ """
393
+ if pred_class_logits.numel() == 0:
394
+ return pred_class_logits.new_zeros([1])[0]
395
+
396
+ N = pred_class_logits.shape[0]
397
+ K = pred_class_logits.shape[1] - 1
398
+
399
+ target = pred_class_logits.new_zeros(N, K + 1)
400
+ target[range(len(gt_classes)), gt_classes] = 1
401
+ target = target[:, :K]
402
+
403
+ cls_loss = F.binary_cross_entropy_with_logits(
404
+ pred_class_logits[:, :-1], target, reduction="none"
405
+ )
406
+
407
+ if self.use_fed_loss:
408
+ fed_loss_classes = self.get_fed_loss_classes(
409
+ gt_classes,
410
+ num_fed_loss_classes=self.fed_loss_num_classes,
411
+ num_classes=K,
412
+ weight=self.fed_loss_cls_weights,
413
+ )
414
+ fed_loss_classes_mask = fed_loss_classes.new_zeros(K + 1)
415
+ fed_loss_classes_mask[fed_loss_classes] = 1
416
+ fed_loss_classes_mask = fed_loss_classes_mask[:K]
417
+ weight = fed_loss_classes_mask.view(1, K).expand(N, K).float()
418
+ else:
419
+ weight = 1
420
+
421
+ loss = torch.sum(cls_loss * weight) / N
422
+ return loss
423
+
424
+ def box_reg_loss(self, proposal_boxes, gt_boxes, pred_deltas, gt_classes):
425
+ """
426
+ Args:
427
+ proposal_boxes/gt_boxes are tensors with the same shape (R, 4 or 5).
428
+ pred_deltas has shape (R, 4 or 5), or (R, num_classes * (4 or 5)).
429
+ gt_classes is a long tensor of shape R, the gt class label of each proposal.
430
+ R shall be the number of proposals.
431
+ """
432
+ box_dim = proposal_boxes.shape[1] # 4 or 5
433
+ # Regression loss is only computed for foreground proposals (those matched to a GT)
434
+ fg_inds = nonzero_tuple((gt_classes >= 0) & (gt_classes < self.num_classes))[0]
435
+ if pred_deltas.shape[1] == box_dim: # cls-agnostic regression
436
+ fg_pred_deltas = pred_deltas[fg_inds]
437
+ else:
438
+ fg_pred_deltas = pred_deltas.view(-1, self.num_classes, box_dim)[
439
+ fg_inds, gt_classes[fg_inds]
440
+ ]
441
+
442
+ loss_box_reg = _dense_box_regression_loss(
443
+ [proposal_boxes[fg_inds]],
444
+ self.box2box_transform,
445
+ [fg_pred_deltas.unsqueeze(0)],
446
+ [gt_boxes[fg_inds]],
447
+ ...,
448
+ self.box_reg_loss_type,
449
+ self.smooth_l1_beta,
450
+ )
451
+
452
+ # The reg loss is normalized using the total number of regions (R), not the number
453
+ # of foreground regions even though the box regression loss is only defined on
454
+ # foreground regions. Why? Because doing so gives equal training influence to
455
+ # each foreground example. To see how, consider two different minibatches:
456
+ # (1) Contains a single foreground region
457
+ # (2) Contains 100 foreground regions
458
+ # If we normalize by the number of foreground regions, the single example in
459
+ # minibatch (1) will be given 100 times as much influence as each foreground
460
+ # example in minibatch (2). Normalizing by the total number of regions, R,
461
+ # means that the single example in minibatch (1) and each of the 100 examples
462
+ # in minibatch (2) are given equal influence.
463
+ return loss_box_reg / max(gt_classes.numel(), 1.0) # return 0 if empty
464
+
465
+ def inference(self, predictions: Tuple[torch.Tensor, torch.Tensor], proposals: List[Instances]):
466
+ """
467
+ Args:
468
+ predictions: return values of :meth:`forward()`.
469
+ proposals (list[Instances]): proposals that match the features that were
470
+ used to compute predictions. The ``proposal_boxes`` field is expected.
471
+
472
+ Returns:
473
+ list[Instances]: same as `fast_rcnn_inference`.
474
+ list[Tensor]: same as `fast_rcnn_inference`.
475
+ """
476
+ boxes = self.predict_boxes(predictions, proposals)
477
+ scores = self.predict_probs(predictions, proposals)
478
+ image_shapes = [x.image_size for x in proposals]
479
+ return fast_rcnn_inference(
480
+ boxes,
481
+ scores,
482
+ image_shapes,
483
+ self.test_score_thresh,
484
+ self.test_nms_thresh,
485
+ self.test_topk_per_image,
486
+ )
487
+
488
+ def predict_boxes_for_gt_classes(self, predictions, proposals):
489
+ """
490
+ Args:
491
+ predictions: return values of :meth:`forward()`.
492
+ proposals (list[Instances]): proposals that match the features that were used
493
+ to compute predictions. The fields ``proposal_boxes``, ``gt_classes`` are expected.
494
+
495
+ Returns:
496
+ list[Tensor]:
497
+ A list of Tensors of predicted boxes for GT classes in case of
498
+ class-specific box head. Element i of the list has shape (Ri, B), where Ri is
499
+ the number of proposals for image i and B is the box dimension (4 or 5)
500
+ """
501
+ if not len(proposals):
502
+ return []
503
+ scores, proposal_deltas = predictions
504
+ proposal_boxes = cat([p.proposal_boxes.tensor for p in proposals], dim=0)
505
+ N, B = proposal_boxes.shape
506
+ predict_boxes = self.box2box_transform.apply_deltas(
507
+ proposal_deltas, proposal_boxes
508
+ ) # Nx(KxB)
509
+
510
+ K = predict_boxes.shape[1] // B
511
+ if K > 1:
512
+ gt_classes = torch.cat([p.gt_classes for p in proposals], dim=0)
513
+ # Some proposals are ignored or have a background class. Their gt_classes
514
+ # cannot be used as index.
515
+ gt_classes = gt_classes.clamp_(0, K - 1)
516
+
517
+ predict_boxes = predict_boxes.view(N, K, B)[
518
+ torch.arange(N, dtype=torch.long, device=predict_boxes.device), gt_classes
519
+ ]
520
+ num_prop_per_image = [len(p) for p in proposals]
521
+ return predict_boxes.split(num_prop_per_image)
522
+
523
+ def predict_boxes(
524
+ self, predictions: Tuple[torch.Tensor, torch.Tensor], proposals: List[Instances]
525
+ ):
526
+ """
527
+ Args:
528
+ predictions: return values of :meth:`forward()`.
529
+ proposals (list[Instances]): proposals that match the features that were
530
+ used to compute predictions. The ``proposal_boxes`` field is expected.
531
+
532
+ Returns:
533
+ list[Tensor]:
534
+ A list of Tensors of predicted class-specific or class-agnostic boxes
535
+ for each image. Element i has shape (Ri, K * B) or (Ri, B), where Ri is
536
+ the number of proposals for image i and B is the box dimension (4 or 5)
537
+ """
538
+ if not len(proposals):
539
+ return []
540
+ _, proposal_deltas = predictions
541
+ num_prop_per_image = [len(p) for p in proposals]
542
+ proposal_boxes = cat([p.proposal_boxes.tensor for p in proposals], dim=0)
543
+ predict_boxes = self.box2box_transform.apply_deltas(
544
+ proposal_deltas,
545
+ proposal_boxes,
546
+ ) # Nx(KxB)
547
+ return predict_boxes.split(num_prop_per_image)
548
+
549
+ def predict_probs(
550
+ self, predictions: Tuple[torch.Tensor, torch.Tensor], proposals: List[Instances]
551
+ ):
552
+ """
553
+ Args:
554
+ predictions: return values of :meth:`forward()`.
555
+ proposals (list[Instances]): proposals that match the features that were
556
+ used to compute predictions.
557
+
558
+ Returns:
559
+ list[Tensor]:
560
+ A list of Tensors of predicted class probabilities for each image.
561
+ Element i has shape (Ri, K + 1), where Ri is the number of proposals for image i.
562
+ """
563
+ scores, _ = predictions
564
+ num_inst_per_image = [len(p) for p in proposals]
565
+ if self.use_sigmoid_ce:
566
+ probs = scores.sigmoid()
567
+ else:
568
+ probs = F.softmax(scores, dim=-1)
569
+ return probs.split(num_inst_per_image, dim=0)
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/roi_heads/keypoint_head.py ADDED
@@ -0,0 +1,272 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ from typing import List
3
+ import torch
4
+ from torch import nn
5
+ from torch.nn import functional as F
6
+
7
+ from annotator.oneformer.detectron2.config import configurable
8
+ from annotator.oneformer.detectron2.layers import Conv2d, ConvTranspose2d, cat, interpolate
9
+ from annotator.oneformer.detectron2.structures import Instances, heatmaps_to_keypoints
10
+ from annotator.oneformer.detectron2.utils.events import get_event_storage
11
+ from annotator.oneformer.detectron2.utils.registry import Registry
12
+
13
+ _TOTAL_SKIPPED = 0
14
+
15
+
16
+ __all__ = [
17
+ "ROI_KEYPOINT_HEAD_REGISTRY",
18
+ "build_keypoint_head",
19
+ "BaseKeypointRCNNHead",
20
+ "KRCNNConvDeconvUpsampleHead",
21
+ ]
22
+
23
+
24
+ ROI_KEYPOINT_HEAD_REGISTRY = Registry("ROI_KEYPOINT_HEAD")
25
+ ROI_KEYPOINT_HEAD_REGISTRY.__doc__ = """
26
+ Registry for keypoint heads, which make keypoint predictions from per-region features.
27
+
28
+ The registered object will be called with `obj(cfg, input_shape)`.
29
+ """
30
+
31
+
32
+ def build_keypoint_head(cfg, input_shape):
33
+ """
34
+ Build a keypoint head from `cfg.MODEL.ROI_KEYPOINT_HEAD.NAME`.
35
+ """
36
+ name = cfg.MODEL.ROI_KEYPOINT_HEAD.NAME
37
+ return ROI_KEYPOINT_HEAD_REGISTRY.get(name)(cfg, input_shape)
38
+
39
+
40
+ def keypoint_rcnn_loss(pred_keypoint_logits, instances, normalizer):
41
+ """
42
+ Arguments:
43
+ pred_keypoint_logits (Tensor): A tensor of shape (N, K, S, S) where N is the total number
44
+ of instances in the batch, K is the number of keypoints, and S is the side length
45
+ of the keypoint heatmap. The values are spatial logits.
46
+ instances (list[Instances]): A list of M Instances, where M is the batch size.
47
+ These instances are predictions from the model
48
+ that are in 1:1 correspondence with pred_keypoint_logits.
49
+ Each Instances should contain a `gt_keypoints` field containing a `structures.Keypoint`
50
+ instance.
51
+ normalizer (float): Normalize the loss by this amount.
52
+ If not specified, we normalize by the number of visible keypoints in the minibatch.
53
+
54
+ Returns a scalar tensor containing the loss.
55
+ """
56
+ heatmaps = []
57
+ valid = []
58
+
59
+ keypoint_side_len = pred_keypoint_logits.shape[2]
60
+ for instances_per_image in instances:
61
+ if len(instances_per_image) == 0:
62
+ continue
63
+ keypoints = instances_per_image.gt_keypoints
64
+ heatmaps_per_image, valid_per_image = keypoints.to_heatmap(
65
+ instances_per_image.proposal_boxes.tensor, keypoint_side_len
66
+ )
67
+ heatmaps.append(heatmaps_per_image.view(-1))
68
+ valid.append(valid_per_image.view(-1))
69
+
70
+ if len(heatmaps):
71
+ keypoint_targets = cat(heatmaps, dim=0)
72
+ valid = cat(valid, dim=0).to(dtype=torch.uint8)
73
+ valid = torch.nonzero(valid).squeeze(1)
74
+
75
+ # torch.mean (in binary_cross_entropy_with_logits) doesn't
76
+ # accept empty tensors, so handle it separately
77
+ if len(heatmaps) == 0 or valid.numel() == 0:
78
+ global _TOTAL_SKIPPED
79
+ _TOTAL_SKIPPED += 1
80
+ storage = get_event_storage()
81
+ storage.put_scalar("kpts_num_skipped_batches", _TOTAL_SKIPPED, smoothing_hint=False)
82
+ return pred_keypoint_logits.sum() * 0
83
+
84
+ N, K, H, W = pred_keypoint_logits.shape
85
+ pred_keypoint_logits = pred_keypoint_logits.view(N * K, H * W)
86
+
87
+ keypoint_loss = F.cross_entropy(
88
+ pred_keypoint_logits[valid], keypoint_targets[valid], reduction="sum"
89
+ )
90
+
91
+ # If a normalizer isn't specified, normalize by the number of visible keypoints in the minibatch
92
+ if normalizer is None:
93
+ normalizer = valid.numel()
94
+ keypoint_loss /= normalizer
95
+
96
+ return keypoint_loss
97
+
98
+
99
+ def keypoint_rcnn_inference(pred_keypoint_logits: torch.Tensor, pred_instances: List[Instances]):
100
+ """
101
+ Post process each predicted keypoint heatmap in `pred_keypoint_logits` into (x, y, score)
102
+ and add it to the `pred_instances` as a `pred_keypoints` field.
103
+
104
+ Args:
105
+ pred_keypoint_logits (Tensor): A tensor of shape (R, K, S, S) where R is the total number
106
+ of instances in the batch, K is the number of keypoints, and S is the side length of
107
+ the keypoint heatmap. The values are spatial logits.
108
+ pred_instances (list[Instances]): A list of N Instances, where N is the number of images.
109
+
110
+ Returns:
111
+ None. Each element in pred_instances will contain extra "pred_keypoints" and
112
+ "pred_keypoint_heatmaps" fields. "pred_keypoints" is a tensor of shape
113
+ (#instance, K, 3) where the last dimension corresponds to (x, y, score).
114
+ The scores are larger than 0. "pred_keypoint_heatmaps" contains the raw
115
+ keypoint logits as passed to this function.
116
+ """
117
+ # flatten all bboxes from all images together (list[Boxes] -> Rx4 tensor)
118
+ bboxes_flat = cat([b.pred_boxes.tensor for b in pred_instances], dim=0)
119
+
120
+ pred_keypoint_logits = pred_keypoint_logits.detach()
121
+ keypoint_results = heatmaps_to_keypoints(pred_keypoint_logits, bboxes_flat.detach())
122
+ num_instances_per_image = [len(i) for i in pred_instances]
123
+ keypoint_results = keypoint_results[:, :, [0, 1, 3]].split(num_instances_per_image, dim=0)
124
+ heatmap_results = pred_keypoint_logits.split(num_instances_per_image, dim=0)
125
+
126
+ for keypoint_results_per_image, heatmap_results_per_image, instances_per_image in zip(
127
+ keypoint_results, heatmap_results, pred_instances
128
+ ):
129
+ # keypoint_results_per_image is (num instances)x(num keypoints)x(x, y, score)
130
+ # heatmap_results_per_image is (num instances)x(num keypoints)x(side)x(side)
131
+ instances_per_image.pred_keypoints = keypoint_results_per_image
132
+ instances_per_image.pred_keypoint_heatmaps = heatmap_results_per_image
133
+
134
+
135
+ class BaseKeypointRCNNHead(nn.Module):
136
+ """
137
+ Implement the basic Keypoint R-CNN losses and inference logic described in
138
+ Sec. 5 of :paper:`Mask R-CNN`.
139
+ """
140
+
141
+ @configurable
142
+ def __init__(self, *, num_keypoints, loss_weight=1.0, loss_normalizer=1.0):
143
+ """
144
+ NOTE: this interface is experimental.
145
+
146
+ Args:
147
+ num_keypoints (int): number of keypoints to predict
148
+ loss_weight (float): weight to multiple on the keypoint loss
149
+ loss_normalizer (float or str):
150
+ If float, divide the loss by `loss_normalizer * #images`.
151
+ If 'visible', the loss is normalized by the total number of
152
+ visible keypoints across images.
153
+ """
154
+ super().__init__()
155
+ self.num_keypoints = num_keypoints
156
+ self.loss_weight = loss_weight
157
+ assert loss_normalizer == "visible" or isinstance(loss_normalizer, float), loss_normalizer
158
+ self.loss_normalizer = loss_normalizer
159
+
160
+ @classmethod
161
+ def from_config(cls, cfg, input_shape):
162
+ ret = {
163
+ "loss_weight": cfg.MODEL.ROI_KEYPOINT_HEAD.LOSS_WEIGHT,
164
+ "num_keypoints": cfg.MODEL.ROI_KEYPOINT_HEAD.NUM_KEYPOINTS,
165
+ }
166
+ normalize_by_visible = (
167
+ cfg.MODEL.ROI_KEYPOINT_HEAD.NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS
168
+ ) # noqa
169
+ if not normalize_by_visible:
170
+ batch_size_per_image = cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE
171
+ positive_sample_fraction = cfg.MODEL.ROI_HEADS.POSITIVE_FRACTION
172
+ ret["loss_normalizer"] = (
173
+ ret["num_keypoints"] * batch_size_per_image * positive_sample_fraction
174
+ )
175
+ else:
176
+ ret["loss_normalizer"] = "visible"
177
+ return ret
178
+
179
+ def forward(self, x, instances: List[Instances]):
180
+ """
181
+ Args:
182
+ x: input 4D region feature(s) provided by :class:`ROIHeads`.
183
+ instances (list[Instances]): contains the boxes & labels corresponding
184
+ to the input features.
185
+ Exact format is up to its caller to decide.
186
+ Typically, this is the foreground instances in training, with
187
+ "proposal_boxes" field and other gt annotations.
188
+ In inference, it contains boxes that are already predicted.
189
+
190
+ Returns:
191
+ A dict of losses if in training. The predicted "instances" if in inference.
192
+ """
193
+ x = self.layers(x)
194
+ if self.training:
195
+ num_images = len(instances)
196
+ normalizer = (
197
+ None if self.loss_normalizer == "visible" else num_images * self.loss_normalizer
198
+ )
199
+ return {
200
+ "loss_keypoint": keypoint_rcnn_loss(x, instances, normalizer=normalizer)
201
+ * self.loss_weight
202
+ }
203
+ else:
204
+ keypoint_rcnn_inference(x, instances)
205
+ return instances
206
+
207
+ def layers(self, x):
208
+ """
209
+ Neural network layers that makes predictions from regional input features.
210
+ """
211
+ raise NotImplementedError
212
+
213
+
214
+ # To get torchscript support, we make the head a subclass of `nn.Sequential`.
215
+ # Therefore, to add new layers in this head class, please make sure they are
216
+ # added in the order they will be used in forward().
217
+ @ROI_KEYPOINT_HEAD_REGISTRY.register()
218
+ class KRCNNConvDeconvUpsampleHead(BaseKeypointRCNNHead, nn.Sequential):
219
+ """
220
+ A standard keypoint head containing a series of 3x3 convs, followed by
221
+ a transpose convolution and bilinear interpolation for upsampling.
222
+ It is described in Sec. 5 of :paper:`Mask R-CNN`.
223
+ """
224
+
225
+ @configurable
226
+ def __init__(self, input_shape, *, num_keypoints, conv_dims, **kwargs):
227
+ """
228
+ NOTE: this interface is experimental.
229
+
230
+ Args:
231
+ input_shape (ShapeSpec): shape of the input feature
232
+ conv_dims: an iterable of output channel counts for each conv in the head
233
+ e.g. (512, 512, 512) for three convs outputting 512 channels.
234
+ """
235
+ super().__init__(num_keypoints=num_keypoints, **kwargs)
236
+
237
+ # default up_scale to 2.0 (this can be made an option)
238
+ up_scale = 2.0
239
+ in_channels = input_shape.channels
240
+
241
+ for idx, layer_channels in enumerate(conv_dims, 1):
242
+ module = Conv2d(in_channels, layer_channels, 3, stride=1, padding=1)
243
+ self.add_module("conv_fcn{}".format(idx), module)
244
+ self.add_module("conv_fcn_relu{}".format(idx), nn.ReLU())
245
+ in_channels = layer_channels
246
+
247
+ deconv_kernel = 4
248
+ self.score_lowres = ConvTranspose2d(
249
+ in_channels, num_keypoints, deconv_kernel, stride=2, padding=deconv_kernel // 2 - 1
250
+ )
251
+ self.up_scale = up_scale
252
+
253
+ for name, param in self.named_parameters():
254
+ if "bias" in name:
255
+ nn.init.constant_(param, 0)
256
+ elif "weight" in name:
257
+ # Caffe2 implementation uses MSRAFill, which in fact
258
+ # corresponds to kaiming_normal_ in PyTorch
259
+ nn.init.kaiming_normal_(param, mode="fan_out", nonlinearity="relu")
260
+
261
+ @classmethod
262
+ def from_config(cls, cfg, input_shape):
263
+ ret = super().from_config(cfg, input_shape)
264
+ ret["input_shape"] = input_shape
265
+ ret["conv_dims"] = cfg.MODEL.ROI_KEYPOINT_HEAD.CONV_DIMS
266
+ return ret
267
+
268
+ def layers(self, x):
269
+ for layer in self:
270
+ x = layer(x)
271
+ x = interpolate(x, scale_factor=self.up_scale, mode="bilinear", align_corners=False)
272
+ return x
extensions/microsoftexcel-controlnet/annotator/oneformer/detectron2/modeling/roi_heads/mask_head.py ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ from typing import List
3
+ import fvcore.nn.weight_init as weight_init
4
+ import torch
5
+ from torch import nn
6
+ from torch.nn import functional as F
7
+
8
+ from annotator.oneformer.detectron2.config import configurable
9
+ from annotator.oneformer.detectron2.layers import Conv2d, ConvTranspose2d, ShapeSpec, cat, get_norm
10
+ from annotator.oneformer.detectron2.layers.wrappers import move_device_like
11
+ from annotator.oneformer.detectron2.structures import Instances
12
+ from annotator.oneformer.detectron2.utils.events import get_event_storage
13
+ from annotator.oneformer.detectron2.utils.registry import Registry
14
+
15
+ __all__ = [
16
+ "BaseMaskRCNNHead",
17
+ "MaskRCNNConvUpsampleHead",
18
+ "build_mask_head",
19
+ "ROI_MASK_HEAD_REGISTRY",
20
+ ]
21
+
22
+
23
+ ROI_MASK_HEAD_REGISTRY = Registry("ROI_MASK_HEAD")
24
+ ROI_MASK_HEAD_REGISTRY.__doc__ = """
25
+ Registry for mask heads, which predicts instance masks given
26
+ per-region features.
27
+
28
+ The registered object will be called with `obj(cfg, input_shape)`.
29
+ """
30
+
31
+
32
+ @torch.jit.unused
33
+ def mask_rcnn_loss(pred_mask_logits: torch.Tensor, instances: List[Instances], vis_period: int = 0):
34
+ """
35
+ Compute the mask prediction loss defined in the Mask R-CNN paper.
36
+
37
+ Args:
38
+ pred_mask_logits (Tensor): A tensor of shape (B, C, Hmask, Wmask) or (B, 1, Hmask, Wmask)
39
+ for class-specific or class-agnostic, where B is the total number of predicted masks
40
+ in all images, C is the number of foreground classes, and Hmask, Wmask are the height
41
+ and width of the mask predictions. The values are logits.
42
+ instances (list[Instances]): A list of N Instances, where N is the number of images
43
+ in the batch. These instances are in 1:1
44
+ correspondence with the pred_mask_logits. The ground-truth labels (class, box, mask,
45
+ ...) associated with each instance are stored in fields.
46
+ vis_period (int): the period (in steps) to dump visualization.
47
+
48
+ Returns:
49
+ mask_loss (Tensor): A scalar tensor containing the loss.
50
+ """
51
+ cls_agnostic_mask = pred_mask_logits.size(1) == 1
52
+ total_num_masks = pred_mask_logits.size(0)
53
+ mask_side_len = pred_mask_logits.size(2)
54
+ assert pred_mask_logits.size(2) == pred_mask_logits.size(3), "Mask prediction must be square!"
55
+
56
+ gt_classes = []
57
+ gt_masks = []
58
+ for instances_per_image in instances:
59
+ if len(instances_per_image) == 0:
60
+ continue
61
+ if not cls_agnostic_mask:
62
+ gt_classes_per_image = instances_per_image.gt_classes.to(dtype=torch.int64)
63
+ gt_classes.append(gt_classes_per_image)
64
+
65
+ gt_masks_per_image = instances_per_image.gt_masks.crop_and_resize(
66
+ instances_per_image.proposal_boxes.tensor, mask_side_len
67
+ ).to(device=pred_mask_logits.device)
68
+ # A tensor of shape (N, M, M), N=#instances in the image; M=mask_side_len
69
+ gt_masks.append(gt_masks_per_image)
70
+
71
+ if len(gt_masks) == 0:
72
+ return pred_mask_logits.sum() * 0
73
+
74
+ gt_masks = cat(gt_masks, dim=0)
75
+
76
+ if cls_agnostic_mask:
77
+ pred_mask_logits = pred_mask_logits[:, 0]
78
+ else:
79
+ indices = torch.arange(total_num_masks)
80
+ gt_classes = cat(gt_classes, dim=0)
81
+ pred_mask_logits = pred_mask_logits[indices, gt_classes]
82
+
83
+ if gt_masks.dtype == torch.bool:
84
+ gt_masks_bool = gt_masks
85
+ else:
86
+ # Here we allow gt_masks to be float as well (depend on the implementation of rasterize())
87
+ gt_masks_bool = gt_masks > 0.5
88
+ gt_masks = gt_masks.to(dtype=torch.float32)
89
+
90
+ # Log the training accuracy (using gt classes and 0.5 threshold)
91
+ mask_incorrect = (pred_mask_logits > 0.0) != gt_masks_bool
92
+ mask_accuracy = 1 - (mask_incorrect.sum().item() / max(mask_incorrect.numel(), 1.0))
93
+ num_positive = gt_masks_bool.sum().item()
94
+ false_positive = (mask_incorrect & ~gt_masks_bool).sum().item() / max(
95
+ gt_masks_bool.numel() - num_positive, 1.0
96
+ )
97
+ false_negative = (mask_incorrect & gt_masks_bool).sum().item() / max(num_positive, 1.0)
98
+
99
+ storage = get_event_storage()
100
+ storage.put_scalar("mask_rcnn/accuracy", mask_accuracy)
101
+ storage.put_scalar("mask_rcnn/false_positive", false_positive)
102
+ storage.put_scalar("mask_rcnn/false_negative", false_negative)
103
+ if vis_period > 0 and storage.iter % vis_period == 0:
104
+ pred_masks = pred_mask_logits.sigmoid()
105
+ vis_masks = torch.cat([pred_masks, gt_masks], axis=2)
106
+ name = "Left: mask prediction; Right: mask GT"
107
+ for idx, vis_mask in enumerate(vis_masks):
108
+ vis_mask = torch.stack([vis_mask] * 3, axis=0)
109
+ storage.put_image(name + f" ({idx})", vis_mask)
110
+
111
+ mask_loss = F.binary_cross_entropy_with_logits(pred_mask_logits, gt_masks, reduction="mean")
112
+ return mask_loss
113
+
114
+
115
+ def mask_rcnn_inference(pred_mask_logits: torch.Tensor, pred_instances: List[Instances]):
116
+ """
117
+ Convert pred_mask_logits to estimated foreground probability masks while also
118
+ extracting only the masks for the predicted classes in pred_instances. For each
119
+ predicted box, the mask of the same class is attached to the instance by adding a
120
+ new "pred_masks" field to pred_instances.
121
+
122
+ Args:
123
+ pred_mask_logits (Tensor): A tensor of shape (B, C, Hmask, Wmask) or (B, 1, Hmask, Wmask)
124
+ for class-specific or class-agnostic, where B is the total number of predicted masks
125
+ in all images, C is the number of foreground classes, and Hmask, Wmask are the height
126
+ and width of the mask predictions. The values are logits.
127
+ pred_instances (list[Instances]): A list of N Instances, where N is the number of images
128
+ in the batch. Each Instances must have field "pred_classes".
129
+
130
+ Returns:
131
+ None. pred_instances will contain an extra "pred_masks" field storing a mask of size (Hmask,
132
+ Wmask) for predicted class. Note that the masks are returned as a soft (non-quantized)
133
+ masks the resolution predicted by the network; post-processing steps, such as resizing
134
+ the predicted masks to the original image resolution and/or binarizing them, is left
135
+ to the caller.
136
+ """
137
+ cls_agnostic_mask = pred_mask_logits.size(1) == 1
138
+
139
+ if cls_agnostic_mask:
140
+ mask_probs_pred = pred_mask_logits.sigmoid()
141
+ else:
142
+ # Select masks corresponding to the predicted classes
143
+ num_masks = pred_mask_logits.shape[0]
144
+ class_pred = cat([i.pred_classes for i in pred_instances])
145
+ device = (
146
+ class_pred.device
147
+ if torch.jit.is_scripting()
148
+ else ("cpu" if torch.jit.is_tracing() else class_pred.device)
149
+ )
150
+ indices = move_device_like(torch.arange(num_masks, device=device), class_pred)
151
+ mask_probs_pred = pred_mask_logits[indices, class_pred][:, None].sigmoid()
152
+ # mask_probs_pred.shape: (B, 1, Hmask, Wmask)
153
+
154
+ num_boxes_per_image = [len(i) for i in pred_instances]
155
+ mask_probs_pred = mask_probs_pred.split(num_boxes_per_image, dim=0)
156
+
157
+ for prob, instances in zip(mask_probs_pred, pred_instances):
158
+ instances.pred_masks = prob # (1, Hmask, Wmask)
159
+
160
+
161
+ class BaseMaskRCNNHead(nn.Module):
162
+ """
163
+ Implement the basic Mask R-CNN losses and inference logic described in :paper:`Mask R-CNN`
164
+ """
165
+
166
+ @configurable
167
+ def __init__(self, *, loss_weight: float = 1.0, vis_period: int = 0):
168
+ """
169
+ NOTE: this interface is experimental.
170
+
171
+ Args:
172
+ loss_weight (float): multiplier of the loss
173
+ vis_period (int): visualization period
174
+ """
175
+ super().__init__()
176
+ self.vis_period = vis_period
177
+ self.loss_weight = loss_weight
178
+
179
+ @classmethod
180
+ def from_config(cls, cfg, input_shape):
181
+ return {"vis_period": cfg.VIS_PERIOD}
182
+
183
+ def forward(self, x, instances: List[Instances]):
184
+ """
185
+ Args:
186
+ x: input region feature(s) provided by :class:`ROIHeads`.
187
+ instances (list[Instances]): contains the boxes & labels corresponding
188
+ to the input features.
189
+ Exact format is up to its caller to decide.
190
+ Typically, this is the foreground instances in training, with
191
+ "proposal_boxes" field and other gt annotations.
192
+ In inference, it contains boxes that are already predicted.
193
+
194
+ Returns:
195
+ A dict of losses in training. The predicted "instances" in inference.
196
+ """
197
+ x = self.layers(x)
198
+ if self.training:
199
+ return {"loss_mask": mask_rcnn_loss(x, instances, self.vis_period) * self.loss_weight}
200
+ else:
201
+ mask_rcnn_inference(x, instances)
202
+ return instances
203
+
204
+ def layers(self, x):
205
+ """
206
+ Neural network layers that makes predictions from input features.
207
+ """
208
+ raise NotImplementedError
209
+
210
+
211
+ # To get torchscript support, we make the head a subclass of `nn.Sequential`.
212
+ # Therefore, to add new layers in this head class, please make sure they are
213
+ # added in the order they will be used in forward().
214
+ @ROI_MASK_HEAD_REGISTRY.register()
215
+ class MaskRCNNConvUpsampleHead(BaseMaskRCNNHead, nn.Sequential):
216
+ """
217
+ A mask head with several conv layers, plus an upsample layer (with `ConvTranspose2d`).
218
+ Predictions are made with a final 1x1 conv layer.
219
+ """
220
+
221
+ @configurable
222
+ def __init__(self, input_shape: ShapeSpec, *, num_classes, conv_dims, conv_norm="", **kwargs):
223
+ """
224
+ NOTE: this interface is experimental.
225
+
226
+ Args:
227
+ input_shape (ShapeSpec): shape of the input feature
228
+ num_classes (int): the number of foreground classes (i.e. background is not
229
+ included). 1 if using class agnostic prediction.
230
+ conv_dims (list[int]): a list of N>0 integers representing the output dimensions
231
+ of N-1 conv layers and the last upsample layer.
232
+ conv_norm (str or callable): normalization for the conv layers.
233
+ See :func:`detectron2.layers.get_norm` for supported types.
234
+ """
235
+ super().__init__(**kwargs)
236
+ assert len(conv_dims) >= 1, "conv_dims have to be non-empty!"
237
+
238
+ self.conv_norm_relus = []
239
+
240
+ cur_channels = input_shape.channels
241
+ for k, conv_dim in enumerate(conv_dims[:-1]):
242
+ conv = Conv2d(
243
+ cur_channels,
244
+ conv_dim,
245
+ kernel_size=3,
246
+ stride=1,
247
+ padding=1,
248
+ bias=not conv_norm,
249
+ norm=get_norm(conv_norm, conv_dim),
250
+ activation=nn.ReLU(),
251
+ )
252
+ self.add_module("mask_fcn{}".format(k + 1), conv)
253
+ self.conv_norm_relus.append(conv)
254
+ cur_channels = conv_dim
255
+
256
+ self.deconv = ConvTranspose2d(
257
+ cur_channels, conv_dims[-1], kernel_size=2, stride=2, padding=0
258
+ )
259
+ self.add_module("deconv_relu", nn.ReLU())
260
+ cur_channels = conv_dims[-1]
261
+
262
+ self.predictor = Conv2d(cur_channels, num_classes, kernel_size=1, stride=1, padding=0)
263
+
264
+ for layer in self.conv_norm_relus + [self.deconv]:
265
+ weight_init.c2_msra_fill(layer)
266
+ # use normal distribution initialization for mask prediction layer
267
+ nn.init.normal_(self.predictor.weight, std=0.001)
268
+ if self.predictor.bias is not None:
269
+ nn.init.constant_(self.predictor.bias, 0)
270
+
271
+ @classmethod
272
+ def from_config(cls, cfg, input_shape):
273
+ ret = super().from_config(cfg, input_shape)
274
+ conv_dim = cfg.MODEL.ROI_MASK_HEAD.CONV_DIM
275
+ num_conv = cfg.MODEL.ROI_MASK_HEAD.NUM_CONV
276
+ ret.update(
277
+ conv_dims=[conv_dim] * (num_conv + 1), # +1 for ConvTranspose
278
+ conv_norm=cfg.MODEL.ROI_MASK_HEAD.NORM,
279
+ input_shape=input_shape,
280
+ )
281
+ if cfg.MODEL.ROI_MASK_HEAD.CLS_AGNOSTIC_MASK:
282
+ ret["num_classes"] = 1
283
+ else:
284
+ ret["num_classes"] = cfg.MODEL.ROI_HEADS.NUM_CLASSES
285
+ return ret
286
+
287
+ def layers(self, x):
288
+ for layer in self:
289
+ x = layer(x)
290
+ return x
291
+
292
+
293
+ def build_mask_head(cfg, input_shape):
294
+ """
295
+ Build a mask head defined by `cfg.MODEL.ROI_MASK_HEAD.NAME`.
296
+ """
297
+ name = cfg.MODEL.ROI_MASK_HEAD.NAME
298
+ return ROI_MASK_HEAD_REGISTRY.get(name)(cfg, input_shape)